1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24332525Smav * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25287745Sdelphij * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26247265Smm * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27286575Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28289422Smav * Copyright 2013 Saso Kiselkov. All rights reserved. 29296519Smav * Copyright (c) 2014 Integros [integros.com] 30332524Smav * Copyright 2016 Toomas Soome <tsoome@me.com> 31339153Smav * Copyright 2018 Joyent, Inc. 32324010Savg * Copyright (c) 2017 Datto Inc. 33331721Smav * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. 34168404Spjd */ 35168404Spjd 36168404Spjd/* 37251629Sdelphij * SPA: Storage Pool Allocator 38251629Sdelphij * 39168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 40168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 41168404Spjd * pool. 42168404Spjd */ 43168404Spjd 44168404Spjd#include <sys/zfs_context.h> 45168404Spjd#include <sys/fm/fs/zfs.h> 46168404Spjd#include <sys/spa_impl.h> 47168404Spjd#include <sys/zio.h> 48168404Spjd#include <sys/zio_checksum.h> 49168404Spjd#include <sys/dmu.h> 50168404Spjd#include <sys/dmu_tx.h> 51168404Spjd#include <sys/zap.h> 52168404Spjd#include <sys/zil.h> 53219089Spjd#include <sys/ddt.h> 54168404Spjd#include <sys/vdev_impl.h> 55332525Smav#include <sys/vdev_removal.h> 56332525Smav#include <sys/vdev_indirect_mapping.h> 57332525Smav#include <sys/vdev_indirect_births.h> 58339111Smav#include <sys/vdev_initialize.h> 59168404Spjd#include <sys/metaslab.h> 60219089Spjd#include <sys/metaslab_impl.h> 61168404Spjd#include <sys/uberblock_impl.h> 62168404Spjd#include <sys/txg.h> 63168404Spjd#include <sys/avl.h> 64332525Smav#include <sys/bpobj.h> 65168404Spjd#include <sys/dmu_traverse.h> 66168404Spjd#include <sys/dmu_objset.h> 67168404Spjd#include <sys/unique.h> 68168404Spjd#include <sys/dsl_pool.h> 69168404Spjd#include <sys/dsl_dataset.h> 70168404Spjd#include <sys/dsl_dir.h> 71168404Spjd#include <sys/dsl_prop.h> 72168404Spjd#include <sys/dsl_synctask.h> 73168404Spjd#include <sys/fs/zfs.h> 74185029Spjd#include <sys/arc.h> 75168404Spjd#include <sys/callb.h> 76185029Spjd#include <sys/spa_boot.h> 77219089Spjd#include <sys/zfs_ioctl.h> 78219089Spjd#include <sys/dsl_scan.h> 79248571Smm#include <sys/dmu_send.h> 80248571Smm#include <sys/dsl_destroy.h> 81248571Smm#include <sys/dsl_userhold.h> 82236884Smm#include <sys/zfeature.h> 83219089Spjd#include <sys/zvol.h> 84240868Spjd#include <sys/trim_map.h> 85321610Smav#include <sys/abd.h> 86168404Spjd 87219089Spjd#ifdef _KERNEL 88219089Spjd#include <sys/callb.h> 89219089Spjd#include <sys/cpupart.h> 90219089Spjd#include <sys/zone.h> 91219089Spjd#endif /* _KERNEL */ 92219089Spjd 93185029Spjd#include "zfs_prop.h" 94185029Spjd#include "zfs_comutil.h" 95168404Spjd 96204073Spjd/* Check hostid on import? */ 97204073Spjdstatic int check_hostid = 1; 98204073Spjd 99251636Sdelphij/* 100251636Sdelphij * The interval, in seconds, at which failed configuration cache file writes 101251636Sdelphij * should be retried. 102251636Sdelphij */ 103332525Smavint zfs_ccw_retry_interval = 300; 104251636Sdelphij 105271785SwillSYSCTL_DECL(_vfs_zfs); 106271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, 107271785Swill "Check hostid on import?"); 108271785SwillTUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); 109271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, 110271785Swill &zfs_ccw_retry_interval, 0, 111271785Swill "Configuration cache file write, retry after failure, interval (seconds)"); 112271785Swill 113219089Spjdtypedef enum zti_modes { 114258631Savg ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 115258631Savg ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 116258631Savg ZTI_MODE_NULL, /* don't create a taskq */ 117258631Savg ZTI_NMODES 118219089Spjd} zti_modes_t; 119168712Spjd 120258631Savg#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 121258631Savg#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 122258631Savg#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 123209962Smm 124258631Savg#define ZTI_N(n) ZTI_P(n, 1) 125258631Savg#define ZTI_ONE ZTI_N(1) 126209962Smm 127209962Smmtypedef struct zio_taskq_info { 128258631Savg zti_modes_t zti_mode; 129211931Smm uint_t zti_value; 130258631Savg uint_t zti_count; 131209962Smm} zio_taskq_info_t; 132209962Smm 133209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 134219089Spjd "issue", "issue_high", "intr", "intr_high" 135209962Smm}; 136209962Smm 137211931Smm/* 138258631Savg * This table defines the taskq settings for each ZFS I/O type. When 139258631Savg * initializing a pool, we use this table to create an appropriately sized 140258631Savg * taskq. Some operations are low volume and therefore have a small, static 141258631Savg * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 142258631Savg * macros. Other operations process a large amount of data; the ZTI_BATCH 143258631Savg * macro causes us to create a taskq oriented for throughput. Some operations 144258631Savg * are so high frequency and short-lived that the taskq itself can become a a 145258631Savg * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 146258631Savg * additional degree of parallelism specified by the number of threads per- 147258631Savg * taskq and the number of taskqs; when dispatching an event in this case, the 148258631Savg * particular taskq is chosen at random. 149258631Savg * 150258631Savg * The different taskq priorities are to handle the different contexts (issue 151258631Savg * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 152258631Savg * need to be handled with minimum delay. 153211931Smm */ 154211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 155211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 156258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 157264670Sdelphij { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 158358836Smav { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */ 159258631Savg { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 160258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 161258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 162209962Smm}; 163209962Smm 164248571Smmstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 165248571Smmstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 166185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 167332547Smavstatic int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); 168219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 169185029Spjd 170258632Savguint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 171219089Spjd#ifdef PSRSET_BIND 172219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 173219089Spjd#endif 174219089Spjd#ifdef SYSDC 175219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 176314355Savguint_t zio_taskq_basedc = 80; /* base duty cycle */ 177219089Spjd#endif 178219089Spjd 179219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 180243503Smmextern int zfs_sync_pass_deferred_free; 181219089Spjd 182168404Spjd/* 183332531Smav * Report any spa_load_verify errors found, but do not fail spa_load. 184332531Smav * This is used by zdb to analyze non-idle pools. 185332531Smav */ 186332531Smavboolean_t spa_load_verify_dryrun = B_FALSE; 187332531Smav 188332531Smav/* 189219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 190219089Spjd * to get the vdev stats associated with the imported devices. 191219089Spjd */ 192219089Spjd#define TRYIMPORT_NAME "$import" 193219089Spjd 194219089Spjd/* 195332536Smav * For debugging purposes: print out vdev tree during pool import. 196332536Smav */ 197332536Smavint spa_load_print_vdev_tree = B_FALSE; 198332536Smav 199332536Smav/* 200332536Smav * A non-zero value for zfs_max_missing_tvds means that we allow importing 201332536Smav * pools with missing top-level vdevs. This is strictly intended for advanced 202332536Smav * pool recovery cases since missing data is almost inevitable. Pools with 203332536Smav * missing devices can only be imported read-only for safety reasons, and their 204332536Smav * fail-mode will be automatically set to "continue". 205332536Smav * 206332536Smav * With 1 missing vdev we should be able to import the pool and mount all 207332536Smav * datasets. User data that was not modified after the missing device has been 208332536Smav * added should be recoverable. This means that snapshots created prior to the 209332536Smav * addition of that device should be completely intact. 210332536Smav * 211332536Smav * With 2 missing vdevs, some datasets may fail to mount since there are 212332536Smav * dataset statistics that are stored as regular metadata. Some data might be 213332536Smav * recoverable if those vdevs were added recently. 214332536Smav * 215332536Smav * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 216332536Smav * may be missing entirely. Chances of data recovery are very low. Note that 217332536Smav * there are also risks of performing an inadvertent rewind as we might be 218332536Smav * missing all the vdevs with the latest uberblocks. 219332536Smav */ 220332536Smavuint64_t zfs_max_missing_tvds = 0; 221332536Smav 222332536Smav/* 223332536Smav * The parameters below are similar to zfs_max_missing_tvds but are only 224332536Smav * intended for a preliminary open of the pool with an untrusted config which 225332536Smav * might be incomplete or out-dated. 226332536Smav * 227332536Smav * We are more tolerant for pools opened from a cachefile since we could have 228332536Smav * an out-dated cachefile where a device removal was not registered. 229332536Smav * We could have set the limit arbitrarily high but in the case where devices 230332536Smav * are really missing we would want to return the proper error codes; we chose 231332536Smav * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 232332536Smav * and we get a chance to retrieve the trusted config. 233332536Smav */ 234332536Smavuint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 235332547Smav 236332536Smav/* 237332536Smav * In the case where config was assembled by scanning device paths (/dev/dsks 238332536Smav * by default) we are less tolerant since all the existing devices should have 239332536Smav * been detected and we want spa_load to return the right error codes. 240332536Smav */ 241332536Smavuint64_t zfs_max_missing_tvds_scan = 0; 242332536Smav 243332536Smav 244354642SmavSYSCTL_DECL(_vfs_zfs_zio); 245354642SmavSYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_batch_pct, CTLFLAG_RDTUN, 246354642Smav &zio_taskq_batch_pct, 0, 247354642Smav "Percentage of CPUs to run an IO worker thread"); 248332536SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_print_vdev_tree, CTLFLAG_RWTUN, 249332536Smav &spa_load_print_vdev_tree, 0, 250332536Smav "print out vdev tree during pool import"); 251332536SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds, CTLFLAG_RWTUN, 252332536Smav &zfs_max_missing_tvds, 0, 253332536Smav "allow importing pools with missing top-level vdevs"); 254332536SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, 255332536Smav &zfs_max_missing_tvds_cachefile, 0, 256332536Smav "allow importing pools with missing top-level vdevs in cache file"); 257332536SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, 258332536Smav &zfs_max_missing_tvds_scan, 0, 259332536Smav "allow importing pools with missing top-level vdevs during scan"); 260332536Smav 261332536Smav/* 262332547Smav * Debugging aid that pauses spa_sync() towards the end. 263332547Smav */ 264332547Smavboolean_t zfs_pause_spa_sync = B_FALSE; 265332547Smav 266332547Smav/* 267168404Spjd * ========================================================================== 268185029Spjd * SPA properties routines 269185029Spjd * ========================================================================== 270185029Spjd */ 271185029Spjd 272185029Spjd/* 273185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 274185029Spjd */ 275185029Spjdstatic void 276185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 277185029Spjd uint64_t intval, zprop_source_t src) 278185029Spjd{ 279185029Spjd const char *propname = zpool_prop_to_name(prop); 280185029Spjd nvlist_t *propval; 281185029Spjd 282185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 283185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 284185029Spjd 285185029Spjd if (strval != NULL) 286185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 287185029Spjd else 288185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 289185029Spjd 290185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 291185029Spjd nvlist_free(propval); 292185029Spjd} 293185029Spjd 294185029Spjd/* 295185029Spjd * Get property values from the spa configuration. 296185029Spjd */ 297185029Spjdstatic void 298185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 299185029Spjd{ 300236155Smm vdev_t *rvd = spa->spa_root_vdev; 301236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 302269118Sdelphij uint64_t size, alloc, cap, version; 303185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 304185029Spjd spa_config_dirent_t *dp; 305269118Sdelphij metaslab_class_t *mc = spa_normal_class(spa); 306185029Spjd 307185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 308185029Spjd 309236155Smm if (rvd != NULL) { 310219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 311219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 312209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 313209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 314219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 315219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 316219089Spjd size - alloc, src); 317332547Smav spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 318332547Smav spa->spa_checkpoint_info.sci_dspace, src); 319236155Smm 320269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 321269118Sdelphij metaslab_class_fragmentation(mc), src); 322269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 323269118Sdelphij metaslab_class_expandable_space(mc), src); 324219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 325219089Spjd (spa_mode(spa) == FREAD), src); 326185029Spjd 327219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 328209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 329185029Spjd 330219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 331219089Spjd ddt_get_pool_dedup_ratio(spa), src); 332219089Spjd 333209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 334236155Smm rvd->vdev_state, src); 335209962Smm 336209962Smm version = spa_version(spa); 337209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 338209962Smm src = ZPROP_SRC_DEFAULT; 339209962Smm else 340209962Smm src = ZPROP_SRC_LOCAL; 341209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 342209962Smm } 343209962Smm 344236884Smm if (pool != NULL) { 345236884Smm /* 346236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 347236884Smm * when opening pools before this version freedir will be NULL. 348236884Smm */ 349268079Sdelphij if (pool->dp_free_dir != NULL) { 350236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 351275782Sdelphij dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 352275782Sdelphij src); 353236884Smm } else { 354236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 355236884Smm NULL, 0, src); 356236884Smm } 357268079Sdelphij 358268079Sdelphij if (pool->dp_leak_dir != NULL) { 359268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 360275782Sdelphij dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 361275782Sdelphij src); 362268079Sdelphij } else { 363268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 364268079Sdelphij NULL, 0, src); 365268079Sdelphij } 366236884Smm } 367236884Smm 368185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 369185029Spjd 370228103Smm if (spa->spa_comment != NULL) { 371228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 372228103Smm 0, ZPROP_SRC_LOCAL); 373228103Smm } 374228103Smm 375185029Spjd if (spa->spa_root != NULL) 376185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 377185029Spjd 0, ZPROP_SRC_LOCAL); 378185029Spjd 379274337Sdelphij if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 380274337Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 381274337Sdelphij MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 382274337Sdelphij } else { 383274337Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 384274337Sdelphij SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 385274337Sdelphij } 386274337Sdelphij 387185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 388185029Spjd if (dp->scd_path == NULL) { 389185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 390185029Spjd "none", 0, ZPROP_SRC_LOCAL); 391185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 392185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 393185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 394185029Spjd } 395185029Spjd } 396185029Spjd} 397185029Spjd 398185029Spjd/* 399185029Spjd * Get zpool property values. 400185029Spjd */ 401185029Spjdint 402185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 403185029Spjd{ 404219089Spjd objset_t *mos = spa->spa_meta_objset; 405185029Spjd zap_cursor_t zc; 406185029Spjd zap_attribute_t za; 407185029Spjd int err; 408185029Spjd 409185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 410185029Spjd 411185029Spjd mutex_enter(&spa->spa_props_lock); 412185029Spjd 413185029Spjd /* 414185029Spjd * Get properties from the spa config. 415185029Spjd */ 416185029Spjd spa_prop_get_config(spa, nvp); 417185029Spjd 418185029Spjd /* If no pool property object, no more prop to get. */ 419219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 420185029Spjd mutex_exit(&spa->spa_props_lock); 421185029Spjd return (0); 422185029Spjd } 423185029Spjd 424185029Spjd /* 425185029Spjd * Get properties from the MOS pool property object. 426185029Spjd */ 427185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 428185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 429185029Spjd zap_cursor_advance(&zc)) { 430185029Spjd uint64_t intval = 0; 431185029Spjd char *strval = NULL; 432185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 433185029Spjd zpool_prop_t prop; 434185029Spjd 435329493Smav if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) 436185029Spjd continue; 437185029Spjd 438185029Spjd switch (za.za_integer_length) { 439185029Spjd case 8: 440185029Spjd /* integer property */ 441185029Spjd if (za.za_first_integer != 442185029Spjd zpool_prop_default_numeric(prop)) 443185029Spjd src = ZPROP_SRC_LOCAL; 444185029Spjd 445185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 446185029Spjd dsl_pool_t *dp; 447185029Spjd dsl_dataset_t *ds = NULL; 448185029Spjd 449185029Spjd dp = spa_get_dsl(spa); 450248571Smm dsl_pool_config_enter(dp, FTAG); 451339111Smav err = dsl_dataset_hold_obj(dp, 452339111Smav za.za_first_integer, FTAG, &ds); 453339111Smav if (err != 0) { 454248571Smm dsl_pool_config_exit(dp, FTAG); 455185029Spjd break; 456185029Spjd } 457185029Spjd 458307108Smav strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 459185029Spjd KM_SLEEP); 460185029Spjd dsl_dataset_name(ds, strval); 461185029Spjd dsl_dataset_rele(ds, FTAG); 462248571Smm dsl_pool_config_exit(dp, FTAG); 463185029Spjd } else { 464185029Spjd strval = NULL; 465185029Spjd intval = za.za_first_integer; 466185029Spjd } 467185029Spjd 468185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 469185029Spjd 470185029Spjd if (strval != NULL) 471307108Smav kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 472185029Spjd 473185029Spjd break; 474185029Spjd 475185029Spjd case 1: 476185029Spjd /* string property */ 477185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 478185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 479185029Spjd za.za_name, 1, za.za_num_integers, strval); 480185029Spjd if (err) { 481185029Spjd kmem_free(strval, za.za_num_integers); 482185029Spjd break; 483185029Spjd } 484185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 485185029Spjd kmem_free(strval, za.za_num_integers); 486185029Spjd break; 487185029Spjd 488185029Spjd default: 489185029Spjd break; 490185029Spjd } 491185029Spjd } 492185029Spjd zap_cursor_fini(&zc); 493185029Spjd mutex_exit(&spa->spa_props_lock); 494185029Spjdout: 495185029Spjd if (err && err != ENOENT) { 496185029Spjd nvlist_free(*nvp); 497185029Spjd *nvp = NULL; 498185029Spjd return (err); 499185029Spjd } 500185029Spjd 501185029Spjd return (0); 502185029Spjd} 503185029Spjd 504185029Spjd/* 505185029Spjd * Validate the given pool properties nvlist and modify the list 506185029Spjd * for the property values to be set. 507185029Spjd */ 508185029Spjdstatic int 509185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 510185029Spjd{ 511185029Spjd nvpair_t *elem; 512185029Spjd int error = 0, reset_bootfs = 0; 513247187Smm uint64_t objnum = 0; 514236884Smm boolean_t has_feature = B_FALSE; 515185029Spjd 516185029Spjd elem = NULL; 517185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 518185029Spjd uint64_t intval; 519236884Smm char *strval, *slash, *check, *fname; 520236884Smm const char *propname = nvpair_name(elem); 521236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 522185029Spjd 523236884Smm switch (prop) { 524329493Smav case ZPOOL_PROP_INVAL: 525236884Smm if (!zpool_prop_feature(propname)) { 526249195Smm error = SET_ERROR(EINVAL); 527236884Smm break; 528236884Smm } 529185029Spjd 530236884Smm /* 531236884Smm * Sanitize the input. 532236884Smm */ 533236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 534249195Smm error = SET_ERROR(EINVAL); 535236884Smm break; 536236884Smm } 537185029Spjd 538236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 539249195Smm error = SET_ERROR(EINVAL); 540236884Smm break; 541236884Smm } 542236884Smm 543236884Smm if (intval != 0) { 544249195Smm error = SET_ERROR(EINVAL); 545236884Smm break; 546236884Smm } 547236884Smm 548236884Smm fname = strchr(propname, '@') + 1; 549236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 550249195Smm error = SET_ERROR(EINVAL); 551236884Smm break; 552236884Smm } 553236884Smm 554236884Smm has_feature = B_TRUE; 555236884Smm break; 556236884Smm 557185029Spjd case ZPOOL_PROP_VERSION: 558185029Spjd error = nvpair_value_uint64(elem, &intval); 559185029Spjd if (!error && 560236884Smm (intval < spa_version(spa) || 561236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 562236884Smm has_feature)) 563249195Smm error = SET_ERROR(EINVAL); 564185029Spjd break; 565185029Spjd 566185029Spjd case ZPOOL_PROP_DELEGATION: 567185029Spjd case ZPOOL_PROP_AUTOREPLACE: 568185029Spjd case ZPOOL_PROP_LISTSNAPS: 569219089Spjd case ZPOOL_PROP_AUTOEXPAND: 570185029Spjd error = nvpair_value_uint64(elem, &intval); 571185029Spjd if (!error && intval > 1) 572249195Smm error = SET_ERROR(EINVAL); 573185029Spjd break; 574185029Spjd 575185029Spjd case ZPOOL_PROP_BOOTFS: 576209962Smm /* 577209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 578209962Smm * or the pool is still being created (version == 0), 579209962Smm * the bootfs property cannot be set. 580209962Smm */ 581185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 582249195Smm error = SET_ERROR(ENOTSUP); 583185029Spjd break; 584185029Spjd } 585185029Spjd 586185029Spjd /* 587185029Spjd * Make sure the vdev config is bootable 588185029Spjd */ 589185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 590249195Smm error = SET_ERROR(ENOTSUP); 591185029Spjd break; 592185029Spjd } 593185029Spjd 594185029Spjd reset_bootfs = 1; 595185029Spjd 596185029Spjd error = nvpair_value_string(elem, &strval); 597185029Spjd 598185029Spjd if (!error) { 599236884Smm objset_t *os; 600274337Sdelphij uint64_t propval; 601185029Spjd 602185029Spjd if (strval == NULL || strval[0] == '\0') { 603185029Spjd objnum = zpool_prop_default_numeric( 604185029Spjd ZPOOL_PROP_BOOTFS); 605185029Spjd break; 606185029Spjd } 607185029Spjd 608339111Smav error = dmu_objset_hold(strval, FTAG, &os); 609339111Smav if (error != 0) 610185029Spjd break; 611185029Spjd 612274337Sdelphij /* 613274337Sdelphij * Must be ZPL, and its property settings 614274337Sdelphij * must be supported by GRUB (compression 615274337Sdelphij * is not gzip, and large blocks are not used). 616274337Sdelphij */ 617219089Spjd 618219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 619249195Smm error = SET_ERROR(ENOTSUP); 620248571Smm } else if ((error = 621248571Smm dsl_prop_get_int_ds(dmu_objset_ds(os), 622185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 623274337Sdelphij &propval)) == 0 && 624274337Sdelphij !BOOTFS_COMPRESS_VALID(propval)) { 625249195Smm error = SET_ERROR(ENOTSUP); 626185029Spjd } else { 627185029Spjd objnum = dmu_objset_id(os); 628185029Spjd } 629219089Spjd dmu_objset_rele(os, FTAG); 630185029Spjd } 631185029Spjd break; 632185029Spjd 633185029Spjd case ZPOOL_PROP_FAILUREMODE: 634185029Spjd error = nvpair_value_uint64(elem, &intval); 635185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 636185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 637249195Smm error = SET_ERROR(EINVAL); 638185029Spjd 639185029Spjd /* 640185029Spjd * This is a special case which only occurs when 641185029Spjd * the pool has completely failed. This allows 642185029Spjd * the user to change the in-core failmode property 643185029Spjd * without syncing it out to disk (I/Os might 644185029Spjd * currently be blocked). We do this by returning 645185029Spjd * EIO to the caller (spa_prop_set) to trick it 646185029Spjd * into thinking we encountered a property validation 647185029Spjd * error. 648185029Spjd */ 649185029Spjd if (!error && spa_suspended(spa)) { 650185029Spjd spa->spa_failmode = intval; 651249195Smm error = SET_ERROR(EIO); 652185029Spjd } 653185029Spjd break; 654185029Spjd 655185029Spjd case ZPOOL_PROP_CACHEFILE: 656185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 657185029Spjd break; 658185029Spjd 659185029Spjd if (strval[0] == '\0') 660185029Spjd break; 661185029Spjd 662185029Spjd if (strcmp(strval, "none") == 0) 663185029Spjd break; 664185029Spjd 665185029Spjd if (strval[0] != '/') { 666249195Smm error = SET_ERROR(EINVAL); 667185029Spjd break; 668185029Spjd } 669185029Spjd 670185029Spjd slash = strrchr(strval, '/'); 671185029Spjd ASSERT(slash != NULL); 672185029Spjd 673185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 674185029Spjd strcmp(slash, "/..") == 0) 675249195Smm error = SET_ERROR(EINVAL); 676185029Spjd break; 677219089Spjd 678228103Smm case ZPOOL_PROP_COMMENT: 679228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 680228103Smm break; 681228103Smm for (check = strval; *check != '\0'; check++) { 682228103Smm /* 683228103Smm * The kernel doesn't have an easy isprint() 684228103Smm * check. For this kernel check, we merely 685228103Smm * check ASCII apart from DEL. Fix this if 686228103Smm * there is an easy-to-use kernel isprint(). 687228103Smm */ 688228103Smm if (*check >= 0x7f) { 689249195Smm error = SET_ERROR(EINVAL); 690228103Smm break; 691228103Smm } 692228103Smm } 693228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 694228103Smm error = E2BIG; 695228103Smm break; 696228103Smm 697219089Spjd case ZPOOL_PROP_DEDUPDITTO: 698219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 699249195Smm error = SET_ERROR(ENOTSUP); 700219089Spjd else 701219089Spjd error = nvpair_value_uint64(elem, &intval); 702219089Spjd if (error == 0 && 703219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 704249195Smm error = SET_ERROR(EINVAL); 705219089Spjd break; 706185029Spjd } 707185029Spjd 708185029Spjd if (error) 709185029Spjd break; 710185029Spjd } 711185029Spjd 712185029Spjd if (!error && reset_bootfs) { 713185029Spjd error = nvlist_remove(props, 714185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 715185029Spjd 716185029Spjd if (!error) { 717185029Spjd error = nvlist_add_uint64(props, 718185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 719185029Spjd } 720185029Spjd } 721185029Spjd 722185029Spjd return (error); 723185029Spjd} 724185029Spjd 725209962Smmvoid 726209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 727209962Smm{ 728209962Smm char *cachefile; 729209962Smm spa_config_dirent_t *dp; 730209962Smm 731209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 732209962Smm &cachefile) != 0) 733209962Smm return; 734209962Smm 735209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 736209962Smm KM_SLEEP); 737209962Smm 738209962Smm if (cachefile[0] == '\0') 739209962Smm dp->scd_path = spa_strdup(spa_config_path); 740209962Smm else if (strcmp(cachefile, "none") == 0) 741209962Smm dp->scd_path = NULL; 742209962Smm else 743209962Smm dp->scd_path = spa_strdup(cachefile); 744209962Smm 745209962Smm list_insert_head(&spa->spa_config_list, dp); 746209962Smm if (need_sync) 747209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 748209962Smm} 749209962Smm 750185029Spjdint 751185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 752185029Spjd{ 753185029Spjd int error; 754236884Smm nvpair_t *elem = NULL; 755209962Smm boolean_t need_sync = B_FALSE; 756185029Spjd 757185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 758185029Spjd return (error); 759185029Spjd 760209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 761236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 762209962Smm 763219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 764219089Spjd prop == ZPOOL_PROP_ALTROOT || 765219089Spjd prop == ZPOOL_PROP_READONLY) 766209962Smm continue; 767209962Smm 768329493Smav if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 769236884Smm uint64_t ver; 770236884Smm 771236884Smm if (prop == ZPOOL_PROP_VERSION) { 772236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 773236884Smm } else { 774236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 775236884Smm ver = SPA_VERSION_FEATURES; 776236884Smm need_sync = B_TRUE; 777236884Smm } 778236884Smm 779236884Smm /* Save time if the version is already set. */ 780236884Smm if (ver == spa_version(spa)) 781236884Smm continue; 782236884Smm 783236884Smm /* 784236884Smm * In addition to the pool directory object, we might 785236884Smm * create the pool properties object, the features for 786236884Smm * read object, the features for write object, or the 787236884Smm * feature descriptions object. 788236884Smm */ 789248571Smm error = dsl_sync_task(spa->spa_name, NULL, 790268473Sdelphij spa_sync_version, &ver, 791268473Sdelphij 6, ZFS_SPACE_CHECK_RESERVED); 792236884Smm if (error) 793236884Smm return (error); 794236884Smm continue; 795236884Smm } 796236884Smm 797209962Smm need_sync = B_TRUE; 798209962Smm break; 799209962Smm } 800209962Smm 801236884Smm if (need_sync) { 802248571Smm return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 803268473Sdelphij nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 804236884Smm } 805236884Smm 806236884Smm return (0); 807185029Spjd} 808185029Spjd 809185029Spjd/* 810185029Spjd * If the bootfs property value is dsobj, clear it. 811185029Spjd */ 812185029Spjdvoid 813185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 814185029Spjd{ 815185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 816185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 817185029Spjd spa->spa_pool_props_object, 818185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 819185029Spjd spa->spa_bootfs = 0; 820185029Spjd } 821185029Spjd} 822185029Spjd 823239620Smm/*ARGSUSED*/ 824239620Smmstatic int 825248571Smmspa_change_guid_check(void *arg, dmu_tx_t *tx) 826239620Smm{ 827248571Smm uint64_t *newguid = arg; 828248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 829239620Smm vdev_t *rvd = spa->spa_root_vdev; 830239620Smm uint64_t vdev_state; 831239620Smm 832332547Smav if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 833332547Smav int error = (spa_has_checkpoint(spa)) ? 834332547Smav ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 835332547Smav return (SET_ERROR(error)); 836332547Smav } 837332547Smav 838239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 839239620Smm vdev_state = rvd->vdev_state; 840239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 841239620Smm 842239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 843249195Smm return (SET_ERROR(ENXIO)); 844239620Smm 845239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 846239620Smm 847239620Smm return (0); 848239620Smm} 849239620Smm 850239620Smmstatic void 851248571Smmspa_change_guid_sync(void *arg, dmu_tx_t *tx) 852239620Smm{ 853248571Smm uint64_t *newguid = arg; 854248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 855239620Smm uint64_t oldguid; 856239620Smm vdev_t *rvd = spa->spa_root_vdev; 857239620Smm 858239620Smm oldguid = spa_guid(spa); 859239620Smm 860239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 861239620Smm rvd->vdev_guid = *newguid; 862239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 863239620Smm vdev_config_dirty(rvd); 864239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 865239620Smm 866248571Smm spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 867239620Smm oldguid, *newguid); 868239620Smm} 869239620Smm 870185029Spjd/* 871228103Smm * Change the GUID for the pool. This is done so that we can later 872228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 873228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 874228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 875228103Smm * online when we do this, or else any vdevs that weren't present 876228103Smm * would be orphaned from our pool. We are also going to issue a 877228103Smm * sysevent to update any watchers. 878228103Smm */ 879228103Smmint 880228103Smmspa_change_guid(spa_t *spa) 881228103Smm{ 882239620Smm int error; 883239620Smm uint64_t guid; 884228103Smm 885254074Sdelphij mutex_enter(&spa->spa_vdev_top_lock); 886239620Smm mutex_enter(&spa_namespace_lock); 887239620Smm guid = spa_generate_guid(NULL); 888228103Smm 889248571Smm error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 890268473Sdelphij spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 891228103Smm 892239620Smm if (error == 0) { 893332525Smav spa_write_cachefile(spa, B_FALSE, B_TRUE); 894331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 895239620Smm } 896228103Smm 897239620Smm mutex_exit(&spa_namespace_lock); 898254074Sdelphij mutex_exit(&spa->spa_vdev_top_lock); 899228103Smm 900239620Smm return (error); 901228103Smm} 902228103Smm 903228103Smm/* 904185029Spjd * ========================================================================== 905168404Spjd * SPA state manipulation (open/create/destroy/import/export) 906168404Spjd * ========================================================================== 907168404Spjd */ 908168404Spjd 909168404Spjdstatic int 910168404Spjdspa_error_entry_compare(const void *a, const void *b) 911168404Spjd{ 912339158Smav const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 913339158Smav const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 914168404Spjd int ret; 915168404Spjd 916339158Smav ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 917268123Sdelphij sizeof (zbookmark_phys_t)); 918168404Spjd 919339158Smav return (AVL_ISIGN(ret)); 920168404Spjd} 921168404Spjd 922168404Spjd/* 923168404Spjd * Utility function which retrieves copies of the current logs and 924168404Spjd * re-initializes them in the process. 925168404Spjd */ 926168404Spjdvoid 927168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 928168404Spjd{ 929168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 930168404Spjd 931168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 932168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 933168404Spjd 934168404Spjd avl_create(&spa->spa_errlist_scrub, 935168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 936168404Spjd offsetof(spa_error_entry_t, se_avl)); 937168404Spjd avl_create(&spa->spa_errlist_last, 938168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 939168404Spjd offsetof(spa_error_entry_t, se_avl)); 940168404Spjd} 941168404Spjd 942258631Savgstatic void 943258631Savgspa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 944168404Spjd{ 945258631Savg const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 946258631Savg enum zti_modes mode = ztip->zti_mode; 947258631Savg uint_t value = ztip->zti_value; 948258631Savg uint_t count = ztip->zti_count; 949258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 950258631Savg char name[32]; 951258630Savg uint_t flags = 0; 952219089Spjd boolean_t batch = B_FALSE; 953168404Spjd 954258631Savg if (mode == ZTI_MODE_NULL) { 955258631Savg tqs->stqs_count = 0; 956258631Savg tqs->stqs_taskq = NULL; 957258631Savg return; 958258631Savg } 959168404Spjd 960258631Savg ASSERT3U(count, >, 0); 961168404Spjd 962258631Savg tqs->stqs_count = count; 963258631Savg tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 964219089Spjd 965258632Savg switch (mode) { 966258632Savg case ZTI_MODE_FIXED: 967258632Savg ASSERT3U(value, >=, 1); 968258632Savg value = MAX(value, 1); 969258632Savg break; 970219089Spjd 971258632Savg case ZTI_MODE_BATCH: 972258632Savg batch = B_TRUE; 973258632Savg flags |= TASKQ_THREADS_CPU_PCT; 974258632Savg value = zio_taskq_batch_pct; 975258632Savg break; 976219089Spjd 977258632Savg default: 978258632Savg panic("unrecognized mode for %s_%s taskq (%u:%u) in " 979258632Savg "spa_activate()", 980258632Savg zio_type_name[t], zio_taskq_types[q], mode, value); 981258632Savg break; 982258632Savg } 983258631Savg 984258632Savg for (uint_t i = 0; i < count; i++) { 985258632Savg taskq_t *tq; 986258631Savg 987258631Savg if (count > 1) { 988258631Savg (void) snprintf(name, sizeof (name), "%s_%s_%u", 989258631Savg zio_type_name[t], zio_taskq_types[q], i); 990258631Savg } else { 991258631Savg (void) snprintf(name, sizeof (name), "%s_%s", 992258631Savg zio_type_name[t], zio_taskq_types[q]); 993258631Savg } 994258631Savg 995219089Spjd#ifdef SYSDC 996258631Savg if (zio_taskq_sysdc && spa->spa_proc != &p0) { 997258631Savg if (batch) 998258631Savg flags |= TASKQ_DC_BATCH; 999219089Spjd 1000258631Savg tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1001258631Savg spa->spa_proc, zio_taskq_basedc, flags); 1002258631Savg } else { 1003258631Savg#endif 1004258632Savg pri_t pri = maxclsyspri; 1005258632Savg /* 1006258632Savg * The write issue taskq can be extremely CPU 1007258632Savg * intensive. Run it at slightly lower priority 1008258632Savg * than the other taskqs. 1009314858Savg * FreeBSD notes: 1010314858Savg * - numerically higher priorities are lower priorities; 1011314858Savg * - if priorities divided by four (RQ_PPQ) are equal 1012314858Savg * then a difference between them is insignificant. 1013258632Savg */ 1014258632Savg if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 1015314858Savg#ifdef illumos 1016314858Savg pri--; 1017314858Savg#else 1018314858Savg pri += 4; 1019314858Savg#endif 1020258632Savg 1021258632Savg tq = taskq_create_proc(name, value, pri, 50, 1022258631Savg INT_MAX, spa->spa_proc, flags); 1023258631Savg#ifdef SYSDC 1024258631Savg } 1025258631Savg#endif 1026258631Savg 1027258631Savg tqs->stqs_taskq[i] = tq; 1028219089Spjd } 1029219089Spjd} 1030219089Spjd 1031219089Spjdstatic void 1032258631Savgspa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1033258631Savg{ 1034258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1035258631Savg 1036258631Savg if (tqs->stqs_taskq == NULL) { 1037258631Savg ASSERT0(tqs->stqs_count); 1038258631Savg return; 1039258631Savg } 1040258631Savg 1041258631Savg for (uint_t i = 0; i < tqs->stqs_count; i++) { 1042258631Savg ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1043258631Savg taskq_destroy(tqs->stqs_taskq[i]); 1044258631Savg } 1045258631Savg 1046258631Savg kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1047258631Savg tqs->stqs_taskq = NULL; 1048258631Savg} 1049258631Savg 1050258631Savg/* 1051258631Savg * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1052258631Savg * Note that a type may have multiple discrete taskqs to avoid lock contention 1053258631Savg * on the taskq itself. In that case we choose which taskq at random by using 1054258631Savg * the low bits of gethrtime(). 1055258631Savg */ 1056258631Savgvoid 1057258631Savgspa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1058258631Savg task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 1059258631Savg{ 1060258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1061258631Savg taskq_t *tq; 1062258631Savg 1063258631Savg ASSERT3P(tqs->stqs_taskq, !=, NULL); 1064258631Savg ASSERT3U(tqs->stqs_count, !=, 0); 1065258631Savg 1066258631Savg if (tqs->stqs_count == 1) { 1067258631Savg tq = tqs->stqs_taskq[0]; 1068258631Savg } else { 1069267038Sbdrewery#ifdef _KERNEL 1070345123Smav tq = tqs->stqs_taskq[(u_int)(sbinuptime() + curcpu) % 1071345123Smav tqs->stqs_count]; 1072267038Sbdrewery#else 1073267038Sbdrewery tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 1074267038Sbdrewery#endif 1075258631Savg } 1076258631Savg 1077258631Savg taskq_dispatch_ent(tq, func, arg, flags, ent); 1078258631Savg} 1079258631Savg 1080258631Savgstatic void 1081219089Spjdspa_create_zio_taskqs(spa_t *spa) 1082219089Spjd{ 1083185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1084185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1085258631Savg spa_taskqs_init(spa, t, q); 1086219089Spjd } 1087219089Spjd } 1088219089Spjd} 1089209962Smm 1090219089Spjd#ifdef _KERNEL 1091219089Spjd#ifdef SPA_PROCESS 1092219089Spjdstatic void 1093219089Spjdspa_thread(void *arg) 1094219089Spjd{ 1095219089Spjd callb_cpr_t cprinfo; 1096209962Smm 1097219089Spjd spa_t *spa = arg; 1098219089Spjd user_t *pu = PTOU(curproc); 1099209962Smm 1100219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1101219089Spjd spa->spa_name); 1102209962Smm 1103219089Spjd ASSERT(curproc != &p0); 1104219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1105219089Spjd "zpool-%s", spa->spa_name); 1106219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1107211931Smm 1108219089Spjd#ifdef PSRSET_BIND 1109219089Spjd /* bind this thread to the requested psrset */ 1110219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 1111219089Spjd pool_lock(); 1112219089Spjd mutex_enter(&cpu_lock); 1113219089Spjd mutex_enter(&pidlock); 1114219089Spjd mutex_enter(&curproc->p_lock); 1115219089Spjd 1116219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1117219089Spjd 0, NULL, NULL) == 0) { 1118219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 1119219089Spjd } else { 1120219089Spjd cmn_err(CE_WARN, 1121219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 1122219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1123219089Spjd } 1124219089Spjd 1125219089Spjd mutex_exit(&curproc->p_lock); 1126219089Spjd mutex_exit(&pidlock); 1127219089Spjd mutex_exit(&cpu_lock); 1128219089Spjd pool_unlock(); 1129219089Spjd } 1130219089Spjd#endif 1131219089Spjd 1132219089Spjd#ifdef SYSDC 1133219089Spjd if (zio_taskq_sysdc) { 1134219089Spjd sysdc_thread_enter(curthread, 100, 0); 1135219089Spjd } 1136219089Spjd#endif 1137219089Spjd 1138219089Spjd spa->spa_proc = curproc; 1139219089Spjd spa->spa_did = curthread->t_did; 1140219089Spjd 1141219089Spjd spa_create_zio_taskqs(spa); 1142219089Spjd 1143219089Spjd mutex_enter(&spa->spa_proc_lock); 1144219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1145219089Spjd 1146219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 1147219089Spjd cv_broadcast(&spa->spa_proc_cv); 1148219089Spjd 1149219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 1150219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1151219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1152219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1153219089Spjd 1154219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1155219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 1156219089Spjd spa->spa_proc = &p0; 1157219089Spjd cv_broadcast(&spa->spa_proc_cv); 1158219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1159219089Spjd 1160219089Spjd mutex_enter(&curproc->p_lock); 1161219089Spjd lwp_exit(); 1162219089Spjd} 1163219089Spjd#endif /* SPA_PROCESS */ 1164219089Spjd#endif 1165219089Spjd 1166219089Spjd/* 1167219089Spjd * Activate an uninitialized pool. 1168219089Spjd */ 1169219089Spjdstatic void 1170219089Spjdspa_activate(spa_t *spa, int mode) 1171219089Spjd{ 1172219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1173219089Spjd 1174219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 1175219089Spjd spa->spa_mode = mode; 1176219089Spjd 1177219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1178219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1179219089Spjd 1180219089Spjd /* Try to create a covering process */ 1181219089Spjd mutex_enter(&spa->spa_proc_lock); 1182219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1183219089Spjd ASSERT(spa->spa_proc == &p0); 1184219089Spjd spa->spa_did = 0; 1185219089Spjd 1186219089Spjd#ifdef SPA_PROCESS 1187219089Spjd /* Only create a process if we're going to be around a while. */ 1188219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1189219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1190219089Spjd NULL, 0) == 0) { 1191219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 1192219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 1193219089Spjd cv_wait(&spa->spa_proc_cv, 1194219089Spjd &spa->spa_proc_lock); 1195209962Smm } 1196219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1197219089Spjd ASSERT(spa->spa_proc != &p0); 1198219089Spjd ASSERT(spa->spa_did != 0); 1199219089Spjd } else { 1200219089Spjd#ifdef _KERNEL 1201219089Spjd cmn_err(CE_WARN, 1202219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 1203219089Spjd spa->spa_name); 1204219089Spjd#endif 1205185029Spjd } 1206168404Spjd } 1207219089Spjd#endif /* SPA_PROCESS */ 1208219089Spjd mutex_exit(&spa->spa_proc_lock); 1209168404Spjd 1210219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1211219089Spjd ASSERT(spa->spa_proc == &p0); 1212219089Spjd if (spa->spa_proc == &p0) { 1213219089Spjd spa_create_zio_taskqs(spa); 1214219089Spjd } 1215219089Spjd 1216240868Spjd /* 1217240868Spjd * Start TRIM thread. 1218240868Spjd */ 1219240868Spjd trim_thread_create(spa); 1220240868Spjd 1221339111Smav for (size_t i = 0; i < TXG_SIZE; i++) { 1222339111Smav spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1223339111Smav ZIO_FLAG_CANFAIL); 1224339111Smav } 1225332525Smav 1226185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1227185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1228286575Smav list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1229286575Smav offsetof(objset_t, os_evicting_node)); 1230185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1231185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1232168404Spjd 1233321567Smav txg_list_create(&spa->spa_vdev_txg_list, spa, 1234168404Spjd offsetof(struct vdev, vdev_txg_node)); 1235168404Spjd 1236168404Spjd avl_create(&spa->spa_errlist_scrub, 1237168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1238168404Spjd offsetof(spa_error_entry_t, se_avl)); 1239168404Spjd avl_create(&spa->spa_errlist_last, 1240168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1241168404Spjd offsetof(spa_error_entry_t, se_avl)); 1242168404Spjd} 1243168404Spjd 1244168404Spjd/* 1245168404Spjd * Opposite of spa_activate(). 1246168404Spjd */ 1247168404Spjdstatic void 1248168404Spjdspa_deactivate(spa_t *spa) 1249168404Spjd{ 1250168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1251168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1252168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1253209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1254168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1255168404Spjd 1256240868Spjd /* 1257240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1258240868Spjd * before spa_deactivate(). 1259240868Spjd */ 1260240868Spjd trim_thread_destroy(spa); 1261240868Spjd 1262286575Smav spa_evicting_os_wait(spa); 1263286575Smav 1264168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1265168404Spjd 1266185029Spjd list_destroy(&spa->spa_config_dirty_list); 1267286575Smav list_destroy(&spa->spa_evicting_os_list); 1268185029Spjd list_destroy(&spa->spa_state_dirty_list); 1269168404Spjd 1270185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1271185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1272258631Savg spa_taskqs_fini(spa, t, q); 1273185029Spjd } 1274168404Spjd } 1275168404Spjd 1276332525Smav for (size_t i = 0; i < TXG_SIZE; i++) { 1277332525Smav ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1278332525Smav VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1279332525Smav spa->spa_txg_zio[i] = NULL; 1280332525Smav } 1281332525Smav 1282168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1283168404Spjd spa->spa_normal_class = NULL; 1284168404Spjd 1285185029Spjd metaslab_class_destroy(spa->spa_log_class); 1286185029Spjd spa->spa_log_class = NULL; 1287185029Spjd 1288168404Spjd /* 1289168404Spjd * If this was part of an import or the open otherwise failed, we may 1290168404Spjd * still have errors left in the queues. Empty them just in case. 1291168404Spjd */ 1292168404Spjd spa_errlog_drain(spa); 1293168404Spjd 1294168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1295168404Spjd avl_destroy(&spa->spa_errlist_last); 1296168404Spjd 1297168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1298219089Spjd 1299219089Spjd mutex_enter(&spa->spa_proc_lock); 1300219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1301219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1302219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1303219089Spjd cv_broadcast(&spa->spa_proc_cv); 1304219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1305219089Spjd ASSERT(spa->spa_proc != &p0); 1306219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1307219089Spjd } 1308219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1309219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1310219089Spjd } 1311219089Spjd ASSERT(spa->spa_proc == &p0); 1312219089Spjd mutex_exit(&spa->spa_proc_lock); 1313219089Spjd 1314219089Spjd#ifdef SPA_PROCESS 1315219089Spjd /* 1316219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1317219089Spjd * module, so that the module can't be unloaded out from underneath 1318219089Spjd * it. 1319219089Spjd */ 1320219089Spjd if (spa->spa_did != 0) { 1321219089Spjd thread_join(spa->spa_did); 1322219089Spjd spa->spa_did = 0; 1323219089Spjd } 1324219089Spjd#endif /* SPA_PROCESS */ 1325168404Spjd} 1326168404Spjd 1327168404Spjd/* 1328168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1329168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1330168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1331168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1332168404Spjd */ 1333168404Spjdstatic int 1334168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1335168404Spjd uint_t id, int atype) 1336168404Spjd{ 1337168404Spjd nvlist_t **child; 1338219089Spjd uint_t children; 1339168404Spjd int error; 1340168404Spjd 1341168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1342168404Spjd return (error); 1343168404Spjd 1344168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1345168404Spjd return (0); 1346168404Spjd 1347185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1348185029Spjd &child, &children); 1349185029Spjd 1350185029Spjd if (error == ENOENT) 1351185029Spjd return (0); 1352185029Spjd 1353185029Spjd if (error) { 1354168404Spjd vdev_free(*vdp); 1355168404Spjd *vdp = NULL; 1356249195Smm return (SET_ERROR(EINVAL)); 1357168404Spjd } 1358168404Spjd 1359219089Spjd for (int c = 0; c < children; c++) { 1360168404Spjd vdev_t *vd; 1361168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1362168404Spjd atype)) != 0) { 1363168404Spjd vdev_free(*vdp); 1364168404Spjd *vdp = NULL; 1365168404Spjd return (error); 1366168404Spjd } 1367168404Spjd } 1368168404Spjd 1369168404Spjd ASSERT(*vdp != NULL); 1370168404Spjd 1371168404Spjd return (0); 1372168404Spjd} 1373168404Spjd 1374168404Spjd/* 1375168404Spjd * Opposite of spa_load(). 1376168404Spjd */ 1377168404Spjdstatic void 1378168404Spjdspa_unload(spa_t *spa) 1379168404Spjd{ 1380168404Spjd int i; 1381168404Spjd 1382185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1383185029Spjd 1384332530Smav spa_load_note(spa, "UNLOADING"); 1385332530Smav 1386168404Spjd /* 1387240868Spjd * Stop TRIM thread. 1388240868Spjd */ 1389240868Spjd trim_thread_destroy(spa); 1390240868Spjd 1391240868Spjd /* 1392168404Spjd * Stop async tasks. 1393168404Spjd */ 1394168404Spjd spa_async_suspend(spa); 1395168404Spjd 1396339111Smav if (spa->spa_root_vdev) { 1397339111Smav vdev_initialize_stop_all(spa->spa_root_vdev, 1398339111Smav VDEV_INITIALIZE_ACTIVE); 1399339111Smav } 1400339111Smav 1401168404Spjd /* 1402168404Spjd * Stop syncing. 1403168404Spjd */ 1404168404Spjd if (spa->spa_sync_on) { 1405168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1406168404Spjd spa->spa_sync_on = B_FALSE; 1407168404Spjd } 1408168404Spjd 1409168404Spjd /* 1410321529Smav * Even though vdev_free() also calls vdev_metaslab_fini, we need 1411321529Smav * to call it earlier, before we wait for async i/o to complete. 1412321529Smav * This ensures that there is no async metaslab prefetching, by 1413321529Smav * calling taskq_wait(mg_taskq). 1414321529Smav */ 1415321529Smav if (spa->spa_root_vdev != NULL) { 1416339111Smav spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1417321529Smav for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) 1418321529Smav vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); 1419339111Smav spa_config_exit(spa, SCL_ALL, spa); 1420321529Smav } 1421321529Smav 1422321529Smav /* 1423185029Spjd * Wait for any outstanding async I/O to complete. 1424168404Spjd */ 1425209962Smm if (spa->spa_async_zio_root != NULL) { 1426272598Sdelphij for (int i = 0; i < max_ncpus; i++) 1427272598Sdelphij (void) zio_wait(spa->spa_async_zio_root[i]); 1428272598Sdelphij kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1429209962Smm spa->spa_async_zio_root = NULL; 1430209962Smm } 1431168404Spjd 1432332525Smav if (spa->spa_vdev_removal != NULL) { 1433332525Smav spa_vdev_removal_destroy(spa->spa_vdev_removal); 1434332525Smav spa->spa_vdev_removal = NULL; 1435332525Smav } 1436332525Smav 1437332537Smav if (spa->spa_condense_zthr != NULL) { 1438332537Smav ASSERT(!zthr_isrunning(spa->spa_condense_zthr)); 1439332537Smav zthr_destroy(spa->spa_condense_zthr); 1440332537Smav spa->spa_condense_zthr = NULL; 1441332537Smav } 1442332537Smav 1443332547Smav if (spa->spa_checkpoint_discard_zthr != NULL) { 1444332547Smav ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr)); 1445332547Smav zthr_destroy(spa->spa_checkpoint_discard_zthr); 1446332547Smav spa->spa_checkpoint_discard_zthr = NULL; 1447332547Smav } 1448332547Smav 1449332525Smav spa_condense_fini(spa); 1450332525Smav 1451219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1452219089Spjd 1453339111Smav spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1454258717Savg 1455168404Spjd /* 1456258717Savg * Close all vdevs. 1457258717Savg */ 1458258717Savg if (spa->spa_root_vdev) 1459258717Savg vdev_free(spa->spa_root_vdev); 1460258717Savg ASSERT(spa->spa_root_vdev == NULL); 1461258717Savg 1462258717Savg /* 1463168404Spjd * Close the dsl pool. 1464168404Spjd */ 1465168404Spjd if (spa->spa_dsl_pool) { 1466168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1467168404Spjd spa->spa_dsl_pool = NULL; 1468219089Spjd spa->spa_meta_objset = NULL; 1469168404Spjd } 1470168404Spjd 1471219089Spjd ddt_unload(spa); 1472219089Spjd 1473168404Spjd /* 1474209962Smm * Drop and purge level 2 cache 1475209962Smm */ 1476209962Smm spa_l2cache_drop(spa); 1477209962Smm 1478185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1479185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1480185029Spjd if (spa->spa_spares.sav_vdevs) { 1481185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1482185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1483185029Spjd spa->spa_spares.sav_vdevs = NULL; 1484168404Spjd } 1485185029Spjd if (spa->spa_spares.sav_config) { 1486185029Spjd nvlist_free(spa->spa_spares.sav_config); 1487185029Spjd spa->spa_spares.sav_config = NULL; 1488168404Spjd } 1489185029Spjd spa->spa_spares.sav_count = 0; 1490168404Spjd 1491230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1492230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1493185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1494230514Smm } 1495185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1496185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1497185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1498185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1499185029Spjd } 1500185029Spjd if (spa->spa_l2cache.sav_config) { 1501185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1502185029Spjd spa->spa_l2cache.sav_config = NULL; 1503185029Spjd } 1504185029Spjd spa->spa_l2cache.sav_count = 0; 1505185029Spjd 1506168404Spjd spa->spa_async_suspended = 0; 1507209962Smm 1508332525Smav spa->spa_indirect_vdevs_loaded = B_FALSE; 1509332525Smav 1510228103Smm if (spa->spa_comment != NULL) { 1511228103Smm spa_strfree(spa->spa_comment); 1512228103Smm spa->spa_comment = NULL; 1513228103Smm } 1514228103Smm 1515339111Smav spa_config_exit(spa, SCL_ALL, spa); 1516168404Spjd} 1517168404Spjd 1518168404Spjd/* 1519168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1520168404Spjd * this pool. When this is called, we have some form of basic information in 1521185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1522185029Spjd * then re-generate a more complete list including status information. 1523168404Spjd */ 1524332525Smavvoid 1525168404Spjdspa_load_spares(spa_t *spa) 1526168404Spjd{ 1527168404Spjd nvlist_t **spares; 1528168404Spjd uint_t nspares; 1529168404Spjd int i; 1530168404Spjd vdev_t *vd, *tvd; 1531168404Spjd 1532332547Smav#ifndef _KERNEL 1533332547Smav /* 1534332547Smav * zdb opens both the current state of the pool and the 1535332547Smav * checkpointed state (if present), with a different spa_t. 1536332547Smav * 1537332547Smav * As spare vdevs are shared among open pools, we skip loading 1538332547Smav * them when we load the checkpointed state of the pool. 1539332547Smav */ 1540332547Smav if (!spa_writeable(spa)) 1541332547Smav return; 1542332547Smav#endif 1543332547Smav 1544185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1545185029Spjd 1546168404Spjd /* 1547168404Spjd * First, close and free any existing spare vdevs. 1548168404Spjd */ 1549185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1550185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1551168404Spjd 1552168404Spjd /* Undo the call to spa_activate() below */ 1553185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1554185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1555168404Spjd spa_spare_remove(tvd); 1556168404Spjd vdev_close(vd); 1557168404Spjd vdev_free(vd); 1558168404Spjd } 1559168404Spjd 1560185029Spjd if (spa->spa_spares.sav_vdevs) 1561185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1562185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1563168404Spjd 1564185029Spjd if (spa->spa_spares.sav_config == NULL) 1565168404Spjd nspares = 0; 1566168404Spjd else 1567185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1568168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1569168404Spjd 1570185029Spjd spa->spa_spares.sav_count = (int)nspares; 1571185029Spjd spa->spa_spares.sav_vdevs = NULL; 1572168404Spjd 1573168404Spjd if (nspares == 0) 1574168404Spjd return; 1575168404Spjd 1576168404Spjd /* 1577168404Spjd * Construct the array of vdevs, opening them to get status in the 1578168404Spjd * process. For each spare, there is potentially two different vdev_t 1579168404Spjd * structures associated with it: one in the list of spares (used only 1580168404Spjd * for basic validation purposes) and one in the active vdev 1581168404Spjd * configuration (if it's spared in). During this phase we open and 1582168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1583168404Spjd * active configuration, then we also mark this vdev as an active spare. 1584168404Spjd */ 1585185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1586185029Spjd KM_SLEEP); 1587185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1588168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1589168404Spjd VDEV_ALLOC_SPARE) == 0); 1590168404Spjd ASSERT(vd != NULL); 1591168404Spjd 1592185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1593168404Spjd 1594185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1595185029Spjd B_FALSE)) != NULL) { 1596168404Spjd if (!tvd->vdev_isspare) 1597168404Spjd spa_spare_add(tvd); 1598168404Spjd 1599168404Spjd /* 1600168404Spjd * We only mark the spare active if we were successfully 1601168404Spjd * able to load the vdev. Otherwise, importing a pool 1602168404Spjd * with a bad active spare would result in strange 1603168404Spjd * behavior, because multiple pool would think the spare 1604168404Spjd * is actively in use. 1605168404Spjd * 1606168404Spjd * There is a vulnerability here to an equally bizarre 1607168404Spjd * circumstance, where a dead active spare is later 1608168404Spjd * brought back to life (onlined or otherwise). Given 1609168404Spjd * the rarity of this scenario, and the extra complexity 1610168404Spjd * it adds, we ignore the possibility. 1611168404Spjd */ 1612168404Spjd if (!vdev_is_dead(tvd)) 1613168404Spjd spa_spare_activate(tvd); 1614168404Spjd } 1615168404Spjd 1616185029Spjd vd->vdev_top = vd; 1617209962Smm vd->vdev_aux = &spa->spa_spares; 1618185029Spjd 1619168404Spjd if (vdev_open(vd) != 0) 1620168404Spjd continue; 1621168404Spjd 1622185029Spjd if (vdev_validate_aux(vd) == 0) 1623185029Spjd spa_spare_add(vd); 1624168404Spjd } 1625168404Spjd 1626168404Spjd /* 1627168404Spjd * Recompute the stashed list of spares, with status information 1628168404Spjd * this time. 1629168404Spjd */ 1630185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1631168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1632168404Spjd 1633185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1634185029Spjd KM_SLEEP); 1635185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1636185029Spjd spares[i] = vdev_config_generate(spa, 1637219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1638185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1639185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1640185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1641168404Spjd nvlist_free(spares[i]); 1642185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1643168404Spjd} 1644168404Spjd 1645185029Spjd/* 1646185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1647185029Spjd * this pool. When this is called, we have some form of basic information in 1648185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1649185029Spjd * then re-generate a more complete list including status information. 1650185029Spjd * Devices which are already active have their details maintained, and are 1651185029Spjd * not re-opened. 1652185029Spjd */ 1653332525Smavvoid 1654185029Spjdspa_load_l2cache(spa_t *spa) 1655185029Spjd{ 1656185029Spjd nvlist_t **l2cache; 1657185029Spjd uint_t nl2cache; 1658185029Spjd int i, j, oldnvdevs; 1659219089Spjd uint64_t guid; 1660185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1661185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1662185029Spjd 1663332547Smav#ifndef _KERNEL 1664332547Smav /* 1665332547Smav * zdb opens both the current state of the pool and the 1666332547Smav * checkpointed state (if present), with a different spa_t. 1667332547Smav * 1668332547Smav * As L2 caches are part of the ARC which is shared among open 1669332547Smav * pools, we skip loading them when we load the checkpointed 1670332547Smav * state of the pool. 1671332547Smav */ 1672332547Smav if (!spa_writeable(spa)) 1673332547Smav return; 1674332547Smav#endif 1675332547Smav 1676185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1677185029Spjd 1678185029Spjd if (sav->sav_config != NULL) { 1679185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1680185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1681185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1682185029Spjd } else { 1683185029Spjd nl2cache = 0; 1684247187Smm newvdevs = NULL; 1685185029Spjd } 1686185029Spjd 1687185029Spjd oldvdevs = sav->sav_vdevs; 1688185029Spjd oldnvdevs = sav->sav_count; 1689185029Spjd sav->sav_vdevs = NULL; 1690185029Spjd sav->sav_count = 0; 1691185029Spjd 1692185029Spjd /* 1693185029Spjd * Process new nvlist of vdevs. 1694185029Spjd */ 1695185029Spjd for (i = 0; i < nl2cache; i++) { 1696185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1697185029Spjd &guid) == 0); 1698185029Spjd 1699185029Spjd newvdevs[i] = NULL; 1700185029Spjd for (j = 0; j < oldnvdevs; j++) { 1701185029Spjd vd = oldvdevs[j]; 1702185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1703185029Spjd /* 1704185029Spjd * Retain previous vdev for add/remove ops. 1705185029Spjd */ 1706185029Spjd newvdevs[i] = vd; 1707185029Spjd oldvdevs[j] = NULL; 1708185029Spjd break; 1709185029Spjd } 1710185029Spjd } 1711185029Spjd 1712185029Spjd if (newvdevs[i] == NULL) { 1713185029Spjd /* 1714185029Spjd * Create new vdev 1715185029Spjd */ 1716185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1717185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1718185029Spjd ASSERT(vd != NULL); 1719185029Spjd newvdevs[i] = vd; 1720185029Spjd 1721185029Spjd /* 1722185029Spjd * Commit this vdev as an l2cache device, 1723185029Spjd * even if it fails to open. 1724185029Spjd */ 1725185029Spjd spa_l2cache_add(vd); 1726185029Spjd 1727185029Spjd vd->vdev_top = vd; 1728185029Spjd vd->vdev_aux = sav; 1729185029Spjd 1730185029Spjd spa_l2cache_activate(vd); 1731185029Spjd 1732185029Spjd if (vdev_open(vd) != 0) 1733185029Spjd continue; 1734185029Spjd 1735185029Spjd (void) vdev_validate_aux(vd); 1736185029Spjd 1737219089Spjd if (!vdev_is_dead(vd)) 1738219089Spjd l2arc_add_vdev(spa, vd); 1739185029Spjd } 1740185029Spjd } 1741185029Spjd 1742185029Spjd /* 1743185029Spjd * Purge vdevs that were dropped 1744185029Spjd */ 1745185029Spjd for (i = 0; i < oldnvdevs; i++) { 1746185029Spjd uint64_t pool; 1747185029Spjd 1748185029Spjd vd = oldvdevs[i]; 1749185029Spjd if (vd != NULL) { 1750230514Smm ASSERT(vd->vdev_isl2cache); 1751230514Smm 1752209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1753209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1754185029Spjd l2arc_remove_vdev(vd); 1755230514Smm vdev_clear_stats(vd); 1756230514Smm vdev_free(vd); 1757185029Spjd } 1758185029Spjd } 1759185029Spjd 1760185029Spjd if (oldvdevs) 1761185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1762185029Spjd 1763185029Spjd if (sav->sav_config == NULL) 1764185029Spjd goto out; 1765185029Spjd 1766185029Spjd sav->sav_vdevs = newvdevs; 1767185029Spjd sav->sav_count = (int)nl2cache; 1768185029Spjd 1769185029Spjd /* 1770185029Spjd * Recompute the stashed list of l2cache devices, with status 1771185029Spjd * information this time. 1772185029Spjd */ 1773185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1774185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1775185029Spjd 1776185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1777185029Spjd for (i = 0; i < sav->sav_count; i++) 1778185029Spjd l2cache[i] = vdev_config_generate(spa, 1779219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1780185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1781185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1782185029Spjdout: 1783185029Spjd for (i = 0; i < sav->sav_count; i++) 1784185029Spjd nvlist_free(l2cache[i]); 1785185029Spjd if (sav->sav_count) 1786185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1787185029Spjd} 1788185029Spjd 1789168404Spjdstatic int 1790168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1791168404Spjd{ 1792168404Spjd dmu_buf_t *db; 1793168404Spjd char *packed = NULL; 1794168404Spjd size_t nvsize = 0; 1795168404Spjd int error; 1796168404Spjd *value = NULL; 1797168404Spjd 1798262676Sdelphij error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1799262676Sdelphij if (error != 0) 1800262676Sdelphij return (error); 1801287744Sdelphij 1802168404Spjd nvsize = *(uint64_t *)db->db_data; 1803168404Spjd dmu_buf_rele(db, FTAG); 1804168404Spjd 1805168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1806209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1807209962Smm DMU_READ_PREFETCH); 1808168404Spjd if (error == 0) 1809168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1810168404Spjd kmem_free(packed, nvsize); 1811168404Spjd 1812168404Spjd return (error); 1813168404Spjd} 1814168404Spjd 1815168404Spjd/* 1816332536Smav * Concrete top-level vdevs that are not missing and are not logs. At every 1817332536Smav * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 1818332536Smav */ 1819332536Smavstatic uint64_t 1820332536Smavspa_healthy_core_tvds(spa_t *spa) 1821332536Smav{ 1822332536Smav vdev_t *rvd = spa->spa_root_vdev; 1823332536Smav uint64_t tvds = 0; 1824332536Smav 1825332536Smav for (uint64_t i = 0; i < rvd->vdev_children; i++) { 1826332536Smav vdev_t *vd = rvd->vdev_child[i]; 1827332536Smav if (vd->vdev_islog) 1828332536Smav continue; 1829332536Smav if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 1830332536Smav tvds++; 1831332536Smav } 1832332536Smav 1833332536Smav return (tvds); 1834332536Smav} 1835332536Smav 1836332536Smav/* 1837185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1838185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1839185029Spjd */ 1840185029Spjdstatic void 1841185029Spjdspa_check_removed(vdev_t *vd) 1842185029Spjd{ 1843332536Smav for (uint64_t c = 0; c < vd->vdev_children; c++) 1844185029Spjd spa_check_removed(vd->vdev_child[c]); 1845185029Spjd 1846249188Smm if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1847332525Smav vdev_is_concrete(vd)) { 1848185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1849331397Smav spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 1850185029Spjd } 1851185029Spjd} 1852185029Spjd 1853332536Smavstatic int 1854332536Smavspa_check_for_missing_logs(spa_t *spa) 1855299441Smav{ 1856332536Smav vdev_t *rvd = spa->spa_root_vdev; 1857299441Smav 1858219089Spjd /* 1859219089Spjd * If we're doing a normal import, then build up any additional 1860332536Smav * diagnostic information about missing log devices. 1861219089Spjd * We'll pass this up to the user for further processing. 1862219089Spjd */ 1863219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1864219089Spjd nvlist_t **child, *nv; 1865219089Spjd uint64_t idx = 0; 1866219089Spjd 1867219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1868219089Spjd KM_SLEEP); 1869219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1870219089Spjd 1871332536Smav for (uint64_t c = 0; c < rvd->vdev_children; c++) { 1872219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1873219089Spjd 1874332536Smav /* 1875332536Smav * We consider a device as missing only if it failed 1876332536Smav * to open (i.e. offline or faulted is not considered 1877332536Smav * as missing). 1878332536Smav */ 1879332536Smav if (tvd->vdev_islog && 1880332536Smav tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 1881332536Smav child[idx++] = vdev_config_generate(spa, tvd, 1882332536Smav B_FALSE, VDEV_CONFIG_MISSING); 1883332536Smav } 1884219089Spjd } 1885219089Spjd 1886332536Smav if (idx > 0) { 1887332536Smav fnvlist_add_nvlist_array(nv, 1888332536Smav ZPOOL_CONFIG_CHILDREN, child, idx); 1889332536Smav fnvlist_add_nvlist(spa->spa_load_info, 1890332536Smav ZPOOL_CONFIG_MISSING_DEVICES, nv); 1891219089Spjd 1892332536Smav for (uint64_t i = 0; i < idx; i++) 1893219089Spjd nvlist_free(child[i]); 1894219089Spjd } 1895219089Spjd nvlist_free(nv); 1896219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1897219089Spjd 1898332536Smav if (idx > 0) { 1899332536Smav spa_load_failed(spa, "some log devices are missing"); 1900332549Smav vdev_dbgmsg_print_tree(rvd, 2); 1901332536Smav return (SET_ERROR(ENXIO)); 1902332536Smav } 1903332536Smav } else { 1904332536Smav for (uint64_t c = 0; c < rvd->vdev_children; c++) { 1905332536Smav vdev_t *tvd = rvd->vdev_child[c]; 1906213197Smm 1907332536Smav if (tvd->vdev_islog && 1908332536Smav tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 1909219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1910332536Smav spa_load_note(spa, "some log devices are " 1911332536Smav "missing, ZIL is dropped."); 1912332549Smav vdev_dbgmsg_print_tree(rvd, 2); 1913332536Smav break; 1914219089Spjd } 1915219089Spjd } 1916213197Smm } 1917299441Smav 1918332536Smav return (0); 1919213197Smm} 1920213197Smm 1921213197Smm/* 1922185029Spjd * Check for missing log devices 1923185029Spjd */ 1924248571Smmstatic boolean_t 1925185029Spjdspa_check_logs(spa_t *spa) 1926185029Spjd{ 1927248571Smm boolean_t rv = B_FALSE; 1928286686Smav dsl_pool_t *dp = spa_get_dsl(spa); 1929248571Smm 1930185029Spjd switch (spa->spa_log_state) { 1931185029Spjd case SPA_LOG_MISSING: 1932185029Spjd /* need to recheck in case slog has been restored */ 1933185029Spjd case SPA_LOG_UNKNOWN: 1934286686Smav rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1935286686Smav zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1936248571Smm if (rv) 1937219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1938185029Spjd break; 1939185029Spjd } 1940248571Smm return (rv); 1941185029Spjd} 1942185029Spjd 1943219089Spjdstatic boolean_t 1944219089Spjdspa_passivate_log(spa_t *spa) 1945219089Spjd{ 1946219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1947219089Spjd boolean_t slog_found = B_FALSE; 1948219089Spjd 1949219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1950219089Spjd 1951219089Spjd if (!spa_has_slogs(spa)) 1952219089Spjd return (B_FALSE); 1953219089Spjd 1954219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1955219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1956219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1957219089Spjd 1958219089Spjd if (tvd->vdev_islog) { 1959219089Spjd metaslab_group_passivate(mg); 1960219089Spjd slog_found = B_TRUE; 1961219089Spjd } 1962219089Spjd } 1963219089Spjd 1964219089Spjd return (slog_found); 1965219089Spjd} 1966219089Spjd 1967219089Spjdstatic void 1968219089Spjdspa_activate_log(spa_t *spa) 1969219089Spjd{ 1970219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1971219089Spjd 1972219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1973219089Spjd 1974219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1975219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1976219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1977219089Spjd 1978219089Spjd if (tvd->vdev_islog) 1979219089Spjd metaslab_group_activate(mg); 1980219089Spjd } 1981219089Spjd} 1982219089Spjd 1983219089Spjdint 1984332525Smavspa_reset_logs(spa_t *spa) 1985219089Spjd{ 1986248571Smm int error; 1987219089Spjd 1988332525Smav error = dmu_objset_find(spa_name(spa), zil_reset, 1989248571Smm NULL, DS_FIND_CHILDREN); 1990248571Smm if (error == 0) { 1991219089Spjd /* 1992219089Spjd * We successfully offlined the log device, sync out the 1993219089Spjd * current txg so that the "stubby" block can be removed 1994219089Spjd * by zil_sync(). 1995219089Spjd */ 1996219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1997219089Spjd } 1998219089Spjd return (error); 1999219089Spjd} 2000219089Spjd 2001219089Spjdstatic void 2002219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 2003219089Spjd{ 2004219089Spjd int i; 2005219089Spjd 2006219089Spjd for (i = 0; i < sav->sav_count; i++) 2007219089Spjd spa_check_removed(sav->sav_vdevs[i]); 2008219089Spjd} 2009219089Spjd 2010219089Spjdvoid 2011219089Spjdspa_claim_notify(zio_t *zio) 2012219089Spjd{ 2013219089Spjd spa_t *spa = zio->io_spa; 2014219089Spjd 2015219089Spjd if (zio->io_error) 2016219089Spjd return; 2017219089Spjd 2018219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2019219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 2020219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 2021219089Spjd mutex_exit(&spa->spa_props_lock); 2022219089Spjd} 2023219089Spjd 2024219089Spjdtypedef struct spa_load_error { 2025219089Spjd uint64_t sle_meta_count; 2026219089Spjd uint64_t sle_data_count; 2027219089Spjd} spa_load_error_t; 2028219089Spjd 2029219089Spjdstatic void 2030219089Spjdspa_load_verify_done(zio_t *zio) 2031219089Spjd{ 2032219089Spjd blkptr_t *bp = zio->io_bp; 2033219089Spjd spa_load_error_t *sle = zio->io_private; 2034219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 2035219089Spjd int error = zio->io_error; 2036268720Sdelphij spa_t *spa = zio->io_spa; 2037219089Spjd 2038321610Smav abd_free(zio->io_abd); 2039219089Spjd if (error) { 2040236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2041219089Spjd type != DMU_OT_INTENT_LOG) 2042270247Sdelphij atomic_inc_64(&sle->sle_meta_count); 2043219089Spjd else 2044270247Sdelphij atomic_inc_64(&sle->sle_data_count); 2045219089Spjd } 2046268720Sdelphij 2047268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 2048339034Ssef spa->spa_load_verify_ios--; 2049268720Sdelphij cv_broadcast(&spa->spa_scrub_io_cv); 2050268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 2051219089Spjd} 2052219089Spjd 2053268720Sdelphij/* 2054268720Sdelphij * Maximum number of concurrent scrub i/os to create while verifying 2055268720Sdelphij * a pool while importing it. 2056268720Sdelphij */ 2057268720Sdelphijint spa_load_verify_maxinflight = 10000; 2058268720Sdelphijboolean_t spa_load_verify_metadata = B_TRUE; 2059268720Sdelphijboolean_t spa_load_verify_data = B_TRUE; 2060268720Sdelphij 2061268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, 2062268720Sdelphij &spa_load_verify_maxinflight, 0, 2063268720Sdelphij "Maximum number of concurrent scrub I/Os to create while verifying a " 2064268720Sdelphij "pool while importing it"); 2065268720Sdelphij 2066268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, 2067268720Sdelphij &spa_load_verify_metadata, 0, 2068268720Sdelphij "Check metadata on import?"); 2069268720Sdelphij 2070268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, 2071268720Sdelphij &spa_load_verify_data, 0, 2072268720Sdelphij "Check user data on import?"); 2073268720Sdelphij 2074219089Spjd/*ARGSUSED*/ 2075219089Spjdstatic int 2076219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2077268123Sdelphij const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2078219089Spjd{ 2079286705Smav if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 2080268720Sdelphij return (0); 2081268720Sdelphij /* 2082268720Sdelphij * Note: normally this routine will not be called if 2083268720Sdelphij * spa_load_verify_metadata is not set. However, it may be useful 2084268720Sdelphij * to manually set the flag after the traversal has begun. 2085268720Sdelphij */ 2086268720Sdelphij if (!spa_load_verify_metadata) 2087268720Sdelphij return (0); 2088321610Smav if (!BP_IS_METADATA(bp) && !spa_load_verify_data) 2089268720Sdelphij return (0); 2090219089Spjd 2091268720Sdelphij zio_t *rio = arg; 2092268720Sdelphij size_t size = BP_GET_PSIZE(bp); 2093268720Sdelphij 2094268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 2095339034Ssef while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight) 2096268720Sdelphij cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2097339034Ssef spa->spa_load_verify_ios++; 2098268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 2099268720Sdelphij 2100321610Smav zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2101268720Sdelphij spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2102268720Sdelphij ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2103268720Sdelphij ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2104219089Spjd return (0); 2105219089Spjd} 2106219089Spjd 2107307045Smav/* ARGSUSED */ 2108307045Smavint 2109307045Smavverify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2110307045Smav{ 2111307108Smav if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2112307045Smav return (SET_ERROR(ENAMETOOLONG)); 2113307045Smav 2114307045Smav return (0); 2115307045Smav} 2116307045Smav 2117219089Spjdstatic int 2118219089Spjdspa_load_verify(spa_t *spa) 2119219089Spjd{ 2120219089Spjd zio_t *rio; 2121219089Spjd spa_load_error_t sle = { 0 }; 2122332550Smav zpool_load_policy_t policy; 2123219089Spjd boolean_t verify_ok = B_FALSE; 2124268720Sdelphij int error = 0; 2125219089Spjd 2126332550Smav zpool_get_load_policy(spa->spa_config, &policy); 2127219089Spjd 2128332550Smav if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) 2129219089Spjd return (0); 2130219089Spjd 2131307045Smav dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2132307045Smav error = dmu_objset_find_dp(spa->spa_dsl_pool, 2133307045Smav spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2134307045Smav DS_FIND_CHILDREN); 2135307045Smav dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2136307045Smav if (error != 0) 2137307045Smav return (error); 2138307045Smav 2139219089Spjd rio = zio_root(spa, NULL, &sle, 2140219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2141219089Spjd 2142268720Sdelphij if (spa_load_verify_metadata) { 2143332530Smav if (spa->spa_extreme_rewind) { 2144332530Smav spa_load_note(spa, "performing a complete scan of the " 2145332530Smav "pool since extreme rewind is on. This may take " 2146332530Smav "a very long time.\n (spa_load_verify_data=%u, " 2147332530Smav "spa_load_verify_metadata=%u)", 2148332530Smav spa_load_verify_data, spa_load_verify_metadata); 2149332530Smav } 2150268720Sdelphij error = traverse_pool(spa, spa->spa_verify_min_txg, 2151268720Sdelphij TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 2152268720Sdelphij spa_load_verify_cb, rio); 2153268720Sdelphij } 2154219089Spjd 2155219089Spjd (void) zio_wait(rio); 2156219089Spjd 2157219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 2158219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 2159219089Spjd 2160332531Smav if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2161332531Smav spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2162332531Smav "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2163332531Smav (u_longlong_t)sle.sle_data_count); 2164332531Smav } 2165332531Smav 2166332531Smav if (spa_load_verify_dryrun || 2167332550Smav (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2168332550Smav sle.sle_data_count <= policy.zlp_maxdata)) { 2169219089Spjd int64_t loss = 0; 2170219089Spjd 2171219089Spjd verify_ok = B_TRUE; 2172219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2173219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2174219089Spjd 2175219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2176219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 2177219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2178219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 2179219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2180219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 2181219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2182219089Spjd } else { 2183219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2184219089Spjd } 2185219089Spjd 2186332531Smav if (spa_load_verify_dryrun) 2187332531Smav return (0); 2188332531Smav 2189219089Spjd if (error) { 2190219089Spjd if (error != ENXIO && error != EIO) 2191249195Smm error = SET_ERROR(EIO); 2192219089Spjd return (error); 2193219089Spjd } 2194219089Spjd 2195219089Spjd return (verify_ok ? 0 : EIO); 2196219089Spjd} 2197219089Spjd 2198185029Spjd/* 2199219089Spjd * Find a value in the pool props object. 2200168404Spjd */ 2201219089Spjdstatic void 2202219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2203219089Spjd{ 2204219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2205219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2206219089Spjd} 2207219089Spjd 2208219089Spjd/* 2209219089Spjd * Find a value in the pool directory object. 2210219089Spjd */ 2211168404Spjdstatic int 2212332530Smavspa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2213168404Spjd{ 2214332530Smav int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2215332530Smav name, sizeof (uint64_t), 1, val); 2216332530Smav 2217332530Smav if (error != 0 && (error != ENOENT || log_enoent)) { 2218332530Smav spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2219332530Smav "[error=%d]", name, error); 2220332530Smav } 2221332530Smav 2222332530Smav return (error); 2223219089Spjd} 2224168404Spjd 2225219089Spjdstatic int 2226219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2227219089Spjd{ 2228219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2229332525Smav return (SET_ERROR(err)); 2230219089Spjd} 2231219089Spjd 2232332537Smavstatic void 2233332537Smavspa_spawn_aux_threads(spa_t *spa) 2234332537Smav{ 2235332537Smav ASSERT(spa_writeable(spa)); 2236332537Smav 2237332537Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2238332537Smav 2239332537Smav spa_start_indirect_condensing_thread(spa); 2240332547Smav 2241332547Smav ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 2242332547Smav spa->spa_checkpoint_discard_zthr = 2243332547Smav zthr_create(spa_checkpoint_discard_thread_check, 2244332547Smav spa_checkpoint_discard_thread, spa); 2245332537Smav} 2246332537Smav 2247219089Spjd/* 2248219089Spjd * Fix up config after a partly-completed split. This is done with the 2249219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2250219089Spjd * pool have that entry in their config, but only the splitting one contains 2251219089Spjd * a list of all the guids of the vdevs that are being split off. 2252219089Spjd * 2253219089Spjd * This function determines what to do with that list: either rejoin 2254219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 2255219089Spjd * the rejoin, each disk that is offlined is marked online again, and 2256219089Spjd * we do a reopen() call. If the vdev label for every disk that was 2257219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2258219089Spjd * then we call vdev_split() on each disk, and complete the split. 2259219089Spjd * 2260219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 2261219089Spjd * the original pool. 2262219089Spjd */ 2263219089Spjdstatic void 2264219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 2265219089Spjd{ 2266219089Spjd uint_t extracted; 2267219089Spjd uint64_t *glist; 2268219089Spjd uint_t i, gcount; 2269219089Spjd nvlist_t *nvl; 2270219089Spjd vdev_t **vd; 2271219089Spjd boolean_t attempt_reopen; 2272219089Spjd 2273219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2274219089Spjd return; 2275219089Spjd 2276219089Spjd /* check that the config is complete */ 2277219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2278219089Spjd &glist, &gcount) != 0) 2279219089Spjd return; 2280219089Spjd 2281219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2282219089Spjd 2283219089Spjd /* attempt to online all the vdevs & validate */ 2284219089Spjd attempt_reopen = B_TRUE; 2285219089Spjd for (i = 0; i < gcount; i++) { 2286219089Spjd if (glist[i] == 0) /* vdev is hole */ 2287219089Spjd continue; 2288219089Spjd 2289219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2290219089Spjd if (vd[i] == NULL) { 2291219089Spjd /* 2292219089Spjd * Don't bother attempting to reopen the disks; 2293219089Spjd * just do the split. 2294219089Spjd */ 2295219089Spjd attempt_reopen = B_FALSE; 2296219089Spjd } else { 2297219089Spjd /* attempt to re-online it */ 2298219089Spjd vd[i]->vdev_offline = B_FALSE; 2299219089Spjd } 2300219089Spjd } 2301219089Spjd 2302219089Spjd if (attempt_reopen) { 2303219089Spjd vdev_reopen(spa->spa_root_vdev); 2304219089Spjd 2305219089Spjd /* check each device to see what state it's in */ 2306219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 2307219089Spjd if (vd[i] != NULL && 2308219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2309219089Spjd break; 2310219089Spjd ++extracted; 2311219089Spjd } 2312219089Spjd } 2313219089Spjd 2314209962Smm /* 2315219089Spjd * If every disk has been moved to the new pool, or if we never 2316219089Spjd * even attempted to look at them, then we split them off for 2317219089Spjd * good. 2318209962Smm */ 2319219089Spjd if (!attempt_reopen || gcount == extracted) { 2320219089Spjd for (i = 0; i < gcount; i++) 2321219089Spjd if (vd[i] != NULL) 2322219089Spjd vdev_split(vd[i]); 2323219089Spjd vdev_reopen(spa->spa_root_vdev); 2324219089Spjd } 2325209962Smm 2326219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 2327219089Spjd} 2328185029Spjd 2329219089Spjdstatic int 2330332536Smavspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 2331219089Spjd{ 2332219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 2333219089Spjd int error; 2334168404Spjd 2335332536Smav spa->spa_load_state = state; 2336168404Spjd 2337332536Smav gethrestime(&spa->spa_loaded_ts); 2338332547Smav error = spa_load_impl(spa, type, &ereport); 2339228103Smm 2340168404Spjd /* 2341286575Smav * Don't count references from objsets that are already closed 2342286575Smav * and are making their way through the eviction process. 2343286575Smav */ 2344286575Smav spa_evicting_os_wait(spa); 2345219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2346219089Spjd if (error) { 2347219089Spjd if (error != EEXIST) { 2348219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2349219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2350219089Spjd } 2351219089Spjd if (error != EBADF) { 2352219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2353219089Spjd } 2354219089Spjd } 2355219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2356219089Spjd spa->spa_ena = 0; 2357168404Spjd 2358219089Spjd return (error); 2359219089Spjd} 2360219089Spjd 2361219089Spjd/* 2362299441Smav * Count the number of per-vdev ZAPs associated with all of the vdevs in the 2363299441Smav * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 2364299441Smav * spa's per-vdev ZAP list. 2365299441Smav */ 2366299441Smavstatic uint64_t 2367299441Smavvdev_count_verify_zaps(vdev_t *vd) 2368299441Smav{ 2369299441Smav spa_t *spa = vd->vdev_spa; 2370299441Smav uint64_t total = 0; 2371299441Smav if (vd->vdev_top_zap != 0) { 2372299441Smav total++; 2373299441Smav ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2374299441Smav spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 2375299441Smav } 2376299441Smav if (vd->vdev_leaf_zap != 0) { 2377299441Smav total++; 2378299441Smav ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2379299441Smav spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 2380299441Smav } 2381299441Smav 2382299441Smav for (uint64_t i = 0; i < vd->vdev_children; i++) { 2383299441Smav total += vdev_count_verify_zaps(vd->vdev_child[i]); 2384299441Smav } 2385299441Smav 2386299441Smav return (total); 2387299441Smav} 2388299441Smav 2389219089Spjdstatic int 2390332536Smavspa_verify_host(spa_t *spa, nvlist_t *mos_config) 2391219089Spjd{ 2392332536Smav uint64_t hostid; 2393332536Smav char *hostname; 2394332536Smav uint64_t myhostid = 0; 2395332536Smav 2396332536Smav if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 2397332536Smav ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2398332536Smav hostname = fnvlist_lookup_string(mos_config, 2399332536Smav ZPOOL_CONFIG_HOSTNAME); 2400332536Smav 2401332536Smav myhostid = zone_get_hostid(NULL); 2402332536Smav 2403332536Smav if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 2404332536Smav cmn_err(CE_WARN, "pool '%s' could not be " 2405332536Smav "loaded as it was last accessed by " 2406332536Smav "another system (host: %s hostid: 0x%llx). " 2407332536Smav "See: http://illumos.org/msg/ZFS-8000-EY", 2408332536Smav spa_name(spa), hostname, (u_longlong_t)hostid); 2409332536Smav spa_load_failed(spa, "hostid verification failed: pool " 2410332536Smav "last accessed by host: %s (hostid: 0x%llx)", 2411332536Smav hostname, (u_longlong_t)hostid); 2412332536Smav return (SET_ERROR(EBADF)); 2413332536Smav } 2414332536Smav } 2415332536Smav 2416332536Smav return (0); 2417332536Smav} 2418332536Smav 2419332536Smavstatic int 2420332536Smavspa_ld_parse_config(spa_t *spa, spa_import_type_t type) 2421332536Smav{ 2422219089Spjd int error = 0; 2423332536Smav nvlist_t *nvtree, *nvl, *config = spa->spa_config; 2424332529Smav int parse; 2425219089Spjd vdev_t *rvd; 2426332536Smav uint64_t pool_guid; 2427332536Smav char *comment; 2428219089Spjd 2429332536Smav /* 2430332536Smav * Versioning wasn't explicitly added to the label until later, so if 2431332536Smav * it's not present treat it as the initial version. 2432332536Smav */ 2433332536Smav if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2434332536Smav &spa->spa_ubsync.ub_version) != 0) 2435332536Smav spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2436332536Smav 2437332536Smav if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 2438332536Smav spa_load_failed(spa, "invalid config provided: '%s' missing", 2439332536Smav ZPOOL_CONFIG_POOL_GUID); 2440332536Smav return (SET_ERROR(EINVAL)); 2441332536Smav } 2442332536Smav 2443332547Smav /* 2444332547Smav * If we are doing an import, ensure that the pool is not already 2445332547Smav * imported by checking if its pool guid already exists in the 2446332547Smav * spa namespace. 2447332547Smav * 2448332547Smav * The only case that we allow an already imported pool to be 2449332547Smav * imported again, is when the pool is checkpointed and we want to 2450332547Smav * look at its checkpointed state from userland tools like zdb. 2451332547Smav */ 2452332547Smav#ifdef _KERNEL 2453332547Smav if ((spa->spa_load_state == SPA_LOAD_IMPORT || 2454332547Smav spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 2455332547Smav spa_guid_exists(pool_guid, 0)) { 2456332547Smav#else 2457332547Smav if ((spa->spa_load_state == SPA_LOAD_IMPORT || 2458332547Smav spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 2459332547Smav spa_guid_exists(pool_guid, 0) && 2460332547Smav !spa_importing_readonly_checkpoint(spa)) { 2461332547Smav#endif 2462332536Smav spa_load_failed(spa, "a pool with guid %llu is already open", 2463332536Smav (u_longlong_t)pool_guid); 2464332536Smav return (SET_ERROR(EEXIST)); 2465332536Smav } 2466332536Smav 2467332536Smav spa->spa_config_guid = pool_guid; 2468332536Smav 2469332536Smav nvlist_free(spa->spa_load_info); 2470332536Smav spa->spa_load_info = fnvlist_alloc(); 2471332536Smav 2472332536Smav ASSERT(spa->spa_comment == NULL); 2473332536Smav if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2474332536Smav spa->spa_comment = spa_strdup(comment); 2475332536Smav 2476332536Smav (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2477332536Smav &spa->spa_config_txg); 2478332536Smav 2479332536Smav if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 2480332536Smav spa->spa_config_splitting = fnvlist_dup(nvl); 2481332536Smav 2482332530Smav if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 2483332530Smav spa_load_failed(spa, "invalid config provided: '%s' missing", 2484332530Smav ZPOOL_CONFIG_VDEV_TREE); 2485249195Smm return (SET_ERROR(EINVAL)); 2486332530Smav } 2487219089Spjd 2488219089Spjd /* 2489209962Smm * Create "The Godfather" zio to hold all async IOs 2490209962Smm */ 2491272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2492272598Sdelphij KM_SLEEP); 2493272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 2494272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2495272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2496272598Sdelphij ZIO_FLAG_GODFATHER); 2497272598Sdelphij } 2498209962Smm 2499209962Smm /* 2500168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2501168404Spjd * value that will be returned by spa_version() since parsing the 2502168404Spjd * configuration requires knowing the version number. 2503168404Spjd */ 2504185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2505332536Smav parse = (type == SPA_IMPORT_EXISTING ? 2506332536Smav VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2507332529Smav error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 2508185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2509168404Spjd 2510332530Smav if (error != 0) { 2511332530Smav spa_load_failed(spa, "unable to parse config [error=%d]", 2512332530Smav error); 2513219089Spjd return (error); 2514332530Smav } 2515168404Spjd 2516168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2517284304Savg ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2518284304Savg ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2519168404Spjd 2520219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2521219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2522219089Spjd } 2523219089Spjd 2524332529Smav return (0); 2525332529Smav} 2526332529Smav 2527332536Smav/* 2528332536Smav * Recursively open all vdevs in the vdev tree. This function is called twice: 2529332536Smav * first with the untrusted config, then with the trusted config. 2530332536Smav */ 2531332529Smavstatic int 2532332529Smavspa_ld_open_vdevs(spa_t *spa) 2533332529Smav{ 2534332529Smav int error = 0; 2535332529Smav 2536332536Smav /* 2537332536Smav * spa_missing_tvds_allowed defines how many top-level vdevs can be 2538332536Smav * missing/unopenable for the root vdev to be still considered openable. 2539332536Smav */ 2540332536Smav if (spa->spa_trust_config) { 2541332536Smav spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 2542332536Smav } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 2543332536Smav spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 2544332536Smav } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 2545332536Smav spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 2546332536Smav } else { 2547332536Smav spa->spa_missing_tvds_allowed = 0; 2548332536Smav } 2549332536Smav 2550332536Smav spa->spa_missing_tvds_allowed = 2551332536Smav MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 2552332536Smav 2553185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2554332529Smav error = vdev_open(spa->spa_root_vdev); 2555185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2556332536Smav 2557332536Smav if (spa->spa_missing_tvds != 0) { 2558332536Smav spa_load_note(spa, "vdev tree has %lld missing top-level " 2559332536Smav "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 2560332536Smav if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) { 2561332536Smav /* 2562332536Smav * Although theoretically we could allow users to open 2563332536Smav * incomplete pools in RW mode, we'd need to add a lot 2564332536Smav * of extra logic (e.g. adjust pool space to account 2565332536Smav * for missing vdevs). 2566332536Smav * This limitation also prevents users from accidentally 2567332536Smav * opening the pool in RW mode during data recovery and 2568332536Smav * damaging it further. 2569332536Smav */ 2570332536Smav spa_load_note(spa, "pools with missing top-level " 2571332536Smav "vdevs can only be opened in read-only mode."); 2572332536Smav error = SET_ERROR(ENXIO); 2573332536Smav } else { 2574332536Smav spa_load_note(spa, "current settings allow for maximum " 2575332536Smav "%lld missing top-level vdevs at this stage.", 2576332536Smav (u_longlong_t)spa->spa_missing_tvds_allowed); 2577332536Smav } 2578332536Smav } 2579332530Smav if (error != 0) { 2580332530Smav spa_load_failed(spa, "unable to open vdev tree [error=%d]", 2581332530Smav error); 2582332530Smav } 2583332536Smav if (spa->spa_missing_tvds != 0 || error != 0) 2584332536Smav vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 2585168404Spjd 2586332529Smav return (error); 2587332529Smav} 2588332529Smav 2589332536Smav/* 2590332536Smav * We need to validate the vdev labels against the configuration that 2591332536Smav * we have in hand. This function is called twice: first with an untrusted 2592332536Smav * config, then with a trusted config. The validation is more strict when the 2593332536Smav * config is trusted. 2594332536Smav */ 2595332529Smavstatic int 2596332536Smavspa_ld_validate_vdevs(spa_t *spa) 2597332529Smav{ 2598332529Smav int error = 0; 2599332529Smav vdev_t *rvd = spa->spa_root_vdev; 2600332529Smav 2601332536Smav spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2602332536Smav error = vdev_validate(rvd); 2603332536Smav spa_config_exit(spa, SCL_ALL, FTAG); 2604168404Spjd 2605332536Smav if (error != 0) { 2606332536Smav spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 2607332536Smav return (error); 2608332536Smav } 2609219089Spjd 2610332536Smav if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 2611332536Smav spa_load_failed(spa, "cannot open vdev tree after invalidating " 2612332536Smav "some vdevs"); 2613332536Smav vdev_dbgmsg_print_tree(rvd, 2); 2614332536Smav return (SET_ERROR(ENXIO)); 2615168404Spjd } 2616168404Spjd 2617332529Smav return (0); 2618332529Smav} 2619332529Smav 2620332547Smavstatic void 2621332547Smavspa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 2622332547Smav{ 2623332547Smav spa->spa_state = POOL_STATE_ACTIVE; 2624332547Smav spa->spa_ubsync = spa->spa_uberblock; 2625332547Smav spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2626332547Smav TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2627332547Smav spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2628332547Smav spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2629332547Smav spa->spa_claim_max_txg = spa->spa_first_txg; 2630332547Smav spa->spa_prev_software_version = ub->ub_software_version; 2631332547Smav} 2632332547Smav 2633332529Smavstatic int 2634332536Smavspa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 2635332529Smav{ 2636332529Smav vdev_t *rvd = spa->spa_root_vdev; 2637332529Smav nvlist_t *label; 2638332529Smav uberblock_t *ub = &spa->spa_uberblock; 2639332529Smav 2640168404Spjd /* 2641332547Smav * If we are opening the checkpointed state of the pool by 2642332547Smav * rewinding to it, at this point we will have written the 2643332547Smav * checkpointed uberblock to the vdev labels, so searching 2644332547Smav * the labels will find the right uberblock. However, if 2645332547Smav * we are opening the checkpointed state read-only, we have 2646332547Smav * not modified the labels. Therefore, we must ignore the 2647332547Smav * labels and continue using the spa_uberblock that was set 2648332547Smav * by spa_ld_checkpoint_rewind. 2649332547Smav * 2650332547Smav * Note that it would be fine to ignore the labels when 2651332547Smav * rewinding (opening writeable) as well. However, if we 2652332547Smav * crash just after writing the labels, we will end up 2653332547Smav * searching the labels. Doing so in the common case means 2654332547Smav * that this code path gets exercised normally, rather than 2655332547Smav * just in the edge case. 2656332547Smav */ 2657332547Smav if (ub->ub_checkpoint_txg != 0 && 2658332547Smav spa_importing_readonly_checkpoint(spa)) { 2659332547Smav spa_ld_select_uberblock_done(spa, ub); 2660332547Smav return (0); 2661332547Smav } 2662332547Smav 2663332547Smav /* 2664168404Spjd * Find the best uberblock. 2665168404Spjd */ 2666236884Smm vdev_uberblock_load(rvd, ub, &label); 2667168404Spjd 2668168404Spjd /* 2669168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2670168404Spjd */ 2671236884Smm if (ub->ub_txg == 0) { 2672236884Smm nvlist_free(label); 2673332530Smav spa_load_failed(spa, "no valid uberblock found"); 2674219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2675236884Smm } 2676168404Spjd 2677332530Smav spa_load_note(spa, "using uberblock with txg=%llu", 2678332530Smav (u_longlong_t)ub->ub_txg); 2679332530Smav 2680168404Spjd /* 2681236884Smm * If the pool has an unsupported version we can't open it. 2682168404Spjd */ 2683236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2684236884Smm nvlist_free(label); 2685332530Smav spa_load_failed(spa, "version %llu is not supported", 2686332530Smav (u_longlong_t)ub->ub_version); 2687219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2688236884Smm } 2689168404Spjd 2690236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2691236884Smm nvlist_t *features; 2692236884Smm 2693236884Smm /* 2694236884Smm * If we weren't able to find what's necessary for reading the 2695236884Smm * MOS in the label, return failure. 2696236884Smm */ 2697332530Smav if (label == NULL) { 2698332530Smav spa_load_failed(spa, "label config unavailable"); 2699332530Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2700332530Smav ENXIO)); 2701332530Smav } 2702332530Smav 2703332530Smav if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 2704332530Smav &features) != 0) { 2705236884Smm nvlist_free(label); 2706332530Smav spa_load_failed(spa, "invalid label: '%s' missing", 2707332530Smav ZPOOL_CONFIG_FEATURES_FOR_READ); 2708236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2709236884Smm ENXIO)); 2710236884Smm } 2711236884Smm 2712236884Smm /* 2713236884Smm * Update our in-core representation with the definitive values 2714236884Smm * from the label. 2715236884Smm */ 2716236884Smm nvlist_free(spa->spa_label_features); 2717236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2718236884Smm } 2719236884Smm 2720236884Smm nvlist_free(label); 2721236884Smm 2722168404Spjd /* 2723236884Smm * Look through entries in the label nvlist's features_for_read. If 2724236884Smm * there is a feature listed there which we don't understand then we 2725236884Smm * cannot open a pool. 2726236884Smm */ 2727236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2728236884Smm nvlist_t *unsup_feat; 2729236884Smm 2730236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2731236884Smm 0); 2732236884Smm 2733236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2734236884Smm NULL); nvp != NULL; 2735236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2736236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2737236884Smm VERIFY(nvlist_add_string(unsup_feat, 2738236884Smm nvpair_name(nvp), "") == 0); 2739236884Smm } 2740236884Smm } 2741236884Smm 2742236884Smm if (!nvlist_empty(unsup_feat)) { 2743236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2744236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2745236884Smm nvlist_free(unsup_feat); 2746332530Smav spa_load_failed(spa, "some features are unsupported"); 2747236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2748236884Smm ENOTSUP)); 2749236884Smm } 2750236884Smm 2751236884Smm nvlist_free(unsup_feat); 2752236884Smm } 2753236884Smm 2754219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2755219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2756332536Smav spa_try_repair(spa, spa->spa_config); 2757219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2758219089Spjd nvlist_free(spa->spa_config_splitting); 2759219089Spjd spa->spa_config_splitting = NULL; 2760168404Spjd } 2761168404Spjd 2762168404Spjd /* 2763168404Spjd * Initialize internal SPA structures. 2764168404Spjd */ 2765332547Smav spa_ld_select_uberblock_done(spa, ub); 2766219089Spjd 2767332529Smav return (0); 2768332529Smav} 2769332525Smav 2770332529Smavstatic int 2771332529Smavspa_ld_open_rootbp(spa_t *spa) 2772332529Smav{ 2773332529Smav int error = 0; 2774332529Smav vdev_t *rvd = spa->spa_root_vdev; 2775332529Smav 2776236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2777332530Smav if (error != 0) { 2778332530Smav spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 2779332530Smav "[error=%d]", error); 2780219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2781332530Smav } 2782168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2783168404Spjd 2784332529Smav return (0); 2785332529Smav} 2786332529Smav 2787332529Smavstatic int 2788332547Smavspa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 2789332536Smav boolean_t reloading) 2790332529Smav{ 2791332536Smav vdev_t *mrvd, *rvd = spa->spa_root_vdev; 2792332536Smav nvlist_t *nv, *mos_config, *policy; 2793332536Smav int error = 0, copy_error; 2794332536Smav uint64_t healthy_tvds, healthy_tvds_mos; 2795332536Smav uint64_t mos_config_txg; 2796332529Smav 2797332530Smav if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 2798332530Smav != 0) 2799219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2800168404Spjd 2801332525Smav /* 2802332536Smav * If we're assembling a pool from a split, the config provided is 2803332536Smav * already trusted so there is nothing to do. 2804332525Smav */ 2805332536Smav if (type == SPA_IMPORT_ASSEMBLE) 2806332536Smav return (0); 2807332525Smav 2808332536Smav healthy_tvds = spa_healthy_core_tvds(spa); 2809332536Smav 2810332536Smav if (load_nvlist(spa, spa->spa_config_object, &mos_config) 2811332536Smav != 0) { 2812332536Smav spa_load_failed(spa, "unable to retrieve MOS config"); 2813332536Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2814332536Smav } 2815332536Smav 2816332536Smav /* 2817332536Smav * If we are doing an open, pool owner wasn't verified yet, thus do 2818332536Smav * the verification here. 2819332536Smav */ 2820332536Smav if (spa->spa_load_state == SPA_LOAD_OPEN) { 2821332536Smav error = spa_verify_host(spa, mos_config); 2822332536Smav if (error != 0) { 2823332525Smav nvlist_free(mos_config); 2824332536Smav return (error); 2825332525Smav } 2826332536Smav } 2827332525Smav 2828332536Smav nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 2829332536Smav 2830332536Smav spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2831332536Smav 2832332536Smav /* 2833332536Smav * Build a new vdev tree from the trusted config 2834332536Smav */ 2835332536Smav VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 2836332536Smav 2837332536Smav /* 2838332536Smav * Vdev paths in the MOS may be obsolete. If the untrusted config was 2839332536Smav * obtained by scanning /dev/dsk, then it will have the right vdev 2840332536Smav * paths. We update the trusted MOS config with this information. 2841332536Smav * We first try to copy the paths with vdev_copy_path_strict, which 2842332536Smav * succeeds only when both configs have exactly the same vdev tree. 2843332536Smav * If that fails, we fall back to a more flexible method that has a 2844332536Smav * best effort policy. 2845332536Smav */ 2846332536Smav copy_error = vdev_copy_path_strict(rvd, mrvd); 2847332536Smav if (copy_error != 0 || spa_load_print_vdev_tree) { 2848332536Smav spa_load_note(spa, "provided vdev tree:"); 2849332536Smav vdev_dbgmsg_print_tree(rvd, 2); 2850332536Smav spa_load_note(spa, "MOS vdev tree:"); 2851332536Smav vdev_dbgmsg_print_tree(mrvd, 2); 2852332536Smav } 2853332536Smav if (copy_error != 0) { 2854332536Smav spa_load_note(spa, "vdev_copy_path_strict failed, falling " 2855332536Smav "back to vdev_copy_path_relaxed"); 2856332536Smav vdev_copy_path_relaxed(rvd, mrvd); 2857332536Smav } 2858332536Smav 2859332536Smav vdev_close(rvd); 2860332536Smav vdev_free(rvd); 2861332536Smav spa->spa_root_vdev = mrvd; 2862332536Smav rvd = mrvd; 2863332536Smav spa_config_exit(spa, SCL_ALL, FTAG); 2864332536Smav 2865332536Smav /* 2866332536Smav * We will use spa_config if we decide to reload the spa or if spa_load 2867332536Smav * fails and we rewind. We must thus regenerate the config using the 2868332550Smav * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 2869332550Smav * pass settings on how to load the pool and is not stored in the MOS. 2870332550Smav * We copy it over to our new, trusted config. 2871332536Smav */ 2872332536Smav mos_config_txg = fnvlist_lookup_uint64(mos_config, 2873332536Smav ZPOOL_CONFIG_POOL_TXG); 2874332536Smav nvlist_free(mos_config); 2875332536Smav mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 2876332550Smav if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 2877332536Smav &policy) == 0) 2878332550Smav fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 2879332536Smav spa_config_set(spa, mos_config); 2880332536Smav spa->spa_config_source = SPA_CONFIG_SRC_MOS; 2881332536Smav 2882332536Smav /* 2883332536Smav * Now that we got the config from the MOS, we should be more strict 2884332536Smav * in checking blkptrs and can make assumptions about the consistency 2885332536Smav * of the vdev tree. spa_trust_config must be set to true before opening 2886332536Smav * vdevs in order for them to be writeable. 2887332536Smav */ 2888332536Smav spa->spa_trust_config = B_TRUE; 2889332536Smav 2890332536Smav /* 2891332536Smav * Open and validate the new vdev tree 2892332536Smav */ 2893332536Smav error = spa_ld_open_vdevs(spa); 2894332536Smav if (error != 0) 2895332536Smav return (error); 2896332536Smav 2897332536Smav error = spa_ld_validate_vdevs(spa); 2898332536Smav if (error != 0) 2899332536Smav return (error); 2900332536Smav 2901332536Smav if (copy_error != 0 || spa_load_print_vdev_tree) { 2902332536Smav spa_load_note(spa, "final vdev tree:"); 2903332536Smav vdev_dbgmsg_print_tree(rvd, 2); 2904332536Smav } 2905332536Smav 2906332536Smav if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 2907332536Smav !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 2908332525Smav /* 2909332536Smav * Sanity check to make sure that we are indeed loading the 2910332536Smav * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 2911332536Smav * in the config provided and they happened to be the only ones 2912332536Smav * to have the latest uberblock, we could involuntarily perform 2913332536Smav * an extreme rewind. 2914332525Smav */ 2915332536Smav healthy_tvds_mos = spa_healthy_core_tvds(spa); 2916332536Smav if (healthy_tvds_mos - healthy_tvds >= 2917332536Smav SPA_SYNC_MIN_VDEVS) { 2918332536Smav spa_load_note(spa, "config provided misses too many " 2919332536Smav "top-level vdevs compared to MOS (%lld vs %lld). ", 2920332536Smav (u_longlong_t)healthy_tvds, 2921332536Smav (u_longlong_t)healthy_tvds_mos); 2922332536Smav spa_load_note(spa, "vdev tree:"); 2923332536Smav vdev_dbgmsg_print_tree(rvd, 2); 2924332536Smav if (reloading) { 2925332536Smav spa_load_failed(spa, "config was already " 2926332536Smav "provided from MOS. Aborting."); 2927332536Smav return (spa_vdev_err(rvd, 2928332536Smav VDEV_AUX_CORRUPT_DATA, EIO)); 2929332536Smav } 2930332536Smav spa_load_note(spa, "spa must be reloaded using MOS " 2931332536Smav "config"); 2932332536Smav return (SET_ERROR(EAGAIN)); 2933332530Smav } 2934332525Smav } 2935332525Smav 2936332536Smav error = spa_check_for_missing_logs(spa); 2937332536Smav if (error != 0) 2938332536Smav return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2939332536Smav 2940332536Smav if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 2941332536Smav spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 2942332536Smav "guid sum (%llu != %llu)", 2943332536Smav (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 2944332536Smav (u_longlong_t)rvd->vdev_guid_sum); 2945332536Smav return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2946332536Smav ENXIO)); 2947332536Smav } 2948332536Smav 2949332529Smav return (0); 2950332529Smav} 2951332529Smav 2952332529Smavstatic int 2953332529Smavspa_ld_open_indirect_vdev_metadata(spa_t *spa) 2954332529Smav{ 2955332529Smav int error = 0; 2956332529Smav vdev_t *rvd = spa->spa_root_vdev; 2957332529Smav 2958332525Smav /* 2959332525Smav * Everything that we read before spa_remove_init() must be stored 2960332525Smav * on concreted vdevs. Therefore we do this as early as possible. 2961332525Smav */ 2962332530Smav error = spa_remove_init(spa); 2963332530Smav if (error != 0) { 2964332530Smav spa_load_failed(spa, "spa_remove_init failed [error=%d]", 2965332530Smav error); 2966332525Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2967332530Smav } 2968332525Smav 2969332529Smav /* 2970332529Smav * Retrieve information needed to condense indirect vdev mappings. 2971332529Smav */ 2972332529Smav error = spa_condense_init(spa); 2973332529Smav if (error != 0) { 2974332530Smav spa_load_failed(spa, "spa_condense_init failed [error=%d]", 2975332530Smav error); 2976332529Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 2977332529Smav } 2978332529Smav 2979332529Smav return (0); 2980332529Smav} 2981332529Smav 2982332529Smavstatic int 2983332530Smavspa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 2984332529Smav{ 2985332529Smav int error = 0; 2986332529Smav vdev_t *rvd = spa->spa_root_vdev; 2987332529Smav 2988236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2989236884Smm boolean_t missing_feat_read = B_FALSE; 2990238926Smm nvlist_t *unsup_feat, *enabled_feat; 2991236884Smm 2992236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2993332530Smav &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 2994236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2995236884Smm } 2996236884Smm 2997236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2998332530Smav &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 2999236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3000236884Smm } 3001236884Smm 3002236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 3003332530Smav &spa->spa_feat_desc_obj, B_TRUE) != 0) { 3004236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3005236884Smm } 3006236884Smm 3007238926Smm enabled_feat = fnvlist_alloc(); 3008238926Smm unsup_feat = fnvlist_alloc(); 3009236884Smm 3010259813Sdelphij if (!spa_features_check(spa, B_FALSE, 3011238926Smm unsup_feat, enabled_feat)) 3012236884Smm missing_feat_read = B_TRUE; 3013236884Smm 3014332530Smav if (spa_writeable(spa) || 3015332530Smav spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 3016259813Sdelphij if (!spa_features_check(spa, B_TRUE, 3017238926Smm unsup_feat, enabled_feat)) { 3018332529Smav *missing_feat_writep = B_TRUE; 3019238926Smm } 3020236884Smm } 3021236884Smm 3022238926Smm fnvlist_add_nvlist(spa->spa_load_info, 3023238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 3024238926Smm 3025236884Smm if (!nvlist_empty(unsup_feat)) { 3026238926Smm fnvlist_add_nvlist(spa->spa_load_info, 3027238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 3028236884Smm } 3029236884Smm 3030238926Smm fnvlist_free(enabled_feat); 3031238926Smm fnvlist_free(unsup_feat); 3032236884Smm 3033236884Smm if (!missing_feat_read) { 3034236884Smm fnvlist_add_boolean(spa->spa_load_info, 3035236884Smm ZPOOL_CONFIG_CAN_RDONLY); 3036236884Smm } 3037236884Smm 3038236884Smm /* 3039236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 3040236884Smm * twofold: to determine whether the pool is available for 3041236884Smm * import in read-write mode and (if it is not) whether the 3042236884Smm * pool is available for import in read-only mode. If the pool 3043236884Smm * is available for import in read-write mode, it is displayed 3044236884Smm * as available in userland; if it is not available for import 3045236884Smm * in read-only mode, it is displayed as unavailable in 3046236884Smm * userland. If the pool is available for import in read-only 3047236884Smm * mode but not read-write mode, it is displayed as unavailable 3048236884Smm * in userland with a special note that the pool is actually 3049236884Smm * available for open in read-only mode. 3050236884Smm * 3051236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 3052236884Smm * missing a feature for write, we must first determine whether 3053236884Smm * the pool can be opened read-only before returning to 3054236884Smm * userland in order to know whether to display the 3055236884Smm * abovementioned note. 3056236884Smm */ 3057332529Smav if (missing_feat_read || (*missing_feat_writep && 3058236884Smm spa_writeable(spa))) { 3059332530Smav spa_load_failed(spa, "pool uses unsupported features"); 3060236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 3061236884Smm ENOTSUP)); 3062236884Smm } 3063260150Sdelphij 3064260150Sdelphij /* 3065260150Sdelphij * Load refcounts for ZFS features from disk into an in-memory 3066260150Sdelphij * cache during SPA initialization. 3067260150Sdelphij */ 3068260150Sdelphij for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 3069260150Sdelphij uint64_t refcount; 3070260150Sdelphij 3071260150Sdelphij error = feature_get_refcount_from_disk(spa, 3072260150Sdelphij &spa_feature_table[i], &refcount); 3073260150Sdelphij if (error == 0) { 3074260150Sdelphij spa->spa_feat_refcount_cache[i] = refcount; 3075260150Sdelphij } else if (error == ENOTSUP) { 3076260150Sdelphij spa->spa_feat_refcount_cache[i] = 3077260150Sdelphij SPA_FEATURE_DISABLED; 3078260150Sdelphij } else { 3079332530Smav spa_load_failed(spa, "error getting refcount " 3080332530Smav "for feature %s [error=%d]", 3081332530Smav spa_feature_table[i].fi_guid, error); 3082260150Sdelphij return (spa_vdev_err(rvd, 3083260150Sdelphij VDEV_AUX_CORRUPT_DATA, EIO)); 3084260150Sdelphij } 3085260150Sdelphij } 3086236884Smm } 3087236884Smm 3088260150Sdelphij if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 3089260150Sdelphij if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 3090332530Smav &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 3091260150Sdelphij return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3092260150Sdelphij } 3093260150Sdelphij 3094332529Smav return (0); 3095332529Smav} 3096332529Smav 3097332529Smavstatic int 3098332529Smavspa_ld_load_special_directories(spa_t *spa) 3099332529Smav{ 3100332529Smav int error = 0; 3101332529Smav vdev_t *rvd = spa->spa_root_vdev; 3102332529Smav 3103236884Smm spa->spa_is_initializing = B_TRUE; 3104236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 3105236884Smm spa->spa_is_initializing = B_FALSE; 3106332530Smav if (error != 0) { 3107332530Smav spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 3108236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3109332530Smav } 3110236884Smm 3111332529Smav return (0); 3112332529Smav} 3113168404Spjd 3114332529Smavstatic int 3115332529Smavspa_ld_get_props(spa_t *spa) 3116332529Smav{ 3117332529Smav int error = 0; 3118332529Smav uint64_t obj; 3119332529Smav vdev_t *rvd = spa->spa_root_vdev; 3120332529Smav 3121289422Smav /* Grab the secret checksum salt from the MOS. */ 3122289422Smav error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 3123289422Smav DMU_POOL_CHECKSUM_SALT, 1, 3124289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes), 3125289422Smav spa->spa_cksum_salt.zcs_bytes); 3126289422Smav if (error == ENOENT) { 3127289422Smav /* Generate a new salt for subsequent use */ 3128289422Smav (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 3129289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes)); 3130289422Smav } else if (error != 0) { 3131332530Smav spa_load_failed(spa, "unable to retrieve checksum salt from " 3132332530Smav "MOS [error=%d]", error); 3133289422Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3134289422Smav } 3135289422Smav 3136332530Smav if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 3137219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3138219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 3139332530Smav if (error != 0) { 3140332530Smav spa_load_failed(spa, "error opening deferred-frees bpobj " 3141332530Smav "[error=%d]", error); 3142219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3143332530Smav } 3144168404Spjd 3145168404Spjd /* 3146168404Spjd * Load the bit that tells us to use the new accounting function 3147168404Spjd * (raid-z deflation). If we have an older pool, this will not 3148168404Spjd * be present. 3149168404Spjd */ 3150332530Smav error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 3151219089Spjd if (error != 0 && error != ENOENT) 3152219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3153168404Spjd 3154219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 3155332530Smav &spa->spa_creation_version, B_FALSE); 3156219089Spjd if (error != 0 && error != ENOENT) 3157219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3158219089Spjd 3159168404Spjd /* 3160168404Spjd * Load the persistent error log. If we have an older pool, this will 3161168404Spjd * not be present. 3162168404Spjd */ 3163332530Smav error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 3164332530Smav B_FALSE); 3165219089Spjd if (error != 0 && error != ENOENT) 3166219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3167168404Spjd 3168219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 3169332530Smav &spa->spa_errlog_scrub, B_FALSE); 3170219089Spjd if (error != 0 && error != ENOENT) 3171219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3172168404Spjd 3173168404Spjd /* 3174168404Spjd * Load the history object. If we have an older pool, this 3175168404Spjd * will not be present. 3176168404Spjd */ 3177332530Smav error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 3178219089Spjd if (error != 0 && error != ENOENT) 3179219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3180168404Spjd 3181168404Spjd /* 3182299441Smav * Load the per-vdev ZAP map. If we have an older pool, this will not 3183299441Smav * be present; in this case, defer its creation to a later time to 3184299441Smav * avoid dirtying the MOS this early / out of sync context. See 3185299441Smav * spa_sync_config_object. 3186299441Smav */ 3187299441Smav 3188299441Smav /* The sentinel is only available in the MOS config. */ 3189299441Smav nvlist_t *mos_config; 3190332530Smav if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 3191332530Smav spa_load_failed(spa, "unable to retrieve MOS config"); 3192299441Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3193332530Smav } 3194299441Smav 3195299441Smav error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 3196332530Smav &spa->spa_all_vdev_zaps, B_FALSE); 3197299441Smav 3198321540Smav if (error == ENOENT) { 3199321540Smav VERIFY(!nvlist_exists(mos_config, 3200321540Smav ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 3201321540Smav spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 3202321540Smav ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 3203321540Smav } else if (error != 0) { 3204299441Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3205321540Smav } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 3206299441Smav /* 3207299441Smav * An older version of ZFS overwrote the sentinel value, so 3208299441Smav * we have orphaned per-vdev ZAPs in the MOS. Defer their 3209299441Smav * destruction to later; see spa_sync_config_object. 3210299441Smav */ 3211299441Smav spa->spa_avz_action = AVZ_ACTION_DESTROY; 3212299441Smav /* 3213299441Smav * We're assuming that no vdevs have had their ZAPs created 3214299441Smav * before this. Better be sure of it. 3215299441Smav */ 3216299441Smav ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 3217299441Smav } 3218299441Smav nvlist_free(mos_config); 3219299441Smav 3220332529Smav spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3221332529Smav 3222332530Smav error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 3223332530Smav B_FALSE); 3224332529Smav if (error && error != ENOENT) 3225332529Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3226332529Smav 3227332529Smav if (error == 0) { 3228332529Smav uint64_t autoreplace; 3229332529Smav 3230332529Smav spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 3231332529Smav spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 3232332529Smav spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 3233332529Smav spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 3234332529Smav spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 3235332529Smav spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 3236332529Smav &spa->spa_dedup_ditto); 3237332529Smav 3238332529Smav spa->spa_autoreplace = (autoreplace != 0); 3239332529Smav } 3240332529Smav 3241332536Smav /* 3242332536Smav * If we are importing a pool with missing top-level vdevs, 3243332536Smav * we enforce that the pool doesn't panic or get suspended on 3244332536Smav * error since the likelihood of missing data is extremely high. 3245332536Smav */ 3246332536Smav if (spa->spa_missing_tvds > 0 && 3247332536Smav spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 3248332536Smav spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 3249332536Smav spa_load_note(spa, "forcing failmode to 'continue' " 3250332536Smav "as some top level vdevs are missing"); 3251332536Smav spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 3252332536Smav } 3253332536Smav 3254332529Smav return (0); 3255332529Smav} 3256332529Smav 3257332529Smavstatic int 3258332529Smavspa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 3259332529Smav{ 3260332529Smav int error = 0; 3261332529Smav vdev_t *rvd = spa->spa_root_vdev; 3262332529Smav 3263299441Smav /* 3264219089Spjd * If we're assembling the pool from the split-off vdevs of 3265219089Spjd * an existing pool, we don't want to attach the spares & cache 3266219089Spjd * devices. 3267219089Spjd */ 3268219089Spjd 3269219089Spjd /* 3270168404Spjd * Load any hot spares for this pool. 3271168404Spjd */ 3272332530Smav error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 3273332530Smav B_FALSE); 3274219089Spjd if (error != 0 && error != ENOENT) 3275219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3276219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 3277185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 3278185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 3279332530Smav &spa->spa_spares.sav_config) != 0) { 3280332530Smav spa_load_failed(spa, "error loading spares nvlist"); 3281219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3282332530Smav } 3283168404Spjd 3284185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3285168404Spjd spa_load_spares(spa); 3286185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3287219089Spjd } else if (error == 0) { 3288219089Spjd spa->spa_spares.sav_sync = B_TRUE; 3289168404Spjd } 3290168404Spjd 3291185029Spjd /* 3292185029Spjd * Load any level 2 ARC devices for this pool. 3293185029Spjd */ 3294219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 3295332530Smav &spa->spa_l2cache.sav_object, B_FALSE); 3296219089Spjd if (error != 0 && error != ENOENT) 3297219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3298219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 3299185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 3300185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 3301332530Smav &spa->spa_l2cache.sav_config) != 0) { 3302332530Smav spa_load_failed(spa, "error loading l2cache nvlist"); 3303219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3304332530Smav } 3305185029Spjd 3306185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3307185029Spjd spa_load_l2cache(spa); 3308185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3309219089Spjd } else if (error == 0) { 3310219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3311185029Spjd } 3312185029Spjd 3313332529Smav return (0); 3314332529Smav} 3315213197Smm 3316332529Smavstatic int 3317332530Smavspa_ld_load_vdev_metadata(spa_t *spa) 3318332529Smav{ 3319332529Smav int error = 0; 3320332529Smav vdev_t *rvd = spa->spa_root_vdev; 3321185029Spjd 3322168404Spjd /* 3323185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 3324185029Spjd * the ZFS DE that it should not issue any faults for unopenable 3325185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 3326185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 3327185029Spjd * over. 3328185029Spjd */ 3329332530Smav if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 3330185029Spjd spa_check_removed(spa->spa_root_vdev); 3331219089Spjd /* 3332219089Spjd * For the import case, this is done in spa_import(), because 3333219089Spjd * at this point we're using the spare definitions from 3334219089Spjd * the MOS config, not necessarily from the userland config. 3335219089Spjd */ 3336332530Smav if (spa->spa_load_state != SPA_LOAD_IMPORT) { 3337219089Spjd spa_aux_check_removed(&spa->spa_spares); 3338219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 3339219089Spjd } 3340219089Spjd } 3341185029Spjd 3342185029Spjd /* 3343332529Smav * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 3344168404Spjd */ 3345332525Smav error = vdev_load(rvd); 3346332525Smav if (error != 0) { 3347332530Smav spa_load_failed(spa, "vdev_load failed [error=%d]", error); 3348332525Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3349332525Smav } 3350168404Spjd 3351168404Spjd /* 3352332529Smav * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 3353168404Spjd */ 3354185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3355168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 3356185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3357168404Spjd 3358332529Smav return (0); 3359332529Smav} 3360332529Smav 3361332529Smavstatic int 3362332529Smavspa_ld_load_dedup_tables(spa_t *spa) 3363332529Smav{ 3364332529Smav int error = 0; 3365332529Smav vdev_t *rvd = spa->spa_root_vdev; 3366332529Smav 3367219089Spjd error = ddt_load(spa); 3368332530Smav if (error != 0) { 3369332530Smav spa_load_failed(spa, "ddt_load failed [error=%d]", error); 3370219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3371332530Smav } 3372219089Spjd 3373332529Smav return (0); 3374332529Smav} 3375219089Spjd 3376332529Smavstatic int 3377332529Smavspa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) 3378332529Smav{ 3379332529Smav vdev_t *rvd = spa->spa_root_vdev; 3380332529Smav 3381332530Smav if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 3382332530Smav boolean_t missing = spa_check_logs(spa); 3383332530Smav if (missing) { 3384332536Smav if (spa->spa_missing_tvds != 0) { 3385332536Smav spa_load_note(spa, "spa_check_logs failed " 3386332536Smav "so dropping the logs"); 3387332536Smav } else { 3388332536Smav *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 3389332536Smav spa_load_failed(spa, "spa_check_logs failed"); 3390332536Smav return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 3391332536Smav ENXIO)); 3392332536Smav } 3393332530Smav } 3394168404Spjd } 3395168404Spjd 3396332529Smav return (0); 3397332529Smav} 3398332529Smav 3399332529Smavstatic int 3400332530Smavspa_ld_verify_pool_data(spa_t *spa) 3401332529Smav{ 3402332529Smav int error = 0; 3403332529Smav vdev_t *rvd = spa->spa_root_vdev; 3404332529Smav 3405332529Smav /* 3406332529Smav * We've successfully opened the pool, verify that we're ready 3407332529Smav * to start pushing transactions. 3408332529Smav */ 3409332530Smav if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 3410332529Smav error = spa_load_verify(spa); 3411332529Smav if (error != 0) { 3412332530Smav spa_load_failed(spa, "spa_load_verify failed " 3413332530Smav "[error=%d]", error); 3414332529Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3415332529Smav error)); 3416332529Smav } 3417332529Smav } 3418332529Smav 3419332529Smav return (0); 3420332529Smav} 3421332529Smav 3422332529Smavstatic void 3423332529Smavspa_ld_claim_log_blocks(spa_t *spa) 3424332529Smav{ 3425332529Smav dmu_tx_t *tx; 3426332529Smav dsl_pool_t *dp = spa_get_dsl(spa); 3427332529Smav 3428332529Smav /* 3429332529Smav * Claim log blocks that haven't been committed yet. 3430332529Smav * This must all happen in a single txg. 3431332529Smav * Note: spa_claim_max_txg is updated by spa_claim_notify(), 3432332529Smav * invoked from zil_claim_log_block()'s i/o done callback. 3433332529Smav * Price of rollback is that we abandon the log. 3434332529Smav */ 3435332529Smav spa->spa_claiming = B_TRUE; 3436332529Smav 3437332529Smav tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 3438332529Smav (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 3439332529Smav zil_claim, tx, DS_FIND_CHILDREN); 3440332529Smav dmu_tx_commit(tx); 3441332529Smav 3442332529Smav spa->spa_claiming = B_FALSE; 3443332529Smav 3444332529Smav spa_set_log_state(spa, SPA_LOG_GOOD); 3445332529Smav} 3446332529Smav 3447332529Smavstatic void 3448332536Smavspa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 3449332547Smav boolean_t update_config_cache) 3450332529Smav{ 3451332529Smav vdev_t *rvd = spa->spa_root_vdev; 3452332529Smav int need_update = B_FALSE; 3453332529Smav 3454332529Smav /* 3455332529Smav * If the config cache is stale, or we have uninitialized 3456332529Smav * metaslabs (see spa_vdev_add()), then update the config. 3457332529Smav * 3458332529Smav * If this is a verbatim import, trust the current 3459332529Smav * in-core spa_config and update the disk labels. 3460332529Smav */ 3461332547Smav if (update_config_cache || config_cache_txg != spa->spa_config_txg || 3462332530Smav spa->spa_load_state == SPA_LOAD_IMPORT || 3463332530Smav spa->spa_load_state == SPA_LOAD_RECOVER || 3464332529Smav (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 3465332529Smav need_update = B_TRUE; 3466332529Smav 3467332529Smav for (int c = 0; c < rvd->vdev_children; c++) 3468332529Smav if (rvd->vdev_child[c]->vdev_ms_array == 0) 3469332529Smav need_update = B_TRUE; 3470332529Smav 3471332529Smav /* 3472332529Smav * Update the config cache asychronously in case we're the 3473332529Smav * root pool, in which case the config cache isn't writable yet. 3474332529Smav */ 3475332529Smav if (need_update) 3476332529Smav spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 3477332529Smav} 3478332529Smav 3479332536Smavstatic void 3480332536Smavspa_ld_prepare_for_reload(spa_t *spa) 3481332536Smav{ 3482332536Smav int mode = spa->spa_mode; 3483332536Smav int async_suspended = spa->spa_async_suspended; 3484332536Smav 3485332536Smav spa_unload(spa); 3486332536Smav spa_deactivate(spa); 3487332536Smav spa_activate(spa, mode); 3488332536Smav 3489332536Smav /* 3490332536Smav * We save the value of spa_async_suspended as it gets reset to 0 by 3491332536Smav * spa_unload(). We want to restore it back to the original value before 3492332536Smav * returning as we might be calling spa_async_resume() later. 3493332536Smav */ 3494332536Smav spa->spa_async_suspended = async_suspended; 3495332536Smav} 3496332536Smav 3497332529Smavstatic int 3498332547Smavspa_ld_read_checkpoint_txg(spa_t *spa) 3499332529Smav{ 3500332547Smav uberblock_t checkpoint; 3501332529Smav int error = 0; 3502332529Smav 3503332547Smav ASSERT0(spa->spa_checkpoint_txg); 3504332530Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3505332547Smav 3506332547Smav error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 3507332547Smav DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 3508332547Smav sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 3509332547Smav 3510332547Smav if (error == ENOENT) 3511332547Smav return (0); 3512332547Smav 3513332547Smav if (error != 0) 3514332547Smav return (error); 3515332547Smav 3516332547Smav ASSERT3U(checkpoint.ub_txg, !=, 0); 3517332547Smav ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 3518332547Smav ASSERT3U(checkpoint.ub_timestamp, !=, 0); 3519332547Smav spa->spa_checkpoint_txg = checkpoint.ub_txg; 3520332547Smav spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 3521332547Smav 3522332547Smav return (0); 3523332547Smav} 3524332547Smav 3525332547Smavstatic int 3526332547Smavspa_ld_mos_init(spa_t *spa, spa_import_type_t type) 3527332547Smav{ 3528332547Smav int error = 0; 3529332547Smav 3530332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3531332536Smav ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 3532332530Smav 3533332529Smav /* 3534332536Smav * Never trust the config that is provided unless we are assembling 3535332536Smav * a pool following a split. 3536332536Smav * This means don't trust blkptrs and the vdev tree in general. This 3537332536Smav * also effectively puts the spa in read-only mode since 3538332536Smav * spa_writeable() checks for spa_trust_config to be true. 3539332536Smav * We will later load a trusted config from the MOS. 3540332529Smav */ 3541332536Smav if (type != SPA_IMPORT_ASSEMBLE) 3542332536Smav spa->spa_trust_config = B_FALSE; 3543332529Smav 3544332529Smav /* 3545332529Smav * Parse the config provided to create a vdev tree. 3546332529Smav */ 3547332536Smav error = spa_ld_parse_config(spa, type); 3548332529Smav if (error != 0) 3549332529Smav return (error); 3550332529Smav 3551332529Smav /* 3552332529Smav * Now that we have the vdev tree, try to open each vdev. This involves 3553332529Smav * opening the underlying physical device, retrieving its geometry and 3554332529Smav * probing the vdev with a dummy I/O. The state of each vdev will be set 3555332529Smav * based on the success of those operations. After this we'll be ready 3556332529Smav * to read from the vdevs. 3557332529Smav */ 3558332529Smav error = spa_ld_open_vdevs(spa); 3559332529Smav if (error != 0) 3560332529Smav return (error); 3561332529Smav 3562332529Smav /* 3563332529Smav * Read the label of each vdev and make sure that the GUIDs stored 3564332529Smav * there match the GUIDs in the config provided. 3565332536Smav * If we're assembling a new pool that's been split off from an 3566332536Smav * existing pool, the labels haven't yet been updated so we skip 3567332536Smav * validation for now. 3568332529Smav */ 3569332536Smav if (type != SPA_IMPORT_ASSEMBLE) { 3570332536Smav error = spa_ld_validate_vdevs(spa); 3571332536Smav if (error != 0) 3572332536Smav return (error); 3573332536Smav } 3574332529Smav 3575332529Smav /* 3576332547Smav * Read all vdev labels to find the best uberblock (i.e. latest, 3577332547Smav * unless spa_load_max_txg is set) and store it in spa_uberblock. We 3578332547Smav * get the list of features required to read blkptrs in the MOS from 3579332547Smav * the vdev label with the best uberblock and verify that our version 3580332547Smav * of zfs supports them all. 3581332529Smav */ 3582332536Smav error = spa_ld_select_uberblock(spa, type); 3583332529Smav if (error != 0) 3584332529Smav return (error); 3585332529Smav 3586332529Smav /* 3587332529Smav * Pass that uberblock to the dsl_pool layer which will open the root 3588332529Smav * blkptr. This blkptr points to the latest version of the MOS and will 3589332529Smav * allow us to read its contents. 3590332529Smav */ 3591332529Smav error = spa_ld_open_rootbp(spa); 3592332529Smav if (error != 0) 3593332529Smav return (error); 3594332529Smav 3595332547Smav return (0); 3596332547Smav} 3597332547Smav 3598332547Smavstatic int 3599332547Smavspa_ld_checkpoint_rewind(spa_t *spa) 3600332547Smav{ 3601332547Smav uberblock_t checkpoint; 3602332547Smav int error = 0; 3603332547Smav 3604332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3605332547Smav ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 3606332547Smav 3607332547Smav error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 3608332547Smav DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 3609332547Smav sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 3610332547Smav 3611332547Smav if (error != 0) { 3612332547Smav spa_load_failed(spa, "unable to retrieve checkpointed " 3613332547Smav "uberblock from the MOS config [error=%d]", error); 3614332547Smav 3615332547Smav if (error == ENOENT) 3616332547Smav error = ZFS_ERR_NO_CHECKPOINT; 3617332547Smav 3618332547Smav return (error); 3619332547Smav } 3620332547Smav 3621332547Smav ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 3622332547Smav ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 3623332547Smav 3624332529Smav /* 3625332547Smav * We need to update the txg and timestamp of the checkpointed 3626332547Smav * uberblock to be higher than the latest one. This ensures that 3627332547Smav * the checkpointed uberblock is selected if we were to close and 3628332547Smav * reopen the pool right after we've written it in the vdev labels. 3629332547Smav * (also see block comment in vdev_uberblock_compare) 3630332547Smav */ 3631332547Smav checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 3632332547Smav checkpoint.ub_timestamp = gethrestime_sec(); 3633332547Smav 3634332547Smav /* 3635332547Smav * Set current uberblock to be the checkpointed uberblock. 3636332547Smav */ 3637332547Smav spa->spa_uberblock = checkpoint; 3638332547Smav 3639332547Smav /* 3640332547Smav * If we are doing a normal rewind, then the pool is open for 3641332547Smav * writing and we sync the "updated" checkpointed uberblock to 3642332547Smav * disk. Once this is done, we've basically rewound the whole 3643332547Smav * pool and there is no way back. 3644332547Smav * 3645332547Smav * There are cases when we don't want to attempt and sync the 3646332547Smav * checkpointed uberblock to disk because we are opening a 3647332547Smav * pool as read-only. Specifically, verifying the checkpointed 3648332547Smav * state with zdb, and importing the checkpointed state to get 3649332547Smav * a "preview" of its content. 3650332547Smav */ 3651332547Smav if (spa_writeable(spa)) { 3652332547Smav vdev_t *rvd = spa->spa_root_vdev; 3653332547Smav 3654332547Smav spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3655332547Smav vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 3656332547Smav int svdcount = 0; 3657332547Smav int children = rvd->vdev_children; 3658332547Smav int c0 = spa_get_random(children); 3659332547Smav 3660332547Smav for (int c = 0; c < children; c++) { 3661332547Smav vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 3662332547Smav 3663332547Smav /* Stop when revisiting the first vdev */ 3664332547Smav if (c > 0 && svd[0] == vd) 3665332547Smav break; 3666332547Smav 3667332547Smav if (vd->vdev_ms_array == 0 || vd->vdev_islog || 3668332547Smav !vdev_is_concrete(vd)) 3669332547Smav continue; 3670332547Smav 3671332547Smav svd[svdcount++] = vd; 3672332547Smav if (svdcount == SPA_SYNC_MIN_VDEVS) 3673332547Smav break; 3674332547Smav } 3675332547Smav error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 3676332547Smav if (error == 0) 3677332547Smav spa->spa_last_synced_guid = rvd->vdev_guid; 3678332547Smav spa_config_exit(spa, SCL_ALL, FTAG); 3679332547Smav 3680332547Smav if (error != 0) { 3681332547Smav spa_load_failed(spa, "failed to write checkpointed " 3682332547Smav "uberblock to the vdev labels [error=%d]", error); 3683332547Smav return (error); 3684332547Smav } 3685332547Smav } 3686332547Smav 3687332547Smav return (0); 3688332547Smav} 3689332547Smav 3690332547Smavstatic int 3691332547Smavspa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 3692332547Smav boolean_t *update_config_cache) 3693332547Smav{ 3694332547Smav int error; 3695332547Smav 3696332547Smav /* 3697332547Smav * Parse the config for pool, open and validate vdevs, 3698332547Smav * select an uberblock, and use that uberblock to open 3699332547Smav * the MOS. 3700332547Smav */ 3701332547Smav error = spa_ld_mos_init(spa, type); 3702332547Smav if (error != 0) 3703332547Smav return (error); 3704332547Smav 3705332547Smav /* 3706332536Smav * Retrieve the trusted config stored in the MOS and use it to create 3707332536Smav * a new, exact version of the vdev tree, then reopen all vdevs. 3708332529Smav */ 3709332547Smav error = spa_ld_trusted_config(spa, type, B_FALSE); 3710332536Smav if (error == EAGAIN) { 3711332547Smav if (update_config_cache != NULL) 3712332547Smav *update_config_cache = B_TRUE; 3713332547Smav 3714332536Smav /* 3715332536Smav * Redo the loading process with the trusted config if it is 3716332536Smav * too different from the untrusted config. 3717332536Smav */ 3718332536Smav spa_ld_prepare_for_reload(spa); 3719332547Smav spa_load_note(spa, "RELOADING"); 3720332547Smav error = spa_ld_mos_init(spa, type); 3721332547Smav if (error != 0) 3722332547Smav return (error); 3723332547Smav 3724332547Smav error = spa_ld_trusted_config(spa, type, B_TRUE); 3725332547Smav if (error != 0) 3726332547Smav return (error); 3727332547Smav 3728332536Smav } else if (error != 0) { 3729332529Smav return (error); 3730332536Smav } 3731332529Smav 3732332547Smav return (0); 3733332547Smav} 3734332547Smav 3735332547Smav/* 3736332547Smav * Load an existing storage pool, using the config provided. This config 3737332547Smav * describes which vdevs are part of the pool and is later validated against 3738332547Smav * partial configs present in each vdev's label and an entire copy of the 3739332547Smav * config stored in the MOS. 3740332547Smav */ 3741332547Smavstatic int 3742332547Smavspa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) 3743332547Smav{ 3744332547Smav int error = 0; 3745332547Smav boolean_t missing_feat_write = B_FALSE; 3746332547Smav boolean_t checkpoint_rewind = 3747332547Smav (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 3748332547Smav boolean_t update_config_cache = B_FALSE; 3749332547Smav 3750332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3751332547Smav ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 3752332547Smav 3753332547Smav spa_load_note(spa, "LOADING"); 3754332547Smav 3755332547Smav error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 3756332547Smav if (error != 0) 3757332547Smav return (error); 3758332547Smav 3759332529Smav /* 3760332547Smav * If we are rewinding to the checkpoint then we need to repeat 3761332547Smav * everything we've done so far in this function but this time 3762332547Smav * selecting the checkpointed uberblock and using that to open 3763332547Smav * the MOS. 3764332547Smav */ 3765332547Smav if (checkpoint_rewind) { 3766332547Smav /* 3767332547Smav * If we are rewinding to the checkpoint update config cache 3768332547Smav * anyway. 3769332547Smav */ 3770332547Smav update_config_cache = B_TRUE; 3771332547Smav 3772332547Smav /* 3773332547Smav * Extract the checkpointed uberblock from the current MOS 3774332547Smav * and use this as the pool's uberblock from now on. If the 3775332547Smav * pool is imported as writeable we also write the checkpoint 3776332547Smav * uberblock to the labels, making the rewind permanent. 3777332547Smav */ 3778332547Smav error = spa_ld_checkpoint_rewind(spa); 3779332547Smav if (error != 0) 3780332547Smav return (error); 3781332547Smav 3782332547Smav /* 3783332547Smav * Redo the loading process process again with the 3784332547Smav * checkpointed uberblock. 3785332547Smav */ 3786332547Smav spa_ld_prepare_for_reload(spa); 3787332547Smav spa_load_note(spa, "LOADING checkpointed uberblock"); 3788332547Smav error = spa_ld_mos_with_trusted_config(spa, type, NULL); 3789332547Smav if (error != 0) 3790332547Smav return (error); 3791332547Smav } 3792332547Smav 3793332547Smav /* 3794332547Smav * Retrieve the checkpoint txg if the pool has a checkpoint. 3795332547Smav */ 3796332547Smav error = spa_ld_read_checkpoint_txg(spa); 3797332547Smav if (error != 0) 3798332547Smav return (error); 3799332547Smav 3800332547Smav /* 3801332529Smav * Retrieve the mapping of indirect vdevs. Those vdevs were removed 3802332529Smav * from the pool and their contents were re-mapped to other vdevs. Note 3803332529Smav * that everything that we read before this step must have been 3804332529Smav * rewritten on concrete vdevs after the last device removal was 3805332529Smav * initiated. Otherwise we could be reading from indirect vdevs before 3806332529Smav * we have loaded their mappings. 3807332529Smav */ 3808332529Smav error = spa_ld_open_indirect_vdev_metadata(spa); 3809332529Smav if (error != 0) 3810332529Smav return (error); 3811332529Smav 3812332529Smav /* 3813332529Smav * Retrieve the full list of active features from the MOS and check if 3814332529Smav * they are all supported. 3815332529Smav */ 3816332530Smav error = spa_ld_check_features(spa, &missing_feat_write); 3817332529Smav if (error != 0) 3818332529Smav return (error); 3819332529Smav 3820332529Smav /* 3821332529Smav * Load several special directories from the MOS needed by the dsl_pool 3822332529Smav * layer. 3823332529Smav */ 3824332529Smav error = spa_ld_load_special_directories(spa); 3825332529Smav if (error != 0) 3826332529Smav return (error); 3827332529Smav 3828332529Smav /* 3829332529Smav * Retrieve pool properties from the MOS. 3830332529Smav */ 3831332529Smav error = spa_ld_get_props(spa); 3832332529Smav if (error != 0) 3833332529Smav return (error); 3834332529Smav 3835332529Smav /* 3836332529Smav * Retrieve the list of auxiliary devices - cache devices and spares - 3837332529Smav * and open them. 3838332529Smav */ 3839332529Smav error = spa_ld_open_aux_vdevs(spa, type); 3840332529Smav if (error != 0) 3841332529Smav return (error); 3842332529Smav 3843332529Smav /* 3844332529Smav * Load the metadata for all vdevs. Also check if unopenable devices 3845332529Smav * should be autoreplaced. 3846332529Smav */ 3847332530Smav error = spa_ld_load_vdev_metadata(spa); 3848332529Smav if (error != 0) 3849332529Smav return (error); 3850332529Smav 3851332529Smav error = spa_ld_load_dedup_tables(spa); 3852332529Smav if (error != 0) 3853332529Smav return (error); 3854332529Smav 3855332529Smav /* 3856332529Smav * Verify the logs now to make sure we don't have any unexpected errors 3857332529Smav * when we claim log blocks later. 3858332529Smav */ 3859332529Smav error = spa_ld_verify_logs(spa, type, ereport); 3860332529Smav if (error != 0) 3861332529Smav return (error); 3862332529Smav 3863236884Smm if (missing_feat_write) { 3864332536Smav ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 3865236884Smm 3866236884Smm /* 3867236884Smm * At this point, we know that we can open the pool in 3868236884Smm * read-only mode but not read-write mode. We now have enough 3869236884Smm * information and can return to userland. 3870236884Smm */ 3871332529Smav return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 3872332529Smav ENOTSUP)); 3873236884Smm } 3874236884Smm 3875219089Spjd /* 3876332529Smav * Traverse the last txgs to make sure the pool was left off in a safe 3877332529Smav * state. When performing an extreme rewind, we verify the whole pool, 3878332529Smav * which can take a very long time. 3879219089Spjd */ 3880332530Smav error = spa_ld_verify_pool_data(spa); 3881332529Smav if (error != 0) 3882332529Smav return (error); 3883219089Spjd 3884332529Smav /* 3885332529Smav * Calculate the deflated space for the pool. This must be done before 3886332529Smav * we write anything to the pool because we'd need to update the space 3887332529Smav * accounting using the deflated sizes. 3888332529Smav */ 3889332529Smav spa_update_dspace(spa); 3890332529Smav 3891332529Smav /* 3892332529Smav * We have now retrieved all the information we needed to open the 3893332529Smav * pool. If we are importing the pool in read-write mode, a few 3894332529Smav * additional steps must be performed to finish the import. 3895332529Smav */ 3896332536Smav if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 3897219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 3898332536Smav uint64_t config_cache_txg = spa->spa_config_txg; 3899168404Spjd 3900332536Smav ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 3901332536Smav 3902332525Smav /* 3903332547Smav * In case of a checkpoint rewind, log the original txg 3904332547Smav * of the checkpointed uberblock. 3905332547Smav */ 3906332547Smav if (checkpoint_rewind) { 3907332547Smav spa_history_log_internal(spa, "checkpoint rewind", 3908332547Smav NULL, "rewound state to txg=%llu", 3909332547Smav (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 3910332547Smav } 3911332547Smav 3912332547Smav /* 3913332529Smav * Traverse the ZIL and claim all blocks. 3914332529Smav */ 3915332529Smav spa_ld_claim_log_blocks(spa); 3916209962Smm 3917168404Spjd /* 3918332529Smav * Kick-off the syncing thread. 3919168404Spjd */ 3920168404Spjd spa->spa_sync_on = B_TRUE; 3921168404Spjd txg_sync_start(spa->spa_dsl_pool); 3922168404Spjd 3923168404Spjd /* 3924219089Spjd * Wait for all claims to sync. We sync up to the highest 3925219089Spjd * claimed log block birth time so that claimed log blocks 3926219089Spjd * don't appear to be from the future. spa_claim_max_txg 3927332529Smav * will have been set for us by ZIL traversal operations 3928332529Smav * performed above. 3929168404Spjd */ 3930219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 3931168404Spjd 3932168404Spjd /* 3933332529Smav * Check if we need to request an update of the config. On the 3934332529Smav * next sync, we would update the config stored in vdev labels 3935332529Smav * and the cachefile (by default /etc/zfs/zpool.cache). 3936168404Spjd */ 3937332536Smav spa_ld_check_for_config_update(spa, config_cache_txg, 3938332547Smav update_config_cache); 3939168404Spjd 3940168404Spjd /* 3941208683Spjd * Check all DTLs to see if anything needs resilvering. 3942208683Spjd */ 3943219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 3944332529Smav vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) 3945208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 3946219089Spjd 3947219089Spjd /* 3948248571Smm * Log the fact that we booted up (so that we can detect if 3949248571Smm * we rebooted in the middle of an operation). 3950248571Smm */ 3951248571Smm spa_history_log_version(spa, "open"); 3952248571Smm 3953248571Smm /* 3954219089Spjd * Delete any inconsistent datasets. 3955219089Spjd */ 3956219089Spjd (void) dmu_objset_find(spa_name(spa), 3957219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 3958219089Spjd 3959219089Spjd /* 3960219089Spjd * Clean up any stale temporary dataset userrefs. 3961219089Spjd */ 3962219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 3963332525Smav 3964332525Smav spa_restart_removal(spa); 3965332525Smav 3966332537Smav spa_spawn_aux_threads(spa); 3967339111Smav 3968339111Smav spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3969339111Smav vdev_initialize_restart(spa->spa_root_vdev); 3970339111Smav spa_config_exit(spa, SCL_CONFIG, FTAG); 3971168404Spjd } 3972168404Spjd 3973332530Smav spa_load_note(spa, "LOADED"); 3974332530Smav 3975219089Spjd return (0); 3976219089Spjd} 3977168404Spjd 3978219089Spjdstatic int 3979332536Smavspa_load_retry(spa_t *spa, spa_load_state_t state) 3980219089Spjd{ 3981219089Spjd int mode = spa->spa_mode; 3982219089Spjd 3983219089Spjd spa_unload(spa); 3984219089Spjd spa_deactivate(spa); 3985219089Spjd 3986268720Sdelphij spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 3987219089Spjd 3988219089Spjd spa_activate(spa, mode); 3989219089Spjd spa_async_suspend(spa); 3990219089Spjd 3991332530Smav spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 3992332530Smav (u_longlong_t)spa->spa_load_max_txg); 3993332530Smav 3994332536Smav return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 3995168404Spjd} 3996168404Spjd 3997236884Smm/* 3998236884Smm * If spa_load() fails this function will try loading prior txg's. If 3999236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 4000236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 4001236884Smm * function will not rewind the pool and will return the same error as 4002236884Smm * spa_load(). 4003236884Smm */ 4004219089Spjdstatic int 4005332536Smavspa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 4006332536Smav int rewind_flags) 4007219089Spjd{ 4008236884Smm nvlist_t *loadinfo = NULL; 4009219089Spjd nvlist_t *config = NULL; 4010219089Spjd int load_error, rewind_error; 4011219089Spjd uint64_t safe_rewind_txg; 4012219089Spjd uint64_t min_txg; 4013219089Spjd 4014219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 4015219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 4016219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 4017219089Spjd } else { 4018219089Spjd spa->spa_load_max_txg = max_request; 4019268720Sdelphij if (max_request != UINT64_MAX) 4020268720Sdelphij spa->spa_extreme_rewind = B_TRUE; 4021219089Spjd } 4022219089Spjd 4023332536Smav load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 4024219089Spjd if (load_error == 0) 4025219089Spjd return (0); 4026332547Smav if (load_error == ZFS_ERR_NO_CHECKPOINT) { 4027332547Smav /* 4028332547Smav * When attempting checkpoint-rewind on a pool with no 4029332547Smav * checkpoint, we should not attempt to load uberblocks 4030332547Smav * from previous txgs when spa_load fails. 4031332547Smav */ 4032332547Smav ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4033332547Smav return (load_error); 4034332547Smav } 4035219089Spjd 4036219089Spjd if (spa->spa_root_vdev != NULL) 4037219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4038219089Spjd 4039219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 4040219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 4041219089Spjd 4042219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 4043219089Spjd nvlist_free(config); 4044219089Spjd return (load_error); 4045219089Spjd } 4046219089Spjd 4047236884Smm if (state == SPA_LOAD_RECOVER) { 4048236884Smm /* Price of rolling back is discarding txgs, including log */ 4049219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 4050236884Smm } else { 4051236884Smm /* 4052236884Smm * If we aren't rolling back save the load info from our first 4053236884Smm * import attempt so that we can restore it after attempting 4054236884Smm * to rewind. 4055236884Smm */ 4056236884Smm loadinfo = spa->spa_load_info; 4057236884Smm spa->spa_load_info = fnvlist_alloc(); 4058236884Smm } 4059219089Spjd 4060219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 4061219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 4062219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 4063219089Spjd TXG_INITIAL : safe_rewind_txg; 4064219089Spjd 4065219089Spjd /* 4066219089Spjd * Continue as long as we're finding errors, we're still within 4067219089Spjd * the acceptable rewind range, and we're still finding uberblocks 4068219089Spjd */ 4069219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 4070219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 4071219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 4072219089Spjd spa->spa_extreme_rewind = B_TRUE; 4073332536Smav rewind_error = spa_load_retry(spa, state); 4074219089Spjd } 4075219089Spjd 4076219089Spjd spa->spa_extreme_rewind = B_FALSE; 4077219089Spjd spa->spa_load_max_txg = UINT64_MAX; 4078219089Spjd 4079219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 4080219089Spjd spa_config_set(spa, config); 4081325535Savg else 4082325535Savg nvlist_free(config); 4083219089Spjd 4084236884Smm if (state == SPA_LOAD_RECOVER) { 4085236884Smm ASSERT3P(loadinfo, ==, NULL); 4086236884Smm return (rewind_error); 4087236884Smm } else { 4088236884Smm /* Store the rewind info as part of the initial load info */ 4089236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 4090236884Smm spa->spa_load_info); 4091236884Smm 4092236884Smm /* Restore the initial load info */ 4093236884Smm fnvlist_free(spa->spa_load_info); 4094236884Smm spa->spa_load_info = loadinfo; 4095236884Smm 4096236884Smm return (load_error); 4097236884Smm } 4098219089Spjd} 4099219089Spjd 4100168404Spjd/* 4101168404Spjd * Pool Open/Import 4102168404Spjd * 4103168404Spjd * The import case is identical to an open except that the configuration is sent 4104168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 4105168404Spjd * case of an open, the pool configuration will exist in the 4106185029Spjd * POOL_STATE_UNINITIALIZED state. 4107168404Spjd * 4108168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 4109168404Spjd * the same time open the pool, without having to keep around the spa_t in some 4110168404Spjd * ambiguous state. 4111168404Spjd */ 4112168404Spjdstatic int 4113219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 4114219089Spjd nvlist_t **config) 4115168404Spjd{ 4116168404Spjd spa_t *spa; 4117219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 4118168404Spjd int error; 4119168404Spjd int locked = B_FALSE; 4120219089Spjd int firstopen = B_FALSE; 4121168404Spjd 4122168404Spjd *spapp = NULL; 4123168404Spjd 4124168404Spjd /* 4125168404Spjd * As disgusting as this is, we need to support recursive calls to this 4126168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 4127168404Spjd * up calling spa_open() again. The real fix is to figure out how to 4128168404Spjd * avoid dsl_dir_open() calling this in the first place. 4129168404Spjd */ 4130168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 4131168404Spjd mutex_enter(&spa_namespace_lock); 4132168404Spjd locked = B_TRUE; 4133168404Spjd } 4134168404Spjd 4135168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4136168404Spjd if (locked) 4137168404Spjd mutex_exit(&spa_namespace_lock); 4138249195Smm return (SET_ERROR(ENOENT)); 4139168404Spjd } 4140219089Spjd 4141168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 4142332550Smav zpool_load_policy_t policy; 4143168404Spjd 4144219089Spjd firstopen = B_TRUE; 4145219089Spjd 4146332550Smav zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 4147219089Spjd &policy); 4148332550Smav if (policy.zlp_rewind & ZPOOL_DO_REWIND) 4149219089Spjd state = SPA_LOAD_RECOVER; 4150219089Spjd 4151209962Smm spa_activate(spa, spa_mode_global); 4152168404Spjd 4153219089Spjd if (state != SPA_LOAD_RECOVER) 4154219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4155332536Smav spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 4156168404Spjd 4157332530Smav zfs_dbgmsg("spa_open_common: opening %s", pool); 4158332550Smav error = spa_load_best(spa, state, policy.zlp_txg, 4159332550Smav policy.zlp_rewind); 4160219089Spjd 4161168404Spjd if (error == EBADF) { 4162168404Spjd /* 4163168404Spjd * If vdev_validate() returns failure (indicated by 4164168404Spjd * EBADF), it indicates that one of the vdevs indicates 4165168404Spjd * that the pool has been exported or destroyed. If 4166168404Spjd * this is the case, the config cache is out of sync and 4167168404Spjd * we should remove the pool from the namespace. 4168168404Spjd */ 4169168404Spjd spa_unload(spa); 4170168404Spjd spa_deactivate(spa); 4171332525Smav spa_write_cachefile(spa, B_TRUE, B_TRUE); 4172168404Spjd spa_remove(spa); 4173168404Spjd if (locked) 4174168404Spjd mutex_exit(&spa_namespace_lock); 4175249195Smm return (SET_ERROR(ENOENT)); 4176168404Spjd } 4177168404Spjd 4178168404Spjd if (error) { 4179168404Spjd /* 4180168404Spjd * We can't open the pool, but we still have useful 4181168404Spjd * information: the state of each vdev after the 4182168404Spjd * attempted vdev_open(). Return this to the user. 4183168404Spjd */ 4184219089Spjd if (config != NULL && spa->spa_config) { 4185219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 4186219089Spjd KM_SLEEP) == 0); 4187219089Spjd VERIFY(nvlist_add_nvlist(*config, 4188219089Spjd ZPOOL_CONFIG_LOAD_INFO, 4189219089Spjd spa->spa_load_info) == 0); 4190219089Spjd } 4191168404Spjd spa_unload(spa); 4192168404Spjd spa_deactivate(spa); 4193219089Spjd spa->spa_last_open_failed = error; 4194168404Spjd if (locked) 4195168404Spjd mutex_exit(&spa_namespace_lock); 4196168404Spjd *spapp = NULL; 4197168404Spjd return (error); 4198168404Spjd } 4199168404Spjd } 4200168404Spjd 4201168404Spjd spa_open_ref(spa, tag); 4202185029Spjd 4203219089Spjd if (config != NULL) 4204219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4205219089Spjd 4206219089Spjd /* 4207219089Spjd * If we've recovered the pool, pass back any information we 4208219089Spjd * gathered while doing the load. 4209219089Spjd */ 4210219089Spjd if (state == SPA_LOAD_RECOVER) { 4211219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 4212219089Spjd spa->spa_load_info) == 0); 4213219089Spjd } 4214219089Spjd 4215219089Spjd if (locked) { 4216219089Spjd spa->spa_last_open_failed = 0; 4217219089Spjd spa->spa_last_ubsync_txg = 0; 4218219089Spjd spa->spa_load_txg = 0; 4219168404Spjd mutex_exit(&spa_namespace_lock); 4220219089Spjd#ifdef __FreeBSD__ 4221219089Spjd#ifdef _KERNEL 4222219089Spjd if (firstopen) 4223249047Savg zvol_create_minors(spa->spa_name); 4224219089Spjd#endif 4225219089Spjd#endif 4226219089Spjd } 4227168404Spjd 4228168404Spjd *spapp = spa; 4229168404Spjd 4230168404Spjd return (0); 4231168404Spjd} 4232168404Spjd 4233168404Spjdint 4234219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 4235219089Spjd nvlist_t **config) 4236219089Spjd{ 4237219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 4238219089Spjd} 4239219089Spjd 4240219089Spjdint 4241168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 4242168404Spjd{ 4243219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 4244168404Spjd} 4245168404Spjd 4246168404Spjd/* 4247168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 4248168404Spjd * preventing it from being exported or destroyed. 4249168404Spjd */ 4250168404Spjdspa_t * 4251168404Spjdspa_inject_addref(char *name) 4252168404Spjd{ 4253168404Spjd spa_t *spa; 4254168404Spjd 4255168404Spjd mutex_enter(&spa_namespace_lock); 4256168404Spjd if ((spa = spa_lookup(name)) == NULL) { 4257168404Spjd mutex_exit(&spa_namespace_lock); 4258168404Spjd return (NULL); 4259168404Spjd } 4260168404Spjd spa->spa_inject_ref++; 4261168404Spjd mutex_exit(&spa_namespace_lock); 4262168404Spjd 4263168404Spjd return (spa); 4264168404Spjd} 4265168404Spjd 4266168404Spjdvoid 4267168404Spjdspa_inject_delref(spa_t *spa) 4268168404Spjd{ 4269168404Spjd mutex_enter(&spa_namespace_lock); 4270168404Spjd spa->spa_inject_ref--; 4271168404Spjd mutex_exit(&spa_namespace_lock); 4272168404Spjd} 4273168404Spjd 4274185029Spjd/* 4275185029Spjd * Add spares device information to the nvlist. 4276185029Spjd */ 4277168404Spjdstatic void 4278168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 4279168404Spjd{ 4280168404Spjd nvlist_t **spares; 4281168404Spjd uint_t i, nspares; 4282168404Spjd nvlist_t *nvroot; 4283168404Spjd uint64_t guid; 4284168404Spjd vdev_stat_t *vs; 4285168404Spjd uint_t vsc; 4286168404Spjd uint64_t pool; 4287168404Spjd 4288209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 4289209962Smm 4290185029Spjd if (spa->spa_spares.sav_count == 0) 4291168404Spjd return; 4292168404Spjd 4293168404Spjd VERIFY(nvlist_lookup_nvlist(config, 4294168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 4295185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 4296168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 4297168404Spjd if (nspares != 0) { 4298168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 4299168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4300168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 4301168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 4302168404Spjd 4303168404Spjd /* 4304168404Spjd * Go through and find any spares which have since been 4305168404Spjd * repurposed as an active spare. If this is the case, update 4306168404Spjd * their status appropriately. 4307168404Spjd */ 4308168404Spjd for (i = 0; i < nspares; i++) { 4309168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 4310168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 4311185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 4312185029Spjd pool != 0ULL) { 4313168404Spjd VERIFY(nvlist_lookup_uint64_array( 4314219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 4315168404Spjd (uint64_t **)&vs, &vsc) == 0); 4316168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 4317168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 4318168404Spjd } 4319168404Spjd } 4320168404Spjd } 4321168404Spjd} 4322168404Spjd 4323185029Spjd/* 4324185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 4325185029Spjd */ 4326185029Spjdstatic void 4327185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 4328185029Spjd{ 4329185029Spjd nvlist_t **l2cache; 4330185029Spjd uint_t i, j, nl2cache; 4331185029Spjd nvlist_t *nvroot; 4332185029Spjd uint64_t guid; 4333185029Spjd vdev_t *vd; 4334185029Spjd vdev_stat_t *vs; 4335185029Spjd uint_t vsc; 4336185029Spjd 4337209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 4338209962Smm 4339185029Spjd if (spa->spa_l2cache.sav_count == 0) 4340185029Spjd return; 4341185029Spjd 4342185029Spjd VERIFY(nvlist_lookup_nvlist(config, 4343185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 4344185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 4345185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 4346185029Spjd if (nl2cache != 0) { 4347185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 4348185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4349185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 4350185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 4351185029Spjd 4352185029Spjd /* 4353185029Spjd * Update level 2 cache device stats. 4354185029Spjd */ 4355185029Spjd 4356185029Spjd for (i = 0; i < nl2cache; i++) { 4357185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 4358185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 4359185029Spjd 4360185029Spjd vd = NULL; 4361185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 4362185029Spjd if (guid == 4363185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 4364185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 4365185029Spjd break; 4366185029Spjd } 4367185029Spjd } 4368185029Spjd ASSERT(vd != NULL); 4369185029Spjd 4370185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 4371219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 4372219089Spjd == 0); 4373185029Spjd vdev_get_stats(vd, vs); 4374185029Spjd } 4375185029Spjd } 4376185029Spjd} 4377185029Spjd 4378236884Smmstatic void 4379346676Smavspa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 4380236884Smm{ 4381236884Smm zap_cursor_t zc; 4382236884Smm zap_attribute_t za; 4383236884Smm 4384253993Smav /* We may be unable to read features if pool is suspended. */ 4385253993Smav if (spa_suspended(spa)) 4386346676Smav return; 4387253993Smav 4388236884Smm if (spa->spa_feat_for_read_obj != 0) { 4389236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 4390236884Smm spa->spa_feat_for_read_obj); 4391236884Smm zap_cursor_retrieve(&zc, &za) == 0; 4392236884Smm zap_cursor_advance(&zc)) { 4393236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 4394236884Smm za.za_num_integers == 1); 4395346676Smav VERIFY0(nvlist_add_uint64(features, za.za_name, 4396236884Smm za.za_first_integer)); 4397236884Smm } 4398236884Smm zap_cursor_fini(&zc); 4399236884Smm } 4400236884Smm 4401236884Smm if (spa->spa_feat_for_write_obj != 0) { 4402236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 4403236884Smm spa->spa_feat_for_write_obj); 4404236884Smm zap_cursor_retrieve(&zc, &za) == 0; 4405236884Smm zap_cursor_advance(&zc)) { 4406236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 4407236884Smm za.za_num_integers == 1); 4408346676Smav VERIFY0(nvlist_add_uint64(features, za.za_name, 4409236884Smm za.za_first_integer)); 4410236884Smm } 4411236884Smm zap_cursor_fini(&zc); 4412236884Smm } 4413346676Smav} 4414236884Smm 4415346676Smavstatic void 4416346676Smavspa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 4417346676Smav{ 4418346676Smav int i; 4419346676Smav 4420346676Smav for (i = 0; i < SPA_FEATURES; i++) { 4421346676Smav zfeature_info_t feature = spa_feature_table[i]; 4422346676Smav uint64_t refcount; 4423346676Smav 4424346676Smav if (feature_get_refcount(spa, &feature, &refcount) != 0) 4425346676Smav continue; 4426346676Smav 4427346676Smav VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 4428346676Smav } 4429236884Smm} 4430236884Smm 4431346676Smav/* 4432346676Smav * Store a list of pool features and their reference counts in the 4433346676Smav * config. 4434346676Smav * 4435346676Smav * The first time this is called on a spa, allocate a new nvlist, fetch 4436346676Smav * the pool features and reference counts from disk, then save the list 4437346676Smav * in the spa. In subsequent calls on the same spa use the saved nvlist 4438346676Smav * and refresh its values from the cached reference counts. This 4439346676Smav * ensures we don't block here on I/O on a suspended pool so 'zpool 4440346676Smav * clear' can resume the pool. 4441346676Smav */ 4442346676Smavstatic void 4443346676Smavspa_add_feature_stats(spa_t *spa, nvlist_t *config) 4444346676Smav{ 4445346676Smav nvlist_t *features; 4446346676Smav 4447346676Smav ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 4448346676Smav 4449346676Smav mutex_enter(&spa->spa_feat_stats_lock); 4450346676Smav features = spa->spa_feat_stats; 4451346676Smav 4452346676Smav if (features != NULL) { 4453346676Smav spa_feature_stats_from_cache(spa, features); 4454346676Smav } else { 4455346676Smav VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 4456346676Smav spa->spa_feat_stats = features; 4457346676Smav spa_feature_stats_from_disk(spa, features); 4458346676Smav } 4459346676Smav 4460346676Smav VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 4461346676Smav features)); 4462346676Smav 4463346676Smav mutex_exit(&spa->spa_feat_stats_lock); 4464346676Smav} 4465346676Smav 4466168404Spjdint 4467236884Smmspa_get_stats(const char *name, nvlist_t **config, 4468236884Smm char *altroot, size_t buflen) 4469168404Spjd{ 4470168404Spjd int error; 4471168404Spjd spa_t *spa; 4472168404Spjd 4473168404Spjd *config = NULL; 4474219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 4475168404Spjd 4476209962Smm if (spa != NULL) { 4477209962Smm /* 4478209962Smm * This still leaves a window of inconsistency where the spares 4479209962Smm * or l2cache devices could change and the config would be 4480209962Smm * self-inconsistent. 4481209962Smm */ 4482209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4483168404Spjd 4484209962Smm if (*config != NULL) { 4485219089Spjd uint64_t loadtimes[2]; 4486219089Spjd 4487219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 4488219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 4489219089Spjd VERIFY(nvlist_add_uint64_array(*config, 4490219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 4491219089Spjd 4492185029Spjd VERIFY(nvlist_add_uint64(*config, 4493209962Smm ZPOOL_CONFIG_ERRCOUNT, 4494209962Smm spa_get_errlog_size(spa)) == 0); 4495185029Spjd 4496209962Smm if (spa_suspended(spa)) 4497209962Smm VERIFY(nvlist_add_uint64(*config, 4498209962Smm ZPOOL_CONFIG_SUSPENDED, 4499209962Smm spa->spa_failmode) == 0); 4500209962Smm 4501209962Smm spa_add_spares(spa, *config); 4502209962Smm spa_add_l2cache(spa, *config); 4503236884Smm spa_add_feature_stats(spa, *config); 4504209962Smm } 4505168404Spjd } 4506168404Spjd 4507168404Spjd /* 4508168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 4509168404Spjd * and call spa_lookup() directly. 4510168404Spjd */ 4511168404Spjd if (altroot) { 4512168404Spjd if (spa == NULL) { 4513168404Spjd mutex_enter(&spa_namespace_lock); 4514168404Spjd spa = spa_lookup(name); 4515168404Spjd if (spa) 4516168404Spjd spa_altroot(spa, altroot, buflen); 4517168404Spjd else 4518168404Spjd altroot[0] = '\0'; 4519168404Spjd spa = NULL; 4520168404Spjd mutex_exit(&spa_namespace_lock); 4521168404Spjd } else { 4522168404Spjd spa_altroot(spa, altroot, buflen); 4523168404Spjd } 4524168404Spjd } 4525168404Spjd 4526209962Smm if (spa != NULL) { 4527209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4528168404Spjd spa_close(spa, FTAG); 4529209962Smm } 4530168404Spjd 4531168404Spjd return (error); 4532168404Spjd} 4533168404Spjd 4534168404Spjd/* 4535185029Spjd * Validate that the auxiliary device array is well formed. We must have an 4536185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 4537185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 4538185029Spjd * specified, as long as they are well-formed. 4539168404Spjd */ 4540168404Spjdstatic int 4541185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 4542185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 4543185029Spjd vdev_labeltype_t label) 4544168404Spjd{ 4545185029Spjd nvlist_t **dev; 4546185029Spjd uint_t i, ndev; 4547168404Spjd vdev_t *vd; 4548168404Spjd int error; 4549168404Spjd 4550185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4551185029Spjd 4552168404Spjd /* 4553185029Spjd * It's acceptable to have no devs specified. 4554168404Spjd */ 4555185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 4556168404Spjd return (0); 4557168404Spjd 4558185029Spjd if (ndev == 0) 4559249195Smm return (SET_ERROR(EINVAL)); 4560168404Spjd 4561168404Spjd /* 4562185029Spjd * Make sure the pool is formatted with a version that supports this 4563185029Spjd * device type. 4564168404Spjd */ 4565185029Spjd if (spa_version(spa) < version) 4566249195Smm return (SET_ERROR(ENOTSUP)); 4567168404Spjd 4568168404Spjd /* 4569185029Spjd * Set the pending device list so we correctly handle device in-use 4570168404Spjd * checking. 4571168404Spjd */ 4572185029Spjd sav->sav_pending = dev; 4573185029Spjd sav->sav_npending = ndev; 4574168404Spjd 4575185029Spjd for (i = 0; i < ndev; i++) { 4576185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 4577168404Spjd mode)) != 0) 4578168404Spjd goto out; 4579168404Spjd 4580168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 4581168404Spjd vdev_free(vd); 4582249195Smm error = SET_ERROR(EINVAL); 4583168404Spjd goto out; 4584168404Spjd } 4585168404Spjd 4586185029Spjd /* 4587185029Spjd * The L2ARC currently only supports disk devices in 4588185029Spjd * kernel context. For user-level testing, we allow it. 4589185029Spjd */ 4590185029Spjd#ifdef _KERNEL 4591185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 4592185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 4593249195Smm error = SET_ERROR(ENOTBLK); 4594230514Smm vdev_free(vd); 4595185029Spjd goto out; 4596185029Spjd } 4597185029Spjd#endif 4598168404Spjd vd->vdev_top = vd; 4599168404Spjd 4600168404Spjd if ((error = vdev_open(vd)) == 0 && 4601185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 4602185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 4603168404Spjd vd->vdev_guid) == 0); 4604168404Spjd } 4605168404Spjd 4606168404Spjd vdev_free(vd); 4607168404Spjd 4608185029Spjd if (error && 4609185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 4610168404Spjd goto out; 4611168404Spjd else 4612168404Spjd error = 0; 4613168404Spjd } 4614168404Spjd 4615168404Spjdout: 4616185029Spjd sav->sav_pending = NULL; 4617185029Spjd sav->sav_npending = 0; 4618168404Spjd return (error); 4619168404Spjd} 4620168404Spjd 4621185029Spjdstatic int 4622185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 4623185029Spjd{ 4624185029Spjd int error; 4625185029Spjd 4626185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4627185029Spjd 4628185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 4629185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 4630185029Spjd VDEV_LABEL_SPARE)) != 0) { 4631185029Spjd return (error); 4632185029Spjd } 4633185029Spjd 4634185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 4635185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 4636185029Spjd VDEV_LABEL_L2CACHE)); 4637185029Spjd} 4638185029Spjd 4639185029Spjdstatic void 4640185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 4641185029Spjd const char *config) 4642185029Spjd{ 4643185029Spjd int i; 4644185029Spjd 4645185029Spjd if (sav->sav_config != NULL) { 4646185029Spjd nvlist_t **olddevs; 4647185029Spjd uint_t oldndevs; 4648185029Spjd nvlist_t **newdevs; 4649185029Spjd 4650185029Spjd /* 4651185029Spjd * Generate new dev list by concatentating with the 4652185029Spjd * current dev list. 4653185029Spjd */ 4654185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 4655185029Spjd &olddevs, &oldndevs) == 0); 4656185029Spjd 4657185029Spjd newdevs = kmem_alloc(sizeof (void *) * 4658185029Spjd (ndevs + oldndevs), KM_SLEEP); 4659185029Spjd for (i = 0; i < oldndevs; i++) 4660185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 4661185029Spjd KM_SLEEP) == 0); 4662185029Spjd for (i = 0; i < ndevs; i++) 4663185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 4664185029Spjd KM_SLEEP) == 0); 4665185029Spjd 4666185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 4667185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 4668185029Spjd 4669185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 4670185029Spjd config, newdevs, ndevs + oldndevs) == 0); 4671185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 4672185029Spjd nvlist_free(newdevs[i]); 4673185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 4674185029Spjd } else { 4675185029Spjd /* 4676185029Spjd * Generate a new dev list. 4677185029Spjd */ 4678185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 4679185029Spjd KM_SLEEP) == 0); 4680185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 4681185029Spjd devs, ndevs) == 0); 4682185029Spjd } 4683185029Spjd} 4684185029Spjd 4685168404Spjd/* 4686185029Spjd * Stop and drop level 2 ARC devices 4687185029Spjd */ 4688185029Spjdvoid 4689185029Spjdspa_l2cache_drop(spa_t *spa) 4690185029Spjd{ 4691185029Spjd vdev_t *vd; 4692185029Spjd int i; 4693185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 4694185029Spjd 4695185029Spjd for (i = 0; i < sav->sav_count; i++) { 4696185029Spjd uint64_t pool; 4697185029Spjd 4698185029Spjd vd = sav->sav_vdevs[i]; 4699185029Spjd ASSERT(vd != NULL); 4700185029Spjd 4701209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 4702209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 4703185029Spjd l2arc_remove_vdev(vd); 4704185029Spjd } 4705185029Spjd} 4706185029Spjd 4707185029Spjd/* 4708168404Spjd * Pool Creation 4709168404Spjd */ 4710168404Spjdint 4711185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 4712248571Smm nvlist_t *zplprops) 4713168404Spjd{ 4714168404Spjd spa_t *spa; 4715185029Spjd char *altroot = NULL; 4716168404Spjd vdev_t *rvd; 4717168404Spjd dsl_pool_t *dp; 4718168404Spjd dmu_tx_t *tx; 4719219089Spjd int error = 0; 4720168404Spjd uint64_t txg = TXG_INITIAL; 4721185029Spjd nvlist_t **spares, **l2cache; 4722185029Spjd uint_t nspares, nl2cache; 4723219089Spjd uint64_t version, obj; 4724236884Smm boolean_t has_features; 4725333194Savg char *poolname; 4726333194Savg nvlist_t *nvl; 4727168404Spjd 4728333194Savg if (nvlist_lookup_string(props, 4729333194Savg zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) 4730333194Savg poolname = (char *)pool; 4731333194Savg 4732168404Spjd /* 4733168404Spjd * If this pool already exists, return failure. 4734168404Spjd */ 4735168404Spjd mutex_enter(&spa_namespace_lock); 4736333194Savg if (spa_lookup(poolname) != NULL) { 4737168404Spjd mutex_exit(&spa_namespace_lock); 4738249195Smm return (SET_ERROR(EEXIST)); 4739168404Spjd } 4740168404Spjd 4741168404Spjd /* 4742168404Spjd * Allocate a new spa_t structure. 4743168404Spjd */ 4744333194Savg nvl = fnvlist_alloc(); 4745333194Savg fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 4746185029Spjd (void) nvlist_lookup_string(props, 4747185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4748333194Savg spa = spa_add(poolname, nvl, altroot); 4749333194Savg fnvlist_free(nvl); 4750209962Smm spa_activate(spa, spa_mode_global); 4751168404Spjd 4752185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 4753185029Spjd spa_deactivate(spa); 4754185029Spjd spa_remove(spa); 4755185029Spjd mutex_exit(&spa_namespace_lock); 4756185029Spjd return (error); 4757185029Spjd } 4758185029Spjd 4759333194Savg /* 4760333194Savg * Temporary pool names should never be written to disk. 4761333194Savg */ 4762333194Savg if (poolname != pool) 4763333194Savg spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 4764333194Savg 4765236884Smm has_features = B_FALSE; 4766236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 4767236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 4768236884Smm if (zpool_prop_feature(nvpair_name(elem))) 4769236884Smm has_features = B_TRUE; 4770236884Smm } 4771236884Smm 4772236884Smm if (has_features || nvlist_lookup_uint64(props, 4773236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 4774185029Spjd version = SPA_VERSION; 4775236884Smm } 4776236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 4777219089Spjd 4778219089Spjd spa->spa_first_txg = txg; 4779219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 4780185029Spjd spa->spa_uberblock.ub_version = version; 4781168404Spjd spa->spa_ubsync = spa->spa_uberblock; 4782307277Smav spa->spa_load_state = SPA_LOAD_CREATE; 4783332525Smav spa->spa_removing_phys.sr_state = DSS_NONE; 4784332525Smav spa->spa_removing_phys.sr_removing_vdev = -1; 4785332525Smav spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 4786338403Smav spa->spa_indirect_vdevs_loaded = B_TRUE; 4787168404Spjd 4788168404Spjd /* 4789209962Smm * Create "The Godfather" zio to hold all async IOs 4790209962Smm */ 4791272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 4792272598Sdelphij KM_SLEEP); 4793272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 4794272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 4795272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 4796272598Sdelphij ZIO_FLAG_GODFATHER); 4797272598Sdelphij } 4798209962Smm 4799209962Smm /* 4800168404Spjd * Create the root vdev. 4801168404Spjd */ 4802185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4803168404Spjd 4804168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 4805168404Spjd 4806168404Spjd ASSERT(error != 0 || rvd != NULL); 4807168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 4808168404Spjd 4809185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 4810249195Smm error = SET_ERROR(EINVAL); 4811168404Spjd 4812168404Spjd if (error == 0 && 4813168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 4814185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 4815168404Spjd VDEV_ALLOC_ADD)) == 0) { 4816219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 4817254591Sgibbs vdev_ashift_optimize(rvd->vdev_child[c]); 4818219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 4819219089Spjd vdev_expand(rvd->vdev_child[c], txg); 4820219089Spjd } 4821168404Spjd } 4822168404Spjd 4823185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4824168404Spjd 4825168404Spjd if (error != 0) { 4826168404Spjd spa_unload(spa); 4827168404Spjd spa_deactivate(spa); 4828168404Spjd spa_remove(spa); 4829168404Spjd mutex_exit(&spa_namespace_lock); 4830168404Spjd return (error); 4831168404Spjd } 4832168404Spjd 4833168404Spjd /* 4834168404Spjd * Get the list of spares, if specified. 4835168404Spjd */ 4836168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4837168404Spjd &spares, &nspares) == 0) { 4838185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 4839168404Spjd KM_SLEEP) == 0); 4840185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4841168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4842185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4843168404Spjd spa_load_spares(spa); 4844185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4845185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4846168404Spjd } 4847168404Spjd 4848185029Spjd /* 4849185029Spjd * Get the list of level 2 cache devices, if specified. 4850185029Spjd */ 4851185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4852185029Spjd &l2cache, &nl2cache) == 0) { 4853185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4854185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 4855185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4856185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4857185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4858185029Spjd spa_load_l2cache(spa); 4859185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4860185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4861185029Spjd } 4862185029Spjd 4863236884Smm spa->spa_is_initializing = B_TRUE; 4864185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 4865168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 4866236884Smm spa->spa_is_initializing = B_FALSE; 4867168404Spjd 4868219089Spjd /* 4869219089Spjd * Create DDTs (dedup tables). 4870219089Spjd */ 4871219089Spjd ddt_create(spa); 4872219089Spjd 4873219089Spjd spa_update_dspace(spa); 4874219089Spjd 4875168404Spjd tx = dmu_tx_create_assigned(dp, txg); 4876168404Spjd 4877168404Spjd /* 4878168404Spjd * Create the pool config object. 4879168404Spjd */ 4880168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 4881185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 4882168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 4883168404Spjd 4884168404Spjd if (zap_add(spa->spa_meta_objset, 4885168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 4886168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 4887168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 4888168404Spjd } 4889168404Spjd 4890236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 4891236884Smm spa_feature_create_zap_objects(spa, tx); 4892236884Smm 4893219089Spjd if (zap_add(spa->spa_meta_objset, 4894219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 4895219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 4896219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 4897219089Spjd } 4898219089Spjd 4899185029Spjd /* Newly created pools with the right version are always deflated. */ 4900185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 4901185029Spjd spa->spa_deflate = TRUE; 4902185029Spjd if (zap_add(spa->spa_meta_objset, 4903185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4904185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 4905185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 4906185029Spjd } 4907168404Spjd } 4908168404Spjd 4909168404Spjd /* 4910219089Spjd * Create the deferred-free bpobj. Turn off compression 4911168404Spjd * because sync-to-convergence takes longer if the blocksize 4912168404Spjd * keeps changing. 4913168404Spjd */ 4914219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 4915219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 4916168404Spjd ZIO_COMPRESS_OFF, tx); 4917168404Spjd if (zap_add(spa->spa_meta_objset, 4918219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 4919219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 4920219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 4921168404Spjd } 4922219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 4923219089Spjd spa->spa_meta_objset, obj)); 4924168404Spjd 4925168404Spjd /* 4926168404Spjd * Create the pool's history object. 4927168404Spjd */ 4928185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 4929185029Spjd spa_history_create_obj(spa, tx); 4930168404Spjd 4931185029Spjd /* 4932289422Smav * Generate some random noise for salted checksums to operate on. 4933289422Smav */ 4934289422Smav (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4935289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes)); 4936289422Smav 4937289422Smav /* 4938185029Spjd * Set pool properties. 4939185029Spjd */ 4940185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 4941185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4942185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 4943219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 4944219089Spjd 4945209962Smm if (props != NULL) { 4946209962Smm spa_configfile_set(spa, props, B_FALSE); 4947248571Smm spa_sync_props(props, tx); 4948209962Smm } 4949185029Spjd 4950168404Spjd dmu_tx_commit(tx); 4951168404Spjd 4952168404Spjd spa->spa_sync_on = B_TRUE; 4953168404Spjd txg_sync_start(spa->spa_dsl_pool); 4954168404Spjd 4955168404Spjd /* 4956168404Spjd * We explicitly wait for the first transaction to complete so that our 4957168404Spjd * bean counters are appropriately updated. 4958168404Spjd */ 4959168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 4960168404Spjd 4961332537Smav spa_spawn_aux_threads(spa); 4962332537Smav 4963332525Smav spa_write_cachefile(spa, B_FALSE, B_TRUE); 4964331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 4965168404Spjd 4966248571Smm spa_history_log_version(spa, "create"); 4967185029Spjd 4968286575Smav /* 4969286575Smav * Don't count references from objsets that are already closed 4970286575Smav * and are making their way through the eviction process. 4971286575Smav */ 4972286575Smav spa_evicting_os_wait(spa); 4973208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 4974307277Smav spa->spa_load_state = SPA_LOAD_NONE; 4975208442Smm 4976168404Spjd mutex_exit(&spa_namespace_lock); 4977168404Spjd 4978168404Spjd return (0); 4979168404Spjd} 4980168404Spjd 4981241286Savg#ifdef _KERNEL 4982277300Ssmh#ifdef illumos 4983185029Spjd/* 4984219089Spjd * Get the root pool information from the root disk, then import the root pool 4985219089Spjd * during the system boot up time. 4986185029Spjd */ 4987219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 4988219089Spjd 4989219089Spjdstatic nvlist_t * 4990219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 4991185029Spjd{ 4992219089Spjd nvlist_t *config; 4993185029Spjd nvlist_t *nvtop, *nvroot; 4994185029Spjd uint64_t pgid; 4995185029Spjd 4996219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 4997219089Spjd return (NULL); 4998219089Spjd 4999168404Spjd /* 5000185029Spjd * Add this top-level vdev to the child array. 5001168404Spjd */ 5002219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 5003219089Spjd &nvtop) == 0); 5004219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5005219089Spjd &pgid) == 0); 5006219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 5007168404Spjd 5008185029Spjd /* 5009185029Spjd * Put this pool's top-level vdevs into a root vdev. 5010185029Spjd */ 5011185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5012219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 5013219089Spjd VDEV_TYPE_ROOT) == 0); 5014185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 5015185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 5016185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 5017185029Spjd &nvtop, 1) == 0); 5018168404Spjd 5019168404Spjd /* 5020185029Spjd * Replace the existing vdev_tree with the new root vdev in 5021185029Spjd * this pool's configuration (remove the old, add the new). 5022168404Spjd */ 5023185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 5024185029Spjd nvlist_free(nvroot); 5025219089Spjd return (config); 5026185029Spjd} 5027168404Spjd 5028185029Spjd/* 5029219089Spjd * Walk the vdev tree and see if we can find a device with "better" 5030219089Spjd * configuration. A configuration is "better" if the label on that 5031219089Spjd * device has a more recent txg. 5032185029Spjd */ 5033219089Spjdstatic void 5034219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 5035185029Spjd{ 5036219089Spjd for (int c = 0; c < vd->vdev_children; c++) 5037219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 5038185029Spjd 5039219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 5040219089Spjd nvlist_t *label; 5041219089Spjd uint64_t label_txg; 5042185029Spjd 5043219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 5044219089Spjd &label) != 0) 5045219089Spjd return; 5046185029Spjd 5047219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 5048219089Spjd &label_txg) == 0); 5049168404Spjd 5050219089Spjd /* 5051219089Spjd * Do we have a better boot device? 5052219089Spjd */ 5053219089Spjd if (label_txg > *txg) { 5054219089Spjd *txg = label_txg; 5055219089Spjd *avd = vd; 5056185029Spjd } 5057219089Spjd nvlist_free(label); 5058185029Spjd } 5059185029Spjd} 5060185029Spjd 5061185029Spjd/* 5062185029Spjd * Import a root pool. 5063185029Spjd * 5064185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 5065185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 5066185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 5067185029Spjd * 5068185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 5069185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 5070185029Spjd * e.g. 5071185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 5072185029Spjd */ 5073185029Spjdint 5074185029Spjdspa_import_rootpool(char *devpath, char *devid) 5075185029Spjd{ 5076219089Spjd spa_t *spa; 5077219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 5078219089Spjd nvlist_t *config, *nvtop; 5079219089Spjd uint64_t guid, txg; 5080185029Spjd char *pname; 5081185029Spjd int error; 5082185029Spjd 5083185029Spjd /* 5084219089Spjd * Read the label from the boot device and generate a configuration. 5085185029Spjd */ 5086219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 5087219089Spjd#if defined(_OBP) && defined(_KERNEL) 5088219089Spjd if (config == NULL) { 5089219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 5090219089Spjd /* iscsi boot */ 5091219089Spjd get_iscsi_bootpath_phy(devpath); 5092219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 5093219089Spjd } 5094219089Spjd } 5095219089Spjd#endif 5096219089Spjd if (config == NULL) { 5097236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 5098219089Spjd devpath); 5099249195Smm return (SET_ERROR(EIO)); 5100219089Spjd } 5101185029Spjd 5102219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 5103219089Spjd &pname) == 0); 5104219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 5105185029Spjd 5106209962Smm mutex_enter(&spa_namespace_lock); 5107209962Smm if ((spa = spa_lookup(pname)) != NULL) { 5108209962Smm /* 5109209962Smm * Remove the existing root pool from the namespace so that we 5110209962Smm * can replace it with the correct config we just read in. 5111209962Smm */ 5112209962Smm spa_remove(spa); 5113209962Smm } 5114185029Spjd 5115219089Spjd spa = spa_add(pname, config, NULL); 5116209962Smm spa->spa_is_root = B_TRUE; 5117219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 5118331721Smav if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 5119331721Smav &spa->spa_ubsync.ub_version) != 0) 5120331721Smav spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 5121209962Smm 5122219089Spjd /* 5123219089Spjd * Build up a vdev tree based on the boot device's label config. 5124219089Spjd */ 5125219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 5126219089Spjd &nvtop) == 0); 5127219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5128219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 5129219089Spjd VDEV_ALLOC_ROOTPOOL); 5130219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 5131219089Spjd if (error) { 5132209962Smm mutex_exit(&spa_namespace_lock); 5133219089Spjd nvlist_free(config); 5134219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 5135219089Spjd pname); 5136219089Spjd return (error); 5137209962Smm } 5138209962Smm 5139219089Spjd /* 5140219089Spjd * Get the boot vdev. 5141219089Spjd */ 5142219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 5143219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 5144219089Spjd (u_longlong_t)guid); 5145249195Smm error = SET_ERROR(ENOENT); 5146219089Spjd goto out; 5147219089Spjd } 5148209962Smm 5149219089Spjd /* 5150219089Spjd * Determine if there is a better boot device. 5151219089Spjd */ 5152219089Spjd avd = bvd; 5153219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 5154219089Spjd if (avd != bvd) { 5155219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 5156219089Spjd "try booting from '%s'", avd->vdev_path); 5157249195Smm error = SET_ERROR(EINVAL); 5158219089Spjd goto out; 5159219089Spjd } 5160209962Smm 5161219089Spjd /* 5162219089Spjd * If the boot device is part of a spare vdev then ensure that 5163219089Spjd * we're booting off the active spare. 5164219089Spjd */ 5165219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 5166219089Spjd !bvd->vdev_isspare) { 5167219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 5168219089Spjd "try booting from '%s'", 5169219089Spjd bvd->vdev_parent-> 5170219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 5171249195Smm error = SET_ERROR(EINVAL); 5172219089Spjd goto out; 5173219089Spjd } 5174209962Smm 5175219089Spjd error = 0; 5176219089Spjdout: 5177219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5178219089Spjd vdev_free(rvd); 5179219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 5180209962Smm mutex_exit(&spa_namespace_lock); 5181209962Smm 5182219089Spjd nvlist_free(config); 5183219089Spjd return (error); 5184185029Spjd} 5185185029Spjd 5186277300Ssmh#else /* !illumos */ 5187241286Savg 5188243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 5189243502Savg uint64_t *count); 5190241286Savg 5191241286Savgstatic nvlist_t * 5192241286Savgspa_generate_rootconf(const char *name) 5193241286Savg{ 5194243502Savg nvlist_t **configs, **tops; 5195241286Savg nvlist_t *config; 5196243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 5197243502Savg uint64_t *holes; 5198243502Savg uint64_t best_txg; 5199243213Savg uint64_t nchildren; 5200241286Savg uint64_t pgid; 5201243502Savg uint64_t count; 5202243502Savg uint64_t i; 5203243502Savg uint_t nholes; 5204241286Savg 5205243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 5206241286Savg return (NULL); 5207241286Savg 5208243502Savg ASSERT3U(count, !=, 0); 5209243502Savg best_txg = 0; 5210243502Savg for (i = 0; i < count; i++) { 5211243502Savg uint64_t txg; 5212243502Savg 5213243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 5214243502Savg &txg) == 0); 5215243502Savg if (txg > best_txg) { 5216243502Savg best_txg = txg; 5217243502Savg best_cfg = configs[i]; 5218243502Savg } 5219243502Savg } 5220243502Savg 5221245945Savg nchildren = 1; 5222245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 5223243502Savg holes = NULL; 5224243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 5225243502Savg &holes, &nholes); 5226243502Savg 5227244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 5228243502Savg for (i = 0; i < nchildren; i++) { 5229243502Savg if (i >= count) 5230243502Savg break; 5231243502Savg if (configs[i] == NULL) 5232243502Savg continue; 5233243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 5234243502Savg &nvtop) == 0); 5235243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 5236243213Savg } 5237243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 5238243502Savg if (i >= nchildren) 5239243502Savg continue; 5240243502Savg if (tops[holes[i]] != NULL) 5241243502Savg continue; 5242243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 5243243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 5244243502Savg VDEV_TYPE_HOLE) == 0); 5245243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 5246243502Savg holes[i]) == 0); 5247243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 5248243502Savg 0) == 0); 5249243502Savg } 5250243502Savg for (i = 0; i < nchildren; i++) { 5251243502Savg if (tops[i] != NULL) 5252243502Savg continue; 5253243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 5254243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 5255243502Savg VDEV_TYPE_MISSING) == 0); 5256243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 5257243502Savg i) == 0); 5258243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 5259243502Savg 0) == 0); 5260243502Savg } 5261243213Savg 5262243213Savg /* 5263243502Savg * Create pool config based on the best vdev config. 5264241286Savg */ 5265243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 5266241286Savg 5267241286Savg /* 5268241286Savg * Put this pool's top-level vdevs into a root vdev. 5269241286Savg */ 5270243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5271243502Savg &pgid) == 0); 5272241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5273241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 5274241286Savg VDEV_TYPE_ROOT) == 0); 5275241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 5276241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 5277241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 5278243502Savg tops, nchildren) == 0); 5279241286Savg 5280241286Savg /* 5281241286Savg * Replace the existing vdev_tree with the new root vdev in 5282241286Savg * this pool's configuration (remove the old, add the new). 5283241286Savg */ 5284241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 5285243502Savg 5286243502Savg /* 5287243502Savg * Drop vdev config elements that should not be present at pool level. 5288243502Savg */ 5289243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 5290243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 5291243502Savg 5292243502Savg for (i = 0; i < count; i++) 5293243502Savg nvlist_free(configs[i]); 5294243502Savg kmem_free(configs, count * sizeof(void *)); 5295243502Savg for (i = 0; i < nchildren; i++) 5296243502Savg nvlist_free(tops[i]); 5297243502Savg kmem_free(tops, nchildren * sizeof(void *)); 5298241286Savg nvlist_free(nvroot); 5299241286Savg return (config); 5300241286Savg} 5301241286Savg 5302241286Savgint 5303241286Savgspa_import_rootpool(const char *name) 5304241286Savg{ 5305241286Savg spa_t *spa; 5306241286Savg vdev_t *rvd, *bvd, *avd = NULL; 5307241286Savg nvlist_t *config, *nvtop; 5308241286Savg uint64_t txg; 5309241286Savg char *pname; 5310241286Savg int error; 5311241286Savg 5312241286Savg /* 5313241286Savg * Read the label from the boot device and generate a configuration. 5314241286Savg */ 5315241286Savg config = spa_generate_rootconf(name); 5316243213Savg 5317243213Savg mutex_enter(&spa_namespace_lock); 5318243213Savg if (config != NULL) { 5319243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 5320243213Savg &pname) == 0 && strcmp(name, pname) == 0); 5321243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 5322243213Savg == 0); 5323243213Savg 5324243213Savg if ((spa = spa_lookup(pname)) != NULL) { 5325243213Savg /* 5326323746Savg * The pool could already be imported, 5327323746Savg * e.g., after reboot -r. 5328323746Savg */ 5329323746Savg if (spa->spa_state == POOL_STATE_ACTIVE) { 5330323746Savg mutex_exit(&spa_namespace_lock); 5331323746Savg nvlist_free(config); 5332323746Savg return (0); 5333323746Savg } 5334323746Savg 5335323746Savg /* 5336243213Savg * Remove the existing root pool from the namespace so 5337243213Savg * that we can replace it with the correct config 5338243213Savg * we just read in. 5339243213Savg */ 5340243213Savg spa_remove(spa); 5341243213Savg } 5342243213Savg spa = spa_add(pname, config, NULL); 5343243501Savg 5344243501Savg /* 5345243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 5346243501Savg * via spa_version(). 5347243501Savg */ 5348243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 5349243501Savg &spa->spa_ubsync.ub_version) != 0) 5350243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 5351243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 5352287100Savg mutex_exit(&spa_namespace_lock); 5353287100Savg nvlist_free(config); 5354241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 5355241286Savg name); 5356241286Savg return (EIO); 5357243213Savg } else { 5358243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 5359241286Savg } 5360241286Savg spa->spa_is_root = B_TRUE; 5361241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 5362241286Savg 5363241286Savg /* 5364241286Savg * Build up a vdev tree based on the boot device's label config. 5365241286Savg */ 5366241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 5367241286Savg &nvtop) == 0); 5368241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5369241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 5370241286Savg VDEV_ALLOC_ROOTPOOL); 5371241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 5372241286Savg if (error) { 5373241286Savg mutex_exit(&spa_namespace_lock); 5374241286Savg nvlist_free(config); 5375241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 5376241286Savg pname); 5377241286Savg return (error); 5378241286Savg } 5379241286Savg 5380241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5381241286Savg vdev_free(rvd); 5382241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 5383241286Savg mutex_exit(&spa_namespace_lock); 5384241286Savg 5385243213Savg nvlist_free(config); 5386243213Savg return (0); 5387241286Savg} 5388241286Savg 5389277300Ssmh#endif /* illumos */ 5390277300Ssmh#endif /* _KERNEL */ 5391219089Spjd 5392209962Smm/* 5393209962Smm * Import a non-root pool into the system. 5394209962Smm */ 5395185029Spjdint 5396219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 5397185029Spjd{ 5398209962Smm spa_t *spa; 5399209962Smm char *altroot = NULL; 5400219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 5401332550Smav zpool_load_policy_t policy; 5402219089Spjd uint64_t mode = spa_mode_global; 5403219089Spjd uint64_t readonly = B_FALSE; 5404209962Smm int error; 5405209962Smm nvlist_t *nvroot; 5406209962Smm nvlist_t **spares, **l2cache; 5407209962Smm uint_t nspares, nl2cache; 5408209962Smm 5409209962Smm /* 5410209962Smm * If a pool with this name exists, return failure. 5411209962Smm */ 5412209962Smm mutex_enter(&spa_namespace_lock); 5413219089Spjd if (spa_lookup(pool) != NULL) { 5414209962Smm mutex_exit(&spa_namespace_lock); 5415249195Smm return (SET_ERROR(EEXIST)); 5416209962Smm } 5417209962Smm 5418209962Smm /* 5419209962Smm * Create and initialize the spa structure. 5420209962Smm */ 5421209962Smm (void) nvlist_lookup_string(props, 5422209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5423219089Spjd (void) nvlist_lookup_uint64(props, 5424219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 5425219089Spjd if (readonly) 5426219089Spjd mode = FREAD; 5427219089Spjd spa = spa_add(pool, config, altroot); 5428219089Spjd spa->spa_import_flags = flags; 5429209962Smm 5430209962Smm /* 5431219089Spjd * Verbatim import - Take a pool and insert it into the namespace 5432219089Spjd * as if it had been loaded at boot. 5433219089Spjd */ 5434219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 5435219089Spjd if (props != NULL) 5436219089Spjd spa_configfile_set(spa, props, B_FALSE); 5437219089Spjd 5438332525Smav spa_write_cachefile(spa, B_FALSE, B_TRUE); 5439331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 5440332530Smav zfs_dbgmsg("spa_import: verbatim import of %s", pool); 5441219089Spjd mutex_exit(&spa_namespace_lock); 5442219089Spjd return (0); 5443219089Spjd } 5444219089Spjd 5445219089Spjd spa_activate(spa, mode); 5446219089Spjd 5447219089Spjd /* 5448209962Smm * Don't start async tasks until we know everything is healthy. 5449209962Smm */ 5450209962Smm spa_async_suspend(spa); 5451209962Smm 5452332550Smav zpool_get_load_policy(config, &policy); 5453332550Smav if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5454219089Spjd state = SPA_LOAD_RECOVER; 5455219089Spjd 5456332536Smav spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 5457332536Smav 5458332536Smav if (state != SPA_LOAD_RECOVER) { 5459219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5460332536Smav zfs_dbgmsg("spa_import: importing %s", pool); 5461332536Smav } else { 5462332536Smav zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 5463332550Smav "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 5464332536Smav } 5465332550Smav error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 5466209962Smm 5467219089Spjd /* 5468219089Spjd * Propagate anything learned while loading the pool and pass it 5469219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 5470219089Spjd */ 5471219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 5472219089Spjd spa->spa_load_info) == 0); 5473219089Spjd 5474209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5475209962Smm /* 5476209962Smm * Toss any existing sparelist, as it doesn't have any validity 5477209962Smm * anymore, and conflicts with spa_has_spare(). 5478209962Smm */ 5479209962Smm if (spa->spa_spares.sav_config) { 5480209962Smm nvlist_free(spa->spa_spares.sav_config); 5481209962Smm spa->spa_spares.sav_config = NULL; 5482209962Smm spa_load_spares(spa); 5483209962Smm } 5484209962Smm if (spa->spa_l2cache.sav_config) { 5485209962Smm nvlist_free(spa->spa_l2cache.sav_config); 5486209962Smm spa->spa_l2cache.sav_config = NULL; 5487209962Smm spa_load_l2cache(spa); 5488209962Smm } 5489209962Smm 5490209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 5491209962Smm &nvroot) == 0); 5492209962Smm if (error == 0) 5493209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 5494209962Smm VDEV_ALLOC_SPARE); 5495209962Smm if (error == 0) 5496209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 5497209962Smm VDEV_ALLOC_L2CACHE); 5498209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5499209962Smm 5500209962Smm if (props != NULL) 5501209962Smm spa_configfile_set(spa, props, B_FALSE); 5502209962Smm 5503209962Smm if (error != 0 || (props && spa_writeable(spa) && 5504209962Smm (error = spa_prop_set(spa, props)))) { 5505209962Smm spa_unload(spa); 5506209962Smm spa_deactivate(spa); 5507209962Smm spa_remove(spa); 5508209962Smm mutex_exit(&spa_namespace_lock); 5509209962Smm return (error); 5510209962Smm } 5511209962Smm 5512209962Smm spa_async_resume(spa); 5513209962Smm 5514209962Smm /* 5515209962Smm * Override any spares and level 2 cache devices as specified by 5516209962Smm * the user, as these may have correct device names/devids, etc. 5517209962Smm */ 5518209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5519209962Smm &spares, &nspares) == 0) { 5520209962Smm if (spa->spa_spares.sav_config) 5521209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 5522209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 5523209962Smm else 5524209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 5525209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 5526209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 5527209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 5528209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5529209962Smm spa_load_spares(spa); 5530209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5531209962Smm spa->spa_spares.sav_sync = B_TRUE; 5532209962Smm } 5533209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5534209962Smm &l2cache, &nl2cache) == 0) { 5535209962Smm if (spa->spa_l2cache.sav_config) 5536209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 5537209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 5538209962Smm else 5539209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 5540209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 5541209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 5542209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 5543209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5544209962Smm spa_load_l2cache(spa); 5545209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5546209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 5547209962Smm } 5548209962Smm 5549219089Spjd /* 5550219089Spjd * Check for any removed devices. 5551219089Spjd */ 5552219089Spjd if (spa->spa_autoreplace) { 5553219089Spjd spa_aux_check_removed(&spa->spa_spares); 5554219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 5555219089Spjd } 5556219089Spjd 5557209962Smm if (spa_writeable(spa)) { 5558209962Smm /* 5559209962Smm * Update the config cache to include the newly-imported pool. 5560209962Smm */ 5561209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5562209962Smm } 5563209962Smm 5564219089Spjd /* 5565219089Spjd * It's possible that the pool was expanded while it was exported. 5566219089Spjd * We kick off an async task to handle this for us. 5567219089Spjd */ 5568219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 5569219089Spjd 5570248571Smm spa_history_log_version(spa, "import"); 5571209962Smm 5572331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 5573287745Sdelphij 5574287745Sdelphij mutex_exit(&spa_namespace_lock); 5575287745Sdelphij 5576219089Spjd#ifdef __FreeBSD__ 5577219089Spjd#ifdef _KERNEL 5578219089Spjd zvol_create_minors(pool); 5579219089Spjd#endif 5580219089Spjd#endif 5581209962Smm return (0); 5582185029Spjd} 5583185029Spjd 5584168404Spjdnvlist_t * 5585168404Spjdspa_tryimport(nvlist_t *tryconfig) 5586168404Spjd{ 5587168404Spjd nvlist_t *config = NULL; 5588332536Smav char *poolname, *cachefile; 5589168404Spjd spa_t *spa; 5590168404Spjd uint64_t state; 5591208443Smm int error; 5592332550Smav zpool_load_policy_t policy; 5593168404Spjd 5594168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 5595168404Spjd return (NULL); 5596168404Spjd 5597168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 5598168404Spjd return (NULL); 5599168404Spjd 5600168404Spjd /* 5601168404Spjd * Create and initialize the spa structure. 5602168404Spjd */ 5603168404Spjd mutex_enter(&spa_namespace_lock); 5604219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 5605209962Smm spa_activate(spa, FREAD); 5606168404Spjd 5607168404Spjd /* 5608332550Smav * Rewind pool if a max txg was provided. 5609168404Spjd */ 5610332550Smav zpool_get_load_policy(spa->spa_config, &policy); 5611332550Smav if (policy.zlp_txg != UINT64_MAX) { 5612332550Smav spa->spa_load_max_txg = policy.zlp_txg; 5613332536Smav spa->spa_extreme_rewind = B_TRUE; 5614332536Smav zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 5615332550Smav poolname, (longlong_t)policy.zlp_txg); 5616332536Smav } else { 5617332536Smav zfs_dbgmsg("spa_tryimport: importing %s", poolname); 5618332536Smav } 5619168404Spjd 5620332536Smav if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 5621332536Smav == 0) { 5622332536Smav zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 5623332536Smav spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5624332536Smav } else { 5625332536Smav spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 5626332536Smav } 5627332536Smav 5628332536Smav error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 5629332536Smav 5630168404Spjd /* 5631168404Spjd * If 'tryconfig' was at least parsable, return the current config. 5632168404Spjd */ 5633168404Spjd if (spa->spa_root_vdev != NULL) { 5634168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5635168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 5636168404Spjd poolname) == 0); 5637168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5638168404Spjd state) == 0); 5639168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 5640168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 5641236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 5642236884Smm spa->spa_load_info) == 0); 5643168404Spjd 5644168404Spjd /* 5645185029Spjd * If the bootfs property exists on this pool then we 5646185029Spjd * copy it out so that external consumers can tell which 5647185029Spjd * pools are bootable. 5648168404Spjd */ 5649208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 5650185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 5651185029Spjd 5652185029Spjd /* 5653185029Spjd * We have to play games with the name since the 5654185029Spjd * pool was opened as TRYIMPORT_NAME. 5655185029Spjd */ 5656185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 5657185029Spjd spa->spa_bootfs, tmpname) == 0) { 5658185029Spjd char *cp; 5659185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 5660185029Spjd 5661185029Spjd cp = strchr(tmpname, '/'); 5662185029Spjd if (cp == NULL) { 5663185029Spjd (void) strlcpy(dsname, tmpname, 5664185029Spjd MAXPATHLEN); 5665185029Spjd } else { 5666185029Spjd (void) snprintf(dsname, MAXPATHLEN, 5667185029Spjd "%s/%s", poolname, ++cp); 5668185029Spjd } 5669185029Spjd VERIFY(nvlist_add_string(config, 5670185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 5671185029Spjd kmem_free(dsname, MAXPATHLEN); 5672185029Spjd } 5673185029Spjd kmem_free(tmpname, MAXPATHLEN); 5674185029Spjd } 5675185029Spjd 5676185029Spjd /* 5677185029Spjd * Add the list of hot spares and level 2 cache devices. 5678185029Spjd */ 5679209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5680168404Spjd spa_add_spares(spa, config); 5681185029Spjd spa_add_l2cache(spa, config); 5682209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 5683168404Spjd } 5684168404Spjd 5685168404Spjd spa_unload(spa); 5686168404Spjd spa_deactivate(spa); 5687168404Spjd spa_remove(spa); 5688168404Spjd mutex_exit(&spa_namespace_lock); 5689168404Spjd 5690168404Spjd return (config); 5691168404Spjd} 5692168404Spjd 5693168404Spjd/* 5694168404Spjd * Pool export/destroy 5695168404Spjd * 5696168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 5697168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 5698168404Spjd * update the pool state and sync all the labels to disk, removing the 5699207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 5700207670Smm * we don't sync the labels or remove the configuration cache. 5701168404Spjd */ 5702168404Spjdstatic int 5703185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 5704207670Smm boolean_t force, boolean_t hardforce) 5705168404Spjd{ 5706168404Spjd spa_t *spa; 5707168404Spjd 5708168404Spjd if (oldconfig) 5709168404Spjd *oldconfig = NULL; 5710168404Spjd 5711209962Smm if (!(spa_mode_global & FWRITE)) 5712249195Smm return (SET_ERROR(EROFS)); 5713168404Spjd 5714168404Spjd mutex_enter(&spa_namespace_lock); 5715168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 5716168404Spjd mutex_exit(&spa_namespace_lock); 5717249195Smm return (SET_ERROR(ENOENT)); 5718168404Spjd } 5719168404Spjd 5720168404Spjd /* 5721168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 5722168404Spjd * reacquire the namespace lock, and see if we can export. 5723168404Spjd */ 5724168404Spjd spa_open_ref(spa, FTAG); 5725168404Spjd mutex_exit(&spa_namespace_lock); 5726168404Spjd spa_async_suspend(spa); 5727168404Spjd mutex_enter(&spa_namespace_lock); 5728168404Spjd spa_close(spa, FTAG); 5729168404Spjd 5730168404Spjd /* 5731168404Spjd * The pool will be in core if it's openable, 5732168404Spjd * in which case we can modify its state. 5733168404Spjd */ 5734168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 5735339111Smav 5736168404Spjd /* 5737168404Spjd * Objsets may be open only because they're dirty, so we 5738168404Spjd * have to force it to sync before checking spa_refcnt. 5739168404Spjd */ 5740168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 5741286575Smav spa_evicting_os_wait(spa); 5742168404Spjd 5743168404Spjd /* 5744168404Spjd * A pool cannot be exported or destroyed if there are active 5745168404Spjd * references. If we are resetting a pool, allow references by 5746168404Spjd * fault injection handlers. 5747168404Spjd */ 5748168404Spjd if (!spa_refcount_zero(spa) || 5749168404Spjd (spa->spa_inject_ref != 0 && 5750168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 5751168404Spjd spa_async_resume(spa); 5752168404Spjd mutex_exit(&spa_namespace_lock); 5753249195Smm return (SET_ERROR(EBUSY)); 5754168404Spjd } 5755168404Spjd 5756185029Spjd /* 5757185029Spjd * A pool cannot be exported if it has an active shared spare. 5758185029Spjd * This is to prevent other pools stealing the active spare 5759185029Spjd * from an exported pool. At user's own will, such pool can 5760185029Spjd * be forcedly exported. 5761185029Spjd */ 5762185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 5763185029Spjd spa_has_active_shared_spare(spa)) { 5764185029Spjd spa_async_resume(spa); 5765185029Spjd mutex_exit(&spa_namespace_lock); 5766249195Smm return (SET_ERROR(EXDEV)); 5767185029Spjd } 5768168404Spjd 5769168404Spjd /* 5770339111Smav * We're about to export or destroy this pool. Make sure 5771339111Smav * we stop all initializtion activity here before we 5772339111Smav * set the spa_final_txg. This will ensure that all 5773339111Smav * dirty data resulting from the initialization is 5774339111Smav * committed to disk before we unload the pool. 5775339111Smav */ 5776339111Smav if (spa->spa_root_vdev != NULL) { 5777339111Smav vdev_initialize_stop_all(spa->spa_root_vdev, 5778339111Smav VDEV_INITIALIZE_ACTIVE); 5779339111Smav } 5780339111Smav 5781339111Smav /* 5782168404Spjd * We want this to be reflected on every label, 5783168404Spjd * so mark them all dirty. spa_unload() will do the 5784168404Spjd * final sync that pushes these changes out. 5785168404Spjd */ 5786207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 5787185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5788168404Spjd spa->spa_state = new_state; 5789219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 5790219089Spjd TXG_DEFER_SIZE + 1; 5791168404Spjd vdev_config_dirty(spa->spa_root_vdev); 5792185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 5793168404Spjd } 5794168404Spjd } 5795168404Spjd 5796331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 5797185029Spjd 5798168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 5799168404Spjd spa_unload(spa); 5800168404Spjd spa_deactivate(spa); 5801168404Spjd } 5802168404Spjd 5803168404Spjd if (oldconfig && spa->spa_config) 5804168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 5805168404Spjd 5806168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 5807207670Smm if (!hardforce) 5808332525Smav spa_write_cachefile(spa, B_TRUE, B_TRUE); 5809168404Spjd spa_remove(spa); 5810168404Spjd } 5811168404Spjd mutex_exit(&spa_namespace_lock); 5812168404Spjd 5813168404Spjd return (0); 5814168404Spjd} 5815168404Spjd 5816168404Spjd/* 5817168404Spjd * Destroy a storage pool. 5818168404Spjd */ 5819168404Spjdint 5820168404Spjdspa_destroy(char *pool) 5821168404Spjd{ 5822207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 5823207670Smm B_FALSE, B_FALSE)); 5824168404Spjd} 5825168404Spjd 5826168404Spjd/* 5827168404Spjd * Export a storage pool. 5828168404Spjd */ 5829168404Spjdint 5830207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 5831207670Smm boolean_t hardforce) 5832168404Spjd{ 5833207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 5834207670Smm force, hardforce)); 5835168404Spjd} 5836168404Spjd 5837168404Spjd/* 5838168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 5839168404Spjd * from the namespace in any way. 5840168404Spjd */ 5841168404Spjdint 5842168404Spjdspa_reset(char *pool) 5843168404Spjd{ 5844185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 5845207670Smm B_FALSE, B_FALSE)); 5846168404Spjd} 5847168404Spjd 5848168404Spjd/* 5849168404Spjd * ========================================================================== 5850168404Spjd * Device manipulation 5851168404Spjd * ========================================================================== 5852168404Spjd */ 5853168404Spjd 5854168404Spjd/* 5855185029Spjd * Add a device to a storage pool. 5856168404Spjd */ 5857168404Spjdint 5858168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 5859168404Spjd{ 5860219089Spjd uint64_t txg, id; 5861209962Smm int error; 5862168404Spjd vdev_t *rvd = spa->spa_root_vdev; 5863168404Spjd vdev_t *vd, *tvd; 5864185029Spjd nvlist_t **spares, **l2cache; 5865185029Spjd uint_t nspares, nl2cache; 5866168404Spjd 5867219089Spjd ASSERT(spa_writeable(spa)); 5868219089Spjd 5869168404Spjd txg = spa_vdev_enter(spa); 5870168404Spjd 5871168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 5872168404Spjd VDEV_ALLOC_ADD)) != 0) 5873168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5874168404Spjd 5875185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 5876168404Spjd 5877185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 5878185029Spjd &nspares) != 0) 5879168404Spjd nspares = 0; 5880168404Spjd 5881185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 5882185029Spjd &nl2cache) != 0) 5883185029Spjd nl2cache = 0; 5884185029Spjd 5885185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 5886168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5887168404Spjd 5888185029Spjd if (vd->vdev_children != 0 && 5889185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 5890185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 5891168404Spjd 5892168404Spjd /* 5893185029Spjd * We must validate the spares and l2cache devices after checking the 5894185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 5895168404Spjd */ 5896185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 5897168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 5898168404Spjd 5899168404Spjd /* 5900332525Smav * If we are in the middle of a device removal, we can only add 5901332525Smav * devices which match the existing devices in the pool. 5902332525Smav * If we are in the middle of a removal, or have some indirect 5903332525Smav * vdevs, we can not add raidz toplevels. 5904168404Spjd */ 5905332525Smav if (spa->spa_vdev_removal != NULL || 5906332525Smav spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 5907332525Smav for (int c = 0; c < vd->vdev_children; c++) { 5908332525Smav tvd = vd->vdev_child[c]; 5909332525Smav if (spa->spa_vdev_removal != NULL && 5910339106Smav tvd->vdev_ashift != spa->spa_max_ashift) { 5911332525Smav return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5912332525Smav } 5913332525Smav /* Fail if top level vdev is raidz */ 5914332525Smav if (tvd->vdev_ops == &vdev_raidz_ops) { 5915332525Smav return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5916332525Smav } 5917332525Smav /* 5918332525Smav * Need the top level mirror to be 5919332525Smav * a mirror of leaf vdevs only 5920332525Smav */ 5921332525Smav if (tvd->vdev_ops == &vdev_mirror_ops) { 5922332525Smav for (uint64_t cid = 0; 5923332525Smav cid < tvd->vdev_children; cid++) { 5924332525Smav vdev_t *cvd = tvd->vdev_child[cid]; 5925332525Smav if (!cvd->vdev_ops->vdev_op_leaf) { 5926332525Smav return (spa_vdev_exit(spa, vd, 5927332525Smav txg, EINVAL)); 5928332525Smav } 5929332525Smav } 5930332525Smav } 5931332525Smav } 5932332525Smav } 5933332525Smav 5934209962Smm for (int c = 0; c < vd->vdev_children; c++) { 5935219089Spjd 5936219089Spjd /* 5937219089Spjd * Set the vdev id to the first hole, if one exists. 5938219089Spjd */ 5939219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 5940219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 5941219089Spjd vdev_free(rvd->vdev_child[id]); 5942219089Spjd break; 5943219089Spjd } 5944219089Spjd } 5945168404Spjd tvd = vd->vdev_child[c]; 5946168404Spjd vdev_remove_child(vd, tvd); 5947219089Spjd tvd->vdev_id = id; 5948168404Spjd vdev_add_child(rvd, tvd); 5949168404Spjd vdev_config_dirty(tvd); 5950168404Spjd } 5951168404Spjd 5952168404Spjd if (nspares != 0) { 5953185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 5954185029Spjd ZPOOL_CONFIG_SPARES); 5955168404Spjd spa_load_spares(spa); 5956185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5957168404Spjd } 5958168404Spjd 5959185029Spjd if (nl2cache != 0) { 5960185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 5961185029Spjd ZPOOL_CONFIG_L2CACHE); 5962185029Spjd spa_load_l2cache(spa); 5963185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5964185029Spjd } 5965185029Spjd 5966168404Spjd /* 5967168404Spjd * We have to be careful when adding new vdevs to an existing pool. 5968168404Spjd * If other threads start allocating from these vdevs before we 5969168404Spjd * sync the config cache, and we lose power, then upon reboot we may 5970168404Spjd * fail to open the pool because there are DVAs that the config cache 5971168404Spjd * can't translate. Therefore, we first add the vdevs without 5972168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 5973168404Spjd * and then let spa_config_update() initialize the new metaslabs. 5974168404Spjd * 5975168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 5976168404Spjd * if we lose power at any point in this sequence, the remaining 5977168404Spjd * steps will be completed the next time we load the pool. 5978168404Spjd */ 5979168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 5980168404Spjd 5981168404Spjd mutex_enter(&spa_namespace_lock); 5982168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5983331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 5984168404Spjd mutex_exit(&spa_namespace_lock); 5985168404Spjd 5986168404Spjd return (0); 5987168404Spjd} 5988168404Spjd 5989168404Spjd/* 5990168404Spjd * Attach a device to a mirror. The arguments are the path to any device 5991168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 5992168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 5993168404Spjd * 5994168404Spjd * If 'replacing' is specified, the new device is intended to replace the 5995168404Spjd * existing device; in this case the two devices are made into their own 5996185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 5997168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 5998168404Spjd * extra rules: you can't attach to it after it's been created, and upon 5999168404Spjd * completion of resilvering, the first disk (the one being replaced) 6000168404Spjd * is automatically detached. 6001168404Spjd */ 6002168404Spjdint 6003168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 6004168404Spjd{ 6005219089Spjd uint64_t txg, dtl_max_txg; 6006168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6007168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 6008168404Spjd vdev_ops_t *pvops; 6009185029Spjd char *oldvdpath, *newvdpath; 6010185029Spjd int newvd_isspare; 6011185029Spjd int error; 6012168404Spjd 6013219089Spjd ASSERT(spa_writeable(spa)); 6014219089Spjd 6015168404Spjd txg = spa_vdev_enter(spa); 6016168404Spjd 6017185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 6018168404Spjd 6019332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6020332547Smav if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6021332547Smav error = (spa_has_checkpoint(spa)) ? 6022332547Smav ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6023332547Smav return (spa_vdev_exit(spa, NULL, txg, error)); 6024332547Smav } 6025332547Smav 6026339106Smav if (spa->spa_vdev_removal != NULL) 6027332525Smav return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 6028332525Smav 6029168404Spjd if (oldvd == NULL) 6030168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 6031168404Spjd 6032168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 6033168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6034168404Spjd 6035168404Spjd pvd = oldvd->vdev_parent; 6036168404Spjd 6037168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 6038230514Smm VDEV_ALLOC_ATTACH)) != 0) 6039185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6040185029Spjd 6041185029Spjd if (newrootvd->vdev_children != 1) 6042168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6043168404Spjd 6044168404Spjd newvd = newrootvd->vdev_child[0]; 6045168404Spjd 6046168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 6047168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6048168404Spjd 6049168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 6050168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 6051168404Spjd 6052185029Spjd /* 6053185029Spjd * Spares can't replace logs 6054185029Spjd */ 6055185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 6056185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6057185029Spjd 6058168404Spjd if (!replacing) { 6059168404Spjd /* 6060168404Spjd * For attach, the only allowable parent is a mirror or the root 6061168404Spjd * vdev. 6062168404Spjd */ 6063168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 6064168404Spjd pvd->vdev_ops != &vdev_root_ops) 6065168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6066168404Spjd 6067168404Spjd pvops = &vdev_mirror_ops; 6068168404Spjd } else { 6069168404Spjd /* 6070168404Spjd * Active hot spares can only be replaced by inactive hot 6071168404Spjd * spares. 6072168404Spjd */ 6073168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 6074219089Spjd oldvd->vdev_isspare && 6075168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 6076168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6077168404Spjd 6078168404Spjd /* 6079168404Spjd * If the source is a hot spare, and the parent isn't already a 6080168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 6081168404Spjd * want to create a replacing vdev. The user is not allowed to 6082168404Spjd * attach to a spared vdev child unless the 'isspare' state is 6083168404Spjd * the same (spare replaces spare, non-spare replaces 6084168404Spjd * non-spare). 6085168404Spjd */ 6086219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 6087219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 6088168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6089219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 6090219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 6091168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6092219089Spjd } 6093219089Spjd 6094219089Spjd if (newvd->vdev_isspare) 6095168404Spjd pvops = &vdev_spare_ops; 6096168404Spjd else 6097168404Spjd pvops = &vdev_replacing_ops; 6098168404Spjd } 6099168404Spjd 6100168404Spjd /* 6101219089Spjd * Make sure the new device is big enough. 6102168404Spjd */ 6103219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 6104168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 6105168404Spjd 6106168404Spjd /* 6107168404Spjd * The new device cannot have a higher alignment requirement 6108168404Spjd * than the top-level vdev. 6109168404Spjd */ 6110168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 6111168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 6112168404Spjd 6113168404Spjd /* 6114168404Spjd * If this is an in-place replacement, update oldvd's path and devid 6115168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 6116168404Spjd */ 6117168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 6118168404Spjd spa_strfree(oldvd->vdev_path); 6119168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 6120168404Spjd KM_SLEEP); 6121168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 6122168404Spjd newvd->vdev_path, "old"); 6123168404Spjd if (oldvd->vdev_devid != NULL) { 6124168404Spjd spa_strfree(oldvd->vdev_devid); 6125168404Spjd oldvd->vdev_devid = NULL; 6126168404Spjd } 6127168404Spjd } 6128168404Spjd 6129219089Spjd /* mark the device being resilvered */ 6130254112Sdelphij newvd->vdev_resilver_txg = txg; 6131219089Spjd 6132168404Spjd /* 6133168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 6134168404Spjd * mirror/replacing/spare vdev above oldvd. 6135168404Spjd */ 6136168404Spjd if (pvd->vdev_ops != pvops) 6137168404Spjd pvd = vdev_add_parent(oldvd, pvops); 6138168404Spjd 6139168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 6140168404Spjd ASSERT(pvd->vdev_ops == pvops); 6141168404Spjd ASSERT(oldvd->vdev_parent == pvd); 6142168404Spjd 6143168404Spjd /* 6144168404Spjd * Extract the new device from its root and add it to pvd. 6145168404Spjd */ 6146168404Spjd vdev_remove_child(newrootvd, newvd); 6147168404Spjd newvd->vdev_id = pvd->vdev_children; 6148219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 6149168404Spjd vdev_add_child(pvd, newvd); 6150168404Spjd 6151168404Spjd tvd = newvd->vdev_top; 6152168404Spjd ASSERT(pvd->vdev_top == tvd); 6153168404Spjd ASSERT(tvd->vdev_parent == rvd); 6154168404Spjd 6155168404Spjd vdev_config_dirty(tvd); 6156168404Spjd 6157168404Spjd /* 6158219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 6159219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 6160219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 6161168404Spjd */ 6162219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 6163168404Spjd 6164219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 6165219089Spjd dtl_max_txg - TXG_INITIAL); 6166168404Spjd 6167209962Smm if (newvd->vdev_isspare) { 6168168404Spjd spa_spare_activate(newvd); 6169331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 6170209962Smm } 6171209962Smm 6172185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 6173185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 6174185029Spjd newvd_isspare = newvd->vdev_isspare; 6175168404Spjd 6176168404Spjd /* 6177168404Spjd * Mark newvd's DTL dirty in this txg. 6178168404Spjd */ 6179168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 6180168404Spjd 6181219089Spjd /* 6182258717Savg * Schedule the resilver to restart in the future. We do this to 6183258717Savg * ensure that dmu_sync-ed blocks have been stitched into the 6184258717Savg * respective datasets. 6185219089Spjd */ 6186219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 6187168404Spjd 6188287745Sdelphij if (spa->spa_bootfs) 6189331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 6190287745Sdelphij 6191331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 6192287745Sdelphij 6193219089Spjd /* 6194219089Spjd * Commit the config 6195219089Spjd */ 6196219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 6197185029Spjd 6198248571Smm spa_history_log_internal(spa, "vdev attach", NULL, 6199219089Spjd "%s vdev=%s %s vdev=%s", 6200219089Spjd replacing && newvd_isspare ? "spare in" : 6201219089Spjd replacing ? "replace" : "attach", newvdpath, 6202219089Spjd replacing ? "for" : "to", oldvdpath); 6203219089Spjd 6204185029Spjd spa_strfree(oldvdpath); 6205185029Spjd spa_strfree(newvdpath); 6206185029Spjd 6207168404Spjd return (0); 6208168404Spjd} 6209168404Spjd 6210168404Spjd/* 6211168404Spjd * Detach a device from a mirror or replacing vdev. 6212251631Sdelphij * 6213168404Spjd * If 'replace_done' is specified, only detach if the parent 6214168404Spjd * is a replacing vdev. 6215168404Spjd */ 6216168404Spjdint 6217209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 6218168404Spjd{ 6219168404Spjd uint64_t txg; 6220209962Smm int error; 6221168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6222168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 6223168404Spjd boolean_t unspare = B_FALSE; 6224247187Smm uint64_t unspare_guid = 0; 6225219089Spjd char *vdpath; 6226168404Spjd 6227219089Spjd ASSERT(spa_writeable(spa)); 6228219089Spjd 6229168404Spjd txg = spa_vdev_enter(spa); 6230168404Spjd 6231185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 6232168404Spjd 6233332547Smav /* 6234332547Smav * Besides being called directly from the userland through the 6235332547Smav * ioctl interface, spa_vdev_detach() can be potentially called 6236332547Smav * at the end of spa_vdev_resilver_done(). 6237332547Smav * 6238332547Smav * In the regular case, when we have a checkpoint this shouldn't 6239332547Smav * happen as we never empty the DTLs of a vdev during the scrub 6240332547Smav * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 6241332547Smav * should never get here when we have a checkpoint. 6242332547Smav * 6243332547Smav * That said, even in a case when we checkpoint the pool exactly 6244332547Smav * as spa_vdev_resilver_done() calls this function everything 6245332547Smav * should be fine as the resilver will return right away. 6246332547Smav */ 6247332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6248332547Smav if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6249332547Smav error = (spa_has_checkpoint(spa)) ? 6250332547Smav ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6251332547Smav return (spa_vdev_exit(spa, NULL, txg, error)); 6252332547Smav } 6253332547Smav 6254168404Spjd if (vd == NULL) 6255168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 6256168404Spjd 6257168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 6258168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6259168404Spjd 6260168404Spjd pvd = vd->vdev_parent; 6261168404Spjd 6262168404Spjd /* 6263209962Smm * If the parent/child relationship is not as expected, don't do it. 6264209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 6265209962Smm * vdev that's replacing B with C. The user's intent in replacing 6266209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 6267209962Smm * the replace by detaching C, the expected behavior is to end up 6268209962Smm * M(A,B). But suppose that right after deciding to detach C, 6269209962Smm * the replacement of B completes. We would have M(A,C), and then 6270209962Smm * ask to detach C, which would leave us with just A -- not what 6271209962Smm * the user wanted. To prevent this, we make sure that the 6272209962Smm * parent/child relationship hasn't changed -- in this example, 6273209962Smm * that C's parent is still the replacing vdev R. 6274209962Smm */ 6275209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 6276209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 6277209962Smm 6278209962Smm /* 6279219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 6280168404Spjd */ 6281219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 6282219089Spjd pvd->vdev_ops != &vdev_spare_ops) 6283219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6284168404Spjd 6285168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 6286185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 6287168404Spjd 6288168404Spjd /* 6289168404Spjd * Only mirror, replacing, and spare vdevs support detach. 6290168404Spjd */ 6291168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 6292168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 6293168404Spjd pvd->vdev_ops != &vdev_spare_ops) 6294168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6295168404Spjd 6296168404Spjd /* 6297209962Smm * If this device has the only valid copy of some data, 6298209962Smm * we cannot safely detach it. 6299168404Spjd */ 6300209962Smm if (vdev_dtl_required(vd)) 6301168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 6302168404Spjd 6303209962Smm ASSERT(pvd->vdev_children >= 2); 6304168404Spjd 6305168404Spjd /* 6306185029Spjd * If we are detaching the second disk from a replacing vdev, then 6307185029Spjd * check to see if we changed the original vdev's path to have "/old" 6308185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 6309168404Spjd */ 6310219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 6311219089Spjd vd->vdev_path != NULL) { 6312219089Spjd size_t len = strlen(vd->vdev_path); 6313219089Spjd 6314219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 6315219089Spjd cvd = pvd->vdev_child[c]; 6316219089Spjd 6317219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 6318219089Spjd continue; 6319219089Spjd 6320219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 6321219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 6322219089Spjd spa_strfree(cvd->vdev_path); 6323219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 6324219089Spjd break; 6325219089Spjd } 6326185029Spjd } 6327185029Spjd } 6328168404Spjd 6329168404Spjd /* 6330168404Spjd * If we are detaching the original disk from a spare, then it implies 6331168404Spjd * that the spare should become a real disk, and be removed from the 6332168404Spjd * active spare list for the pool. 6333168404Spjd */ 6334168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 6335219089Spjd vd->vdev_id == 0 && 6336219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 6337168404Spjd unspare = B_TRUE; 6338168404Spjd 6339168404Spjd /* 6340168404Spjd * Erase the disk labels so the disk can be used for other things. 6341168404Spjd * This must be done after all other error cases are handled, 6342168404Spjd * but before we disembowel vd (so we can still do I/O to it). 6343168404Spjd * But if we can't do it, don't treat the error as fatal -- 6344168404Spjd * it may be that the unwritability of the disk is the reason 6345168404Spjd * it's being detached! 6346168404Spjd */ 6347168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 6348168404Spjd 6349168404Spjd /* 6350168404Spjd * Remove vd from its parent and compact the parent's children. 6351168404Spjd */ 6352168404Spjd vdev_remove_child(pvd, vd); 6353168404Spjd vdev_compact_children(pvd); 6354168404Spjd 6355168404Spjd /* 6356168404Spjd * Remember one of the remaining children so we can get tvd below. 6357168404Spjd */ 6358219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 6359168404Spjd 6360168404Spjd /* 6361168404Spjd * If we need to remove the remaining child from the list of hot spares, 6362209962Smm * do it now, marking the vdev as no longer a spare in the process. 6363209962Smm * We must do this before vdev_remove_parent(), because that can 6364209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 6365209962Smm * reason, we must remove the spare now, in the same txg as the detach; 6366209962Smm * otherwise someone could attach a new sibling, change the GUID, and 6367209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 6368168404Spjd */ 6369168404Spjd if (unspare) { 6370168404Spjd ASSERT(cvd->vdev_isspare); 6371168404Spjd spa_spare_remove(cvd); 6372168404Spjd unspare_guid = cvd->vdev_guid; 6373209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 6374219089Spjd cvd->vdev_unspare = B_TRUE; 6375168404Spjd } 6376168404Spjd 6377168404Spjd /* 6378168404Spjd * If the parent mirror/replacing vdev only has one child, 6379168404Spjd * the parent is no longer needed. Remove it from the tree. 6380168404Spjd */ 6381219089Spjd if (pvd->vdev_children == 1) { 6382219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 6383219089Spjd cvd->vdev_unspare = B_FALSE; 6384168404Spjd vdev_remove_parent(cvd); 6385219089Spjd } 6386168404Spjd 6387219089Spjd 6388168404Spjd /* 6389168404Spjd * We don't set tvd until now because the parent we just removed 6390168404Spjd * may have been the previous top-level vdev. 6391168404Spjd */ 6392168404Spjd tvd = cvd->vdev_top; 6393168404Spjd ASSERT(tvd->vdev_parent == rvd); 6394168404Spjd 6395168404Spjd /* 6396168404Spjd * Reevaluate the parent vdev state. 6397168404Spjd */ 6398185029Spjd vdev_propagate_state(cvd); 6399168404Spjd 6400168404Spjd /* 6401219089Spjd * If the 'autoexpand' property is set on the pool then automatically 6402219089Spjd * try to expand the size of the pool. For example if the device we 6403219089Spjd * just detached was smaller than the others, it may be possible to 6404219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 6405219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 6406168404Spjd */ 6407219089Spjd if (spa->spa_autoexpand) { 6408219089Spjd vdev_reopen(tvd); 6409219089Spjd vdev_expand(tvd, txg); 6410219089Spjd } 6411168404Spjd 6412168404Spjd vdev_config_dirty(tvd); 6413168404Spjd 6414168404Spjd /* 6415168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 6416168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 6417168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 6418168404Spjd * prevent vd from being accessed after it's freed. 6419168404Spjd */ 6420219089Spjd vdpath = spa_strdup(vd->vdev_path); 6421209962Smm for (int t = 0; t < TXG_SIZE; t++) 6422168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 6423168404Spjd vd->vdev_detached = B_TRUE; 6424168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 6425168404Spjd 6426331397Smav spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 6427185029Spjd 6428219089Spjd /* hang on to the spa before we release the lock */ 6429219089Spjd spa_open_ref(spa, FTAG); 6430219089Spjd 6431168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 6432168404Spjd 6433248571Smm spa_history_log_internal(spa, "detach", NULL, 6434219089Spjd "vdev=%s", vdpath); 6435219089Spjd spa_strfree(vdpath); 6436219089Spjd 6437168404Spjd /* 6438168404Spjd * If this was the removal of the original device in a hot spare vdev, 6439168404Spjd * then we want to go through and remove the device from the hot spare 6440168404Spjd * list of every other pool. 6441168404Spjd */ 6442168404Spjd if (unspare) { 6443219089Spjd spa_t *altspa = NULL; 6444219089Spjd 6445168404Spjd mutex_enter(&spa_namespace_lock); 6446219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 6447219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 6448219089Spjd altspa == spa) 6449168404Spjd continue; 6450219089Spjd 6451219089Spjd spa_open_ref(altspa, FTAG); 6452185029Spjd mutex_exit(&spa_namespace_lock); 6453219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 6454185029Spjd mutex_enter(&spa_namespace_lock); 6455219089Spjd spa_close(altspa, FTAG); 6456168404Spjd } 6457168404Spjd mutex_exit(&spa_namespace_lock); 6458219089Spjd 6459219089Spjd /* search the rest of the vdevs for spares to remove */ 6460219089Spjd spa_vdev_resilver_done(spa); 6461168404Spjd } 6462168404Spjd 6463219089Spjd /* all done with the spa; OK to release */ 6464219089Spjd mutex_enter(&spa_namespace_lock); 6465219089Spjd spa_close(spa, FTAG); 6466219089Spjd mutex_exit(&spa_namespace_lock); 6467219089Spjd 6468168404Spjd return (error); 6469168404Spjd} 6470168404Spjd 6471339111Smavint 6472339111Smavspa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type) 6473339111Smav{ 6474339111Smav /* 6475339111Smav * We hold the namespace lock through the whole function 6476339111Smav * to prevent any changes to the pool while we're starting or 6477339111Smav * stopping initialization. The config and state locks are held so that 6478339111Smav * we can properly assess the vdev state before we commit to 6479339111Smav * the initializing operation. 6480339111Smav */ 6481339111Smav mutex_enter(&spa_namespace_lock); 6482339111Smav spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6483339111Smav 6484339111Smav /* Look up vdev and ensure it's a leaf. */ 6485339111Smav vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 6486339111Smav if (vd == NULL || vd->vdev_detached) { 6487339111Smav spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6488339111Smav mutex_exit(&spa_namespace_lock); 6489339111Smav return (SET_ERROR(ENODEV)); 6490339111Smav } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 6491339111Smav spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6492339111Smav mutex_exit(&spa_namespace_lock); 6493339111Smav return (SET_ERROR(EINVAL)); 6494339111Smav } else if (!vdev_writeable(vd)) { 6495339111Smav spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6496339111Smav mutex_exit(&spa_namespace_lock); 6497339111Smav return (SET_ERROR(EROFS)); 6498339111Smav } 6499339111Smav mutex_enter(&vd->vdev_initialize_lock); 6500339111Smav spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6501339111Smav 6502339111Smav /* 6503339111Smav * When we activate an initialize action we check to see 6504339111Smav * if the vdev_initialize_thread is NULL. We do this instead 6505339111Smav * of using the vdev_initialize_state since there might be 6506339111Smav * a previous initialization process which has completed but 6507339111Smav * the thread is not exited. 6508339111Smav */ 6509339111Smav if (cmd_type == POOL_INITIALIZE_DO && 6510339111Smav (vd->vdev_initialize_thread != NULL || 6511339111Smav vd->vdev_top->vdev_removing)) { 6512339111Smav mutex_exit(&vd->vdev_initialize_lock); 6513339111Smav mutex_exit(&spa_namespace_lock); 6514339111Smav return (SET_ERROR(EBUSY)); 6515339111Smav } else if (cmd_type == POOL_INITIALIZE_CANCEL && 6516339111Smav (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 6517339111Smav vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 6518339111Smav mutex_exit(&vd->vdev_initialize_lock); 6519339111Smav mutex_exit(&spa_namespace_lock); 6520339111Smav return (SET_ERROR(ESRCH)); 6521339111Smav } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 6522339111Smav vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 6523339111Smav mutex_exit(&vd->vdev_initialize_lock); 6524339111Smav mutex_exit(&spa_namespace_lock); 6525339111Smav return (SET_ERROR(ESRCH)); 6526339111Smav } 6527339111Smav 6528339111Smav switch (cmd_type) { 6529339111Smav case POOL_INITIALIZE_DO: 6530339111Smav vdev_initialize(vd); 6531339111Smav break; 6532339111Smav case POOL_INITIALIZE_CANCEL: 6533339111Smav vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); 6534339111Smav break; 6535339111Smav case POOL_INITIALIZE_SUSPEND: 6536339111Smav vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED); 6537339111Smav break; 6538339111Smav default: 6539339111Smav panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 6540339111Smav } 6541339111Smav mutex_exit(&vd->vdev_initialize_lock); 6542339111Smav 6543339111Smav /* Sync out the initializing state */ 6544339111Smav txg_wait_synced(spa->spa_dsl_pool, 0); 6545339111Smav mutex_exit(&spa_namespace_lock); 6546339111Smav 6547339111Smav return (0); 6548339111Smav} 6549339111Smav 6550339111Smav 6551219089Spjd/* 6552219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 6553219089Spjd */ 6554219089Spjdint 6555219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 6556219089Spjd nvlist_t *props, boolean_t exp) 6557219089Spjd{ 6558219089Spjd int error = 0; 6559219089Spjd uint64_t txg, *glist; 6560219089Spjd spa_t *newspa; 6561219089Spjd uint_t c, children, lastlog; 6562219089Spjd nvlist_t **child, *nvl, *tmp; 6563219089Spjd dmu_tx_t *tx; 6564219089Spjd char *altroot = NULL; 6565219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 6566219089Spjd boolean_t activate_slog; 6567219089Spjd 6568219089Spjd ASSERT(spa_writeable(spa)); 6569219089Spjd 6570219089Spjd txg = spa_vdev_enter(spa); 6571219089Spjd 6572332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6573332547Smav if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6574332547Smav error = (spa_has_checkpoint(spa)) ? 6575332547Smav ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6576332547Smav return (spa_vdev_exit(spa, NULL, txg, error)); 6577332547Smav } 6578332547Smav 6579219089Spjd /* clear the log and flush everything up to now */ 6580219089Spjd activate_slog = spa_passivate_log(spa); 6581219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 6582332525Smav error = spa_reset_logs(spa); 6583219089Spjd txg = spa_vdev_config_enter(spa); 6584219089Spjd 6585219089Spjd if (activate_slog) 6586219089Spjd spa_activate_log(spa); 6587219089Spjd 6588219089Spjd if (error != 0) 6589219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 6590219089Spjd 6591219089Spjd /* check new spa name before going any further */ 6592219089Spjd if (spa_lookup(newname) != NULL) 6593219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 6594219089Spjd 6595219089Spjd /* 6596219089Spjd * scan through all the children to ensure they're all mirrors 6597219089Spjd */ 6598219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 6599219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 6600219089Spjd &children) != 0) 6601219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6602219089Spjd 6603219089Spjd /* first, check to ensure we've got the right child count */ 6604219089Spjd rvd = spa->spa_root_vdev; 6605219089Spjd lastlog = 0; 6606219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 6607219089Spjd vdev_t *vd = rvd->vdev_child[c]; 6608219089Spjd 6609219089Spjd /* don't count the holes & logs as children */ 6610332525Smav if (vd->vdev_islog || !vdev_is_concrete(vd)) { 6611219089Spjd if (lastlog == 0) 6612219089Spjd lastlog = c; 6613219089Spjd continue; 6614219089Spjd } 6615219089Spjd 6616219089Spjd lastlog = 0; 6617219089Spjd } 6618219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 6619219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6620219089Spjd 6621219089Spjd /* next, ensure no spare or cache devices are part of the split */ 6622219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 6623219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 6624219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6625219089Spjd 6626219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 6627219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 6628219089Spjd 6629219089Spjd /* then, loop over each vdev and validate it */ 6630219089Spjd for (c = 0; c < children; c++) { 6631219089Spjd uint64_t is_hole = 0; 6632219089Spjd 6633219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 6634219089Spjd &is_hole); 6635219089Spjd 6636219089Spjd if (is_hole != 0) { 6637219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 6638219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 6639219089Spjd continue; 6640219089Spjd } else { 6641249195Smm error = SET_ERROR(EINVAL); 6642219089Spjd break; 6643219089Spjd } 6644219089Spjd } 6645219089Spjd 6646219089Spjd /* which disk is going to be split? */ 6647219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 6648219089Spjd &glist[c]) != 0) { 6649249195Smm error = SET_ERROR(EINVAL); 6650219089Spjd break; 6651219089Spjd } 6652219089Spjd 6653219089Spjd /* look it up in the spa */ 6654219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 6655219089Spjd if (vml[c] == NULL) { 6656249195Smm error = SET_ERROR(ENODEV); 6657219089Spjd break; 6658219089Spjd } 6659219089Spjd 6660219089Spjd /* make sure there's nothing stopping the split */ 6661219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 6662219089Spjd vml[c]->vdev_islog || 6663332525Smav !vdev_is_concrete(vml[c]) || 6664219089Spjd vml[c]->vdev_isspare || 6665219089Spjd vml[c]->vdev_isl2cache || 6666219089Spjd !vdev_writeable(vml[c]) || 6667219089Spjd vml[c]->vdev_children != 0 || 6668219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 6669219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 6670249195Smm error = SET_ERROR(EINVAL); 6671219089Spjd break; 6672219089Spjd } 6673219089Spjd 6674219089Spjd if (vdev_dtl_required(vml[c])) { 6675249195Smm error = SET_ERROR(EBUSY); 6676219089Spjd break; 6677219089Spjd } 6678219089Spjd 6679219089Spjd /* we need certain info from the top level */ 6680219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 6681219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 6682219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 6683219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 6684219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 6685219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 6686219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 6687219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 6688299441Smav 6689299441Smav /* transfer per-vdev ZAPs */ 6690299441Smav ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 6691299441Smav VERIFY0(nvlist_add_uint64(child[c], 6692299441Smav ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 6693299441Smav 6694299441Smav ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 6695299441Smav VERIFY0(nvlist_add_uint64(child[c], 6696299441Smav ZPOOL_CONFIG_VDEV_TOP_ZAP, 6697299441Smav vml[c]->vdev_parent->vdev_top_zap)); 6698219089Spjd } 6699219089Spjd 6700219089Spjd if (error != 0) { 6701219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 6702219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 6703219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 6704219089Spjd } 6705219089Spjd 6706219089Spjd /* stop writers from using the disks */ 6707219089Spjd for (c = 0; c < children; c++) { 6708219089Spjd if (vml[c] != NULL) 6709219089Spjd vml[c]->vdev_offline = B_TRUE; 6710219089Spjd } 6711219089Spjd vdev_reopen(spa->spa_root_vdev); 6712219089Spjd 6713219089Spjd /* 6714219089Spjd * Temporarily record the splitting vdevs in the spa config. This 6715219089Spjd * will disappear once the config is regenerated. 6716219089Spjd */ 6717219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6718219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 6719219089Spjd glist, children) == 0); 6720219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 6721219089Spjd 6722219089Spjd mutex_enter(&spa->spa_props_lock); 6723219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 6724219089Spjd nvl) == 0); 6725219089Spjd mutex_exit(&spa->spa_props_lock); 6726219089Spjd spa->spa_config_splitting = nvl; 6727219089Spjd vdev_config_dirty(spa->spa_root_vdev); 6728219089Spjd 6729219089Spjd /* configure and create the new pool */ 6730219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 6731219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 6732219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 6733219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6734219089Spjd spa_version(spa)) == 0); 6735219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 6736219089Spjd spa->spa_config_txg) == 0); 6737219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 6738219089Spjd spa_generate_guid(NULL)) == 0); 6739299441Smav VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 6740219089Spjd (void) nvlist_lookup_string(props, 6741219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6742219089Spjd 6743219089Spjd /* add the new pool to the namespace */ 6744219089Spjd newspa = spa_add(newname, config, altroot); 6745299441Smav newspa->spa_avz_action = AVZ_ACTION_REBUILD; 6746219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 6747219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 6748219089Spjd 6749219089Spjd /* release the spa config lock, retaining the namespace lock */ 6750219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 6751219089Spjd 6752219089Spjd if (zio_injection_enabled) 6753219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 6754219089Spjd 6755219089Spjd spa_activate(newspa, spa_mode_global); 6756219089Spjd spa_async_suspend(newspa); 6757219089Spjd 6758339111Smav for (c = 0; c < children; c++) { 6759339111Smav if (vml[c] != NULL) { 6760339111Smav /* 6761339111Smav * Temporarily stop the initializing activity. We set 6762339111Smav * the state to ACTIVE so that we know to resume 6763339111Smav * the initializing once the split has completed. 6764339111Smav */ 6765339111Smav mutex_enter(&vml[c]->vdev_initialize_lock); 6766339111Smav vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE); 6767339111Smav mutex_exit(&vml[c]->vdev_initialize_lock); 6768339111Smav } 6769339111Smav } 6770339111Smav 6771277300Ssmh#ifndef illumos 6772219089Spjd /* mark that we are creating new spa by splitting */ 6773219089Spjd newspa->spa_splitting_newspa = B_TRUE; 6774219089Spjd#endif 6775332536Smav newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 6776332536Smav 6777219089Spjd /* create the new pool from the disks of the original pool */ 6778332536Smav error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 6779277300Ssmh#ifndef illumos 6780219089Spjd newspa->spa_splitting_newspa = B_FALSE; 6781219089Spjd#endif 6782219089Spjd if (error) 6783219089Spjd goto out; 6784219089Spjd 6785219089Spjd /* if that worked, generate a real config for the new pool */ 6786219089Spjd if (newspa->spa_root_vdev != NULL) { 6787219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 6788219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 6789219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 6790219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 6791219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 6792219089Spjd B_TRUE)); 6793219089Spjd } 6794219089Spjd 6795219089Spjd /* set the props */ 6796219089Spjd if (props != NULL) { 6797219089Spjd spa_configfile_set(newspa, props, B_FALSE); 6798219089Spjd error = spa_prop_set(newspa, props); 6799219089Spjd if (error) 6800219089Spjd goto out; 6801219089Spjd } 6802219089Spjd 6803219089Spjd /* flush everything */ 6804219089Spjd txg = spa_vdev_config_enter(newspa); 6805219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 6806219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 6807219089Spjd 6808219089Spjd if (zio_injection_enabled) 6809219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 6810219089Spjd 6811219089Spjd spa_async_resume(newspa); 6812219089Spjd 6813219089Spjd /* finally, update the original pool's config */ 6814219089Spjd txg = spa_vdev_config_enter(spa); 6815219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 6816219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 6817219089Spjd if (error != 0) 6818219089Spjd dmu_tx_abort(tx); 6819219089Spjd for (c = 0; c < children; c++) { 6820219089Spjd if (vml[c] != NULL) { 6821219089Spjd vdev_split(vml[c]); 6822219089Spjd if (error == 0) 6823248571Smm spa_history_log_internal(spa, "detach", tx, 6824248571Smm "vdev=%s", vml[c]->vdev_path); 6825299441Smav 6826219089Spjd vdev_free(vml[c]); 6827219089Spjd } 6828219089Spjd } 6829299441Smav spa->spa_avz_action = AVZ_ACTION_REBUILD; 6830219089Spjd vdev_config_dirty(spa->spa_root_vdev); 6831219089Spjd spa->spa_config_splitting = NULL; 6832219089Spjd nvlist_free(nvl); 6833219089Spjd if (error == 0) 6834219089Spjd dmu_tx_commit(tx); 6835219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 6836219089Spjd 6837219089Spjd if (zio_injection_enabled) 6838219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 6839219089Spjd 6840219089Spjd /* split is complete; log a history record */ 6841248571Smm spa_history_log_internal(newspa, "split", NULL, 6842248571Smm "from pool %s", spa_name(spa)); 6843219089Spjd 6844219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 6845219089Spjd 6846219089Spjd /* if we're not going to mount the filesystems in userland, export */ 6847219089Spjd if (exp) 6848219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 6849219089Spjd B_FALSE, B_FALSE); 6850219089Spjd 6851219089Spjd return (error); 6852219089Spjd 6853219089Spjdout: 6854219089Spjd spa_unload(newspa); 6855219089Spjd spa_deactivate(newspa); 6856219089Spjd spa_remove(newspa); 6857219089Spjd 6858219089Spjd txg = spa_vdev_config_enter(spa); 6859219089Spjd 6860219089Spjd /* re-online all offlined disks */ 6861219089Spjd for (c = 0; c < children; c++) { 6862219089Spjd if (vml[c] != NULL) 6863219089Spjd vml[c]->vdev_offline = B_FALSE; 6864219089Spjd } 6865339111Smav 6866339111Smav /* restart initializing disks as necessary */ 6867339111Smav spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 6868339111Smav 6869219089Spjd vdev_reopen(spa->spa_root_vdev); 6870219089Spjd 6871219089Spjd nvlist_free(spa->spa_config_splitting); 6872219089Spjd spa->spa_config_splitting = NULL; 6873219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 6874219089Spjd 6875219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 6876219089Spjd return (error); 6877219089Spjd} 6878219089Spjd 6879168404Spjd/* 6880185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 6881251631Sdelphij * currently spared, so we can detach it. 6882168404Spjd */ 6883168404Spjdstatic vdev_t * 6884185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 6885168404Spjd{ 6886168404Spjd vdev_t *newvd, *oldvd; 6887168404Spjd 6888219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 6889185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 6890168404Spjd if (oldvd != NULL) 6891168404Spjd return (oldvd); 6892168404Spjd } 6893168404Spjd 6894185029Spjd /* 6895219089Spjd * Check for a completed replacement. We always consider the first 6896219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 6897219089Spjd * the newest (see spa_vdev_attach() for how that works). In 6898219089Spjd * the case where the newest vdev is faulted, we will not automatically 6899219089Spjd * remove it after a resilver completes. This is OK as it will require 6900219089Spjd * user intervention to determine which disk the admin wishes to keep. 6901185029Spjd */ 6902219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 6903219089Spjd ASSERT(vd->vdev_children > 1); 6904219089Spjd 6905219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 6906168404Spjd oldvd = vd->vdev_child[0]; 6907168404Spjd 6908209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 6909219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 6910209962Smm !vdev_dtl_required(oldvd)) 6911168404Spjd return (oldvd); 6912168404Spjd } 6913168404Spjd 6914185029Spjd /* 6915185029Spjd * Check for a completed resilver with the 'unspare' flag set. 6916339153Smav * Also potentially update faulted state. 6917185029Spjd */ 6918219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 6919219089Spjd vdev_t *first = vd->vdev_child[0]; 6920219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 6921185029Spjd 6922219089Spjd if (last->vdev_unspare) { 6923219089Spjd oldvd = first; 6924219089Spjd newvd = last; 6925219089Spjd } else if (first->vdev_unspare) { 6926219089Spjd oldvd = last; 6927219089Spjd newvd = first; 6928219089Spjd } else { 6929219089Spjd oldvd = NULL; 6930219089Spjd } 6931219089Spjd 6932219089Spjd if (oldvd != NULL && 6933209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 6934219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 6935219089Spjd !vdev_dtl_required(oldvd)) 6936185029Spjd return (oldvd); 6937219089Spjd 6938339153Smav vdev_propagate_state(vd); 6939339153Smav 6940219089Spjd /* 6941219089Spjd * If there are more than two spares attached to a disk, 6942219089Spjd * and those spares are not required, then we want to 6943219089Spjd * attempt to free them up now so that they can be used 6944219089Spjd * by other pools. Once we're back down to a single 6945219089Spjd * disk+spare, we stop removing them. 6946219089Spjd */ 6947219089Spjd if (vd->vdev_children > 2) { 6948219089Spjd newvd = vd->vdev_child[1]; 6949219089Spjd 6950219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 6951219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 6952219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 6953219089Spjd !vdev_dtl_required(newvd)) 6954219089Spjd return (newvd); 6955185029Spjd } 6956185029Spjd } 6957185029Spjd 6958168404Spjd return (NULL); 6959168404Spjd} 6960168404Spjd 6961168404Spjdstatic void 6962185029Spjdspa_vdev_resilver_done(spa_t *spa) 6963168404Spjd{ 6964209962Smm vdev_t *vd, *pvd, *ppvd; 6965209962Smm uint64_t guid, sguid, pguid, ppguid; 6966168404Spjd 6967209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6968168404Spjd 6969185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 6970209962Smm pvd = vd->vdev_parent; 6971209962Smm ppvd = pvd->vdev_parent; 6972168404Spjd guid = vd->vdev_guid; 6973209962Smm pguid = pvd->vdev_guid; 6974209962Smm ppguid = ppvd->vdev_guid; 6975209962Smm sguid = 0; 6976168404Spjd /* 6977168404Spjd * If we have just finished replacing a hot spared device, then 6978168404Spjd * we need to detach the parent's first child (the original hot 6979168404Spjd * spare) as well. 6980168404Spjd */ 6981219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 6982219089Spjd ppvd->vdev_children == 2) { 6983168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 6984209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 6985168404Spjd } 6986254112Sdelphij ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 6987254112Sdelphij 6988209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 6989209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 6990168404Spjd return; 6991209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 6992168404Spjd return; 6993209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6994168404Spjd } 6995168404Spjd 6996209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 6997168404Spjd} 6998168404Spjd 6999168404Spjd/* 7000219089Spjd * Update the stored path or FRU for this vdev. 7001168404Spjd */ 7002168404Spjdint 7003209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 7004209962Smm boolean_t ispath) 7005168404Spjd{ 7006185029Spjd vdev_t *vd; 7007219089Spjd boolean_t sync = B_FALSE; 7008168404Spjd 7009219089Spjd ASSERT(spa_writeable(spa)); 7010168404Spjd 7011219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 7012219089Spjd 7013209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 7014219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 7015168404Spjd 7016168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 7017219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 7018168404Spjd 7019209962Smm if (ispath) { 7020219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 7021219089Spjd spa_strfree(vd->vdev_path); 7022219089Spjd vd->vdev_path = spa_strdup(value); 7023219089Spjd sync = B_TRUE; 7024219089Spjd } 7025209962Smm } else { 7026219089Spjd if (vd->vdev_fru == NULL) { 7027219089Spjd vd->vdev_fru = spa_strdup(value); 7028219089Spjd sync = B_TRUE; 7029219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 7030209962Smm spa_strfree(vd->vdev_fru); 7031219089Spjd vd->vdev_fru = spa_strdup(value); 7032219089Spjd sync = B_TRUE; 7033219089Spjd } 7034209962Smm } 7035168404Spjd 7036219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 7037168404Spjd} 7038168404Spjd 7039209962Smmint 7040209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 7041209962Smm{ 7042209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 7043209962Smm} 7044209962Smm 7045209962Smmint 7046209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 7047209962Smm{ 7048209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 7049209962Smm} 7050209962Smm 7051168404Spjd/* 7052168404Spjd * ========================================================================== 7053219089Spjd * SPA Scanning 7054168404Spjd * ========================================================================== 7055168404Spjd */ 7056324010Savgint 7057324010Savgspa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 7058324010Savg{ 7059324010Savg ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 7060168404Spjd 7061324010Savg if (dsl_scan_resilvering(spa->spa_dsl_pool)) 7062324010Savg return (SET_ERROR(EBUSY)); 7063324010Savg 7064324010Savg return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 7065324010Savg} 7066324010Savg 7067168404Spjdint 7068219089Spjdspa_scan_stop(spa_t *spa) 7069168404Spjd{ 7070185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 7071219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 7072249195Smm return (SET_ERROR(EBUSY)); 7073219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 7074219089Spjd} 7075168404Spjd 7076219089Spjdint 7077219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 7078219089Spjd{ 7079219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 7080219089Spjd 7081219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 7082249195Smm return (SET_ERROR(ENOTSUP)); 7083168404Spjd 7084168404Spjd /* 7085185029Spjd * If a resilver was requested, but there is no DTL on a 7086185029Spjd * writeable leaf device, we have nothing to do. 7087168404Spjd */ 7088219089Spjd if (func == POOL_SCAN_RESILVER && 7089185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 7090185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 7091168404Spjd return (0); 7092168404Spjd } 7093168404Spjd 7094219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 7095168404Spjd} 7096168404Spjd 7097168404Spjd/* 7098168404Spjd * ========================================================================== 7099168404Spjd * SPA async task processing 7100168404Spjd * ========================================================================== 7101168404Spjd */ 7102168404Spjd 7103168404Spjdstatic void 7104185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 7105168404Spjd{ 7106185029Spjd if (vd->vdev_remove_wanted) { 7107219089Spjd vd->vdev_remove_wanted = B_FALSE; 7108219089Spjd vd->vdev_delayed_close = B_FALSE; 7109185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 7110209962Smm 7111209962Smm /* 7112209962Smm * We want to clear the stats, but we don't want to do a full 7113209962Smm * vdev_clear() as that will cause us to throw away 7114209962Smm * degraded/faulted state as well as attempt to reopen the 7115209962Smm * device, all of which is a waste. 7116209962Smm */ 7117209962Smm vd->vdev_stat.vs_read_errors = 0; 7118209962Smm vd->vdev_stat.vs_write_errors = 0; 7119209962Smm vd->vdev_stat.vs_checksum_errors = 0; 7120209962Smm 7121185029Spjd vdev_state_dirty(vd->vdev_top); 7122294027Sasomers /* Tell userspace that the vdev is gone. */ 7123294027Sasomers zfs_post_remove(spa, vd); 7124185029Spjd } 7125168404Spjd 7126185029Spjd for (int c = 0; c < vd->vdev_children; c++) 7127185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 7128185029Spjd} 7129168404Spjd 7130185029Spjdstatic void 7131185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 7132185029Spjd{ 7133185029Spjd if (vd->vdev_probe_wanted) { 7134219089Spjd vd->vdev_probe_wanted = B_FALSE; 7135185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 7136168404Spjd } 7137168404Spjd 7138185029Spjd for (int c = 0; c < vd->vdev_children; c++) 7139185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 7140168404Spjd} 7141168404Spjd 7142168404Spjdstatic void 7143219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 7144219089Spjd{ 7145219089Spjd sysevent_id_t eid; 7146219089Spjd nvlist_t *attr; 7147219089Spjd char *physpath; 7148219089Spjd 7149219089Spjd if (!spa->spa_autoexpand) 7150219089Spjd return; 7151219089Spjd 7152219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 7153219089Spjd vdev_t *cvd = vd->vdev_child[c]; 7154219089Spjd spa_async_autoexpand(spa, cvd); 7155219089Spjd } 7156219089Spjd 7157219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 7158219089Spjd return; 7159219089Spjd 7160219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 7161219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 7162219089Spjd 7163219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 7164219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 7165219089Spjd 7166219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 7167219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 7168219089Spjd 7169219089Spjd nvlist_free(attr); 7170219089Spjd kmem_free(physpath, MAXPATHLEN); 7171219089Spjd} 7172219089Spjd 7173219089Spjdstatic void 7174168404Spjdspa_async_thread(void *arg) 7175168404Spjd{ 7176331399Smav spa_t *spa = (spa_t *)arg; 7177168404Spjd int tasks; 7178168404Spjd 7179168404Spjd ASSERT(spa->spa_sync_on); 7180168404Spjd 7181168404Spjd mutex_enter(&spa->spa_async_lock); 7182168404Spjd tasks = spa->spa_async_tasks; 7183253990Smav spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 7184168404Spjd mutex_exit(&spa->spa_async_lock); 7185168404Spjd 7186168404Spjd /* 7187168404Spjd * See if the config needs to be updated. 7188168404Spjd */ 7189168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 7190219089Spjd uint64_t old_space, new_space; 7191219089Spjd 7192168404Spjd mutex_enter(&spa_namespace_lock); 7193219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 7194168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 7195219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 7196168404Spjd mutex_exit(&spa_namespace_lock); 7197219089Spjd 7198219089Spjd /* 7199219089Spjd * If the pool grew as a result of the config update, 7200219089Spjd * then log an internal history event. 7201219089Spjd */ 7202219089Spjd if (new_space != old_space) { 7203248571Smm spa_history_log_internal(spa, "vdev online", NULL, 7204219089Spjd "pool '%s' size: %llu(+%llu)", 7205219089Spjd spa_name(spa), new_space, new_space - old_space); 7206219089Spjd } 7207168404Spjd } 7208168404Spjd 7209219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 7210219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7211219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 7212219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 7213219089Spjd } 7214219089Spjd 7215168404Spjd /* 7216185029Spjd * See if any devices need to be probed. 7217168404Spjd */ 7218185029Spjd if (tasks & SPA_ASYNC_PROBE) { 7219219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 7220185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 7221185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 7222185029Spjd } 7223168404Spjd 7224168404Spjd /* 7225185029Spjd * If any devices are done replacing, detach them. 7226168404Spjd */ 7227185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 7228185029Spjd spa_vdev_resilver_done(spa); 7229168404Spjd 7230168404Spjd /* 7231168404Spjd * Kick off a resilver. 7232168404Spjd */ 7233168404Spjd if (tasks & SPA_ASYNC_RESILVER) 7234219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 7235168404Spjd 7236339111Smav if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 7237339111Smav mutex_enter(&spa_namespace_lock); 7238339111Smav spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7239339111Smav vdev_initialize_restart(spa->spa_root_vdev); 7240339111Smav spa_config_exit(spa, SCL_CONFIG, FTAG); 7241339111Smav mutex_exit(&spa_namespace_lock); 7242339111Smav } 7243339111Smav 7244168404Spjd /* 7245168404Spjd * Let the world know that we're done. 7246168404Spjd */ 7247168404Spjd mutex_enter(&spa->spa_async_lock); 7248168404Spjd spa->spa_async_thread = NULL; 7249168404Spjd cv_broadcast(&spa->spa_async_cv); 7250168404Spjd mutex_exit(&spa->spa_async_lock); 7251168404Spjd thread_exit(); 7252168404Spjd} 7253168404Spjd 7254253990Smavstatic void 7255253990Smavspa_async_thread_vd(void *arg) 7256253990Smav{ 7257253990Smav spa_t *spa = arg; 7258253990Smav int tasks; 7259253990Smav 7260253990Smav mutex_enter(&spa->spa_async_lock); 7261253990Smav tasks = spa->spa_async_tasks; 7262253990Smavretry: 7263253990Smav spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 7264253990Smav mutex_exit(&spa->spa_async_lock); 7265253990Smav 7266253990Smav /* 7267253990Smav * See if any devices need to be marked REMOVED. 7268253990Smav */ 7269253990Smav if (tasks & SPA_ASYNC_REMOVE) { 7270253990Smav spa_vdev_state_enter(spa, SCL_NONE); 7271253990Smav spa_async_remove(spa, spa->spa_root_vdev); 7272253990Smav for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 7273253990Smav spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 7274253990Smav for (int i = 0; i < spa->spa_spares.sav_count; i++) 7275253990Smav spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 7276253990Smav (void) spa_vdev_state_exit(spa, NULL, 0); 7277253990Smav } 7278253990Smav 7279253990Smav /* 7280253990Smav * Let the world know that we're done. 7281253990Smav */ 7282253990Smav mutex_enter(&spa->spa_async_lock); 7283253990Smav tasks = spa->spa_async_tasks; 7284253990Smav if ((tasks & SPA_ASYNC_REMOVE) != 0) 7285253990Smav goto retry; 7286253990Smav spa->spa_async_thread_vd = NULL; 7287253990Smav cv_broadcast(&spa->spa_async_cv); 7288253990Smav mutex_exit(&spa->spa_async_lock); 7289253990Smav thread_exit(); 7290253990Smav} 7291253990Smav 7292168404Spjdvoid 7293168404Spjdspa_async_suspend(spa_t *spa) 7294168404Spjd{ 7295168404Spjd mutex_enter(&spa->spa_async_lock); 7296168404Spjd spa->spa_async_suspended++; 7297332525Smav while (spa->spa_async_thread != NULL || 7298332537Smav spa->spa_async_thread_vd != NULL) 7299168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 7300168404Spjd mutex_exit(&spa->spa_async_lock); 7301332525Smav 7302332525Smav spa_vdev_remove_suspend(spa); 7303332537Smav 7304332537Smav zthr_t *condense_thread = spa->spa_condense_zthr; 7305332537Smav if (condense_thread != NULL && zthr_isrunning(condense_thread)) 7306332537Smav VERIFY0(zthr_cancel(condense_thread)); 7307332547Smav 7308332547Smav zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 7309332547Smav if (discard_thread != NULL && zthr_isrunning(discard_thread)) 7310332547Smav VERIFY0(zthr_cancel(discard_thread)); 7311168404Spjd} 7312168404Spjd 7313168404Spjdvoid 7314168404Spjdspa_async_resume(spa_t *spa) 7315168404Spjd{ 7316168404Spjd mutex_enter(&spa->spa_async_lock); 7317168404Spjd ASSERT(spa->spa_async_suspended != 0); 7318168404Spjd spa->spa_async_suspended--; 7319168404Spjd mutex_exit(&spa->spa_async_lock); 7320332525Smav spa_restart_removal(spa); 7321332537Smav 7322332537Smav zthr_t *condense_thread = spa->spa_condense_zthr; 7323332537Smav if (condense_thread != NULL && !zthr_isrunning(condense_thread)) 7324332537Smav zthr_resume(condense_thread); 7325332547Smav 7326332547Smav zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 7327332547Smav if (discard_thread != NULL && !zthr_isrunning(discard_thread)) 7328332547Smav zthr_resume(discard_thread); 7329168404Spjd} 7330168404Spjd 7331251636Sdelphijstatic boolean_t 7332251636Sdelphijspa_async_tasks_pending(spa_t *spa) 7333251636Sdelphij{ 7334251636Sdelphij uint_t non_config_tasks; 7335251636Sdelphij uint_t config_task; 7336251636Sdelphij boolean_t config_task_suspended; 7337251636Sdelphij 7338253990Smav non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 7339253990Smav SPA_ASYNC_REMOVE); 7340251636Sdelphij config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 7341251636Sdelphij if (spa->spa_ccw_fail_time == 0) { 7342251636Sdelphij config_task_suspended = B_FALSE; 7343251636Sdelphij } else { 7344251636Sdelphij config_task_suspended = 7345251636Sdelphij (gethrtime() - spa->spa_ccw_fail_time) < 7346251636Sdelphij (zfs_ccw_retry_interval * NANOSEC); 7347251636Sdelphij } 7348251636Sdelphij 7349251636Sdelphij return (non_config_tasks || (config_task && !config_task_suspended)); 7350251636Sdelphij} 7351251636Sdelphij 7352168404Spjdstatic void 7353168404Spjdspa_async_dispatch(spa_t *spa) 7354168404Spjd{ 7355168404Spjd mutex_enter(&spa->spa_async_lock); 7356251636Sdelphij if (spa_async_tasks_pending(spa) && 7357251636Sdelphij !spa->spa_async_suspended && 7358168404Spjd spa->spa_async_thread == NULL && 7359251636Sdelphij rootdir != NULL) 7360168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 7361168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 7362168404Spjd mutex_exit(&spa->spa_async_lock); 7363168404Spjd} 7364168404Spjd 7365253990Smavstatic void 7366253990Smavspa_async_dispatch_vd(spa_t *spa) 7367253990Smav{ 7368253990Smav mutex_enter(&spa->spa_async_lock); 7369253990Smav if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 7370253990Smav !spa->spa_async_suspended && 7371253990Smav spa->spa_async_thread_vd == NULL && 7372253990Smav rootdir != NULL) 7373253990Smav spa->spa_async_thread_vd = thread_create(NULL, 0, 7374253990Smav spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 7375253990Smav mutex_exit(&spa->spa_async_lock); 7376253990Smav} 7377253990Smav 7378168404Spjdvoid 7379168404Spjdspa_async_request(spa_t *spa, int task) 7380168404Spjd{ 7381219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 7382168404Spjd mutex_enter(&spa->spa_async_lock); 7383168404Spjd spa->spa_async_tasks |= task; 7384168404Spjd mutex_exit(&spa->spa_async_lock); 7385253990Smav spa_async_dispatch_vd(spa); 7386168404Spjd} 7387168404Spjd 7388168404Spjd/* 7389168404Spjd * ========================================================================== 7390168404Spjd * SPA syncing routines 7391168404Spjd * ========================================================================== 7392168404Spjd */ 7393168404Spjd 7394219089Spjdstatic int 7395219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 7396168404Spjd{ 7397219089Spjd bpobj_t *bpo = arg; 7398219089Spjd bpobj_enqueue(bpo, bp, tx); 7399219089Spjd return (0); 7400219089Spjd} 7401168404Spjd 7402219089Spjdstatic int 7403219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 7404219089Spjd{ 7405219089Spjd zio_t *zio = arg; 7406168404Spjd 7407219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 7408240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 7409219089Spjd return (0); 7410168404Spjd} 7411168404Spjd 7412258632Savg/* 7413258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 7414258632Savg * amount of time spent syncing frees. 7415258632Savg */ 7416168404Spjdstatic void 7417258632Savgspa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 7418258632Savg{ 7419258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 7420258632Savg bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 7421258632Savg VERIFY(zio_wait(zio) == 0); 7422258632Savg} 7423258632Savg 7424258632Savg/* 7425258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 7426258632Savg * amount of time spent syncing deferred frees. 7427258632Savg */ 7428258632Savgstatic void 7429258632Savgspa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 7430258632Savg{ 7431258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 7432258632Savg VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 7433258632Savg spa_free_sync_cb, zio, tx), ==, 0); 7434258632Savg VERIFY0(zio_wait(zio)); 7435258632Savg} 7436258632Savg 7437258632Savg 7438258632Savgstatic void 7439168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 7440168404Spjd{ 7441168404Spjd char *packed = NULL; 7442185029Spjd size_t bufsize; 7443168404Spjd size_t nvsize = 0; 7444168404Spjd dmu_buf_t *db; 7445168404Spjd 7446168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 7447168404Spjd 7448185029Spjd /* 7449185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 7450260150Sdelphij * information. This avoids the dmu_buf_will_dirty() path and 7451185029Spjd * saves us a pre-read to get data we don't actually care about. 7452185029Spjd */ 7453236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 7454185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 7455168404Spjd 7456168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 7457168404Spjd KM_SLEEP) == 0); 7458185029Spjd bzero(packed + nvsize, bufsize - nvsize); 7459168404Spjd 7460185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 7461168404Spjd 7462185029Spjd kmem_free(packed, bufsize); 7463168404Spjd 7464168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 7465168404Spjd dmu_buf_will_dirty(db, tx); 7466168404Spjd *(uint64_t *)db->db_data = nvsize; 7467168404Spjd dmu_buf_rele(db, FTAG); 7468168404Spjd} 7469168404Spjd 7470168404Spjdstatic void 7471185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 7472185029Spjd const char *config, const char *entry) 7473168404Spjd{ 7474168404Spjd nvlist_t *nvroot; 7475185029Spjd nvlist_t **list; 7476168404Spjd int i; 7477168404Spjd 7478185029Spjd if (!sav->sav_sync) 7479168404Spjd return; 7480168404Spjd 7481168404Spjd /* 7482185029Spjd * Update the MOS nvlist describing the list of available devices. 7483185029Spjd * spa_validate_aux() will have already made sure this nvlist is 7484185029Spjd * valid and the vdevs are labeled appropriately. 7485168404Spjd */ 7486185029Spjd if (sav->sav_object == 0) { 7487185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 7488185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 7489185029Spjd sizeof (uint64_t), tx); 7490168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 7491185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 7492185029Spjd &sav->sav_object, tx) == 0); 7493168404Spjd } 7494168404Spjd 7495168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 7496185029Spjd if (sav->sav_count == 0) { 7497185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 7498168404Spjd } else { 7499185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 7500185029Spjd for (i = 0; i < sav->sav_count; i++) 7501185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 7502219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 7503185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 7504185029Spjd sav->sav_count) == 0); 7505185029Spjd for (i = 0; i < sav->sav_count; i++) 7506185029Spjd nvlist_free(list[i]); 7507185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 7508168404Spjd } 7509168404Spjd 7510185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 7511168404Spjd nvlist_free(nvroot); 7512168404Spjd 7513185029Spjd sav->sav_sync = B_FALSE; 7514168404Spjd} 7515168404Spjd 7516299441Smav/* 7517299441Smav * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 7518299441Smav * The all-vdev ZAP must be empty. 7519299441Smav */ 7520168404Spjdstatic void 7521299441Smavspa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 7522299441Smav{ 7523299441Smav spa_t *spa = vd->vdev_spa; 7524299441Smav if (vd->vdev_top_zap != 0) { 7525299441Smav VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 7526299441Smav vd->vdev_top_zap, tx)); 7527299441Smav } 7528299441Smav if (vd->vdev_leaf_zap != 0) { 7529299441Smav VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 7530299441Smav vd->vdev_leaf_zap, tx)); 7531299441Smav } 7532299441Smav for (uint64_t i = 0; i < vd->vdev_children; i++) { 7533299441Smav spa_avz_build(vd->vdev_child[i], avz, tx); 7534299441Smav } 7535299441Smav} 7536299441Smav 7537299441Smavstatic void 7538168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 7539168404Spjd{ 7540168404Spjd nvlist_t *config; 7541168404Spjd 7542299441Smav /* 7543299441Smav * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 7544299441Smav * its config may not be dirty but we still need to build per-vdev ZAPs. 7545299441Smav * Similarly, if the pool is being assembled (e.g. after a split), we 7546299441Smav * need to rebuild the AVZ although the config may not be dirty. 7547299441Smav */ 7548299441Smav if (list_is_empty(&spa->spa_config_dirty_list) && 7549299441Smav spa->spa_avz_action == AVZ_ACTION_NONE) 7550168404Spjd return; 7551168404Spjd 7552185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7553168404Spjd 7554299441Smav ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 7555321540Smav spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 7556299441Smav spa->spa_all_vdev_zaps != 0); 7557299441Smav 7558299441Smav if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 7559299441Smav /* Make and build the new AVZ */ 7560299441Smav uint64_t new_avz = zap_create(spa->spa_meta_objset, 7561299441Smav DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 7562299441Smav spa_avz_build(spa->spa_root_vdev, new_avz, tx); 7563299441Smav 7564299441Smav /* Diff old AVZ with new one */ 7565299441Smav zap_cursor_t zc; 7566299441Smav zap_attribute_t za; 7567299441Smav 7568299441Smav for (zap_cursor_init(&zc, spa->spa_meta_objset, 7569299441Smav spa->spa_all_vdev_zaps); 7570299441Smav zap_cursor_retrieve(&zc, &za) == 0; 7571299441Smav zap_cursor_advance(&zc)) { 7572299441Smav uint64_t vdzap = za.za_first_integer; 7573299441Smav if (zap_lookup_int(spa->spa_meta_objset, new_avz, 7574299441Smav vdzap) == ENOENT) { 7575299441Smav /* 7576299441Smav * ZAP is listed in old AVZ but not in new one; 7577299441Smav * destroy it 7578299441Smav */ 7579299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 7580299441Smav tx)); 7581299441Smav } 7582299441Smav } 7583299441Smav 7584299441Smav zap_cursor_fini(&zc); 7585299441Smav 7586299441Smav /* Destroy the old AVZ */ 7587299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, 7588299441Smav spa->spa_all_vdev_zaps, tx)); 7589299441Smav 7590299441Smav /* Replace the old AVZ in the dir obj with the new one */ 7591299441Smav VERIFY0(zap_update(spa->spa_meta_objset, 7592299441Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 7593299441Smav sizeof (new_avz), 1, &new_avz, tx)); 7594299441Smav 7595299441Smav spa->spa_all_vdev_zaps = new_avz; 7596299441Smav } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 7597299441Smav zap_cursor_t zc; 7598299441Smav zap_attribute_t za; 7599299441Smav 7600299441Smav /* Walk through the AVZ and destroy all listed ZAPs */ 7601299441Smav for (zap_cursor_init(&zc, spa->spa_meta_objset, 7602299441Smav spa->spa_all_vdev_zaps); 7603299441Smav zap_cursor_retrieve(&zc, &za) == 0; 7604299441Smav zap_cursor_advance(&zc)) { 7605299441Smav uint64_t zap = za.za_first_integer; 7606299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 7607299441Smav } 7608299441Smav 7609299441Smav zap_cursor_fini(&zc); 7610299441Smav 7611299441Smav /* Destroy and unlink the AVZ itself */ 7612299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, 7613299441Smav spa->spa_all_vdev_zaps, tx)); 7614299441Smav VERIFY0(zap_remove(spa->spa_meta_objset, 7615299441Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 7616299441Smav spa->spa_all_vdev_zaps = 0; 7617299441Smav } 7618299441Smav 7619299441Smav if (spa->spa_all_vdev_zaps == 0) { 7620299441Smav spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 7621299441Smav DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 7622299441Smav DMU_POOL_VDEV_ZAP_MAP, tx); 7623299441Smav } 7624299441Smav spa->spa_avz_action = AVZ_ACTION_NONE; 7625299441Smav 7626299441Smav /* Create ZAPs for vdevs that don't have them. */ 7627299441Smav vdev_construct_zaps(spa->spa_root_vdev, tx); 7628299441Smav 7629185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 7630185029Spjd dmu_tx_get_txg(tx), B_FALSE); 7631185029Spjd 7632243505Smm /* 7633243505Smm * If we're upgrading the spa version then make sure that 7634243505Smm * the config object gets updated with the correct version. 7635243505Smm */ 7636243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 7637243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 7638243505Smm spa->spa_uberblock.ub_version); 7639243505Smm 7640185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 7641185029Spjd 7642296528Smav nvlist_free(spa->spa_config_syncing); 7643168404Spjd spa->spa_config_syncing = config; 7644168404Spjd 7645168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 7646168404Spjd} 7647168404Spjd 7648236884Smmstatic void 7649248571Smmspa_sync_version(void *arg, dmu_tx_t *tx) 7650236884Smm{ 7651248571Smm uint64_t *versionp = arg; 7652248571Smm uint64_t version = *versionp; 7653248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 7654236884Smm 7655236884Smm /* 7656236884Smm * Setting the version is special cased when first creating the pool. 7657236884Smm */ 7658236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 7659236884Smm 7660247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 7661236884Smm ASSERT(version >= spa_version(spa)); 7662236884Smm 7663236884Smm spa->spa_uberblock.ub_version = version; 7664236884Smm vdev_config_dirty(spa->spa_root_vdev); 7665248571Smm spa_history_log_internal(spa, "set", tx, "version=%lld", version); 7666236884Smm} 7667236884Smm 7668185029Spjd/* 7669185029Spjd * Set zpool properties. 7670185029Spjd */ 7671168404Spjdstatic void 7672248571Smmspa_sync_props(void *arg, dmu_tx_t *tx) 7673168404Spjd{ 7674248571Smm nvlist_t *nvp = arg; 7675248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 7676185029Spjd objset_t *mos = spa->spa_meta_objset; 7677236884Smm nvpair_t *elem = NULL; 7678168404Spjd 7679168404Spjd mutex_enter(&spa->spa_props_lock); 7680168404Spjd 7681185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 7682236884Smm uint64_t intval; 7683236884Smm char *strval, *fname; 7684236884Smm zpool_prop_t prop; 7685236884Smm const char *propname; 7686236884Smm zprop_type_t proptype; 7687259813Sdelphij spa_feature_t fid; 7688236884Smm 7689185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 7690329493Smav case ZPOOL_PROP_INVAL: 7691236884Smm /* 7692236884Smm * We checked this earlier in spa_prop_validate(). 7693236884Smm */ 7694236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 7695236884Smm 7696236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 7697259813Sdelphij VERIFY0(zfeature_lookup_name(fname, &fid)); 7698236884Smm 7699259813Sdelphij spa_feature_enable(spa, fid, tx); 7700248571Smm spa_history_log_internal(spa, "set", tx, 7701248571Smm "%s=enabled", nvpair_name(elem)); 7702236884Smm break; 7703236884Smm 7704185029Spjd case ZPOOL_PROP_VERSION: 7705258717Savg intval = fnvpair_value_uint64(elem); 7706185029Spjd /* 7707236884Smm * The version is synced seperatly before other 7708236884Smm * properties and should be correct by now. 7709185029Spjd */ 7710236884Smm ASSERT3U(spa_version(spa), >=, intval); 7711185029Spjd break; 7712168404Spjd 7713185029Spjd case ZPOOL_PROP_ALTROOT: 7714185029Spjd /* 7715185029Spjd * 'altroot' is a non-persistent property. It should 7716185029Spjd * have been set temporarily at creation or import time. 7717185029Spjd */ 7718185029Spjd ASSERT(spa->spa_root != NULL); 7719185029Spjd break; 7720168404Spjd 7721219089Spjd case ZPOOL_PROP_READONLY: 7722185029Spjd case ZPOOL_PROP_CACHEFILE: 7723185029Spjd /* 7724219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 7725219089Spjd * properties. 7726185029Spjd */ 7727168404Spjd break; 7728228103Smm case ZPOOL_PROP_COMMENT: 7729258717Savg strval = fnvpair_value_string(elem); 7730228103Smm if (spa->spa_comment != NULL) 7731228103Smm spa_strfree(spa->spa_comment); 7732228103Smm spa->spa_comment = spa_strdup(strval); 7733228103Smm /* 7734228103Smm * We need to dirty the configuration on all the vdevs 7735228103Smm * so that their labels get updated. It's unnecessary 7736228103Smm * to do this for pool creation since the vdev's 7737228103Smm * configuratoin has already been dirtied. 7738228103Smm */ 7739228103Smm if (tx->tx_txg != TXG_INITIAL) 7740228103Smm vdev_config_dirty(spa->spa_root_vdev); 7741248571Smm spa_history_log_internal(spa, "set", tx, 7742248571Smm "%s=%s", nvpair_name(elem), strval); 7743228103Smm break; 7744185029Spjd default: 7745185029Spjd /* 7746185029Spjd * Set pool property values in the poolprops mos object. 7747185029Spjd */ 7748185029Spjd if (spa->spa_pool_props_object == 0) { 7749236884Smm spa->spa_pool_props_object = 7750236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 7751185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 7752236884Smm tx); 7753185029Spjd } 7754185029Spjd 7755185029Spjd /* normalize the property name */ 7756185029Spjd propname = zpool_prop_to_name(prop); 7757185029Spjd proptype = zpool_prop_get_type(prop); 7758185029Spjd 7759185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 7760185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 7761258717Savg strval = fnvpair_value_string(elem); 7762258717Savg VERIFY0(zap_update(mos, 7763185029Spjd spa->spa_pool_props_object, propname, 7764258717Savg 1, strlen(strval) + 1, strval, tx)); 7765248571Smm spa_history_log_internal(spa, "set", tx, 7766248571Smm "%s=%s", nvpair_name(elem), strval); 7767185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 7768258717Savg intval = fnvpair_value_uint64(elem); 7769185029Spjd 7770185029Spjd if (proptype == PROP_TYPE_INDEX) { 7771185029Spjd const char *unused; 7772258717Savg VERIFY0(zpool_prop_index_to_string( 7773258717Savg prop, intval, &unused)); 7774185029Spjd } 7775258717Savg VERIFY0(zap_update(mos, 7776185029Spjd spa->spa_pool_props_object, propname, 7777258717Savg 8, 1, &intval, tx)); 7778248571Smm spa_history_log_internal(spa, "set", tx, 7779248571Smm "%s=%lld", nvpair_name(elem), intval); 7780185029Spjd } else { 7781185029Spjd ASSERT(0); /* not allowed */ 7782185029Spjd } 7783185029Spjd 7784185029Spjd switch (prop) { 7785185029Spjd case ZPOOL_PROP_DELEGATION: 7786185029Spjd spa->spa_delegation = intval; 7787185029Spjd break; 7788185029Spjd case ZPOOL_PROP_BOOTFS: 7789185029Spjd spa->spa_bootfs = intval; 7790185029Spjd break; 7791185029Spjd case ZPOOL_PROP_FAILUREMODE: 7792185029Spjd spa->spa_failmode = intval; 7793185029Spjd break; 7794219089Spjd case ZPOOL_PROP_AUTOEXPAND: 7795219089Spjd spa->spa_autoexpand = intval; 7796219089Spjd if (tx->tx_txg != TXG_INITIAL) 7797219089Spjd spa_async_request(spa, 7798219089Spjd SPA_ASYNC_AUTOEXPAND); 7799219089Spjd break; 7800219089Spjd case ZPOOL_PROP_DEDUPDITTO: 7801219089Spjd spa->spa_dedup_ditto = intval; 7802219089Spjd break; 7803185029Spjd default: 7804185029Spjd break; 7805185029Spjd } 7806168404Spjd } 7807185029Spjd 7808168404Spjd } 7809185029Spjd 7810185029Spjd mutex_exit(&spa->spa_props_lock); 7811168404Spjd} 7812168404Spjd 7813168404Spjd/* 7814219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 7815219089Spjd * reflect the new version this txg, so there must be no changes this 7816219089Spjd * txg to anything that the upgrade code depends on after it executes. 7817219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 7818219089Spjd * tasks. 7819219089Spjd */ 7820219089Spjdstatic void 7821219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 7822219089Spjd{ 7823219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 7824219089Spjd 7825219089Spjd ASSERT(spa->spa_sync_pass == 1); 7826219089Spjd 7827248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 7828248571Smm 7829219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 7830219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 7831219089Spjd dsl_pool_create_origin(dp, tx); 7832219089Spjd 7833219089Spjd /* Keeping the origin open increases spa_minref */ 7834219089Spjd spa->spa_minref += 3; 7835219089Spjd } 7836219089Spjd 7837219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 7838219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 7839219089Spjd dsl_pool_upgrade_clones(dp, tx); 7840219089Spjd } 7841219089Spjd 7842219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 7843219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 7844219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 7845219089Spjd 7846219089Spjd /* Keeping the freedir open increases spa_minref */ 7847219089Spjd spa->spa_minref += 3; 7848219089Spjd } 7849236884Smm 7850236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 7851236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 7852236884Smm spa_feature_create_zap_objects(spa, tx); 7853236884Smm } 7854268126Sdelphij 7855268126Sdelphij /* 7856268126Sdelphij * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 7857268126Sdelphij * when possibility to use lz4 compression for metadata was added 7858268126Sdelphij * Old pools that have this feature enabled must be upgraded to have 7859268126Sdelphij * this feature active 7860268126Sdelphij */ 7861268126Sdelphij if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 7862268126Sdelphij boolean_t lz4_en = spa_feature_is_enabled(spa, 7863268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 7864268126Sdelphij boolean_t lz4_ac = spa_feature_is_active(spa, 7865268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 7866268126Sdelphij 7867268126Sdelphij if (lz4_en && !lz4_ac) 7868268126Sdelphij spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 7869268126Sdelphij } 7870289422Smav 7871289422Smav /* 7872289422Smav * If we haven't written the salt, do so now. Note that the 7873289422Smav * feature may not be activated yet, but that's fine since 7874289422Smav * the presence of this ZAP entry is backwards compatible. 7875289422Smav */ 7876289422Smav if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 7877289422Smav DMU_POOL_CHECKSUM_SALT) == ENOENT) { 7878289422Smav VERIFY0(zap_add(spa->spa_meta_objset, 7879289422Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 7880289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes), 7881289422Smav spa->spa_cksum_salt.zcs_bytes, tx)); 7882289422Smav } 7883289422Smav 7884248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 7885219089Spjd} 7886219089Spjd 7887332525Smavstatic void 7888332525Smavvdev_indirect_state_sync_verify(vdev_t *vd) 7889332525Smav{ 7890332525Smav vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 7891332525Smav vdev_indirect_births_t *vib = vd->vdev_indirect_births; 7892332525Smav 7893332525Smav if (vd->vdev_ops == &vdev_indirect_ops) { 7894332525Smav ASSERT(vim != NULL); 7895332525Smav ASSERT(vib != NULL); 7896332525Smav } 7897332525Smav 7898332525Smav if (vdev_obsolete_sm_object(vd) != 0) { 7899332525Smav ASSERT(vd->vdev_obsolete_sm != NULL); 7900332525Smav ASSERT(vd->vdev_removing || 7901332525Smav vd->vdev_ops == &vdev_indirect_ops); 7902332525Smav ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 7903332525Smav ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 7904332525Smav 7905332525Smav ASSERT3U(vdev_obsolete_sm_object(vd), ==, 7906332525Smav space_map_object(vd->vdev_obsolete_sm)); 7907332525Smav ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 7908332525Smav space_map_allocated(vd->vdev_obsolete_sm)); 7909332525Smav } 7910332525Smav ASSERT(vd->vdev_obsolete_segments != NULL); 7911332525Smav 7912332525Smav /* 7913332525Smav * Since frees / remaps to an indirect vdev can only 7914332525Smav * happen in syncing context, the obsolete segments 7915332525Smav * tree must be empty when we start syncing. 7916332525Smav */ 7917332525Smav ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 7918332525Smav} 7919332525Smav 7920219089Spjd/* 7921168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 7922168404Spjd * part of the process, so we iterate until it converges. 7923168404Spjd */ 7924168404Spjdvoid 7925168404Spjdspa_sync(spa_t *spa, uint64_t txg) 7926168404Spjd{ 7927168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 7928168404Spjd objset_t *mos = spa->spa_meta_objset; 7929219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 7930168404Spjd vdev_t *rvd = spa->spa_root_vdev; 7931168404Spjd vdev_t *vd; 7932168404Spjd dmu_tx_t *tx; 7933185029Spjd int error; 7934307277Smav uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 7935307277Smav zfs_vdev_queue_depth_pct / 100; 7936168404Spjd 7937219089Spjd VERIFY(spa_writeable(spa)); 7938219089Spjd 7939168404Spjd /* 7940332525Smav * Wait for i/os issued in open context that need to complete 7941332525Smav * before this txg syncs. 7942332525Smav */ 7943339111Smav (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 7944339111Smav spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 7945339111Smav ZIO_FLAG_CANFAIL); 7946332525Smav 7947332525Smav /* 7948168404Spjd * Lock out configuration changes. 7949168404Spjd */ 7950185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7951168404Spjd 7952168404Spjd spa->spa_syncing_txg = txg; 7953168404Spjd spa->spa_sync_pass = 0; 7954168404Spjd 7955339105Smav for (int i = 0; i < spa->spa_alloc_count; i++) { 7956339105Smav mutex_enter(&spa->spa_alloc_locks[i]); 7957339105Smav VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); 7958339105Smav mutex_exit(&spa->spa_alloc_locks[i]); 7959339105Smav } 7960307277Smav 7961185029Spjd /* 7962185029Spjd * If there are any pending vdev state changes, convert them 7963185029Spjd * into config changes that go out with this transaction group. 7964185029Spjd */ 7965185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7966209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 7967209962Smm /* 7968209962Smm * We need the write lock here because, for aux vdevs, 7969209962Smm * calling vdev_config_dirty() modifies sav_config. 7970209962Smm * This is ugly and will become unnecessary when we 7971209962Smm * eliminate the aux vdev wart by integrating all vdevs 7972209962Smm * into the root vdev tree. 7973209962Smm */ 7974209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7975209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 7976209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 7977209962Smm vdev_state_clean(vd); 7978209962Smm vdev_config_dirty(vd); 7979209962Smm } 7980209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7981209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7982185029Spjd } 7983185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 7984185029Spjd 7985168404Spjd tx = dmu_tx_create_assigned(dp, txg); 7986168404Spjd 7987247265Smm spa->spa_sync_starttime = gethrtime(); 7988247265Smm#ifdef illumos 7989247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 7990247265Smm spa->spa_sync_starttime + spa->spa_deadman_synctime)); 7991277300Ssmh#else /* !illumos */ 7992247265Smm#ifdef _KERNEL 7993314665Savg callout_schedule(&spa->spa_deadman_cycid, 7994314665Savg hz * spa->spa_deadman_synctime / NANOSEC); 7995247265Smm#endif 7996277300Ssmh#endif /* illumos */ 7997247265Smm 7998168404Spjd /* 7999185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 8000168404Spjd * set spa_deflate if we have no raid-z vdevs. 8001168404Spjd */ 8002185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 8003185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 8004168404Spjd int i; 8005168404Spjd 8006168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 8007168404Spjd vd = rvd->vdev_child[i]; 8008168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 8009168404Spjd break; 8010168404Spjd } 8011168404Spjd if (i == rvd->vdev_children) { 8012168404Spjd spa->spa_deflate = TRUE; 8013168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 8014168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 8015168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 8016168404Spjd } 8017168404Spjd } 8018168404Spjd 8019168404Spjd /* 8020307277Smav * Set the top-level vdev's max queue depth. Evaluate each 8021307277Smav * top-level's async write queue depth in case it changed. 8022307277Smav * The max queue depth will not change in the middle of syncing 8023307277Smav * out this txg. 8024307277Smav */ 8025339105Smav uint64_t slots_per_allocator = 0; 8026307277Smav for (int c = 0; c < rvd->vdev_children; c++) { 8027307277Smav vdev_t *tvd = rvd->vdev_child[c]; 8028307277Smav metaslab_group_t *mg = tvd->vdev_mg; 8029307277Smav 8030307277Smav if (mg == NULL || mg->mg_class != spa_normal_class(spa) || 8031307277Smav !metaslab_group_initialized(mg)) 8032307277Smav continue; 8033307277Smav 8034307277Smav /* 8035307277Smav * It is safe to do a lock-free check here because only async 8036307277Smav * allocations look at mg_max_alloc_queue_depth, and async 8037307277Smav * allocations all happen from spa_sync(). 8038307277Smav */ 8039339105Smav for (int i = 0; i < spa->spa_alloc_count; i++) 8040339105Smav ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i]))); 8041307277Smav mg->mg_max_alloc_queue_depth = max_queue_depth; 8042339105Smav 8043339105Smav for (int i = 0; i < spa->spa_alloc_count; i++) { 8044339105Smav mg->mg_cur_max_alloc_queue_depth[i] = 8045339105Smav zfs_vdev_def_queue_depth; 8046339105Smav } 8047339105Smav slots_per_allocator += zfs_vdev_def_queue_depth; 8048307277Smav } 8049307277Smav metaslab_class_t *mc = spa_normal_class(spa); 8050339105Smav for (int i = 0; i < spa->spa_alloc_count; i++) { 8051339105Smav ASSERT0(refcount_count(&mc->mc_alloc_slots[i])); 8052339105Smav mc->mc_alloc_max_slots[i] = slots_per_allocator; 8053339105Smav } 8054307277Smav mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 8055307277Smav 8056332525Smav for (int c = 0; c < rvd->vdev_children; c++) { 8057332525Smav vdev_t *vd = rvd->vdev_child[c]; 8058332525Smav vdev_indirect_state_sync_verify(vd); 8059332525Smav 8060332525Smav if (vdev_indirect_should_condense(vd)) { 8061332525Smav spa_condense_indirect_start_sync(vd, tx); 8062332525Smav break; 8063332525Smav } 8064332525Smav } 8065332525Smav 8066307277Smav /* 8067168404Spjd * Iterate to convergence. 8068168404Spjd */ 8069168404Spjd do { 8070219089Spjd int pass = ++spa->spa_sync_pass; 8071168404Spjd 8072168404Spjd spa_sync_config_object(spa, tx); 8073185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 8074185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 8075185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 8076185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 8077168404Spjd spa_errlog_sync(spa, txg); 8078168404Spjd dsl_pool_sync(dp, txg); 8079168404Spjd 8080243503Smm if (pass < zfs_sync_pass_deferred_free) { 8081258632Savg spa_sync_frees(spa, free_bpl, tx); 8082219089Spjd } else { 8083275781Sdelphij /* 8084275781Sdelphij * We can not defer frees in pass 1, because 8085275781Sdelphij * we sync the deferred frees later in pass 1. 8086275781Sdelphij */ 8087275781Sdelphij ASSERT3U(pass, >, 1); 8088219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 8089258632Savg &spa->spa_deferred_bpobj, tx); 8090168404Spjd } 8091168404Spjd 8092219089Spjd ddt_sync(spa, txg); 8093219089Spjd dsl_scan_sync(dp, tx); 8094168404Spjd 8095332525Smav if (spa->spa_vdev_removal != NULL) 8096332525Smav svr_sync(spa, tx); 8097332525Smav 8098332525Smav while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 8099332525Smav != NULL) 8100219089Spjd vdev_sync(vd, txg); 8101168404Spjd 8102275781Sdelphij if (pass == 1) { 8103219089Spjd spa_sync_upgrades(spa, tx); 8104275781Sdelphij ASSERT3U(txg, >=, 8105275781Sdelphij spa->spa_uberblock.ub_rootbp.blk_birth); 8106275781Sdelphij /* 8107275781Sdelphij * Note: We need to check if the MOS is dirty 8108275781Sdelphij * because we could have marked the MOS dirty 8109275781Sdelphij * without updating the uberblock (e.g. if we 8110275781Sdelphij * have sync tasks but no dirty user data). We 8111275781Sdelphij * need to check the uberblock's rootbp because 8112275781Sdelphij * it is updated if we have synced out dirty 8113275781Sdelphij * data (though in this case the MOS will most 8114275781Sdelphij * likely also be dirty due to second order 8115275781Sdelphij * effects, we don't want to rely on that here). 8116275781Sdelphij */ 8117275781Sdelphij if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 8118275781Sdelphij !dmu_objset_is_dirty(mos, txg)) { 8119275781Sdelphij /* 8120275781Sdelphij * Nothing changed on the first pass, 8121275781Sdelphij * therefore this TXG is a no-op. Avoid 8122275781Sdelphij * syncing deferred frees, so that we 8123275781Sdelphij * can keep this TXG as a no-op. 8124275781Sdelphij */ 8125275781Sdelphij ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 8126275781Sdelphij txg)); 8127275781Sdelphij ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 8128275781Sdelphij ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 8129332547Smav ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, 8130332547Smav txg)); 8131275781Sdelphij break; 8132275781Sdelphij } 8133275781Sdelphij spa_sync_deferred_frees(spa, tx); 8134275781Sdelphij } 8135168404Spjd 8136219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 8137219089Spjd 8138299441Smav if (!list_is_empty(&spa->spa_config_dirty_list)) { 8139299441Smav /* 8140299441Smav * Make sure that the number of ZAPs for all the vdevs matches 8141299441Smav * the number of ZAPs in the per-vdev ZAP list. This only gets 8142299441Smav * called if the config is dirty; otherwise there may be 8143299441Smav * outstanding AVZ operations that weren't completed in 8144299441Smav * spa_sync_config_object. 8145299441Smav */ 8146299441Smav uint64_t all_vdev_zap_entry_count; 8147299441Smav ASSERT0(zap_count(spa->spa_meta_objset, 8148299441Smav spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 8149299441Smav ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 8150299441Smav all_vdev_zap_entry_count); 8151299441Smav } 8152299441Smav 8153332525Smav if (spa->spa_vdev_removal != NULL) { 8154332525Smav ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 8155332525Smav } 8156332525Smav 8157168404Spjd /* 8158168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 8159168404Spjd * to commit the transaction group. 8160168404Spjd * 8161185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 8162185029Spjd * random top-level vdevs that are known to be visible in the 8163185029Spjd * config cache (see spa_vdev_add() for a complete description). 8164185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 8165168404Spjd */ 8166185029Spjd for (;;) { 8167185029Spjd /* 8168185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 8169185029Spjd * while we're attempting to write the vdev labels. 8170185029Spjd */ 8171185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 8172168404Spjd 8173185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 8174332547Smav vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 8175185029Spjd int svdcount = 0; 8176185029Spjd int children = rvd->vdev_children; 8177185029Spjd int c0 = spa_get_random(children); 8178185029Spjd 8179219089Spjd for (int c = 0; c < children; c++) { 8180185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 8181332547Smav 8182332547Smav /* Stop when revisiting the first vdev */ 8183332547Smav if (c > 0 && svd[0] == vd) 8184332547Smav break; 8185332547Smav 8186332525Smav if (vd->vdev_ms_array == 0 || vd->vdev_islog || 8187332525Smav !vdev_is_concrete(vd)) 8188185029Spjd continue; 8189332547Smav 8190185029Spjd svd[svdcount++] = vd; 8191332536Smav if (svdcount == SPA_SYNC_MIN_VDEVS) 8192185029Spjd break; 8193185029Spjd } 8194294811Smav error = vdev_config_sync(svd, svdcount, txg); 8195185029Spjd } else { 8196185029Spjd error = vdev_config_sync(rvd->vdev_child, 8197294811Smav rvd->vdev_children, txg); 8198168404Spjd } 8199185029Spjd 8200239620Smm if (error == 0) 8201239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 8202239620Smm 8203185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 8204185029Spjd 8205185029Spjd if (error == 0) 8206185029Spjd break; 8207185029Spjd zio_suspend(spa, NULL); 8208185029Spjd zio_resume_wait(spa); 8209168404Spjd } 8210168404Spjd dmu_tx_commit(tx); 8211168404Spjd 8212247265Smm#ifdef illumos 8213247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 8214277300Ssmh#else /* !illumos */ 8215247265Smm#ifdef _KERNEL 8216247265Smm callout_drain(&spa->spa_deadman_cycid); 8217247265Smm#endif 8218277300Ssmh#endif /* illumos */ 8219247265Smm 8220168404Spjd /* 8221168404Spjd * Clear the dirty config list. 8222168404Spjd */ 8223185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 8224168404Spjd vdev_config_clean(vd); 8225168404Spjd 8226168404Spjd /* 8227168404Spjd * Now that the new config has synced transactionally, 8228168404Spjd * let it become visible to the config cache. 8229168404Spjd */ 8230168404Spjd if (spa->spa_config_syncing != NULL) { 8231168404Spjd spa_config_set(spa, spa->spa_config_syncing); 8232168404Spjd spa->spa_config_txg = txg; 8233168404Spjd spa->spa_config_syncing = NULL; 8234168404Spjd } 8235168404Spjd 8236219089Spjd dsl_pool_sync_done(dp, txg); 8237168404Spjd 8238339105Smav for (int i = 0; i < spa->spa_alloc_count; i++) { 8239339105Smav mutex_enter(&spa->spa_alloc_locks[i]); 8240339105Smav VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); 8241339105Smav mutex_exit(&spa->spa_alloc_locks[i]); 8242339105Smav } 8243307277Smav 8244168404Spjd /* 8245168404Spjd * Update usable space statistics. 8246168404Spjd */ 8247339111Smav while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 8248339111Smav != NULL) 8249168404Spjd vdev_sync_done(vd, txg); 8250168404Spjd 8251219089Spjd spa_update_dspace(spa); 8252219089Spjd 8253168404Spjd /* 8254168404Spjd * It had better be the case that we didn't dirty anything 8255168404Spjd * since vdev_config_sync(). 8256168404Spjd */ 8257168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 8258168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 8259168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 8260168404Spjd 8261332547Smav while (zfs_pause_spa_sync) 8262332547Smav delay(1); 8263332547Smav 8264219089Spjd spa->spa_sync_pass = 0; 8265219089Spjd 8266310515Savg /* 8267310515Savg * Update the last synced uberblock here. We want to do this at 8268310515Savg * the end of spa_sync() so that consumers of spa_last_synced_txg() 8269310515Savg * will be guaranteed that all the processing associated with 8270310515Savg * that txg has been completed. 8271310515Savg */ 8272310515Savg spa->spa_ubsync = spa->spa_uberblock; 8273185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 8274168404Spjd 8275219089Spjd spa_handle_ignored_writes(spa); 8276219089Spjd 8277168404Spjd /* 8278168404Spjd * If any async tasks have been requested, kick them off. 8279168404Spjd */ 8280168404Spjd spa_async_dispatch(spa); 8281253990Smav spa_async_dispatch_vd(spa); 8282168404Spjd} 8283168404Spjd 8284168404Spjd/* 8285168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 8286168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 8287168404Spjd * sync. 8288168404Spjd */ 8289168404Spjdvoid 8290168404Spjdspa_sync_allpools(void) 8291168404Spjd{ 8292168404Spjd spa_t *spa = NULL; 8293168404Spjd mutex_enter(&spa_namespace_lock); 8294168404Spjd while ((spa = spa_next(spa)) != NULL) { 8295219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 8296219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 8297168404Spjd continue; 8298168404Spjd spa_open_ref(spa, FTAG); 8299168404Spjd mutex_exit(&spa_namespace_lock); 8300168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 8301168404Spjd mutex_enter(&spa_namespace_lock); 8302168404Spjd spa_close(spa, FTAG); 8303168404Spjd } 8304168404Spjd mutex_exit(&spa_namespace_lock); 8305168404Spjd} 8306168404Spjd 8307168404Spjd/* 8308168404Spjd * ========================================================================== 8309168404Spjd * Miscellaneous routines 8310168404Spjd * ========================================================================== 8311168404Spjd */ 8312168404Spjd 8313168404Spjd/* 8314168404Spjd * Remove all pools in the system. 8315168404Spjd */ 8316168404Spjdvoid 8317168404Spjdspa_evict_all(void) 8318168404Spjd{ 8319168404Spjd spa_t *spa; 8320168404Spjd 8321168404Spjd /* 8322168404Spjd * Remove all cached state. All pools should be closed now, 8323168404Spjd * so every spa in the AVL tree should be unreferenced. 8324168404Spjd */ 8325168404Spjd mutex_enter(&spa_namespace_lock); 8326168404Spjd while ((spa = spa_next(NULL)) != NULL) { 8327168404Spjd /* 8328168404Spjd * Stop async tasks. The async thread may need to detach 8329168404Spjd * a device that's been replaced, which requires grabbing 8330168404Spjd * spa_namespace_lock, so we must drop it here. 8331168404Spjd */ 8332168404Spjd spa_open_ref(spa, FTAG); 8333168404Spjd mutex_exit(&spa_namespace_lock); 8334168404Spjd spa_async_suspend(spa); 8335168404Spjd mutex_enter(&spa_namespace_lock); 8336168404Spjd spa_close(spa, FTAG); 8337168404Spjd 8338168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 8339168404Spjd spa_unload(spa); 8340168404Spjd spa_deactivate(spa); 8341168404Spjd } 8342168404Spjd spa_remove(spa); 8343168404Spjd } 8344168404Spjd mutex_exit(&spa_namespace_lock); 8345168404Spjd} 8346168404Spjd 8347168404Spjdvdev_t * 8348209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 8349168404Spjd{ 8350185029Spjd vdev_t *vd; 8351185029Spjd int i; 8352185029Spjd 8353185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 8354185029Spjd return (vd); 8355185029Spjd 8356209962Smm if (aux) { 8357185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 8358185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 8359185029Spjd if (vd->vdev_guid == guid) 8360185029Spjd return (vd); 8361185029Spjd } 8362209962Smm 8363209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 8364209962Smm vd = spa->spa_spares.sav_vdevs[i]; 8365209962Smm if (vd->vdev_guid == guid) 8366209962Smm return (vd); 8367209962Smm } 8368185029Spjd } 8369185029Spjd 8370185029Spjd return (NULL); 8371168404Spjd} 8372168404Spjd 8373168404Spjdvoid 8374185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 8375168404Spjd{ 8376219089Spjd ASSERT(spa_writeable(spa)); 8377219089Spjd 8378185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8379168404Spjd 8380168404Spjd /* 8381168404Spjd * This should only be called for a non-faulted pool, and since a 8382168404Spjd * future version would result in an unopenable pool, this shouldn't be 8383168404Spjd * possible. 8384168404Spjd */ 8385247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 8386268075Sdelphij ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 8387168404Spjd 8388185029Spjd spa->spa_uberblock.ub_version = version; 8389168404Spjd vdev_config_dirty(spa->spa_root_vdev); 8390168404Spjd 8391185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 8392168404Spjd 8393168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 8394168404Spjd} 8395168404Spjd 8396168404Spjdboolean_t 8397168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 8398168404Spjd{ 8399168404Spjd int i; 8400168404Spjd uint64_t spareguid; 8401185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 8402168404Spjd 8403185029Spjd for (i = 0; i < sav->sav_count; i++) 8404185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 8405168404Spjd return (B_TRUE); 8406168404Spjd 8407185029Spjd for (i = 0; i < sav->sav_npending; i++) { 8408185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 8409185029Spjd &spareguid) == 0 && spareguid == guid) 8410168404Spjd return (B_TRUE); 8411168404Spjd } 8412168404Spjd 8413168404Spjd return (B_FALSE); 8414168404Spjd} 8415168404Spjd 8416185029Spjd/* 8417185029Spjd * Check if a pool has an active shared spare device. 8418185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 8419185029Spjd */ 8420185029Spjdstatic boolean_t 8421185029Spjdspa_has_active_shared_spare(spa_t *spa) 8422168404Spjd{ 8423185029Spjd int i, refcnt; 8424185029Spjd uint64_t pool; 8425185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 8426185029Spjd 8427185029Spjd for (i = 0; i < sav->sav_count; i++) { 8428185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 8429185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 8430185029Spjd refcnt > 2) 8431185029Spjd return (B_TRUE); 8432185029Spjd } 8433185029Spjd 8434185029Spjd return (B_FALSE); 8435168404Spjd} 8436168404Spjd 8437332525Smavsysevent_t * 8438331397Smavspa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 8439168404Spjd{ 8440307113Smav sysevent_t *ev = NULL; 8441185029Spjd#ifdef _KERNEL 8442185029Spjd sysevent_attr_list_t *attr = NULL; 8443185029Spjd sysevent_value_t value; 8444168404Spjd 8445185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 8446185029Spjd SE_SLEEP); 8447307113Smav ASSERT(ev != NULL); 8448168404Spjd 8449185029Spjd value.value_type = SE_DATA_TYPE_STRING; 8450185029Spjd value.value.sv_string = spa_name(spa); 8451185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 8452185029Spjd goto done; 8453168404Spjd 8454185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 8455185029Spjd value.value.sv_uint64 = spa_guid(spa); 8456185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 8457185029Spjd goto done; 8458168404Spjd 8459185029Spjd if (vd) { 8460185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 8461185029Spjd value.value.sv_uint64 = vd->vdev_guid; 8462185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 8463185029Spjd SE_SLEEP) != 0) 8464185029Spjd goto done; 8465168404Spjd 8466185029Spjd if (vd->vdev_path) { 8467185029Spjd value.value_type = SE_DATA_TYPE_STRING; 8468185029Spjd value.value.sv_string = vd->vdev_path; 8469185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 8470185029Spjd &value, SE_SLEEP) != 0) 8471185029Spjd goto done; 8472168404Spjd } 8473168404Spjd } 8474168404Spjd 8475331397Smav if (hist_nvl != NULL) { 8476331397Smav fnvlist_merge((nvlist_t *)attr, hist_nvl); 8477331397Smav } 8478331397Smav 8479185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 8480185029Spjd goto done; 8481185029Spjd attr = NULL; 8482168404Spjd 8483185029Spjddone: 8484185029Spjd if (attr) 8485185029Spjd sysevent_free_attr(attr); 8486307113Smav 8487307113Smav#endif 8488307113Smav return (ev); 8489307113Smav} 8490307113Smav 8491332525Smavvoid 8492307113Smavspa_event_post(sysevent_t *ev) 8493307113Smav{ 8494307113Smav#ifdef _KERNEL 8495307113Smav sysevent_id_t eid; 8496307113Smav 8497307113Smav (void) log_sysevent(ev, SE_SLEEP, &eid); 8498185029Spjd sysevent_free(ev); 8499185029Spjd#endif 8500168404Spjd} 8501307113Smav 8502332525Smavvoid 8503332525Smavspa_event_discard(sysevent_t *ev) 8504332525Smav{ 8505332525Smav#ifdef _KERNEL 8506332525Smav sysevent_free(ev); 8507332525Smav#endif 8508332525Smav} 8509332525Smav 8510307113Smav/* 8511307113Smav * Post a sysevent corresponding to the given event. The 'name' must be one of 8512307113Smav * the event definitions in sys/sysevent/eventdefs.h. The payload will be 8513331397Smav * filled in from the spa and (optionally) the vdev and history nvl. This 8514331397Smav * doesn't do anything in the userland libzpool, as we don't want consumers to 8515331397Smav * misinterpret ztest or zdb as real changes. 8516307113Smav */ 8517307113Smavvoid 8518331397Smavspa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 8519307113Smav{ 8520331397Smav spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 8521307113Smav} 8522