spa.c revision 339034
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24332525Smav * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25287745Sdelphij * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26247265Smm * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27286575Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28289422Smav * Copyright 2013 Saso Kiselkov. All rights reserved. 29296519Smav * Copyright (c) 2014 Integros [integros.com] 30332524Smav * Copyright 2016 Toomas Soome <tsoome@me.com> 31331397Smav * Copyright 2017 Joyent, Inc. 32324010Savg * Copyright (c) 2017 Datto Inc. 33331721Smav * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. 34168404Spjd */ 35168404Spjd 36168404Spjd/* 37251629Sdelphij * SPA: Storage Pool Allocator 38251629Sdelphij * 39168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 40168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 41168404Spjd * pool. 42168404Spjd */ 43168404Spjd 44168404Spjd#include <sys/zfs_context.h> 45168404Spjd#include <sys/fm/fs/zfs.h> 46168404Spjd#include <sys/spa_impl.h> 47168404Spjd#include <sys/zio.h> 48168404Spjd#include <sys/zio_checksum.h> 49168404Spjd#include <sys/dmu.h> 50168404Spjd#include <sys/dmu_tx.h> 51168404Spjd#include <sys/zap.h> 52168404Spjd#include <sys/zil.h> 53219089Spjd#include <sys/ddt.h> 54168404Spjd#include <sys/vdev_impl.h> 55332525Smav#include <sys/vdev_removal.h> 56332525Smav#include <sys/vdev_indirect_mapping.h> 57332525Smav#include <sys/vdev_indirect_births.h> 58168404Spjd#include <sys/metaslab.h> 59219089Spjd#include <sys/metaslab_impl.h> 60168404Spjd#include <sys/uberblock_impl.h> 61168404Spjd#include <sys/txg.h> 62168404Spjd#include <sys/avl.h> 63332525Smav#include <sys/bpobj.h> 64168404Spjd#include <sys/dmu_traverse.h> 65168404Spjd#include <sys/dmu_objset.h> 66168404Spjd#include <sys/unique.h> 67168404Spjd#include <sys/dsl_pool.h> 68168404Spjd#include <sys/dsl_dataset.h> 69168404Spjd#include <sys/dsl_dir.h> 70168404Spjd#include <sys/dsl_prop.h> 71168404Spjd#include <sys/dsl_synctask.h> 72168404Spjd#include <sys/fs/zfs.h> 73185029Spjd#include <sys/arc.h> 74168404Spjd#include <sys/callb.h> 75185029Spjd#include <sys/spa_boot.h> 76219089Spjd#include <sys/zfs_ioctl.h> 77219089Spjd#include <sys/dsl_scan.h> 78248571Smm#include <sys/dmu_send.h> 79248571Smm#include <sys/dsl_destroy.h> 80248571Smm#include <sys/dsl_userhold.h> 81236884Smm#include <sys/zfeature.h> 82219089Spjd#include <sys/zvol.h> 83240868Spjd#include <sys/trim_map.h> 84321610Smav#include <sys/abd.h> 85168404Spjd 86219089Spjd#ifdef _KERNEL 87219089Spjd#include <sys/callb.h> 88219089Spjd#include <sys/cpupart.h> 89219089Spjd#include <sys/zone.h> 90219089Spjd#endif /* _KERNEL */ 91219089Spjd 92185029Spjd#include "zfs_prop.h" 93185029Spjd#include "zfs_comutil.h" 94168404Spjd 95204073Spjd/* Check hostid on import? */ 96204073Spjdstatic int check_hostid = 1; 97204073Spjd 98251636Sdelphij/* 99251636Sdelphij * The interval, in seconds, at which failed configuration cache file writes 100251636Sdelphij * should be retried. 101251636Sdelphij */ 102332525Smavint zfs_ccw_retry_interval = 300; 103251636Sdelphij 104271785SwillSYSCTL_DECL(_vfs_zfs); 105271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, 106271785Swill "Check hostid on import?"); 107271785SwillTUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); 108271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, 109271785Swill &zfs_ccw_retry_interval, 0, 110271785Swill "Configuration cache file write, retry after failure, interval (seconds)"); 111271785Swill 112219089Spjdtypedef enum zti_modes { 113258631Savg ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 114258631Savg ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 115258631Savg ZTI_MODE_NULL, /* don't create a taskq */ 116258631Savg ZTI_NMODES 117219089Spjd} zti_modes_t; 118168712Spjd 119258631Savg#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 120258631Savg#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 121258631Savg#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 122209962Smm 123258631Savg#define ZTI_N(n) ZTI_P(n, 1) 124258631Savg#define ZTI_ONE ZTI_N(1) 125209962Smm 126209962Smmtypedef struct zio_taskq_info { 127258631Savg zti_modes_t zti_mode; 128211931Smm uint_t zti_value; 129258631Savg uint_t zti_count; 130209962Smm} zio_taskq_info_t; 131209962Smm 132209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 133219089Spjd "issue", "issue_high", "intr", "intr_high" 134209962Smm}; 135209962Smm 136211931Smm/* 137258631Savg * This table defines the taskq settings for each ZFS I/O type. When 138258631Savg * initializing a pool, we use this table to create an appropriately sized 139258631Savg * taskq. Some operations are low volume and therefore have a small, static 140258631Savg * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 141258631Savg * macros. Other operations process a large amount of data; the ZTI_BATCH 142258631Savg * macro causes us to create a taskq oriented for throughput. Some operations 143258631Savg * are so high frequency and short-lived that the taskq itself can become a a 144258631Savg * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 145258631Savg * additional degree of parallelism specified by the number of threads per- 146258631Savg * taskq and the number of taskqs; when dispatching an event in this case, the 147258631Savg * particular taskq is chosen at random. 148258631Savg * 149258631Savg * The different taskq priorities are to handle the different contexts (issue 150258631Savg * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 151258631Savg * need to be handled with minimum delay. 152211931Smm */ 153211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 154211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 155258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 156264670Sdelphij { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 157258631Savg { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 158258631Savg { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 159258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 160258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 161209962Smm}; 162209962Smm 163248571Smmstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 164248571Smmstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 165185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 166332547Smavstatic int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); 167219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 168185029Spjd 169258632Savguint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 170219089Spjd#ifdef PSRSET_BIND 171219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 172219089Spjd#endif 173219089Spjd#ifdef SYSDC 174219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 175314355Savguint_t zio_taskq_basedc = 80; /* base duty cycle */ 176219089Spjd#endif 177219089Spjd 178219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 179243503Smmextern int zfs_sync_pass_deferred_free; 180219089Spjd 181168404Spjd/* 182332531Smav * Report any spa_load_verify errors found, but do not fail spa_load. 183332531Smav * This is used by zdb to analyze non-idle pools. 184332531Smav */ 185332531Smavboolean_t spa_load_verify_dryrun = B_FALSE; 186332531Smav 187332531Smav/* 188219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 189219089Spjd * to get the vdev stats associated with the imported devices. 190219089Spjd */ 191219089Spjd#define TRYIMPORT_NAME "$import" 192219089Spjd 193219089Spjd/* 194332536Smav * For debugging purposes: print out vdev tree during pool import. 195332536Smav */ 196332536Smavint spa_load_print_vdev_tree = B_FALSE; 197332536Smav 198332536Smav/* 199332536Smav * A non-zero value for zfs_max_missing_tvds means that we allow importing 200332536Smav * pools with missing top-level vdevs. This is strictly intended for advanced 201332536Smav * pool recovery cases since missing data is almost inevitable. Pools with 202332536Smav * missing devices can only be imported read-only for safety reasons, and their 203332536Smav * fail-mode will be automatically set to "continue". 204332536Smav * 205332536Smav * With 1 missing vdev we should be able to import the pool and mount all 206332536Smav * datasets. User data that was not modified after the missing device has been 207332536Smav * added should be recoverable. This means that snapshots created prior to the 208332536Smav * addition of that device should be completely intact. 209332536Smav * 210332536Smav * With 2 missing vdevs, some datasets may fail to mount since there are 211332536Smav * dataset statistics that are stored as regular metadata. Some data might be 212332536Smav * recoverable if those vdevs were added recently. 213332536Smav * 214332536Smav * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 215332536Smav * may be missing entirely. Chances of data recovery are very low. Note that 216332536Smav * there are also risks of performing an inadvertent rewind as we might be 217332536Smav * missing all the vdevs with the latest uberblocks. 218332536Smav */ 219332536Smavuint64_t zfs_max_missing_tvds = 0; 220332536Smav 221332536Smav/* 222332536Smav * The parameters below are similar to zfs_max_missing_tvds but are only 223332536Smav * intended for a preliminary open of the pool with an untrusted config which 224332536Smav * might be incomplete or out-dated. 225332536Smav * 226332536Smav * We are more tolerant for pools opened from a cachefile since we could have 227332536Smav * an out-dated cachefile where a device removal was not registered. 228332536Smav * We could have set the limit arbitrarily high but in the case where devices 229332536Smav * are really missing we would want to return the proper error codes; we chose 230332536Smav * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 231332536Smav * and we get a chance to retrieve the trusted config. 232332536Smav */ 233332536Smavuint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 234332547Smav 235332536Smav/* 236332536Smav * In the case where config was assembled by scanning device paths (/dev/dsks 237332536Smav * by default) we are less tolerant since all the existing devices should have 238332536Smav * been detected and we want spa_load to return the right error codes. 239332536Smav */ 240332536Smavuint64_t zfs_max_missing_tvds_scan = 0; 241332536Smav 242332536Smav 243332536SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_print_vdev_tree, CTLFLAG_RWTUN, 244332536Smav &spa_load_print_vdev_tree, 0, 245332536Smav "print out vdev tree during pool import"); 246332536SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds, CTLFLAG_RWTUN, 247332536Smav &zfs_max_missing_tvds, 0, 248332536Smav "allow importing pools with missing top-level vdevs"); 249332536SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, 250332536Smav &zfs_max_missing_tvds_cachefile, 0, 251332536Smav "allow importing pools with missing top-level vdevs in cache file"); 252332536SmavSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, 253332536Smav &zfs_max_missing_tvds_scan, 0, 254332536Smav "allow importing pools with missing top-level vdevs during scan"); 255332536Smav 256332536Smav/* 257332547Smav * Debugging aid that pauses spa_sync() towards the end. 258332547Smav */ 259332547Smavboolean_t zfs_pause_spa_sync = B_FALSE; 260332547Smav 261332547Smav/* 262168404Spjd * ========================================================================== 263185029Spjd * SPA properties routines 264185029Spjd * ========================================================================== 265185029Spjd */ 266185029Spjd 267185029Spjd/* 268185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 269185029Spjd */ 270185029Spjdstatic void 271185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 272185029Spjd uint64_t intval, zprop_source_t src) 273185029Spjd{ 274185029Spjd const char *propname = zpool_prop_to_name(prop); 275185029Spjd nvlist_t *propval; 276185029Spjd 277185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 278185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 279185029Spjd 280185029Spjd if (strval != NULL) 281185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 282185029Spjd else 283185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 284185029Spjd 285185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 286185029Spjd nvlist_free(propval); 287185029Spjd} 288185029Spjd 289185029Spjd/* 290185029Spjd * Get property values from the spa configuration. 291185029Spjd */ 292185029Spjdstatic void 293185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 294185029Spjd{ 295236155Smm vdev_t *rvd = spa->spa_root_vdev; 296236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 297269118Sdelphij uint64_t size, alloc, cap, version; 298185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 299185029Spjd spa_config_dirent_t *dp; 300269118Sdelphij metaslab_class_t *mc = spa_normal_class(spa); 301185029Spjd 302185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 303185029Spjd 304236155Smm if (rvd != NULL) { 305219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 306219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 307209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 308209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 309219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 310219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 311219089Spjd size - alloc, src); 312332547Smav spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 313332547Smav spa->spa_checkpoint_info.sci_dspace, src); 314236155Smm 315269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 316269118Sdelphij metaslab_class_fragmentation(mc), src); 317269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 318269118Sdelphij metaslab_class_expandable_space(mc), src); 319219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 320219089Spjd (spa_mode(spa) == FREAD), src); 321185029Spjd 322219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 323209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 324185029Spjd 325219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 326219089Spjd ddt_get_pool_dedup_ratio(spa), src); 327219089Spjd 328209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 329236155Smm rvd->vdev_state, src); 330209962Smm 331209962Smm version = spa_version(spa); 332209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 333209962Smm src = ZPROP_SRC_DEFAULT; 334209962Smm else 335209962Smm src = ZPROP_SRC_LOCAL; 336209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 337209962Smm } 338209962Smm 339236884Smm if (pool != NULL) { 340236884Smm /* 341236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 342236884Smm * when opening pools before this version freedir will be NULL. 343236884Smm */ 344268079Sdelphij if (pool->dp_free_dir != NULL) { 345236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 346275782Sdelphij dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 347275782Sdelphij src); 348236884Smm } else { 349236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 350236884Smm NULL, 0, src); 351236884Smm } 352268079Sdelphij 353268079Sdelphij if (pool->dp_leak_dir != NULL) { 354268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 355275782Sdelphij dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 356275782Sdelphij src); 357268079Sdelphij } else { 358268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 359268079Sdelphij NULL, 0, src); 360268079Sdelphij } 361236884Smm } 362236884Smm 363185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 364185029Spjd 365228103Smm if (spa->spa_comment != NULL) { 366228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 367228103Smm 0, ZPROP_SRC_LOCAL); 368228103Smm } 369228103Smm 370185029Spjd if (spa->spa_root != NULL) 371185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 372185029Spjd 0, ZPROP_SRC_LOCAL); 373185029Spjd 374274337Sdelphij if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 375274337Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 376274337Sdelphij MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 377274337Sdelphij } else { 378274337Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 379274337Sdelphij SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 380274337Sdelphij } 381274337Sdelphij 382185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 383185029Spjd if (dp->scd_path == NULL) { 384185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 385185029Spjd "none", 0, ZPROP_SRC_LOCAL); 386185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 387185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 388185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 389185029Spjd } 390185029Spjd } 391185029Spjd} 392185029Spjd 393185029Spjd/* 394185029Spjd * Get zpool property values. 395185029Spjd */ 396185029Spjdint 397185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 398185029Spjd{ 399219089Spjd objset_t *mos = spa->spa_meta_objset; 400185029Spjd zap_cursor_t zc; 401185029Spjd zap_attribute_t za; 402185029Spjd int err; 403185029Spjd 404185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 405185029Spjd 406185029Spjd mutex_enter(&spa->spa_props_lock); 407185029Spjd 408185029Spjd /* 409185029Spjd * Get properties from the spa config. 410185029Spjd */ 411185029Spjd spa_prop_get_config(spa, nvp); 412185029Spjd 413185029Spjd /* If no pool property object, no more prop to get. */ 414219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 415185029Spjd mutex_exit(&spa->spa_props_lock); 416185029Spjd return (0); 417185029Spjd } 418185029Spjd 419185029Spjd /* 420185029Spjd * Get properties from the MOS pool property object. 421185029Spjd */ 422185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 423185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 424185029Spjd zap_cursor_advance(&zc)) { 425185029Spjd uint64_t intval = 0; 426185029Spjd char *strval = NULL; 427185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 428185029Spjd zpool_prop_t prop; 429185029Spjd 430329493Smav if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) 431185029Spjd continue; 432185029Spjd 433185029Spjd switch (za.za_integer_length) { 434185029Spjd case 8: 435185029Spjd /* integer property */ 436185029Spjd if (za.za_first_integer != 437185029Spjd zpool_prop_default_numeric(prop)) 438185029Spjd src = ZPROP_SRC_LOCAL; 439185029Spjd 440185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 441185029Spjd dsl_pool_t *dp; 442185029Spjd dsl_dataset_t *ds = NULL; 443185029Spjd 444185029Spjd dp = spa_get_dsl(spa); 445248571Smm dsl_pool_config_enter(dp, FTAG); 446185029Spjd if (err = dsl_dataset_hold_obj(dp, 447185029Spjd za.za_first_integer, FTAG, &ds)) { 448248571Smm dsl_pool_config_exit(dp, FTAG); 449185029Spjd break; 450185029Spjd } 451185029Spjd 452307108Smav strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 453185029Spjd KM_SLEEP); 454185029Spjd dsl_dataset_name(ds, strval); 455185029Spjd dsl_dataset_rele(ds, FTAG); 456248571Smm dsl_pool_config_exit(dp, FTAG); 457185029Spjd } else { 458185029Spjd strval = NULL; 459185029Spjd intval = za.za_first_integer; 460185029Spjd } 461185029Spjd 462185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 463185029Spjd 464185029Spjd if (strval != NULL) 465307108Smav kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 466185029Spjd 467185029Spjd break; 468185029Spjd 469185029Spjd case 1: 470185029Spjd /* string property */ 471185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 472185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 473185029Spjd za.za_name, 1, za.za_num_integers, strval); 474185029Spjd if (err) { 475185029Spjd kmem_free(strval, za.za_num_integers); 476185029Spjd break; 477185029Spjd } 478185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 479185029Spjd kmem_free(strval, za.za_num_integers); 480185029Spjd break; 481185029Spjd 482185029Spjd default: 483185029Spjd break; 484185029Spjd } 485185029Spjd } 486185029Spjd zap_cursor_fini(&zc); 487185029Spjd mutex_exit(&spa->spa_props_lock); 488185029Spjdout: 489185029Spjd if (err && err != ENOENT) { 490185029Spjd nvlist_free(*nvp); 491185029Spjd *nvp = NULL; 492185029Spjd return (err); 493185029Spjd } 494185029Spjd 495185029Spjd return (0); 496185029Spjd} 497185029Spjd 498185029Spjd/* 499185029Spjd * Validate the given pool properties nvlist and modify the list 500185029Spjd * for the property values to be set. 501185029Spjd */ 502185029Spjdstatic int 503185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 504185029Spjd{ 505185029Spjd nvpair_t *elem; 506185029Spjd int error = 0, reset_bootfs = 0; 507247187Smm uint64_t objnum = 0; 508236884Smm boolean_t has_feature = B_FALSE; 509185029Spjd 510185029Spjd elem = NULL; 511185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 512185029Spjd uint64_t intval; 513236884Smm char *strval, *slash, *check, *fname; 514236884Smm const char *propname = nvpair_name(elem); 515236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 516185029Spjd 517236884Smm switch (prop) { 518329493Smav case ZPOOL_PROP_INVAL: 519236884Smm if (!zpool_prop_feature(propname)) { 520249195Smm error = SET_ERROR(EINVAL); 521236884Smm break; 522236884Smm } 523185029Spjd 524236884Smm /* 525236884Smm * Sanitize the input. 526236884Smm */ 527236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 528249195Smm error = SET_ERROR(EINVAL); 529236884Smm break; 530236884Smm } 531185029Spjd 532236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 533249195Smm error = SET_ERROR(EINVAL); 534236884Smm break; 535236884Smm } 536236884Smm 537236884Smm if (intval != 0) { 538249195Smm error = SET_ERROR(EINVAL); 539236884Smm break; 540236884Smm } 541236884Smm 542236884Smm fname = strchr(propname, '@') + 1; 543236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 544249195Smm error = SET_ERROR(EINVAL); 545236884Smm break; 546236884Smm } 547236884Smm 548236884Smm has_feature = B_TRUE; 549236884Smm break; 550236884Smm 551185029Spjd case ZPOOL_PROP_VERSION: 552185029Spjd error = nvpair_value_uint64(elem, &intval); 553185029Spjd if (!error && 554236884Smm (intval < spa_version(spa) || 555236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 556236884Smm has_feature)) 557249195Smm error = SET_ERROR(EINVAL); 558185029Spjd break; 559185029Spjd 560185029Spjd case ZPOOL_PROP_DELEGATION: 561185029Spjd case ZPOOL_PROP_AUTOREPLACE: 562185029Spjd case ZPOOL_PROP_LISTSNAPS: 563219089Spjd case ZPOOL_PROP_AUTOEXPAND: 564185029Spjd error = nvpair_value_uint64(elem, &intval); 565185029Spjd if (!error && intval > 1) 566249195Smm error = SET_ERROR(EINVAL); 567185029Spjd break; 568185029Spjd 569185029Spjd case ZPOOL_PROP_BOOTFS: 570209962Smm /* 571209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 572209962Smm * or the pool is still being created (version == 0), 573209962Smm * the bootfs property cannot be set. 574209962Smm */ 575185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 576249195Smm error = SET_ERROR(ENOTSUP); 577185029Spjd break; 578185029Spjd } 579185029Spjd 580185029Spjd /* 581185029Spjd * Make sure the vdev config is bootable 582185029Spjd */ 583185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 584249195Smm error = SET_ERROR(ENOTSUP); 585185029Spjd break; 586185029Spjd } 587185029Spjd 588185029Spjd reset_bootfs = 1; 589185029Spjd 590185029Spjd error = nvpair_value_string(elem, &strval); 591185029Spjd 592185029Spjd if (!error) { 593236884Smm objset_t *os; 594274337Sdelphij uint64_t propval; 595185029Spjd 596185029Spjd if (strval == NULL || strval[0] == '\0') { 597185029Spjd objnum = zpool_prop_default_numeric( 598185029Spjd ZPOOL_PROP_BOOTFS); 599185029Spjd break; 600185029Spjd } 601185029Spjd 602219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 603185029Spjd break; 604185029Spjd 605274337Sdelphij /* 606274337Sdelphij * Must be ZPL, and its property settings 607274337Sdelphij * must be supported by GRUB (compression 608274337Sdelphij * is not gzip, and large blocks are not used). 609274337Sdelphij */ 610219089Spjd 611219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 612249195Smm error = SET_ERROR(ENOTSUP); 613248571Smm } else if ((error = 614248571Smm dsl_prop_get_int_ds(dmu_objset_ds(os), 615185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 616274337Sdelphij &propval)) == 0 && 617274337Sdelphij !BOOTFS_COMPRESS_VALID(propval)) { 618249195Smm error = SET_ERROR(ENOTSUP); 619185029Spjd } else { 620185029Spjd objnum = dmu_objset_id(os); 621185029Spjd } 622219089Spjd dmu_objset_rele(os, FTAG); 623185029Spjd } 624185029Spjd break; 625185029Spjd 626185029Spjd case ZPOOL_PROP_FAILUREMODE: 627185029Spjd error = nvpair_value_uint64(elem, &intval); 628185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 629185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 630249195Smm error = SET_ERROR(EINVAL); 631185029Spjd 632185029Spjd /* 633185029Spjd * This is a special case which only occurs when 634185029Spjd * the pool has completely failed. This allows 635185029Spjd * the user to change the in-core failmode property 636185029Spjd * without syncing it out to disk (I/Os might 637185029Spjd * currently be blocked). We do this by returning 638185029Spjd * EIO to the caller (spa_prop_set) to trick it 639185029Spjd * into thinking we encountered a property validation 640185029Spjd * error. 641185029Spjd */ 642185029Spjd if (!error && spa_suspended(spa)) { 643185029Spjd spa->spa_failmode = intval; 644249195Smm error = SET_ERROR(EIO); 645185029Spjd } 646185029Spjd break; 647185029Spjd 648185029Spjd case ZPOOL_PROP_CACHEFILE: 649185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 650185029Spjd break; 651185029Spjd 652185029Spjd if (strval[0] == '\0') 653185029Spjd break; 654185029Spjd 655185029Spjd if (strcmp(strval, "none") == 0) 656185029Spjd break; 657185029Spjd 658185029Spjd if (strval[0] != '/') { 659249195Smm error = SET_ERROR(EINVAL); 660185029Spjd break; 661185029Spjd } 662185029Spjd 663185029Spjd slash = strrchr(strval, '/'); 664185029Spjd ASSERT(slash != NULL); 665185029Spjd 666185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 667185029Spjd strcmp(slash, "/..") == 0) 668249195Smm error = SET_ERROR(EINVAL); 669185029Spjd break; 670219089Spjd 671228103Smm case ZPOOL_PROP_COMMENT: 672228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 673228103Smm break; 674228103Smm for (check = strval; *check != '\0'; check++) { 675228103Smm /* 676228103Smm * The kernel doesn't have an easy isprint() 677228103Smm * check. For this kernel check, we merely 678228103Smm * check ASCII apart from DEL. Fix this if 679228103Smm * there is an easy-to-use kernel isprint(). 680228103Smm */ 681228103Smm if (*check >= 0x7f) { 682249195Smm error = SET_ERROR(EINVAL); 683228103Smm break; 684228103Smm } 685228103Smm } 686228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 687228103Smm error = E2BIG; 688228103Smm break; 689228103Smm 690219089Spjd case ZPOOL_PROP_DEDUPDITTO: 691219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 692249195Smm error = SET_ERROR(ENOTSUP); 693219089Spjd else 694219089Spjd error = nvpair_value_uint64(elem, &intval); 695219089Spjd if (error == 0 && 696219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 697249195Smm error = SET_ERROR(EINVAL); 698219089Spjd break; 699185029Spjd } 700185029Spjd 701185029Spjd if (error) 702185029Spjd break; 703185029Spjd } 704185029Spjd 705185029Spjd if (!error && reset_bootfs) { 706185029Spjd error = nvlist_remove(props, 707185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 708185029Spjd 709185029Spjd if (!error) { 710185029Spjd error = nvlist_add_uint64(props, 711185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 712185029Spjd } 713185029Spjd } 714185029Spjd 715185029Spjd return (error); 716185029Spjd} 717185029Spjd 718209962Smmvoid 719209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 720209962Smm{ 721209962Smm char *cachefile; 722209962Smm spa_config_dirent_t *dp; 723209962Smm 724209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 725209962Smm &cachefile) != 0) 726209962Smm return; 727209962Smm 728209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 729209962Smm KM_SLEEP); 730209962Smm 731209962Smm if (cachefile[0] == '\0') 732209962Smm dp->scd_path = spa_strdup(spa_config_path); 733209962Smm else if (strcmp(cachefile, "none") == 0) 734209962Smm dp->scd_path = NULL; 735209962Smm else 736209962Smm dp->scd_path = spa_strdup(cachefile); 737209962Smm 738209962Smm list_insert_head(&spa->spa_config_list, dp); 739209962Smm if (need_sync) 740209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 741209962Smm} 742209962Smm 743185029Spjdint 744185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 745185029Spjd{ 746185029Spjd int error; 747236884Smm nvpair_t *elem = NULL; 748209962Smm boolean_t need_sync = B_FALSE; 749185029Spjd 750185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 751185029Spjd return (error); 752185029Spjd 753209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 754236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 755209962Smm 756219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 757219089Spjd prop == ZPOOL_PROP_ALTROOT || 758219089Spjd prop == ZPOOL_PROP_READONLY) 759209962Smm continue; 760209962Smm 761329493Smav if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 762236884Smm uint64_t ver; 763236884Smm 764236884Smm if (prop == ZPOOL_PROP_VERSION) { 765236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 766236884Smm } else { 767236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 768236884Smm ver = SPA_VERSION_FEATURES; 769236884Smm need_sync = B_TRUE; 770236884Smm } 771236884Smm 772236884Smm /* Save time if the version is already set. */ 773236884Smm if (ver == spa_version(spa)) 774236884Smm continue; 775236884Smm 776236884Smm /* 777236884Smm * In addition to the pool directory object, we might 778236884Smm * create the pool properties object, the features for 779236884Smm * read object, the features for write object, or the 780236884Smm * feature descriptions object. 781236884Smm */ 782248571Smm error = dsl_sync_task(spa->spa_name, NULL, 783268473Sdelphij spa_sync_version, &ver, 784268473Sdelphij 6, ZFS_SPACE_CHECK_RESERVED); 785236884Smm if (error) 786236884Smm return (error); 787236884Smm continue; 788236884Smm } 789236884Smm 790209962Smm need_sync = B_TRUE; 791209962Smm break; 792209962Smm } 793209962Smm 794236884Smm if (need_sync) { 795248571Smm return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 796268473Sdelphij nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 797236884Smm } 798236884Smm 799236884Smm return (0); 800185029Spjd} 801185029Spjd 802185029Spjd/* 803185029Spjd * If the bootfs property value is dsobj, clear it. 804185029Spjd */ 805185029Spjdvoid 806185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 807185029Spjd{ 808185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 809185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 810185029Spjd spa->spa_pool_props_object, 811185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 812185029Spjd spa->spa_bootfs = 0; 813185029Spjd } 814185029Spjd} 815185029Spjd 816239620Smm/*ARGSUSED*/ 817239620Smmstatic int 818248571Smmspa_change_guid_check(void *arg, dmu_tx_t *tx) 819239620Smm{ 820248571Smm uint64_t *newguid = arg; 821248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 822239620Smm vdev_t *rvd = spa->spa_root_vdev; 823239620Smm uint64_t vdev_state; 824239620Smm 825332547Smav if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 826332547Smav int error = (spa_has_checkpoint(spa)) ? 827332547Smav ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 828332547Smav return (SET_ERROR(error)); 829332547Smav } 830332547Smav 831239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 832239620Smm vdev_state = rvd->vdev_state; 833239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 834239620Smm 835239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 836249195Smm return (SET_ERROR(ENXIO)); 837239620Smm 838239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 839239620Smm 840239620Smm return (0); 841239620Smm} 842239620Smm 843239620Smmstatic void 844248571Smmspa_change_guid_sync(void *arg, dmu_tx_t *tx) 845239620Smm{ 846248571Smm uint64_t *newguid = arg; 847248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 848239620Smm uint64_t oldguid; 849239620Smm vdev_t *rvd = spa->spa_root_vdev; 850239620Smm 851239620Smm oldguid = spa_guid(spa); 852239620Smm 853239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 854239620Smm rvd->vdev_guid = *newguid; 855239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 856239620Smm vdev_config_dirty(rvd); 857239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 858239620Smm 859248571Smm spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 860239620Smm oldguid, *newguid); 861239620Smm} 862239620Smm 863185029Spjd/* 864228103Smm * Change the GUID for the pool. This is done so that we can later 865228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 866228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 867228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 868228103Smm * online when we do this, or else any vdevs that weren't present 869228103Smm * would be orphaned from our pool. We are also going to issue a 870228103Smm * sysevent to update any watchers. 871228103Smm */ 872228103Smmint 873228103Smmspa_change_guid(spa_t *spa) 874228103Smm{ 875239620Smm int error; 876239620Smm uint64_t guid; 877228103Smm 878254074Sdelphij mutex_enter(&spa->spa_vdev_top_lock); 879239620Smm mutex_enter(&spa_namespace_lock); 880239620Smm guid = spa_generate_guid(NULL); 881228103Smm 882248571Smm error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 883268473Sdelphij spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 884228103Smm 885239620Smm if (error == 0) { 886332525Smav spa_write_cachefile(spa, B_FALSE, B_TRUE); 887331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 888239620Smm } 889228103Smm 890239620Smm mutex_exit(&spa_namespace_lock); 891254074Sdelphij mutex_exit(&spa->spa_vdev_top_lock); 892228103Smm 893239620Smm return (error); 894228103Smm} 895228103Smm 896228103Smm/* 897185029Spjd * ========================================================================== 898168404Spjd * SPA state manipulation (open/create/destroy/import/export) 899168404Spjd * ========================================================================== 900168404Spjd */ 901168404Spjd 902168404Spjdstatic int 903168404Spjdspa_error_entry_compare(const void *a, const void *b) 904168404Spjd{ 905168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 906168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 907168404Spjd int ret; 908168404Spjd 909168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 910268123Sdelphij sizeof (zbookmark_phys_t)); 911168404Spjd 912168404Spjd if (ret < 0) 913168404Spjd return (-1); 914168404Spjd else if (ret > 0) 915168404Spjd return (1); 916168404Spjd else 917168404Spjd return (0); 918168404Spjd} 919168404Spjd 920168404Spjd/* 921168404Spjd * Utility function which retrieves copies of the current logs and 922168404Spjd * re-initializes them in the process. 923168404Spjd */ 924168404Spjdvoid 925168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 926168404Spjd{ 927168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 928168404Spjd 929168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 930168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 931168404Spjd 932168404Spjd avl_create(&spa->spa_errlist_scrub, 933168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 934168404Spjd offsetof(spa_error_entry_t, se_avl)); 935168404Spjd avl_create(&spa->spa_errlist_last, 936168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 937168404Spjd offsetof(spa_error_entry_t, se_avl)); 938168404Spjd} 939168404Spjd 940258631Savgstatic void 941258631Savgspa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 942168404Spjd{ 943258631Savg const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 944258631Savg enum zti_modes mode = ztip->zti_mode; 945258631Savg uint_t value = ztip->zti_value; 946258631Savg uint_t count = ztip->zti_count; 947258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 948258631Savg char name[32]; 949258630Savg uint_t flags = 0; 950219089Spjd boolean_t batch = B_FALSE; 951168404Spjd 952258631Savg if (mode == ZTI_MODE_NULL) { 953258631Savg tqs->stqs_count = 0; 954258631Savg tqs->stqs_taskq = NULL; 955258631Savg return; 956258631Savg } 957168404Spjd 958258631Savg ASSERT3U(count, >, 0); 959168404Spjd 960258631Savg tqs->stqs_count = count; 961258631Savg tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 962219089Spjd 963258632Savg switch (mode) { 964258632Savg case ZTI_MODE_FIXED: 965258632Savg ASSERT3U(value, >=, 1); 966258632Savg value = MAX(value, 1); 967258632Savg break; 968219089Spjd 969258632Savg case ZTI_MODE_BATCH: 970258632Savg batch = B_TRUE; 971258632Savg flags |= TASKQ_THREADS_CPU_PCT; 972258632Savg value = zio_taskq_batch_pct; 973258632Savg break; 974219089Spjd 975258632Savg default: 976258632Savg panic("unrecognized mode for %s_%s taskq (%u:%u) in " 977258632Savg "spa_activate()", 978258632Savg zio_type_name[t], zio_taskq_types[q], mode, value); 979258632Savg break; 980258632Savg } 981258631Savg 982258632Savg for (uint_t i = 0; i < count; i++) { 983258632Savg taskq_t *tq; 984258631Savg 985258631Savg if (count > 1) { 986258631Savg (void) snprintf(name, sizeof (name), "%s_%s_%u", 987258631Savg zio_type_name[t], zio_taskq_types[q], i); 988258631Savg } else { 989258631Savg (void) snprintf(name, sizeof (name), "%s_%s", 990258631Savg zio_type_name[t], zio_taskq_types[q]); 991258631Savg } 992258631Savg 993219089Spjd#ifdef SYSDC 994258631Savg if (zio_taskq_sysdc && spa->spa_proc != &p0) { 995258631Savg if (batch) 996258631Savg flags |= TASKQ_DC_BATCH; 997219089Spjd 998258631Savg tq = taskq_create_sysdc(name, value, 50, INT_MAX, 999258631Savg spa->spa_proc, zio_taskq_basedc, flags); 1000258631Savg } else { 1001258631Savg#endif 1002258632Savg pri_t pri = maxclsyspri; 1003258632Savg /* 1004258632Savg * The write issue taskq can be extremely CPU 1005258632Savg * intensive. Run it at slightly lower priority 1006258632Savg * than the other taskqs. 1007314858Savg * FreeBSD notes: 1008314858Savg * - numerically higher priorities are lower priorities; 1009314858Savg * - if priorities divided by four (RQ_PPQ) are equal 1010314858Savg * then a difference between them is insignificant. 1011258632Savg */ 1012258632Savg if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 1013314858Savg#ifdef illumos 1014314858Savg pri--; 1015314858Savg#else 1016314858Savg pri += 4; 1017314858Savg#endif 1018258632Savg 1019258632Savg tq = taskq_create_proc(name, value, pri, 50, 1020258631Savg INT_MAX, spa->spa_proc, flags); 1021258631Savg#ifdef SYSDC 1022258631Savg } 1023258631Savg#endif 1024258631Savg 1025258631Savg tqs->stqs_taskq[i] = tq; 1026219089Spjd } 1027219089Spjd} 1028219089Spjd 1029219089Spjdstatic void 1030258631Savgspa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1031258631Savg{ 1032258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1033258631Savg 1034258631Savg if (tqs->stqs_taskq == NULL) { 1035258631Savg ASSERT0(tqs->stqs_count); 1036258631Savg return; 1037258631Savg } 1038258631Savg 1039258631Savg for (uint_t i = 0; i < tqs->stqs_count; i++) { 1040258631Savg ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1041258631Savg taskq_destroy(tqs->stqs_taskq[i]); 1042258631Savg } 1043258631Savg 1044258631Savg kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1045258631Savg tqs->stqs_taskq = NULL; 1046258631Savg} 1047258631Savg 1048258631Savg/* 1049258631Savg * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1050258631Savg * Note that a type may have multiple discrete taskqs to avoid lock contention 1051258631Savg * on the taskq itself. In that case we choose which taskq at random by using 1052258631Savg * the low bits of gethrtime(). 1053258631Savg */ 1054258631Savgvoid 1055258631Savgspa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1056258631Savg task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 1057258631Savg{ 1058258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1059258631Savg taskq_t *tq; 1060258631Savg 1061258631Savg ASSERT3P(tqs->stqs_taskq, !=, NULL); 1062258631Savg ASSERT3U(tqs->stqs_count, !=, 0); 1063258631Savg 1064258631Savg if (tqs->stqs_count == 1) { 1065258631Savg tq = tqs->stqs_taskq[0]; 1066258631Savg } else { 1067267038Sbdrewery#ifdef _KERNEL 1068267029Smav tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 1069267038Sbdrewery#else 1070267038Sbdrewery tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 1071267038Sbdrewery#endif 1072258631Savg } 1073258631Savg 1074258631Savg taskq_dispatch_ent(tq, func, arg, flags, ent); 1075258631Savg} 1076258631Savg 1077258631Savgstatic void 1078219089Spjdspa_create_zio_taskqs(spa_t *spa) 1079219089Spjd{ 1080185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1081185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1082258631Savg spa_taskqs_init(spa, t, q); 1083219089Spjd } 1084219089Spjd } 1085219089Spjd} 1086209962Smm 1087219089Spjd#ifdef _KERNEL 1088219089Spjd#ifdef SPA_PROCESS 1089219089Spjdstatic void 1090219089Spjdspa_thread(void *arg) 1091219089Spjd{ 1092219089Spjd callb_cpr_t cprinfo; 1093209962Smm 1094219089Spjd spa_t *spa = arg; 1095219089Spjd user_t *pu = PTOU(curproc); 1096209962Smm 1097219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1098219089Spjd spa->spa_name); 1099209962Smm 1100219089Spjd ASSERT(curproc != &p0); 1101219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1102219089Spjd "zpool-%s", spa->spa_name); 1103219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1104211931Smm 1105219089Spjd#ifdef PSRSET_BIND 1106219089Spjd /* bind this thread to the requested psrset */ 1107219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 1108219089Spjd pool_lock(); 1109219089Spjd mutex_enter(&cpu_lock); 1110219089Spjd mutex_enter(&pidlock); 1111219089Spjd mutex_enter(&curproc->p_lock); 1112219089Spjd 1113219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1114219089Spjd 0, NULL, NULL) == 0) { 1115219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 1116219089Spjd } else { 1117219089Spjd cmn_err(CE_WARN, 1118219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 1119219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1120219089Spjd } 1121219089Spjd 1122219089Spjd mutex_exit(&curproc->p_lock); 1123219089Spjd mutex_exit(&pidlock); 1124219089Spjd mutex_exit(&cpu_lock); 1125219089Spjd pool_unlock(); 1126219089Spjd } 1127219089Spjd#endif 1128219089Spjd 1129219089Spjd#ifdef SYSDC 1130219089Spjd if (zio_taskq_sysdc) { 1131219089Spjd sysdc_thread_enter(curthread, 100, 0); 1132219089Spjd } 1133219089Spjd#endif 1134219089Spjd 1135219089Spjd spa->spa_proc = curproc; 1136219089Spjd spa->spa_did = curthread->t_did; 1137219089Spjd 1138219089Spjd spa_create_zio_taskqs(spa); 1139219089Spjd 1140219089Spjd mutex_enter(&spa->spa_proc_lock); 1141219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1142219089Spjd 1143219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 1144219089Spjd cv_broadcast(&spa->spa_proc_cv); 1145219089Spjd 1146219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 1147219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1148219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1149219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1150219089Spjd 1151219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1152219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 1153219089Spjd spa->spa_proc = &p0; 1154219089Spjd cv_broadcast(&spa->spa_proc_cv); 1155219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1156219089Spjd 1157219089Spjd mutex_enter(&curproc->p_lock); 1158219089Spjd lwp_exit(); 1159219089Spjd} 1160219089Spjd#endif /* SPA_PROCESS */ 1161219089Spjd#endif 1162219089Spjd 1163219089Spjd/* 1164219089Spjd * Activate an uninitialized pool. 1165219089Spjd */ 1166219089Spjdstatic void 1167219089Spjdspa_activate(spa_t *spa, int mode) 1168219089Spjd{ 1169219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1170219089Spjd 1171219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 1172219089Spjd spa->spa_mode = mode; 1173219089Spjd 1174219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1175219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1176219089Spjd 1177219089Spjd /* Try to create a covering process */ 1178219089Spjd mutex_enter(&spa->spa_proc_lock); 1179219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1180219089Spjd ASSERT(spa->spa_proc == &p0); 1181219089Spjd spa->spa_did = 0; 1182219089Spjd 1183219089Spjd#ifdef SPA_PROCESS 1184219089Spjd /* Only create a process if we're going to be around a while. */ 1185219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1186219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1187219089Spjd NULL, 0) == 0) { 1188219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 1189219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 1190219089Spjd cv_wait(&spa->spa_proc_cv, 1191219089Spjd &spa->spa_proc_lock); 1192209962Smm } 1193219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1194219089Spjd ASSERT(spa->spa_proc != &p0); 1195219089Spjd ASSERT(spa->spa_did != 0); 1196219089Spjd } else { 1197219089Spjd#ifdef _KERNEL 1198219089Spjd cmn_err(CE_WARN, 1199219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 1200219089Spjd spa->spa_name); 1201219089Spjd#endif 1202185029Spjd } 1203168404Spjd } 1204219089Spjd#endif /* SPA_PROCESS */ 1205219089Spjd mutex_exit(&spa->spa_proc_lock); 1206168404Spjd 1207219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1208219089Spjd ASSERT(spa->spa_proc == &p0); 1209219089Spjd if (spa->spa_proc == &p0) { 1210219089Spjd spa_create_zio_taskqs(spa); 1211219089Spjd } 1212219089Spjd 1213240868Spjd /* 1214240868Spjd * Start TRIM thread. 1215240868Spjd */ 1216240868Spjd trim_thread_create(spa); 1217240868Spjd 1218332525Smav for (size_t i = 0; i < TXG_SIZE; i++) 1219332525Smav spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0); 1220332525Smav 1221185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1222185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1223286575Smav list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1224286575Smav offsetof(objset_t, os_evicting_node)); 1225185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1226185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1227168404Spjd 1228321567Smav txg_list_create(&spa->spa_vdev_txg_list, spa, 1229168404Spjd offsetof(struct vdev, vdev_txg_node)); 1230168404Spjd 1231168404Spjd avl_create(&spa->spa_errlist_scrub, 1232168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1233168404Spjd offsetof(spa_error_entry_t, se_avl)); 1234168404Spjd avl_create(&spa->spa_errlist_last, 1235168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1236168404Spjd offsetof(spa_error_entry_t, se_avl)); 1237168404Spjd} 1238168404Spjd 1239168404Spjd/* 1240168404Spjd * Opposite of spa_activate(). 1241168404Spjd */ 1242168404Spjdstatic void 1243168404Spjdspa_deactivate(spa_t *spa) 1244168404Spjd{ 1245168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1246168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1247168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1248209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1249168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1250168404Spjd 1251240868Spjd /* 1252240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1253240868Spjd * before spa_deactivate(). 1254240868Spjd */ 1255240868Spjd trim_thread_destroy(spa); 1256240868Spjd 1257286575Smav spa_evicting_os_wait(spa); 1258286575Smav 1259168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1260168404Spjd 1261185029Spjd list_destroy(&spa->spa_config_dirty_list); 1262286575Smav list_destroy(&spa->spa_evicting_os_list); 1263185029Spjd list_destroy(&spa->spa_state_dirty_list); 1264168404Spjd 1265185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1266185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1267258631Savg spa_taskqs_fini(spa, t, q); 1268185029Spjd } 1269168404Spjd } 1270168404Spjd 1271332525Smav for (size_t i = 0; i < TXG_SIZE; i++) { 1272332525Smav ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1273332525Smav VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1274332525Smav spa->spa_txg_zio[i] = NULL; 1275332525Smav } 1276332525Smav 1277168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1278168404Spjd spa->spa_normal_class = NULL; 1279168404Spjd 1280185029Spjd metaslab_class_destroy(spa->spa_log_class); 1281185029Spjd spa->spa_log_class = NULL; 1282185029Spjd 1283168404Spjd /* 1284168404Spjd * If this was part of an import or the open otherwise failed, we may 1285168404Spjd * still have errors left in the queues. Empty them just in case. 1286168404Spjd */ 1287168404Spjd spa_errlog_drain(spa); 1288168404Spjd 1289168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1290168404Spjd avl_destroy(&spa->spa_errlist_last); 1291168404Spjd 1292168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1293219089Spjd 1294219089Spjd mutex_enter(&spa->spa_proc_lock); 1295219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1296219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1297219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1298219089Spjd cv_broadcast(&spa->spa_proc_cv); 1299219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1300219089Spjd ASSERT(spa->spa_proc != &p0); 1301219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1302219089Spjd } 1303219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1304219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1305219089Spjd } 1306219089Spjd ASSERT(spa->spa_proc == &p0); 1307219089Spjd mutex_exit(&spa->spa_proc_lock); 1308219089Spjd 1309219089Spjd#ifdef SPA_PROCESS 1310219089Spjd /* 1311219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1312219089Spjd * module, so that the module can't be unloaded out from underneath 1313219089Spjd * it. 1314219089Spjd */ 1315219089Spjd if (spa->spa_did != 0) { 1316219089Spjd thread_join(spa->spa_did); 1317219089Spjd spa->spa_did = 0; 1318219089Spjd } 1319219089Spjd#endif /* SPA_PROCESS */ 1320168404Spjd} 1321168404Spjd 1322168404Spjd/* 1323168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1324168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1325168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1326168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1327168404Spjd */ 1328168404Spjdstatic int 1329168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1330168404Spjd uint_t id, int atype) 1331168404Spjd{ 1332168404Spjd nvlist_t **child; 1333219089Spjd uint_t children; 1334168404Spjd int error; 1335168404Spjd 1336168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1337168404Spjd return (error); 1338168404Spjd 1339168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1340168404Spjd return (0); 1341168404Spjd 1342185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1343185029Spjd &child, &children); 1344185029Spjd 1345185029Spjd if (error == ENOENT) 1346185029Spjd return (0); 1347185029Spjd 1348185029Spjd if (error) { 1349168404Spjd vdev_free(*vdp); 1350168404Spjd *vdp = NULL; 1351249195Smm return (SET_ERROR(EINVAL)); 1352168404Spjd } 1353168404Spjd 1354219089Spjd for (int c = 0; c < children; c++) { 1355168404Spjd vdev_t *vd; 1356168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1357168404Spjd atype)) != 0) { 1358168404Spjd vdev_free(*vdp); 1359168404Spjd *vdp = NULL; 1360168404Spjd return (error); 1361168404Spjd } 1362168404Spjd } 1363168404Spjd 1364168404Spjd ASSERT(*vdp != NULL); 1365168404Spjd 1366168404Spjd return (0); 1367168404Spjd} 1368168404Spjd 1369168404Spjd/* 1370168404Spjd * Opposite of spa_load(). 1371168404Spjd */ 1372168404Spjdstatic void 1373168404Spjdspa_unload(spa_t *spa) 1374168404Spjd{ 1375168404Spjd int i; 1376168404Spjd 1377185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1378185029Spjd 1379332530Smav spa_load_note(spa, "UNLOADING"); 1380332530Smav 1381168404Spjd /* 1382240868Spjd * Stop TRIM thread. 1383240868Spjd */ 1384240868Spjd trim_thread_destroy(spa); 1385240868Spjd 1386240868Spjd /* 1387168404Spjd * Stop async tasks. 1388168404Spjd */ 1389168404Spjd spa_async_suspend(spa); 1390168404Spjd 1391168404Spjd /* 1392168404Spjd * Stop syncing. 1393168404Spjd */ 1394168404Spjd if (spa->spa_sync_on) { 1395168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1396168404Spjd spa->spa_sync_on = B_FALSE; 1397168404Spjd } 1398168404Spjd 1399168404Spjd /* 1400321529Smav * Even though vdev_free() also calls vdev_metaslab_fini, we need 1401321529Smav * to call it earlier, before we wait for async i/o to complete. 1402321529Smav * This ensures that there is no async metaslab prefetching, by 1403321529Smav * calling taskq_wait(mg_taskq). 1404321529Smav */ 1405321529Smav if (spa->spa_root_vdev != NULL) { 1406321529Smav spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1407321529Smav for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) 1408321529Smav vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); 1409321529Smav spa_config_exit(spa, SCL_ALL, FTAG); 1410321529Smav } 1411321529Smav 1412321529Smav /* 1413185029Spjd * Wait for any outstanding async I/O to complete. 1414168404Spjd */ 1415209962Smm if (spa->spa_async_zio_root != NULL) { 1416272598Sdelphij for (int i = 0; i < max_ncpus; i++) 1417272598Sdelphij (void) zio_wait(spa->spa_async_zio_root[i]); 1418272598Sdelphij kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1419209962Smm spa->spa_async_zio_root = NULL; 1420209962Smm } 1421168404Spjd 1422332525Smav if (spa->spa_vdev_removal != NULL) { 1423332525Smav spa_vdev_removal_destroy(spa->spa_vdev_removal); 1424332525Smav spa->spa_vdev_removal = NULL; 1425332525Smav } 1426332525Smav 1427332537Smav if (spa->spa_condense_zthr != NULL) { 1428332537Smav ASSERT(!zthr_isrunning(spa->spa_condense_zthr)); 1429332537Smav zthr_destroy(spa->spa_condense_zthr); 1430332537Smav spa->spa_condense_zthr = NULL; 1431332537Smav } 1432332537Smav 1433332547Smav if (spa->spa_checkpoint_discard_zthr != NULL) { 1434332547Smav ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr)); 1435332547Smav zthr_destroy(spa->spa_checkpoint_discard_zthr); 1436332547Smav spa->spa_checkpoint_discard_zthr = NULL; 1437332547Smav } 1438332547Smav 1439332525Smav spa_condense_fini(spa); 1440332525Smav 1441219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1442219089Spjd 1443258717Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1444258717Savg 1445168404Spjd /* 1446258717Savg * Close all vdevs. 1447258717Savg */ 1448258717Savg if (spa->spa_root_vdev) 1449258717Savg vdev_free(spa->spa_root_vdev); 1450258717Savg ASSERT(spa->spa_root_vdev == NULL); 1451258717Savg 1452258717Savg /* 1453168404Spjd * Close the dsl pool. 1454168404Spjd */ 1455168404Spjd if (spa->spa_dsl_pool) { 1456168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1457168404Spjd spa->spa_dsl_pool = NULL; 1458219089Spjd spa->spa_meta_objset = NULL; 1459168404Spjd } 1460168404Spjd 1461219089Spjd ddt_unload(spa); 1462219089Spjd 1463168404Spjd /* 1464209962Smm * Drop and purge level 2 cache 1465209962Smm */ 1466209962Smm spa_l2cache_drop(spa); 1467209962Smm 1468185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1469185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1470185029Spjd if (spa->spa_spares.sav_vdevs) { 1471185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1472185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1473185029Spjd spa->spa_spares.sav_vdevs = NULL; 1474168404Spjd } 1475185029Spjd if (spa->spa_spares.sav_config) { 1476185029Spjd nvlist_free(spa->spa_spares.sav_config); 1477185029Spjd spa->spa_spares.sav_config = NULL; 1478168404Spjd } 1479185029Spjd spa->spa_spares.sav_count = 0; 1480168404Spjd 1481230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1482230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1483185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1484230514Smm } 1485185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1486185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1487185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1488185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1489185029Spjd } 1490185029Spjd if (spa->spa_l2cache.sav_config) { 1491185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1492185029Spjd spa->spa_l2cache.sav_config = NULL; 1493185029Spjd } 1494185029Spjd spa->spa_l2cache.sav_count = 0; 1495185029Spjd 1496168404Spjd spa->spa_async_suspended = 0; 1497209962Smm 1498332525Smav spa->spa_indirect_vdevs_loaded = B_FALSE; 1499332525Smav 1500228103Smm if (spa->spa_comment != NULL) { 1501228103Smm spa_strfree(spa->spa_comment); 1502228103Smm spa->spa_comment = NULL; 1503228103Smm } 1504228103Smm 1505209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1506168404Spjd} 1507168404Spjd 1508168404Spjd/* 1509168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1510168404Spjd * this pool. When this is called, we have some form of basic information in 1511185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1512185029Spjd * then re-generate a more complete list including status information. 1513168404Spjd */ 1514332525Smavvoid 1515168404Spjdspa_load_spares(spa_t *spa) 1516168404Spjd{ 1517168404Spjd nvlist_t **spares; 1518168404Spjd uint_t nspares; 1519168404Spjd int i; 1520168404Spjd vdev_t *vd, *tvd; 1521168404Spjd 1522332547Smav#ifndef _KERNEL 1523332547Smav /* 1524332547Smav * zdb opens both the current state of the pool and the 1525332547Smav * checkpointed state (if present), with a different spa_t. 1526332547Smav * 1527332547Smav * As spare vdevs are shared among open pools, we skip loading 1528332547Smav * them when we load the checkpointed state of the pool. 1529332547Smav */ 1530332547Smav if (!spa_writeable(spa)) 1531332547Smav return; 1532332547Smav#endif 1533332547Smav 1534185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1535185029Spjd 1536168404Spjd /* 1537168404Spjd * First, close and free any existing spare vdevs. 1538168404Spjd */ 1539185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1540185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1541168404Spjd 1542168404Spjd /* Undo the call to spa_activate() below */ 1543185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1544185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1545168404Spjd spa_spare_remove(tvd); 1546168404Spjd vdev_close(vd); 1547168404Spjd vdev_free(vd); 1548168404Spjd } 1549168404Spjd 1550185029Spjd if (spa->spa_spares.sav_vdevs) 1551185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1552185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1553168404Spjd 1554185029Spjd if (spa->spa_spares.sav_config == NULL) 1555168404Spjd nspares = 0; 1556168404Spjd else 1557185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1558168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1559168404Spjd 1560185029Spjd spa->spa_spares.sav_count = (int)nspares; 1561185029Spjd spa->spa_spares.sav_vdevs = NULL; 1562168404Spjd 1563168404Spjd if (nspares == 0) 1564168404Spjd return; 1565168404Spjd 1566168404Spjd /* 1567168404Spjd * Construct the array of vdevs, opening them to get status in the 1568168404Spjd * process. For each spare, there is potentially two different vdev_t 1569168404Spjd * structures associated with it: one in the list of spares (used only 1570168404Spjd * for basic validation purposes) and one in the active vdev 1571168404Spjd * configuration (if it's spared in). During this phase we open and 1572168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1573168404Spjd * active configuration, then we also mark this vdev as an active spare. 1574168404Spjd */ 1575185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1576185029Spjd KM_SLEEP); 1577185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1578168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1579168404Spjd VDEV_ALLOC_SPARE) == 0); 1580168404Spjd ASSERT(vd != NULL); 1581168404Spjd 1582185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1583168404Spjd 1584185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1585185029Spjd B_FALSE)) != NULL) { 1586168404Spjd if (!tvd->vdev_isspare) 1587168404Spjd spa_spare_add(tvd); 1588168404Spjd 1589168404Spjd /* 1590168404Spjd * We only mark the spare active if we were successfully 1591168404Spjd * able to load the vdev. Otherwise, importing a pool 1592168404Spjd * with a bad active spare would result in strange 1593168404Spjd * behavior, because multiple pool would think the spare 1594168404Spjd * is actively in use. 1595168404Spjd * 1596168404Spjd * There is a vulnerability here to an equally bizarre 1597168404Spjd * circumstance, where a dead active spare is later 1598168404Spjd * brought back to life (onlined or otherwise). Given 1599168404Spjd * the rarity of this scenario, and the extra complexity 1600168404Spjd * it adds, we ignore the possibility. 1601168404Spjd */ 1602168404Spjd if (!vdev_is_dead(tvd)) 1603168404Spjd spa_spare_activate(tvd); 1604168404Spjd } 1605168404Spjd 1606185029Spjd vd->vdev_top = vd; 1607209962Smm vd->vdev_aux = &spa->spa_spares; 1608185029Spjd 1609168404Spjd if (vdev_open(vd) != 0) 1610168404Spjd continue; 1611168404Spjd 1612185029Spjd if (vdev_validate_aux(vd) == 0) 1613185029Spjd spa_spare_add(vd); 1614168404Spjd } 1615168404Spjd 1616168404Spjd /* 1617168404Spjd * Recompute the stashed list of spares, with status information 1618168404Spjd * this time. 1619168404Spjd */ 1620185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1621168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1622168404Spjd 1623185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1624185029Spjd KM_SLEEP); 1625185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1626185029Spjd spares[i] = vdev_config_generate(spa, 1627219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1628185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1629185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1630185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1631168404Spjd nvlist_free(spares[i]); 1632185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1633168404Spjd} 1634168404Spjd 1635185029Spjd/* 1636185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1637185029Spjd * this pool. When this is called, we have some form of basic information in 1638185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1639185029Spjd * then re-generate a more complete list including status information. 1640185029Spjd * Devices which are already active have their details maintained, and are 1641185029Spjd * not re-opened. 1642185029Spjd */ 1643332525Smavvoid 1644185029Spjdspa_load_l2cache(spa_t *spa) 1645185029Spjd{ 1646185029Spjd nvlist_t **l2cache; 1647185029Spjd uint_t nl2cache; 1648185029Spjd int i, j, oldnvdevs; 1649219089Spjd uint64_t guid; 1650185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1651185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1652185029Spjd 1653332547Smav#ifndef _KERNEL 1654332547Smav /* 1655332547Smav * zdb opens both the current state of the pool and the 1656332547Smav * checkpointed state (if present), with a different spa_t. 1657332547Smav * 1658332547Smav * As L2 caches are part of the ARC which is shared among open 1659332547Smav * pools, we skip loading them when we load the checkpointed 1660332547Smav * state of the pool. 1661332547Smav */ 1662332547Smav if (!spa_writeable(spa)) 1663332547Smav return; 1664332547Smav#endif 1665332547Smav 1666185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1667185029Spjd 1668185029Spjd if (sav->sav_config != NULL) { 1669185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1670185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1671185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1672185029Spjd } else { 1673185029Spjd nl2cache = 0; 1674247187Smm newvdevs = NULL; 1675185029Spjd } 1676185029Spjd 1677185029Spjd oldvdevs = sav->sav_vdevs; 1678185029Spjd oldnvdevs = sav->sav_count; 1679185029Spjd sav->sav_vdevs = NULL; 1680185029Spjd sav->sav_count = 0; 1681185029Spjd 1682185029Spjd /* 1683185029Spjd * Process new nvlist of vdevs. 1684185029Spjd */ 1685185029Spjd for (i = 0; i < nl2cache; i++) { 1686185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1687185029Spjd &guid) == 0); 1688185029Spjd 1689185029Spjd newvdevs[i] = NULL; 1690185029Spjd for (j = 0; j < oldnvdevs; j++) { 1691185029Spjd vd = oldvdevs[j]; 1692185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1693185029Spjd /* 1694185029Spjd * Retain previous vdev for add/remove ops. 1695185029Spjd */ 1696185029Spjd newvdevs[i] = vd; 1697185029Spjd oldvdevs[j] = NULL; 1698185029Spjd break; 1699185029Spjd } 1700185029Spjd } 1701185029Spjd 1702185029Spjd if (newvdevs[i] == NULL) { 1703185029Spjd /* 1704185029Spjd * Create new vdev 1705185029Spjd */ 1706185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1707185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1708185029Spjd ASSERT(vd != NULL); 1709185029Spjd newvdevs[i] = vd; 1710185029Spjd 1711185029Spjd /* 1712185029Spjd * Commit this vdev as an l2cache device, 1713185029Spjd * even if it fails to open. 1714185029Spjd */ 1715185029Spjd spa_l2cache_add(vd); 1716185029Spjd 1717185029Spjd vd->vdev_top = vd; 1718185029Spjd vd->vdev_aux = sav; 1719185029Spjd 1720185029Spjd spa_l2cache_activate(vd); 1721185029Spjd 1722185029Spjd if (vdev_open(vd) != 0) 1723185029Spjd continue; 1724185029Spjd 1725185029Spjd (void) vdev_validate_aux(vd); 1726185029Spjd 1727219089Spjd if (!vdev_is_dead(vd)) 1728219089Spjd l2arc_add_vdev(spa, vd); 1729185029Spjd } 1730185029Spjd } 1731185029Spjd 1732185029Spjd /* 1733185029Spjd * Purge vdevs that were dropped 1734185029Spjd */ 1735185029Spjd for (i = 0; i < oldnvdevs; i++) { 1736185029Spjd uint64_t pool; 1737185029Spjd 1738185029Spjd vd = oldvdevs[i]; 1739185029Spjd if (vd != NULL) { 1740230514Smm ASSERT(vd->vdev_isl2cache); 1741230514Smm 1742209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1743209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1744185029Spjd l2arc_remove_vdev(vd); 1745230514Smm vdev_clear_stats(vd); 1746230514Smm vdev_free(vd); 1747185029Spjd } 1748185029Spjd } 1749185029Spjd 1750185029Spjd if (oldvdevs) 1751185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1752185029Spjd 1753185029Spjd if (sav->sav_config == NULL) 1754185029Spjd goto out; 1755185029Spjd 1756185029Spjd sav->sav_vdevs = newvdevs; 1757185029Spjd sav->sav_count = (int)nl2cache; 1758185029Spjd 1759185029Spjd /* 1760185029Spjd * Recompute the stashed list of l2cache devices, with status 1761185029Spjd * information this time. 1762185029Spjd */ 1763185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1764185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1765185029Spjd 1766185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1767185029Spjd for (i = 0; i < sav->sav_count; i++) 1768185029Spjd l2cache[i] = vdev_config_generate(spa, 1769219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1770185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1771185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1772185029Spjdout: 1773185029Spjd for (i = 0; i < sav->sav_count; i++) 1774185029Spjd nvlist_free(l2cache[i]); 1775185029Spjd if (sav->sav_count) 1776185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1777185029Spjd} 1778185029Spjd 1779168404Spjdstatic int 1780168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1781168404Spjd{ 1782168404Spjd dmu_buf_t *db; 1783168404Spjd char *packed = NULL; 1784168404Spjd size_t nvsize = 0; 1785168404Spjd int error; 1786168404Spjd *value = NULL; 1787168404Spjd 1788262676Sdelphij error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1789262676Sdelphij if (error != 0) 1790262676Sdelphij return (error); 1791287744Sdelphij 1792168404Spjd nvsize = *(uint64_t *)db->db_data; 1793168404Spjd dmu_buf_rele(db, FTAG); 1794168404Spjd 1795168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1796209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1797209962Smm DMU_READ_PREFETCH); 1798168404Spjd if (error == 0) 1799168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1800168404Spjd kmem_free(packed, nvsize); 1801168404Spjd 1802168404Spjd return (error); 1803168404Spjd} 1804168404Spjd 1805168404Spjd/* 1806332536Smav * Concrete top-level vdevs that are not missing and are not logs. At every 1807332536Smav * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 1808332536Smav */ 1809332536Smavstatic uint64_t 1810332536Smavspa_healthy_core_tvds(spa_t *spa) 1811332536Smav{ 1812332536Smav vdev_t *rvd = spa->spa_root_vdev; 1813332536Smav uint64_t tvds = 0; 1814332536Smav 1815332536Smav for (uint64_t i = 0; i < rvd->vdev_children; i++) { 1816332536Smav vdev_t *vd = rvd->vdev_child[i]; 1817332536Smav if (vd->vdev_islog) 1818332536Smav continue; 1819332536Smav if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 1820332536Smav tvds++; 1821332536Smav } 1822332536Smav 1823332536Smav return (tvds); 1824332536Smav} 1825332536Smav 1826332536Smav/* 1827185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1828185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1829185029Spjd */ 1830185029Spjdstatic void 1831185029Spjdspa_check_removed(vdev_t *vd) 1832185029Spjd{ 1833332536Smav for (uint64_t c = 0; c < vd->vdev_children; c++) 1834185029Spjd spa_check_removed(vd->vdev_child[c]); 1835185029Spjd 1836249188Smm if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1837332525Smav vdev_is_concrete(vd)) { 1838185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1839331397Smav spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 1840185029Spjd } 1841185029Spjd} 1842185029Spjd 1843332536Smavstatic int 1844332536Smavspa_check_for_missing_logs(spa_t *spa) 1845299441Smav{ 1846332536Smav vdev_t *rvd = spa->spa_root_vdev; 1847299441Smav 1848219089Spjd /* 1849219089Spjd * If we're doing a normal import, then build up any additional 1850332536Smav * diagnostic information about missing log devices. 1851219089Spjd * We'll pass this up to the user for further processing. 1852219089Spjd */ 1853219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1854219089Spjd nvlist_t **child, *nv; 1855219089Spjd uint64_t idx = 0; 1856219089Spjd 1857219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1858219089Spjd KM_SLEEP); 1859219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1860219089Spjd 1861332536Smav for (uint64_t c = 0; c < rvd->vdev_children; c++) { 1862219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1863219089Spjd 1864332536Smav /* 1865332536Smav * We consider a device as missing only if it failed 1866332536Smav * to open (i.e. offline or faulted is not considered 1867332536Smav * as missing). 1868332536Smav */ 1869332536Smav if (tvd->vdev_islog && 1870332536Smav tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 1871332536Smav child[idx++] = vdev_config_generate(spa, tvd, 1872332536Smav B_FALSE, VDEV_CONFIG_MISSING); 1873332536Smav } 1874219089Spjd } 1875219089Spjd 1876332536Smav if (idx > 0) { 1877332536Smav fnvlist_add_nvlist_array(nv, 1878332536Smav ZPOOL_CONFIG_CHILDREN, child, idx); 1879332536Smav fnvlist_add_nvlist(spa->spa_load_info, 1880332536Smav ZPOOL_CONFIG_MISSING_DEVICES, nv); 1881219089Spjd 1882332536Smav for (uint64_t i = 0; i < idx; i++) 1883219089Spjd nvlist_free(child[i]); 1884219089Spjd } 1885219089Spjd nvlist_free(nv); 1886219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1887219089Spjd 1888332536Smav if (idx > 0) { 1889332536Smav spa_load_failed(spa, "some log devices are missing"); 1890332549Smav vdev_dbgmsg_print_tree(rvd, 2); 1891332536Smav return (SET_ERROR(ENXIO)); 1892332536Smav } 1893332536Smav } else { 1894332536Smav for (uint64_t c = 0; c < rvd->vdev_children; c++) { 1895332536Smav vdev_t *tvd = rvd->vdev_child[c]; 1896213197Smm 1897332536Smav if (tvd->vdev_islog && 1898332536Smav tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 1899219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1900332536Smav spa_load_note(spa, "some log devices are " 1901332536Smav "missing, ZIL is dropped."); 1902332549Smav vdev_dbgmsg_print_tree(rvd, 2); 1903332536Smav break; 1904219089Spjd } 1905219089Spjd } 1906213197Smm } 1907299441Smav 1908332536Smav return (0); 1909213197Smm} 1910213197Smm 1911213197Smm/* 1912185029Spjd * Check for missing log devices 1913185029Spjd */ 1914248571Smmstatic boolean_t 1915185029Spjdspa_check_logs(spa_t *spa) 1916185029Spjd{ 1917248571Smm boolean_t rv = B_FALSE; 1918286686Smav dsl_pool_t *dp = spa_get_dsl(spa); 1919248571Smm 1920185029Spjd switch (spa->spa_log_state) { 1921185029Spjd case SPA_LOG_MISSING: 1922185029Spjd /* need to recheck in case slog has been restored */ 1923185029Spjd case SPA_LOG_UNKNOWN: 1924286686Smav rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1925286686Smav zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1926248571Smm if (rv) 1927219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1928185029Spjd break; 1929185029Spjd } 1930248571Smm return (rv); 1931185029Spjd} 1932185029Spjd 1933219089Spjdstatic boolean_t 1934219089Spjdspa_passivate_log(spa_t *spa) 1935219089Spjd{ 1936219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1937219089Spjd boolean_t slog_found = B_FALSE; 1938219089Spjd 1939219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1940219089Spjd 1941219089Spjd if (!spa_has_slogs(spa)) 1942219089Spjd return (B_FALSE); 1943219089Spjd 1944219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1945219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1946219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1947219089Spjd 1948219089Spjd if (tvd->vdev_islog) { 1949219089Spjd metaslab_group_passivate(mg); 1950219089Spjd slog_found = B_TRUE; 1951219089Spjd } 1952219089Spjd } 1953219089Spjd 1954219089Spjd return (slog_found); 1955219089Spjd} 1956219089Spjd 1957219089Spjdstatic void 1958219089Spjdspa_activate_log(spa_t *spa) 1959219089Spjd{ 1960219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1961219089Spjd 1962219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1963219089Spjd 1964219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1965219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1966219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1967219089Spjd 1968219089Spjd if (tvd->vdev_islog) 1969219089Spjd metaslab_group_activate(mg); 1970219089Spjd } 1971219089Spjd} 1972219089Spjd 1973219089Spjdint 1974332525Smavspa_reset_logs(spa_t *spa) 1975219089Spjd{ 1976248571Smm int error; 1977219089Spjd 1978332525Smav error = dmu_objset_find(spa_name(spa), zil_reset, 1979248571Smm NULL, DS_FIND_CHILDREN); 1980248571Smm if (error == 0) { 1981219089Spjd /* 1982219089Spjd * We successfully offlined the log device, sync out the 1983219089Spjd * current txg so that the "stubby" block can be removed 1984219089Spjd * by zil_sync(). 1985219089Spjd */ 1986219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1987219089Spjd } 1988219089Spjd return (error); 1989219089Spjd} 1990219089Spjd 1991219089Spjdstatic void 1992219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1993219089Spjd{ 1994219089Spjd int i; 1995219089Spjd 1996219089Spjd for (i = 0; i < sav->sav_count; i++) 1997219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1998219089Spjd} 1999219089Spjd 2000219089Spjdvoid 2001219089Spjdspa_claim_notify(zio_t *zio) 2002219089Spjd{ 2003219089Spjd spa_t *spa = zio->io_spa; 2004219089Spjd 2005219089Spjd if (zio->io_error) 2006219089Spjd return; 2007219089Spjd 2008219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2009219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 2010219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 2011219089Spjd mutex_exit(&spa->spa_props_lock); 2012219089Spjd} 2013219089Spjd 2014219089Spjdtypedef struct spa_load_error { 2015219089Spjd uint64_t sle_meta_count; 2016219089Spjd uint64_t sle_data_count; 2017219089Spjd} spa_load_error_t; 2018219089Spjd 2019219089Spjdstatic void 2020219089Spjdspa_load_verify_done(zio_t *zio) 2021219089Spjd{ 2022219089Spjd blkptr_t *bp = zio->io_bp; 2023219089Spjd spa_load_error_t *sle = zio->io_private; 2024219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 2025219089Spjd int error = zio->io_error; 2026268720Sdelphij spa_t *spa = zio->io_spa; 2027219089Spjd 2028321610Smav abd_free(zio->io_abd); 2029219089Spjd if (error) { 2030236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2031219089Spjd type != DMU_OT_INTENT_LOG) 2032270247Sdelphij atomic_inc_64(&sle->sle_meta_count); 2033219089Spjd else 2034270247Sdelphij atomic_inc_64(&sle->sle_data_count); 2035219089Spjd } 2036268720Sdelphij 2037268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 2038339034Ssef spa->spa_load_verify_ios--; 2039268720Sdelphij cv_broadcast(&spa->spa_scrub_io_cv); 2040268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 2041219089Spjd} 2042219089Spjd 2043268720Sdelphij/* 2044268720Sdelphij * Maximum number of concurrent scrub i/os to create while verifying 2045268720Sdelphij * a pool while importing it. 2046268720Sdelphij */ 2047268720Sdelphijint spa_load_verify_maxinflight = 10000; 2048268720Sdelphijboolean_t spa_load_verify_metadata = B_TRUE; 2049268720Sdelphijboolean_t spa_load_verify_data = B_TRUE; 2050268720Sdelphij 2051268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, 2052268720Sdelphij &spa_load_verify_maxinflight, 0, 2053268720Sdelphij "Maximum number of concurrent scrub I/Os to create while verifying a " 2054268720Sdelphij "pool while importing it"); 2055268720Sdelphij 2056268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, 2057268720Sdelphij &spa_load_verify_metadata, 0, 2058268720Sdelphij "Check metadata on import?"); 2059268720Sdelphij 2060268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, 2061268720Sdelphij &spa_load_verify_data, 0, 2062268720Sdelphij "Check user data on import?"); 2063268720Sdelphij 2064219089Spjd/*ARGSUSED*/ 2065219089Spjdstatic int 2066219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2067268123Sdelphij const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2068219089Spjd{ 2069286705Smav if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 2070268720Sdelphij return (0); 2071268720Sdelphij /* 2072268720Sdelphij * Note: normally this routine will not be called if 2073268720Sdelphij * spa_load_verify_metadata is not set. However, it may be useful 2074268720Sdelphij * to manually set the flag after the traversal has begun. 2075268720Sdelphij */ 2076268720Sdelphij if (!spa_load_verify_metadata) 2077268720Sdelphij return (0); 2078321610Smav if (!BP_IS_METADATA(bp) && !spa_load_verify_data) 2079268720Sdelphij return (0); 2080219089Spjd 2081268720Sdelphij zio_t *rio = arg; 2082268720Sdelphij size_t size = BP_GET_PSIZE(bp); 2083268720Sdelphij 2084268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 2085339034Ssef while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight) 2086268720Sdelphij cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2087339034Ssef spa->spa_load_verify_ios++; 2088268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 2089268720Sdelphij 2090321610Smav zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2091268720Sdelphij spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2092268720Sdelphij ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2093268720Sdelphij ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2094219089Spjd return (0); 2095219089Spjd} 2096219089Spjd 2097307045Smav/* ARGSUSED */ 2098307045Smavint 2099307045Smavverify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2100307045Smav{ 2101307108Smav if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2102307045Smav return (SET_ERROR(ENAMETOOLONG)); 2103307045Smav 2104307045Smav return (0); 2105307045Smav} 2106307045Smav 2107219089Spjdstatic int 2108219089Spjdspa_load_verify(spa_t *spa) 2109219089Spjd{ 2110219089Spjd zio_t *rio; 2111219089Spjd spa_load_error_t sle = { 0 }; 2112332550Smav zpool_load_policy_t policy; 2113219089Spjd boolean_t verify_ok = B_FALSE; 2114268720Sdelphij int error = 0; 2115219089Spjd 2116332550Smav zpool_get_load_policy(spa->spa_config, &policy); 2117219089Spjd 2118332550Smav if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) 2119219089Spjd return (0); 2120219089Spjd 2121307045Smav dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2122307045Smav error = dmu_objset_find_dp(spa->spa_dsl_pool, 2123307045Smav spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2124307045Smav DS_FIND_CHILDREN); 2125307045Smav dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2126307045Smav if (error != 0) 2127307045Smav return (error); 2128307045Smav 2129219089Spjd rio = zio_root(spa, NULL, &sle, 2130219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2131219089Spjd 2132268720Sdelphij if (spa_load_verify_metadata) { 2133332530Smav if (spa->spa_extreme_rewind) { 2134332530Smav spa_load_note(spa, "performing a complete scan of the " 2135332530Smav "pool since extreme rewind is on. This may take " 2136332530Smav "a very long time.\n (spa_load_verify_data=%u, " 2137332530Smav "spa_load_verify_metadata=%u)", 2138332530Smav spa_load_verify_data, spa_load_verify_metadata); 2139332530Smav } 2140268720Sdelphij error = traverse_pool(spa, spa->spa_verify_min_txg, 2141268720Sdelphij TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 2142268720Sdelphij spa_load_verify_cb, rio); 2143268720Sdelphij } 2144219089Spjd 2145219089Spjd (void) zio_wait(rio); 2146219089Spjd 2147219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 2148219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 2149219089Spjd 2150332531Smav if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2151332531Smav spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2152332531Smav "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2153332531Smav (u_longlong_t)sle.sle_data_count); 2154332531Smav } 2155332531Smav 2156332531Smav if (spa_load_verify_dryrun || 2157332550Smav (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2158332550Smav sle.sle_data_count <= policy.zlp_maxdata)) { 2159219089Spjd int64_t loss = 0; 2160219089Spjd 2161219089Spjd verify_ok = B_TRUE; 2162219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2163219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2164219089Spjd 2165219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2166219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 2167219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2168219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 2169219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2170219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 2171219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2172219089Spjd } else { 2173219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2174219089Spjd } 2175219089Spjd 2176332531Smav if (spa_load_verify_dryrun) 2177332531Smav return (0); 2178332531Smav 2179219089Spjd if (error) { 2180219089Spjd if (error != ENXIO && error != EIO) 2181249195Smm error = SET_ERROR(EIO); 2182219089Spjd return (error); 2183219089Spjd } 2184219089Spjd 2185219089Spjd return (verify_ok ? 0 : EIO); 2186219089Spjd} 2187219089Spjd 2188185029Spjd/* 2189219089Spjd * Find a value in the pool props object. 2190168404Spjd */ 2191219089Spjdstatic void 2192219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2193219089Spjd{ 2194219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2195219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2196219089Spjd} 2197219089Spjd 2198219089Spjd/* 2199219089Spjd * Find a value in the pool directory object. 2200219089Spjd */ 2201168404Spjdstatic int 2202332530Smavspa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2203168404Spjd{ 2204332530Smav int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2205332530Smav name, sizeof (uint64_t), 1, val); 2206332530Smav 2207332530Smav if (error != 0 && (error != ENOENT || log_enoent)) { 2208332530Smav spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2209332530Smav "[error=%d]", name, error); 2210332530Smav } 2211332530Smav 2212332530Smav return (error); 2213219089Spjd} 2214168404Spjd 2215219089Spjdstatic int 2216219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2217219089Spjd{ 2218219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2219332525Smav return (SET_ERROR(err)); 2220219089Spjd} 2221219089Spjd 2222332537Smavstatic void 2223332537Smavspa_spawn_aux_threads(spa_t *spa) 2224332537Smav{ 2225332537Smav ASSERT(spa_writeable(spa)); 2226332537Smav 2227332537Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2228332537Smav 2229332537Smav spa_start_indirect_condensing_thread(spa); 2230332547Smav 2231332547Smav ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 2232332547Smav spa->spa_checkpoint_discard_zthr = 2233332547Smav zthr_create(spa_checkpoint_discard_thread_check, 2234332547Smav spa_checkpoint_discard_thread, spa); 2235332537Smav} 2236332537Smav 2237219089Spjd/* 2238219089Spjd * Fix up config after a partly-completed split. This is done with the 2239219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2240219089Spjd * pool have that entry in their config, but only the splitting one contains 2241219089Spjd * a list of all the guids of the vdevs that are being split off. 2242219089Spjd * 2243219089Spjd * This function determines what to do with that list: either rejoin 2244219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 2245219089Spjd * the rejoin, each disk that is offlined is marked online again, and 2246219089Spjd * we do a reopen() call. If the vdev label for every disk that was 2247219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2248219089Spjd * then we call vdev_split() on each disk, and complete the split. 2249219089Spjd * 2250219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 2251219089Spjd * the original pool. 2252219089Spjd */ 2253219089Spjdstatic void 2254219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 2255219089Spjd{ 2256219089Spjd uint_t extracted; 2257219089Spjd uint64_t *glist; 2258219089Spjd uint_t i, gcount; 2259219089Spjd nvlist_t *nvl; 2260219089Spjd vdev_t **vd; 2261219089Spjd boolean_t attempt_reopen; 2262219089Spjd 2263219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2264219089Spjd return; 2265219089Spjd 2266219089Spjd /* check that the config is complete */ 2267219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2268219089Spjd &glist, &gcount) != 0) 2269219089Spjd return; 2270219089Spjd 2271219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2272219089Spjd 2273219089Spjd /* attempt to online all the vdevs & validate */ 2274219089Spjd attempt_reopen = B_TRUE; 2275219089Spjd for (i = 0; i < gcount; i++) { 2276219089Spjd if (glist[i] == 0) /* vdev is hole */ 2277219089Spjd continue; 2278219089Spjd 2279219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2280219089Spjd if (vd[i] == NULL) { 2281219089Spjd /* 2282219089Spjd * Don't bother attempting to reopen the disks; 2283219089Spjd * just do the split. 2284219089Spjd */ 2285219089Spjd attempt_reopen = B_FALSE; 2286219089Spjd } else { 2287219089Spjd /* attempt to re-online it */ 2288219089Spjd vd[i]->vdev_offline = B_FALSE; 2289219089Spjd } 2290219089Spjd } 2291219089Spjd 2292219089Spjd if (attempt_reopen) { 2293219089Spjd vdev_reopen(spa->spa_root_vdev); 2294219089Spjd 2295219089Spjd /* check each device to see what state it's in */ 2296219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 2297219089Spjd if (vd[i] != NULL && 2298219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2299219089Spjd break; 2300219089Spjd ++extracted; 2301219089Spjd } 2302219089Spjd } 2303219089Spjd 2304209962Smm /* 2305219089Spjd * If every disk has been moved to the new pool, or if we never 2306219089Spjd * even attempted to look at them, then we split them off for 2307219089Spjd * good. 2308209962Smm */ 2309219089Spjd if (!attempt_reopen || gcount == extracted) { 2310219089Spjd for (i = 0; i < gcount; i++) 2311219089Spjd if (vd[i] != NULL) 2312219089Spjd vdev_split(vd[i]); 2313219089Spjd vdev_reopen(spa->spa_root_vdev); 2314219089Spjd } 2315209962Smm 2316219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 2317219089Spjd} 2318185029Spjd 2319219089Spjdstatic int 2320332536Smavspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 2321219089Spjd{ 2322219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 2323219089Spjd int error; 2324168404Spjd 2325332536Smav spa->spa_load_state = state; 2326168404Spjd 2327332536Smav gethrestime(&spa->spa_loaded_ts); 2328332547Smav error = spa_load_impl(spa, type, &ereport); 2329228103Smm 2330168404Spjd /* 2331286575Smav * Don't count references from objsets that are already closed 2332286575Smav * and are making their way through the eviction process. 2333286575Smav */ 2334286575Smav spa_evicting_os_wait(spa); 2335219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2336219089Spjd if (error) { 2337219089Spjd if (error != EEXIST) { 2338219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2339219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2340219089Spjd } 2341219089Spjd if (error != EBADF) { 2342219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2343219089Spjd } 2344219089Spjd } 2345219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2346219089Spjd spa->spa_ena = 0; 2347168404Spjd 2348219089Spjd return (error); 2349219089Spjd} 2350219089Spjd 2351219089Spjd/* 2352299441Smav * Count the number of per-vdev ZAPs associated with all of the vdevs in the 2353299441Smav * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 2354299441Smav * spa's per-vdev ZAP list. 2355299441Smav */ 2356299441Smavstatic uint64_t 2357299441Smavvdev_count_verify_zaps(vdev_t *vd) 2358299441Smav{ 2359299441Smav spa_t *spa = vd->vdev_spa; 2360299441Smav uint64_t total = 0; 2361299441Smav if (vd->vdev_top_zap != 0) { 2362299441Smav total++; 2363299441Smav ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2364299441Smav spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 2365299441Smav } 2366299441Smav if (vd->vdev_leaf_zap != 0) { 2367299441Smav total++; 2368299441Smav ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2369299441Smav spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 2370299441Smav } 2371299441Smav 2372299441Smav for (uint64_t i = 0; i < vd->vdev_children; i++) { 2373299441Smav total += vdev_count_verify_zaps(vd->vdev_child[i]); 2374299441Smav } 2375299441Smav 2376299441Smav return (total); 2377299441Smav} 2378299441Smav 2379219089Spjdstatic int 2380332536Smavspa_verify_host(spa_t *spa, nvlist_t *mos_config) 2381219089Spjd{ 2382332536Smav uint64_t hostid; 2383332536Smav char *hostname; 2384332536Smav uint64_t myhostid = 0; 2385332536Smav 2386332536Smav if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 2387332536Smav ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2388332536Smav hostname = fnvlist_lookup_string(mos_config, 2389332536Smav ZPOOL_CONFIG_HOSTNAME); 2390332536Smav 2391332536Smav myhostid = zone_get_hostid(NULL); 2392332536Smav 2393332536Smav if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 2394332536Smav cmn_err(CE_WARN, "pool '%s' could not be " 2395332536Smav "loaded as it was last accessed by " 2396332536Smav "another system (host: %s hostid: 0x%llx). " 2397332536Smav "See: http://illumos.org/msg/ZFS-8000-EY", 2398332536Smav spa_name(spa), hostname, (u_longlong_t)hostid); 2399332536Smav spa_load_failed(spa, "hostid verification failed: pool " 2400332536Smav "last accessed by host: %s (hostid: 0x%llx)", 2401332536Smav hostname, (u_longlong_t)hostid); 2402332536Smav return (SET_ERROR(EBADF)); 2403332536Smav } 2404332536Smav } 2405332536Smav 2406332536Smav return (0); 2407332536Smav} 2408332536Smav 2409332536Smavstatic int 2410332536Smavspa_ld_parse_config(spa_t *spa, spa_import_type_t type) 2411332536Smav{ 2412219089Spjd int error = 0; 2413332536Smav nvlist_t *nvtree, *nvl, *config = spa->spa_config; 2414332529Smav int parse; 2415219089Spjd vdev_t *rvd; 2416332536Smav uint64_t pool_guid; 2417332536Smav char *comment; 2418219089Spjd 2419332536Smav /* 2420332536Smav * Versioning wasn't explicitly added to the label until later, so if 2421332536Smav * it's not present treat it as the initial version. 2422332536Smav */ 2423332536Smav if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2424332536Smav &spa->spa_ubsync.ub_version) != 0) 2425332536Smav spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2426332536Smav 2427332536Smav if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 2428332536Smav spa_load_failed(spa, "invalid config provided: '%s' missing", 2429332536Smav ZPOOL_CONFIG_POOL_GUID); 2430332536Smav return (SET_ERROR(EINVAL)); 2431332536Smav } 2432332536Smav 2433332547Smav /* 2434332547Smav * If we are doing an import, ensure that the pool is not already 2435332547Smav * imported by checking if its pool guid already exists in the 2436332547Smav * spa namespace. 2437332547Smav * 2438332547Smav * The only case that we allow an already imported pool to be 2439332547Smav * imported again, is when the pool is checkpointed and we want to 2440332547Smav * look at its checkpointed state from userland tools like zdb. 2441332547Smav */ 2442332547Smav#ifdef _KERNEL 2443332547Smav if ((spa->spa_load_state == SPA_LOAD_IMPORT || 2444332547Smav spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 2445332547Smav spa_guid_exists(pool_guid, 0)) { 2446332547Smav#else 2447332547Smav if ((spa->spa_load_state == SPA_LOAD_IMPORT || 2448332547Smav spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 2449332547Smav spa_guid_exists(pool_guid, 0) && 2450332547Smav !spa_importing_readonly_checkpoint(spa)) { 2451332547Smav#endif 2452332536Smav spa_load_failed(spa, "a pool with guid %llu is already open", 2453332536Smav (u_longlong_t)pool_guid); 2454332536Smav return (SET_ERROR(EEXIST)); 2455332536Smav } 2456332536Smav 2457332536Smav spa->spa_config_guid = pool_guid; 2458332536Smav 2459332536Smav nvlist_free(spa->spa_load_info); 2460332536Smav spa->spa_load_info = fnvlist_alloc(); 2461332536Smav 2462332536Smav ASSERT(spa->spa_comment == NULL); 2463332536Smav if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2464332536Smav spa->spa_comment = spa_strdup(comment); 2465332536Smav 2466332536Smav (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2467332536Smav &spa->spa_config_txg); 2468332536Smav 2469332536Smav if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 2470332536Smav spa->spa_config_splitting = fnvlist_dup(nvl); 2471332536Smav 2472332530Smav if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 2473332530Smav spa_load_failed(spa, "invalid config provided: '%s' missing", 2474332530Smav ZPOOL_CONFIG_VDEV_TREE); 2475249195Smm return (SET_ERROR(EINVAL)); 2476332530Smav } 2477219089Spjd 2478219089Spjd /* 2479209962Smm * Create "The Godfather" zio to hold all async IOs 2480209962Smm */ 2481272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2482272598Sdelphij KM_SLEEP); 2483272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 2484272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2485272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2486272598Sdelphij ZIO_FLAG_GODFATHER); 2487272598Sdelphij } 2488209962Smm 2489209962Smm /* 2490168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2491168404Spjd * value that will be returned by spa_version() since parsing the 2492168404Spjd * configuration requires knowing the version number. 2493168404Spjd */ 2494185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2495332536Smav parse = (type == SPA_IMPORT_EXISTING ? 2496332536Smav VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2497332529Smav error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 2498185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2499168404Spjd 2500332530Smav if (error != 0) { 2501332530Smav spa_load_failed(spa, "unable to parse config [error=%d]", 2502332530Smav error); 2503219089Spjd return (error); 2504332530Smav } 2505168404Spjd 2506168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2507284304Savg ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2508284304Savg ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2509168404Spjd 2510219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2511219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2512219089Spjd } 2513219089Spjd 2514332529Smav return (0); 2515332529Smav} 2516332529Smav 2517332536Smav/* 2518332536Smav * Recursively open all vdevs in the vdev tree. This function is called twice: 2519332536Smav * first with the untrusted config, then with the trusted config. 2520332536Smav */ 2521332529Smavstatic int 2522332529Smavspa_ld_open_vdevs(spa_t *spa) 2523332529Smav{ 2524332529Smav int error = 0; 2525332529Smav 2526332536Smav /* 2527332536Smav * spa_missing_tvds_allowed defines how many top-level vdevs can be 2528332536Smav * missing/unopenable for the root vdev to be still considered openable. 2529332536Smav */ 2530332536Smav if (spa->spa_trust_config) { 2531332536Smav spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 2532332536Smav } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 2533332536Smav spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 2534332536Smav } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 2535332536Smav spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 2536332536Smav } else { 2537332536Smav spa->spa_missing_tvds_allowed = 0; 2538332536Smav } 2539332536Smav 2540332536Smav spa->spa_missing_tvds_allowed = 2541332536Smav MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 2542332536Smav 2543185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2544332529Smav error = vdev_open(spa->spa_root_vdev); 2545185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2546332536Smav 2547332536Smav if (spa->spa_missing_tvds != 0) { 2548332536Smav spa_load_note(spa, "vdev tree has %lld missing top-level " 2549332536Smav "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 2550332536Smav if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) { 2551332536Smav /* 2552332536Smav * Although theoretically we could allow users to open 2553332536Smav * incomplete pools in RW mode, we'd need to add a lot 2554332536Smav * of extra logic (e.g. adjust pool space to account 2555332536Smav * for missing vdevs). 2556332536Smav * This limitation also prevents users from accidentally 2557332536Smav * opening the pool in RW mode during data recovery and 2558332536Smav * damaging it further. 2559332536Smav */ 2560332536Smav spa_load_note(spa, "pools with missing top-level " 2561332536Smav "vdevs can only be opened in read-only mode."); 2562332536Smav error = SET_ERROR(ENXIO); 2563332536Smav } else { 2564332536Smav spa_load_note(spa, "current settings allow for maximum " 2565332536Smav "%lld missing top-level vdevs at this stage.", 2566332536Smav (u_longlong_t)spa->spa_missing_tvds_allowed); 2567332536Smav } 2568332536Smav } 2569332530Smav if (error != 0) { 2570332530Smav spa_load_failed(spa, "unable to open vdev tree [error=%d]", 2571332530Smav error); 2572332530Smav } 2573332536Smav if (spa->spa_missing_tvds != 0 || error != 0) 2574332536Smav vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 2575168404Spjd 2576332529Smav return (error); 2577332529Smav} 2578332529Smav 2579332536Smav/* 2580332536Smav * We need to validate the vdev labels against the configuration that 2581332536Smav * we have in hand. This function is called twice: first with an untrusted 2582332536Smav * config, then with a trusted config. The validation is more strict when the 2583332536Smav * config is trusted. 2584332536Smav */ 2585332529Smavstatic int 2586332536Smavspa_ld_validate_vdevs(spa_t *spa) 2587332529Smav{ 2588332529Smav int error = 0; 2589332529Smav vdev_t *rvd = spa->spa_root_vdev; 2590332529Smav 2591332536Smav spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2592332536Smav error = vdev_validate(rvd); 2593332536Smav spa_config_exit(spa, SCL_ALL, FTAG); 2594168404Spjd 2595332536Smav if (error != 0) { 2596332536Smav spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 2597332536Smav return (error); 2598332536Smav } 2599219089Spjd 2600332536Smav if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 2601332536Smav spa_load_failed(spa, "cannot open vdev tree after invalidating " 2602332536Smav "some vdevs"); 2603332536Smav vdev_dbgmsg_print_tree(rvd, 2); 2604332536Smav return (SET_ERROR(ENXIO)); 2605168404Spjd } 2606168404Spjd 2607332529Smav return (0); 2608332529Smav} 2609332529Smav 2610332547Smavstatic void 2611332547Smavspa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 2612332547Smav{ 2613332547Smav spa->spa_state = POOL_STATE_ACTIVE; 2614332547Smav spa->spa_ubsync = spa->spa_uberblock; 2615332547Smav spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2616332547Smav TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2617332547Smav spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2618332547Smav spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2619332547Smav spa->spa_claim_max_txg = spa->spa_first_txg; 2620332547Smav spa->spa_prev_software_version = ub->ub_software_version; 2621332547Smav} 2622332547Smav 2623332529Smavstatic int 2624332536Smavspa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 2625332529Smav{ 2626332529Smav vdev_t *rvd = spa->spa_root_vdev; 2627332529Smav nvlist_t *label; 2628332529Smav uberblock_t *ub = &spa->spa_uberblock; 2629332529Smav 2630168404Spjd /* 2631332547Smav * If we are opening the checkpointed state of the pool by 2632332547Smav * rewinding to it, at this point we will have written the 2633332547Smav * checkpointed uberblock to the vdev labels, so searching 2634332547Smav * the labels will find the right uberblock. However, if 2635332547Smav * we are opening the checkpointed state read-only, we have 2636332547Smav * not modified the labels. Therefore, we must ignore the 2637332547Smav * labels and continue using the spa_uberblock that was set 2638332547Smav * by spa_ld_checkpoint_rewind. 2639332547Smav * 2640332547Smav * Note that it would be fine to ignore the labels when 2641332547Smav * rewinding (opening writeable) as well. However, if we 2642332547Smav * crash just after writing the labels, we will end up 2643332547Smav * searching the labels. Doing so in the common case means 2644332547Smav * that this code path gets exercised normally, rather than 2645332547Smav * just in the edge case. 2646332547Smav */ 2647332547Smav if (ub->ub_checkpoint_txg != 0 && 2648332547Smav spa_importing_readonly_checkpoint(spa)) { 2649332547Smav spa_ld_select_uberblock_done(spa, ub); 2650332547Smav return (0); 2651332547Smav } 2652332547Smav 2653332547Smav /* 2654168404Spjd * Find the best uberblock. 2655168404Spjd */ 2656236884Smm vdev_uberblock_load(rvd, ub, &label); 2657168404Spjd 2658168404Spjd /* 2659168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2660168404Spjd */ 2661236884Smm if (ub->ub_txg == 0) { 2662236884Smm nvlist_free(label); 2663332530Smav spa_load_failed(spa, "no valid uberblock found"); 2664219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2665236884Smm } 2666168404Spjd 2667332530Smav spa_load_note(spa, "using uberblock with txg=%llu", 2668332530Smav (u_longlong_t)ub->ub_txg); 2669332530Smav 2670168404Spjd /* 2671236884Smm * If the pool has an unsupported version we can't open it. 2672168404Spjd */ 2673236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2674236884Smm nvlist_free(label); 2675332530Smav spa_load_failed(spa, "version %llu is not supported", 2676332530Smav (u_longlong_t)ub->ub_version); 2677219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2678236884Smm } 2679168404Spjd 2680236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2681236884Smm nvlist_t *features; 2682236884Smm 2683236884Smm /* 2684236884Smm * If we weren't able to find what's necessary for reading the 2685236884Smm * MOS in the label, return failure. 2686236884Smm */ 2687332530Smav if (label == NULL) { 2688332530Smav spa_load_failed(spa, "label config unavailable"); 2689332530Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2690332530Smav ENXIO)); 2691332530Smav } 2692332530Smav 2693332530Smav if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 2694332530Smav &features) != 0) { 2695236884Smm nvlist_free(label); 2696332530Smav spa_load_failed(spa, "invalid label: '%s' missing", 2697332530Smav ZPOOL_CONFIG_FEATURES_FOR_READ); 2698236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2699236884Smm ENXIO)); 2700236884Smm } 2701236884Smm 2702236884Smm /* 2703236884Smm * Update our in-core representation with the definitive values 2704236884Smm * from the label. 2705236884Smm */ 2706236884Smm nvlist_free(spa->spa_label_features); 2707236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2708236884Smm } 2709236884Smm 2710236884Smm nvlist_free(label); 2711236884Smm 2712168404Spjd /* 2713236884Smm * Look through entries in the label nvlist's features_for_read. If 2714236884Smm * there is a feature listed there which we don't understand then we 2715236884Smm * cannot open a pool. 2716236884Smm */ 2717236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2718236884Smm nvlist_t *unsup_feat; 2719236884Smm 2720236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2721236884Smm 0); 2722236884Smm 2723236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2724236884Smm NULL); nvp != NULL; 2725236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2726236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2727236884Smm VERIFY(nvlist_add_string(unsup_feat, 2728236884Smm nvpair_name(nvp), "") == 0); 2729236884Smm } 2730236884Smm } 2731236884Smm 2732236884Smm if (!nvlist_empty(unsup_feat)) { 2733236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2734236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2735236884Smm nvlist_free(unsup_feat); 2736332530Smav spa_load_failed(spa, "some features are unsupported"); 2737236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2738236884Smm ENOTSUP)); 2739236884Smm } 2740236884Smm 2741236884Smm nvlist_free(unsup_feat); 2742236884Smm } 2743236884Smm 2744219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2745219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2746332536Smav spa_try_repair(spa, spa->spa_config); 2747219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2748219089Spjd nvlist_free(spa->spa_config_splitting); 2749219089Spjd spa->spa_config_splitting = NULL; 2750168404Spjd } 2751168404Spjd 2752168404Spjd /* 2753168404Spjd * Initialize internal SPA structures. 2754168404Spjd */ 2755332547Smav spa_ld_select_uberblock_done(spa, ub); 2756219089Spjd 2757332529Smav return (0); 2758332529Smav} 2759332525Smav 2760332529Smavstatic int 2761332529Smavspa_ld_open_rootbp(spa_t *spa) 2762332529Smav{ 2763332529Smav int error = 0; 2764332529Smav vdev_t *rvd = spa->spa_root_vdev; 2765332529Smav 2766236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2767332530Smav if (error != 0) { 2768332530Smav spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 2769332530Smav "[error=%d]", error); 2770219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2771332530Smav } 2772168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2773168404Spjd 2774332529Smav return (0); 2775332529Smav} 2776332529Smav 2777332529Smavstatic int 2778332547Smavspa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 2779332536Smav boolean_t reloading) 2780332529Smav{ 2781332536Smav vdev_t *mrvd, *rvd = spa->spa_root_vdev; 2782332536Smav nvlist_t *nv, *mos_config, *policy; 2783332536Smav int error = 0, copy_error; 2784332536Smav uint64_t healthy_tvds, healthy_tvds_mos; 2785332536Smav uint64_t mos_config_txg; 2786332529Smav 2787332530Smav if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 2788332530Smav != 0) 2789219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2790168404Spjd 2791332525Smav /* 2792332536Smav * If we're assembling a pool from a split, the config provided is 2793332536Smav * already trusted so there is nothing to do. 2794332525Smav */ 2795332536Smav if (type == SPA_IMPORT_ASSEMBLE) 2796332536Smav return (0); 2797332525Smav 2798332536Smav healthy_tvds = spa_healthy_core_tvds(spa); 2799332536Smav 2800332536Smav if (load_nvlist(spa, spa->spa_config_object, &mos_config) 2801332536Smav != 0) { 2802332536Smav spa_load_failed(spa, "unable to retrieve MOS config"); 2803332536Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2804332536Smav } 2805332536Smav 2806332536Smav /* 2807332536Smav * If we are doing an open, pool owner wasn't verified yet, thus do 2808332536Smav * the verification here. 2809332536Smav */ 2810332536Smav if (spa->spa_load_state == SPA_LOAD_OPEN) { 2811332536Smav error = spa_verify_host(spa, mos_config); 2812332536Smav if (error != 0) { 2813332525Smav nvlist_free(mos_config); 2814332536Smav return (error); 2815332525Smav } 2816332536Smav } 2817332525Smav 2818332536Smav nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 2819332536Smav 2820332536Smav spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2821332536Smav 2822332536Smav /* 2823332536Smav * Build a new vdev tree from the trusted config 2824332536Smav */ 2825332536Smav VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 2826332536Smav 2827332536Smav /* 2828332536Smav * Vdev paths in the MOS may be obsolete. If the untrusted config was 2829332536Smav * obtained by scanning /dev/dsk, then it will have the right vdev 2830332536Smav * paths. We update the trusted MOS config with this information. 2831332536Smav * We first try to copy the paths with vdev_copy_path_strict, which 2832332536Smav * succeeds only when both configs have exactly the same vdev tree. 2833332536Smav * If that fails, we fall back to a more flexible method that has a 2834332536Smav * best effort policy. 2835332536Smav */ 2836332536Smav copy_error = vdev_copy_path_strict(rvd, mrvd); 2837332536Smav if (copy_error != 0 || spa_load_print_vdev_tree) { 2838332536Smav spa_load_note(spa, "provided vdev tree:"); 2839332536Smav vdev_dbgmsg_print_tree(rvd, 2); 2840332536Smav spa_load_note(spa, "MOS vdev tree:"); 2841332536Smav vdev_dbgmsg_print_tree(mrvd, 2); 2842332536Smav } 2843332536Smav if (copy_error != 0) { 2844332536Smav spa_load_note(spa, "vdev_copy_path_strict failed, falling " 2845332536Smav "back to vdev_copy_path_relaxed"); 2846332536Smav vdev_copy_path_relaxed(rvd, mrvd); 2847332536Smav } 2848332536Smav 2849332536Smav vdev_close(rvd); 2850332536Smav vdev_free(rvd); 2851332536Smav spa->spa_root_vdev = mrvd; 2852332536Smav rvd = mrvd; 2853332536Smav spa_config_exit(spa, SCL_ALL, FTAG); 2854332536Smav 2855332536Smav /* 2856332536Smav * We will use spa_config if we decide to reload the spa or if spa_load 2857332536Smav * fails and we rewind. We must thus regenerate the config using the 2858332550Smav * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 2859332550Smav * pass settings on how to load the pool and is not stored in the MOS. 2860332550Smav * We copy it over to our new, trusted config. 2861332536Smav */ 2862332536Smav mos_config_txg = fnvlist_lookup_uint64(mos_config, 2863332536Smav ZPOOL_CONFIG_POOL_TXG); 2864332536Smav nvlist_free(mos_config); 2865332536Smav mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 2866332550Smav if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 2867332536Smav &policy) == 0) 2868332550Smav fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 2869332536Smav spa_config_set(spa, mos_config); 2870332536Smav spa->spa_config_source = SPA_CONFIG_SRC_MOS; 2871332536Smav 2872332536Smav /* 2873332536Smav * Now that we got the config from the MOS, we should be more strict 2874332536Smav * in checking blkptrs and can make assumptions about the consistency 2875332536Smav * of the vdev tree. spa_trust_config must be set to true before opening 2876332536Smav * vdevs in order for them to be writeable. 2877332536Smav */ 2878332536Smav spa->spa_trust_config = B_TRUE; 2879332536Smav 2880332536Smav /* 2881332536Smav * Open and validate the new vdev tree 2882332536Smav */ 2883332536Smav error = spa_ld_open_vdevs(spa); 2884332536Smav if (error != 0) 2885332536Smav return (error); 2886332536Smav 2887332536Smav error = spa_ld_validate_vdevs(spa); 2888332536Smav if (error != 0) 2889332536Smav return (error); 2890332536Smav 2891332536Smav if (copy_error != 0 || spa_load_print_vdev_tree) { 2892332536Smav spa_load_note(spa, "final vdev tree:"); 2893332536Smav vdev_dbgmsg_print_tree(rvd, 2); 2894332536Smav } 2895332536Smav 2896332536Smav if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 2897332536Smav !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 2898332525Smav /* 2899332536Smav * Sanity check to make sure that we are indeed loading the 2900332536Smav * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 2901332536Smav * in the config provided and they happened to be the only ones 2902332536Smav * to have the latest uberblock, we could involuntarily perform 2903332536Smav * an extreme rewind. 2904332525Smav */ 2905332536Smav healthy_tvds_mos = spa_healthy_core_tvds(spa); 2906332536Smav if (healthy_tvds_mos - healthy_tvds >= 2907332536Smav SPA_SYNC_MIN_VDEVS) { 2908332536Smav spa_load_note(spa, "config provided misses too many " 2909332536Smav "top-level vdevs compared to MOS (%lld vs %lld). ", 2910332536Smav (u_longlong_t)healthy_tvds, 2911332536Smav (u_longlong_t)healthy_tvds_mos); 2912332536Smav spa_load_note(spa, "vdev tree:"); 2913332536Smav vdev_dbgmsg_print_tree(rvd, 2); 2914332536Smav if (reloading) { 2915332536Smav spa_load_failed(spa, "config was already " 2916332536Smav "provided from MOS. Aborting."); 2917332536Smav return (spa_vdev_err(rvd, 2918332536Smav VDEV_AUX_CORRUPT_DATA, EIO)); 2919332536Smav } 2920332536Smav spa_load_note(spa, "spa must be reloaded using MOS " 2921332536Smav "config"); 2922332536Smav return (SET_ERROR(EAGAIN)); 2923332530Smav } 2924332525Smav } 2925332525Smav 2926332536Smav error = spa_check_for_missing_logs(spa); 2927332536Smav if (error != 0) 2928332536Smav return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2929332536Smav 2930332536Smav if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 2931332536Smav spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 2932332536Smav "guid sum (%llu != %llu)", 2933332536Smav (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 2934332536Smav (u_longlong_t)rvd->vdev_guid_sum); 2935332536Smav return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2936332536Smav ENXIO)); 2937332536Smav } 2938332536Smav 2939332529Smav return (0); 2940332529Smav} 2941332529Smav 2942332529Smavstatic int 2943332529Smavspa_ld_open_indirect_vdev_metadata(spa_t *spa) 2944332529Smav{ 2945332529Smav int error = 0; 2946332529Smav vdev_t *rvd = spa->spa_root_vdev; 2947332529Smav 2948332525Smav /* 2949332525Smav * Everything that we read before spa_remove_init() must be stored 2950332525Smav * on concreted vdevs. Therefore we do this as early as possible. 2951332525Smav */ 2952332530Smav error = spa_remove_init(spa); 2953332530Smav if (error != 0) { 2954332530Smav spa_load_failed(spa, "spa_remove_init failed [error=%d]", 2955332530Smav error); 2956332525Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2957332530Smav } 2958332525Smav 2959332529Smav /* 2960332529Smav * Retrieve information needed to condense indirect vdev mappings. 2961332529Smav */ 2962332529Smav error = spa_condense_init(spa); 2963332529Smav if (error != 0) { 2964332530Smav spa_load_failed(spa, "spa_condense_init failed [error=%d]", 2965332530Smav error); 2966332529Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 2967332529Smav } 2968332529Smav 2969332529Smav return (0); 2970332529Smav} 2971332529Smav 2972332529Smavstatic int 2973332530Smavspa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 2974332529Smav{ 2975332529Smav int error = 0; 2976332529Smav vdev_t *rvd = spa->spa_root_vdev; 2977332529Smav 2978236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2979236884Smm boolean_t missing_feat_read = B_FALSE; 2980238926Smm nvlist_t *unsup_feat, *enabled_feat; 2981236884Smm 2982236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2983332530Smav &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 2984236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2985236884Smm } 2986236884Smm 2987236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2988332530Smav &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 2989236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2990236884Smm } 2991236884Smm 2992236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2993332530Smav &spa->spa_feat_desc_obj, B_TRUE) != 0) { 2994236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2995236884Smm } 2996236884Smm 2997238926Smm enabled_feat = fnvlist_alloc(); 2998238926Smm unsup_feat = fnvlist_alloc(); 2999236884Smm 3000259813Sdelphij if (!spa_features_check(spa, B_FALSE, 3001238926Smm unsup_feat, enabled_feat)) 3002236884Smm missing_feat_read = B_TRUE; 3003236884Smm 3004332530Smav if (spa_writeable(spa) || 3005332530Smav spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 3006259813Sdelphij if (!spa_features_check(spa, B_TRUE, 3007238926Smm unsup_feat, enabled_feat)) { 3008332529Smav *missing_feat_writep = B_TRUE; 3009238926Smm } 3010236884Smm } 3011236884Smm 3012238926Smm fnvlist_add_nvlist(spa->spa_load_info, 3013238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 3014238926Smm 3015236884Smm if (!nvlist_empty(unsup_feat)) { 3016238926Smm fnvlist_add_nvlist(spa->spa_load_info, 3017238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 3018236884Smm } 3019236884Smm 3020238926Smm fnvlist_free(enabled_feat); 3021238926Smm fnvlist_free(unsup_feat); 3022236884Smm 3023236884Smm if (!missing_feat_read) { 3024236884Smm fnvlist_add_boolean(spa->spa_load_info, 3025236884Smm ZPOOL_CONFIG_CAN_RDONLY); 3026236884Smm } 3027236884Smm 3028236884Smm /* 3029236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 3030236884Smm * twofold: to determine whether the pool is available for 3031236884Smm * import in read-write mode and (if it is not) whether the 3032236884Smm * pool is available for import in read-only mode. If the pool 3033236884Smm * is available for import in read-write mode, it is displayed 3034236884Smm * as available in userland; if it is not available for import 3035236884Smm * in read-only mode, it is displayed as unavailable in 3036236884Smm * userland. If the pool is available for import in read-only 3037236884Smm * mode but not read-write mode, it is displayed as unavailable 3038236884Smm * in userland with a special note that the pool is actually 3039236884Smm * available for open in read-only mode. 3040236884Smm * 3041236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 3042236884Smm * missing a feature for write, we must first determine whether 3043236884Smm * the pool can be opened read-only before returning to 3044236884Smm * userland in order to know whether to display the 3045236884Smm * abovementioned note. 3046236884Smm */ 3047332529Smav if (missing_feat_read || (*missing_feat_writep && 3048236884Smm spa_writeable(spa))) { 3049332530Smav spa_load_failed(spa, "pool uses unsupported features"); 3050236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 3051236884Smm ENOTSUP)); 3052236884Smm } 3053260150Sdelphij 3054260150Sdelphij /* 3055260150Sdelphij * Load refcounts for ZFS features from disk into an in-memory 3056260150Sdelphij * cache during SPA initialization. 3057260150Sdelphij */ 3058260150Sdelphij for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 3059260150Sdelphij uint64_t refcount; 3060260150Sdelphij 3061260150Sdelphij error = feature_get_refcount_from_disk(spa, 3062260150Sdelphij &spa_feature_table[i], &refcount); 3063260150Sdelphij if (error == 0) { 3064260150Sdelphij spa->spa_feat_refcount_cache[i] = refcount; 3065260150Sdelphij } else if (error == ENOTSUP) { 3066260150Sdelphij spa->spa_feat_refcount_cache[i] = 3067260150Sdelphij SPA_FEATURE_DISABLED; 3068260150Sdelphij } else { 3069332530Smav spa_load_failed(spa, "error getting refcount " 3070332530Smav "for feature %s [error=%d]", 3071332530Smav spa_feature_table[i].fi_guid, error); 3072260150Sdelphij return (spa_vdev_err(rvd, 3073260150Sdelphij VDEV_AUX_CORRUPT_DATA, EIO)); 3074260150Sdelphij } 3075260150Sdelphij } 3076236884Smm } 3077236884Smm 3078260150Sdelphij if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 3079260150Sdelphij if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 3080332530Smav &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 3081260150Sdelphij return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3082260150Sdelphij } 3083260150Sdelphij 3084332529Smav return (0); 3085332529Smav} 3086332529Smav 3087332529Smavstatic int 3088332529Smavspa_ld_load_special_directories(spa_t *spa) 3089332529Smav{ 3090332529Smav int error = 0; 3091332529Smav vdev_t *rvd = spa->spa_root_vdev; 3092332529Smav 3093236884Smm spa->spa_is_initializing = B_TRUE; 3094236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 3095236884Smm spa->spa_is_initializing = B_FALSE; 3096332530Smav if (error != 0) { 3097332530Smav spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 3098236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3099332530Smav } 3100236884Smm 3101332529Smav return (0); 3102332529Smav} 3103168404Spjd 3104332529Smavstatic int 3105332529Smavspa_ld_get_props(spa_t *spa) 3106332529Smav{ 3107332529Smav int error = 0; 3108332529Smav uint64_t obj; 3109332529Smav vdev_t *rvd = spa->spa_root_vdev; 3110332529Smav 3111289422Smav /* Grab the secret checksum salt from the MOS. */ 3112289422Smav error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 3113289422Smav DMU_POOL_CHECKSUM_SALT, 1, 3114289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes), 3115289422Smav spa->spa_cksum_salt.zcs_bytes); 3116289422Smav if (error == ENOENT) { 3117289422Smav /* Generate a new salt for subsequent use */ 3118289422Smav (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 3119289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes)); 3120289422Smav } else if (error != 0) { 3121332530Smav spa_load_failed(spa, "unable to retrieve checksum salt from " 3122332530Smav "MOS [error=%d]", error); 3123289422Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3124289422Smav } 3125289422Smav 3126332530Smav if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 3127219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3128219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 3129332530Smav if (error != 0) { 3130332530Smav spa_load_failed(spa, "error opening deferred-frees bpobj " 3131332530Smav "[error=%d]", error); 3132219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3133332530Smav } 3134168404Spjd 3135168404Spjd /* 3136168404Spjd * Load the bit that tells us to use the new accounting function 3137168404Spjd * (raid-z deflation). If we have an older pool, this will not 3138168404Spjd * be present. 3139168404Spjd */ 3140332530Smav error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 3141219089Spjd if (error != 0 && error != ENOENT) 3142219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3143168404Spjd 3144219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 3145332530Smav &spa->spa_creation_version, B_FALSE); 3146219089Spjd if (error != 0 && error != ENOENT) 3147219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3148219089Spjd 3149168404Spjd /* 3150168404Spjd * Load the persistent error log. If we have an older pool, this will 3151168404Spjd * not be present. 3152168404Spjd */ 3153332530Smav error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 3154332530Smav B_FALSE); 3155219089Spjd if (error != 0 && error != ENOENT) 3156219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3157168404Spjd 3158219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 3159332530Smav &spa->spa_errlog_scrub, B_FALSE); 3160219089Spjd if (error != 0 && error != ENOENT) 3161219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3162168404Spjd 3163168404Spjd /* 3164168404Spjd * Load the history object. If we have an older pool, this 3165168404Spjd * will not be present. 3166168404Spjd */ 3167332530Smav error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 3168219089Spjd if (error != 0 && error != ENOENT) 3169219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3170168404Spjd 3171168404Spjd /* 3172299441Smav * Load the per-vdev ZAP map. If we have an older pool, this will not 3173299441Smav * be present; in this case, defer its creation to a later time to 3174299441Smav * avoid dirtying the MOS this early / out of sync context. See 3175299441Smav * spa_sync_config_object. 3176299441Smav */ 3177299441Smav 3178299441Smav /* The sentinel is only available in the MOS config. */ 3179299441Smav nvlist_t *mos_config; 3180332530Smav if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 3181332530Smav spa_load_failed(spa, "unable to retrieve MOS config"); 3182299441Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3183332530Smav } 3184299441Smav 3185299441Smav error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 3186332530Smav &spa->spa_all_vdev_zaps, B_FALSE); 3187299441Smav 3188321540Smav if (error == ENOENT) { 3189321540Smav VERIFY(!nvlist_exists(mos_config, 3190321540Smav ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 3191321540Smav spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 3192321540Smav ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 3193321540Smav } else if (error != 0) { 3194299441Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3195321540Smav } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 3196299441Smav /* 3197299441Smav * An older version of ZFS overwrote the sentinel value, so 3198299441Smav * we have orphaned per-vdev ZAPs in the MOS. Defer their 3199299441Smav * destruction to later; see spa_sync_config_object. 3200299441Smav */ 3201299441Smav spa->spa_avz_action = AVZ_ACTION_DESTROY; 3202299441Smav /* 3203299441Smav * We're assuming that no vdevs have had their ZAPs created 3204299441Smav * before this. Better be sure of it. 3205299441Smav */ 3206299441Smav ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 3207299441Smav } 3208299441Smav nvlist_free(mos_config); 3209299441Smav 3210332529Smav spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3211332529Smav 3212332530Smav error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 3213332530Smav B_FALSE); 3214332529Smav if (error && error != ENOENT) 3215332529Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3216332529Smav 3217332529Smav if (error == 0) { 3218332529Smav uint64_t autoreplace; 3219332529Smav 3220332529Smav spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 3221332529Smav spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 3222332529Smav spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 3223332529Smav spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 3224332529Smav spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 3225332529Smav spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 3226332529Smav &spa->spa_dedup_ditto); 3227332529Smav 3228332529Smav spa->spa_autoreplace = (autoreplace != 0); 3229332529Smav } 3230332529Smav 3231332536Smav /* 3232332536Smav * If we are importing a pool with missing top-level vdevs, 3233332536Smav * we enforce that the pool doesn't panic or get suspended on 3234332536Smav * error since the likelihood of missing data is extremely high. 3235332536Smav */ 3236332536Smav if (spa->spa_missing_tvds > 0 && 3237332536Smav spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 3238332536Smav spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 3239332536Smav spa_load_note(spa, "forcing failmode to 'continue' " 3240332536Smav "as some top level vdevs are missing"); 3241332536Smav spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 3242332536Smav } 3243332536Smav 3244332529Smav return (0); 3245332529Smav} 3246332529Smav 3247332529Smavstatic int 3248332529Smavspa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 3249332529Smav{ 3250332529Smav int error = 0; 3251332529Smav vdev_t *rvd = spa->spa_root_vdev; 3252332529Smav 3253299441Smav /* 3254219089Spjd * If we're assembling the pool from the split-off vdevs of 3255219089Spjd * an existing pool, we don't want to attach the spares & cache 3256219089Spjd * devices. 3257219089Spjd */ 3258219089Spjd 3259219089Spjd /* 3260168404Spjd * Load any hot spares for this pool. 3261168404Spjd */ 3262332530Smav error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 3263332530Smav B_FALSE); 3264219089Spjd if (error != 0 && error != ENOENT) 3265219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3266219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 3267185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 3268185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 3269332530Smav &spa->spa_spares.sav_config) != 0) { 3270332530Smav spa_load_failed(spa, "error loading spares nvlist"); 3271219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3272332530Smav } 3273168404Spjd 3274185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3275168404Spjd spa_load_spares(spa); 3276185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3277219089Spjd } else if (error == 0) { 3278219089Spjd spa->spa_spares.sav_sync = B_TRUE; 3279168404Spjd } 3280168404Spjd 3281185029Spjd /* 3282185029Spjd * Load any level 2 ARC devices for this pool. 3283185029Spjd */ 3284219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 3285332530Smav &spa->spa_l2cache.sav_object, B_FALSE); 3286219089Spjd if (error != 0 && error != ENOENT) 3287219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3288219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 3289185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 3290185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 3291332530Smav &spa->spa_l2cache.sav_config) != 0) { 3292332530Smav spa_load_failed(spa, "error loading l2cache nvlist"); 3293219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3294332530Smav } 3295185029Spjd 3296185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3297185029Spjd spa_load_l2cache(spa); 3298185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3299219089Spjd } else if (error == 0) { 3300219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3301185029Spjd } 3302185029Spjd 3303332529Smav return (0); 3304332529Smav} 3305213197Smm 3306332529Smavstatic int 3307332530Smavspa_ld_load_vdev_metadata(spa_t *spa) 3308332529Smav{ 3309332529Smav int error = 0; 3310332529Smav vdev_t *rvd = spa->spa_root_vdev; 3311185029Spjd 3312168404Spjd /* 3313185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 3314185029Spjd * the ZFS DE that it should not issue any faults for unopenable 3315185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 3316185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 3317185029Spjd * over. 3318185029Spjd */ 3319332530Smav if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 3320185029Spjd spa_check_removed(spa->spa_root_vdev); 3321219089Spjd /* 3322219089Spjd * For the import case, this is done in spa_import(), because 3323219089Spjd * at this point we're using the spare definitions from 3324219089Spjd * the MOS config, not necessarily from the userland config. 3325219089Spjd */ 3326332530Smav if (spa->spa_load_state != SPA_LOAD_IMPORT) { 3327219089Spjd spa_aux_check_removed(&spa->spa_spares); 3328219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 3329219089Spjd } 3330219089Spjd } 3331185029Spjd 3332185029Spjd /* 3333332529Smav * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 3334168404Spjd */ 3335332525Smav error = vdev_load(rvd); 3336332525Smav if (error != 0) { 3337332530Smav spa_load_failed(spa, "vdev_load failed [error=%d]", error); 3338332525Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3339332525Smav } 3340168404Spjd 3341168404Spjd /* 3342332529Smav * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 3343168404Spjd */ 3344185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3345168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 3346185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3347168404Spjd 3348332529Smav return (0); 3349332529Smav} 3350332529Smav 3351332529Smavstatic int 3352332529Smavspa_ld_load_dedup_tables(spa_t *spa) 3353332529Smav{ 3354332529Smav int error = 0; 3355332529Smav vdev_t *rvd = spa->spa_root_vdev; 3356332529Smav 3357219089Spjd error = ddt_load(spa); 3358332530Smav if (error != 0) { 3359332530Smav spa_load_failed(spa, "ddt_load failed [error=%d]", error); 3360219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3361332530Smav } 3362219089Spjd 3363332529Smav return (0); 3364332529Smav} 3365219089Spjd 3366332529Smavstatic int 3367332529Smavspa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) 3368332529Smav{ 3369332529Smav vdev_t *rvd = spa->spa_root_vdev; 3370332529Smav 3371332530Smav if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 3372332530Smav boolean_t missing = spa_check_logs(spa); 3373332530Smav if (missing) { 3374332536Smav if (spa->spa_missing_tvds != 0) { 3375332536Smav spa_load_note(spa, "spa_check_logs failed " 3376332536Smav "so dropping the logs"); 3377332536Smav } else { 3378332536Smav *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 3379332536Smav spa_load_failed(spa, "spa_check_logs failed"); 3380332536Smav return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 3381332536Smav ENXIO)); 3382332536Smav } 3383332530Smav } 3384168404Spjd } 3385168404Spjd 3386332529Smav return (0); 3387332529Smav} 3388332529Smav 3389332529Smavstatic int 3390332530Smavspa_ld_verify_pool_data(spa_t *spa) 3391332529Smav{ 3392332529Smav int error = 0; 3393332529Smav vdev_t *rvd = spa->spa_root_vdev; 3394332529Smav 3395332529Smav /* 3396332529Smav * We've successfully opened the pool, verify that we're ready 3397332529Smav * to start pushing transactions. 3398332529Smav */ 3399332530Smav if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 3400332529Smav error = spa_load_verify(spa); 3401332529Smav if (error != 0) { 3402332530Smav spa_load_failed(spa, "spa_load_verify failed " 3403332530Smav "[error=%d]", error); 3404332529Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3405332529Smav error)); 3406332529Smav } 3407332529Smav } 3408332529Smav 3409332529Smav return (0); 3410332529Smav} 3411332529Smav 3412332529Smavstatic void 3413332529Smavspa_ld_claim_log_blocks(spa_t *spa) 3414332529Smav{ 3415332529Smav dmu_tx_t *tx; 3416332529Smav dsl_pool_t *dp = spa_get_dsl(spa); 3417332529Smav 3418332529Smav /* 3419332529Smav * Claim log blocks that haven't been committed yet. 3420332529Smav * This must all happen in a single txg. 3421332529Smav * Note: spa_claim_max_txg is updated by spa_claim_notify(), 3422332529Smav * invoked from zil_claim_log_block()'s i/o done callback. 3423332529Smav * Price of rollback is that we abandon the log. 3424332529Smav */ 3425332529Smav spa->spa_claiming = B_TRUE; 3426332529Smav 3427332529Smav tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 3428332529Smav (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 3429332529Smav zil_claim, tx, DS_FIND_CHILDREN); 3430332529Smav dmu_tx_commit(tx); 3431332529Smav 3432332529Smav spa->spa_claiming = B_FALSE; 3433332529Smav 3434332529Smav spa_set_log_state(spa, SPA_LOG_GOOD); 3435332529Smav} 3436332529Smav 3437332529Smavstatic void 3438332536Smavspa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 3439332547Smav boolean_t update_config_cache) 3440332529Smav{ 3441332529Smav vdev_t *rvd = spa->spa_root_vdev; 3442332529Smav int need_update = B_FALSE; 3443332529Smav 3444332529Smav /* 3445332529Smav * If the config cache is stale, or we have uninitialized 3446332529Smav * metaslabs (see spa_vdev_add()), then update the config. 3447332529Smav * 3448332529Smav * If this is a verbatim import, trust the current 3449332529Smav * in-core spa_config and update the disk labels. 3450332529Smav */ 3451332547Smav if (update_config_cache || config_cache_txg != spa->spa_config_txg || 3452332530Smav spa->spa_load_state == SPA_LOAD_IMPORT || 3453332530Smav spa->spa_load_state == SPA_LOAD_RECOVER || 3454332529Smav (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 3455332529Smav need_update = B_TRUE; 3456332529Smav 3457332529Smav for (int c = 0; c < rvd->vdev_children; c++) 3458332529Smav if (rvd->vdev_child[c]->vdev_ms_array == 0) 3459332529Smav need_update = B_TRUE; 3460332529Smav 3461332529Smav /* 3462332529Smav * Update the config cache asychronously in case we're the 3463332529Smav * root pool, in which case the config cache isn't writable yet. 3464332529Smav */ 3465332529Smav if (need_update) 3466332529Smav spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 3467332529Smav} 3468332529Smav 3469332536Smavstatic void 3470332536Smavspa_ld_prepare_for_reload(spa_t *spa) 3471332536Smav{ 3472332536Smav int mode = spa->spa_mode; 3473332536Smav int async_suspended = spa->spa_async_suspended; 3474332536Smav 3475332536Smav spa_unload(spa); 3476332536Smav spa_deactivate(spa); 3477332536Smav spa_activate(spa, mode); 3478332536Smav 3479332536Smav /* 3480332536Smav * We save the value of spa_async_suspended as it gets reset to 0 by 3481332536Smav * spa_unload(). We want to restore it back to the original value before 3482332536Smav * returning as we might be calling spa_async_resume() later. 3483332536Smav */ 3484332536Smav spa->spa_async_suspended = async_suspended; 3485332536Smav} 3486332536Smav 3487332529Smavstatic int 3488332547Smavspa_ld_read_checkpoint_txg(spa_t *spa) 3489332529Smav{ 3490332547Smav uberblock_t checkpoint; 3491332529Smav int error = 0; 3492332529Smav 3493332547Smav ASSERT0(spa->spa_checkpoint_txg); 3494332530Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3495332547Smav 3496332547Smav error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 3497332547Smav DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 3498332547Smav sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 3499332547Smav 3500332547Smav if (error == ENOENT) 3501332547Smav return (0); 3502332547Smav 3503332547Smav if (error != 0) 3504332547Smav return (error); 3505332547Smav 3506332547Smav ASSERT3U(checkpoint.ub_txg, !=, 0); 3507332547Smav ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 3508332547Smav ASSERT3U(checkpoint.ub_timestamp, !=, 0); 3509332547Smav spa->spa_checkpoint_txg = checkpoint.ub_txg; 3510332547Smav spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 3511332547Smav 3512332547Smav return (0); 3513332547Smav} 3514332547Smav 3515332547Smavstatic int 3516332547Smavspa_ld_mos_init(spa_t *spa, spa_import_type_t type) 3517332547Smav{ 3518332547Smav int error = 0; 3519332547Smav 3520332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3521332536Smav ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 3522332530Smav 3523332529Smav /* 3524332536Smav * Never trust the config that is provided unless we are assembling 3525332536Smav * a pool following a split. 3526332536Smav * This means don't trust blkptrs and the vdev tree in general. This 3527332536Smav * also effectively puts the spa in read-only mode since 3528332536Smav * spa_writeable() checks for spa_trust_config to be true. 3529332536Smav * We will later load a trusted config from the MOS. 3530332529Smav */ 3531332536Smav if (type != SPA_IMPORT_ASSEMBLE) 3532332536Smav spa->spa_trust_config = B_FALSE; 3533332529Smav 3534332529Smav /* 3535332529Smav * Parse the config provided to create a vdev tree. 3536332529Smav */ 3537332536Smav error = spa_ld_parse_config(spa, type); 3538332529Smav if (error != 0) 3539332529Smav return (error); 3540332529Smav 3541332529Smav /* 3542332529Smav * Now that we have the vdev tree, try to open each vdev. This involves 3543332529Smav * opening the underlying physical device, retrieving its geometry and 3544332529Smav * probing the vdev with a dummy I/O. The state of each vdev will be set 3545332529Smav * based on the success of those operations. After this we'll be ready 3546332529Smav * to read from the vdevs. 3547332529Smav */ 3548332529Smav error = spa_ld_open_vdevs(spa); 3549332529Smav if (error != 0) 3550332529Smav return (error); 3551332529Smav 3552332529Smav /* 3553332529Smav * Read the label of each vdev and make sure that the GUIDs stored 3554332529Smav * there match the GUIDs in the config provided. 3555332536Smav * If we're assembling a new pool that's been split off from an 3556332536Smav * existing pool, the labels haven't yet been updated so we skip 3557332536Smav * validation for now. 3558332529Smav */ 3559332536Smav if (type != SPA_IMPORT_ASSEMBLE) { 3560332536Smav error = spa_ld_validate_vdevs(spa); 3561332536Smav if (error != 0) 3562332536Smav return (error); 3563332536Smav } 3564332529Smav 3565332529Smav /* 3566332547Smav * Read all vdev labels to find the best uberblock (i.e. latest, 3567332547Smav * unless spa_load_max_txg is set) and store it in spa_uberblock. We 3568332547Smav * get the list of features required to read blkptrs in the MOS from 3569332547Smav * the vdev label with the best uberblock and verify that our version 3570332547Smav * of zfs supports them all. 3571332529Smav */ 3572332536Smav error = spa_ld_select_uberblock(spa, type); 3573332529Smav if (error != 0) 3574332529Smav return (error); 3575332529Smav 3576332529Smav /* 3577332529Smav * Pass that uberblock to the dsl_pool layer which will open the root 3578332529Smav * blkptr. This blkptr points to the latest version of the MOS and will 3579332529Smav * allow us to read its contents. 3580332529Smav */ 3581332529Smav error = spa_ld_open_rootbp(spa); 3582332529Smav if (error != 0) 3583332529Smav return (error); 3584332529Smav 3585332547Smav return (0); 3586332547Smav} 3587332547Smav 3588332547Smavstatic int 3589332547Smavspa_ld_checkpoint_rewind(spa_t *spa) 3590332547Smav{ 3591332547Smav uberblock_t checkpoint; 3592332547Smav int error = 0; 3593332547Smav 3594332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3595332547Smav ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 3596332547Smav 3597332547Smav error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 3598332547Smav DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 3599332547Smav sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 3600332547Smav 3601332547Smav if (error != 0) { 3602332547Smav spa_load_failed(spa, "unable to retrieve checkpointed " 3603332547Smav "uberblock from the MOS config [error=%d]", error); 3604332547Smav 3605332547Smav if (error == ENOENT) 3606332547Smav error = ZFS_ERR_NO_CHECKPOINT; 3607332547Smav 3608332547Smav return (error); 3609332547Smav } 3610332547Smav 3611332547Smav ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 3612332547Smav ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 3613332547Smav 3614332529Smav /* 3615332547Smav * We need to update the txg and timestamp of the checkpointed 3616332547Smav * uberblock to be higher than the latest one. This ensures that 3617332547Smav * the checkpointed uberblock is selected if we were to close and 3618332547Smav * reopen the pool right after we've written it in the vdev labels. 3619332547Smav * (also see block comment in vdev_uberblock_compare) 3620332547Smav */ 3621332547Smav checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 3622332547Smav checkpoint.ub_timestamp = gethrestime_sec(); 3623332547Smav 3624332547Smav /* 3625332547Smav * Set current uberblock to be the checkpointed uberblock. 3626332547Smav */ 3627332547Smav spa->spa_uberblock = checkpoint; 3628332547Smav 3629332547Smav /* 3630332547Smav * If we are doing a normal rewind, then the pool is open for 3631332547Smav * writing and we sync the "updated" checkpointed uberblock to 3632332547Smav * disk. Once this is done, we've basically rewound the whole 3633332547Smav * pool and there is no way back. 3634332547Smav * 3635332547Smav * There are cases when we don't want to attempt and sync the 3636332547Smav * checkpointed uberblock to disk because we are opening a 3637332547Smav * pool as read-only. Specifically, verifying the checkpointed 3638332547Smav * state with zdb, and importing the checkpointed state to get 3639332547Smav * a "preview" of its content. 3640332547Smav */ 3641332547Smav if (spa_writeable(spa)) { 3642332547Smav vdev_t *rvd = spa->spa_root_vdev; 3643332547Smav 3644332547Smav spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3645332547Smav vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 3646332547Smav int svdcount = 0; 3647332547Smav int children = rvd->vdev_children; 3648332547Smav int c0 = spa_get_random(children); 3649332547Smav 3650332547Smav for (int c = 0; c < children; c++) { 3651332547Smav vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 3652332547Smav 3653332547Smav /* Stop when revisiting the first vdev */ 3654332547Smav if (c > 0 && svd[0] == vd) 3655332547Smav break; 3656332547Smav 3657332547Smav if (vd->vdev_ms_array == 0 || vd->vdev_islog || 3658332547Smav !vdev_is_concrete(vd)) 3659332547Smav continue; 3660332547Smav 3661332547Smav svd[svdcount++] = vd; 3662332547Smav if (svdcount == SPA_SYNC_MIN_VDEVS) 3663332547Smav break; 3664332547Smav } 3665332547Smav error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 3666332547Smav if (error == 0) 3667332547Smav spa->spa_last_synced_guid = rvd->vdev_guid; 3668332547Smav spa_config_exit(spa, SCL_ALL, FTAG); 3669332547Smav 3670332547Smav if (error != 0) { 3671332547Smav spa_load_failed(spa, "failed to write checkpointed " 3672332547Smav "uberblock to the vdev labels [error=%d]", error); 3673332547Smav return (error); 3674332547Smav } 3675332547Smav } 3676332547Smav 3677332547Smav return (0); 3678332547Smav} 3679332547Smav 3680332547Smavstatic int 3681332547Smavspa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 3682332547Smav boolean_t *update_config_cache) 3683332547Smav{ 3684332547Smav int error; 3685332547Smav 3686332547Smav /* 3687332547Smav * Parse the config for pool, open and validate vdevs, 3688332547Smav * select an uberblock, and use that uberblock to open 3689332547Smav * the MOS. 3690332547Smav */ 3691332547Smav error = spa_ld_mos_init(spa, type); 3692332547Smav if (error != 0) 3693332547Smav return (error); 3694332547Smav 3695332547Smav /* 3696332536Smav * Retrieve the trusted config stored in the MOS and use it to create 3697332536Smav * a new, exact version of the vdev tree, then reopen all vdevs. 3698332529Smav */ 3699332547Smav error = spa_ld_trusted_config(spa, type, B_FALSE); 3700332536Smav if (error == EAGAIN) { 3701332547Smav if (update_config_cache != NULL) 3702332547Smav *update_config_cache = B_TRUE; 3703332547Smav 3704332536Smav /* 3705332536Smav * Redo the loading process with the trusted config if it is 3706332536Smav * too different from the untrusted config. 3707332536Smav */ 3708332536Smav spa_ld_prepare_for_reload(spa); 3709332547Smav spa_load_note(spa, "RELOADING"); 3710332547Smav error = spa_ld_mos_init(spa, type); 3711332547Smav if (error != 0) 3712332547Smav return (error); 3713332547Smav 3714332547Smav error = spa_ld_trusted_config(spa, type, B_TRUE); 3715332547Smav if (error != 0) 3716332547Smav return (error); 3717332547Smav 3718332536Smav } else if (error != 0) { 3719332529Smav return (error); 3720332536Smav } 3721332529Smav 3722332547Smav return (0); 3723332547Smav} 3724332547Smav 3725332547Smav/* 3726332547Smav * Load an existing storage pool, using the config provided. This config 3727332547Smav * describes which vdevs are part of the pool and is later validated against 3728332547Smav * partial configs present in each vdev's label and an entire copy of the 3729332547Smav * config stored in the MOS. 3730332547Smav */ 3731332547Smavstatic int 3732332547Smavspa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) 3733332547Smav{ 3734332547Smav int error = 0; 3735332547Smav boolean_t missing_feat_write = B_FALSE; 3736332547Smav boolean_t checkpoint_rewind = 3737332547Smav (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 3738332547Smav boolean_t update_config_cache = B_FALSE; 3739332547Smav 3740332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3741332547Smav ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 3742332547Smav 3743332547Smav spa_load_note(spa, "LOADING"); 3744332547Smav 3745332547Smav error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 3746332547Smav if (error != 0) 3747332547Smav return (error); 3748332547Smav 3749332529Smav /* 3750332547Smav * If we are rewinding to the checkpoint then we need to repeat 3751332547Smav * everything we've done so far in this function but this time 3752332547Smav * selecting the checkpointed uberblock and using that to open 3753332547Smav * the MOS. 3754332547Smav */ 3755332547Smav if (checkpoint_rewind) { 3756332547Smav /* 3757332547Smav * If we are rewinding to the checkpoint update config cache 3758332547Smav * anyway. 3759332547Smav */ 3760332547Smav update_config_cache = B_TRUE; 3761332547Smav 3762332547Smav /* 3763332547Smav * Extract the checkpointed uberblock from the current MOS 3764332547Smav * and use this as the pool's uberblock from now on. If the 3765332547Smav * pool is imported as writeable we also write the checkpoint 3766332547Smav * uberblock to the labels, making the rewind permanent. 3767332547Smav */ 3768332547Smav error = spa_ld_checkpoint_rewind(spa); 3769332547Smav if (error != 0) 3770332547Smav return (error); 3771332547Smav 3772332547Smav /* 3773332547Smav * Redo the loading process process again with the 3774332547Smav * checkpointed uberblock. 3775332547Smav */ 3776332547Smav spa_ld_prepare_for_reload(spa); 3777332547Smav spa_load_note(spa, "LOADING checkpointed uberblock"); 3778332547Smav error = spa_ld_mos_with_trusted_config(spa, type, NULL); 3779332547Smav if (error != 0) 3780332547Smav return (error); 3781332547Smav } 3782332547Smav 3783332547Smav /* 3784332547Smav * Retrieve the checkpoint txg if the pool has a checkpoint. 3785332547Smav */ 3786332547Smav error = spa_ld_read_checkpoint_txg(spa); 3787332547Smav if (error != 0) 3788332547Smav return (error); 3789332547Smav 3790332547Smav /* 3791332529Smav * Retrieve the mapping of indirect vdevs. Those vdevs were removed 3792332529Smav * from the pool and their contents were re-mapped to other vdevs. Note 3793332529Smav * that everything that we read before this step must have been 3794332529Smav * rewritten on concrete vdevs after the last device removal was 3795332529Smav * initiated. Otherwise we could be reading from indirect vdevs before 3796332529Smav * we have loaded their mappings. 3797332529Smav */ 3798332529Smav error = spa_ld_open_indirect_vdev_metadata(spa); 3799332529Smav if (error != 0) 3800332529Smav return (error); 3801332529Smav 3802332529Smav /* 3803332529Smav * Retrieve the full list of active features from the MOS and check if 3804332529Smav * they are all supported. 3805332529Smav */ 3806332530Smav error = spa_ld_check_features(spa, &missing_feat_write); 3807332529Smav if (error != 0) 3808332529Smav return (error); 3809332529Smav 3810332529Smav /* 3811332529Smav * Load several special directories from the MOS needed by the dsl_pool 3812332529Smav * layer. 3813332529Smav */ 3814332529Smav error = spa_ld_load_special_directories(spa); 3815332529Smav if (error != 0) 3816332529Smav return (error); 3817332529Smav 3818332529Smav /* 3819332529Smav * Retrieve pool properties from the MOS. 3820332529Smav */ 3821332529Smav error = spa_ld_get_props(spa); 3822332529Smav if (error != 0) 3823332529Smav return (error); 3824332529Smav 3825332529Smav /* 3826332529Smav * Retrieve the list of auxiliary devices - cache devices and spares - 3827332529Smav * and open them. 3828332529Smav */ 3829332529Smav error = spa_ld_open_aux_vdevs(spa, type); 3830332529Smav if (error != 0) 3831332529Smav return (error); 3832332529Smav 3833332529Smav /* 3834332529Smav * Load the metadata for all vdevs. Also check if unopenable devices 3835332529Smav * should be autoreplaced. 3836332529Smav */ 3837332530Smav error = spa_ld_load_vdev_metadata(spa); 3838332529Smav if (error != 0) 3839332529Smav return (error); 3840332529Smav 3841332529Smav error = spa_ld_load_dedup_tables(spa); 3842332529Smav if (error != 0) 3843332529Smav return (error); 3844332529Smav 3845332529Smav /* 3846332529Smav * Verify the logs now to make sure we don't have any unexpected errors 3847332529Smav * when we claim log blocks later. 3848332529Smav */ 3849332529Smav error = spa_ld_verify_logs(spa, type, ereport); 3850332529Smav if (error != 0) 3851332529Smav return (error); 3852332529Smav 3853236884Smm if (missing_feat_write) { 3854332536Smav ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 3855236884Smm 3856236884Smm /* 3857236884Smm * At this point, we know that we can open the pool in 3858236884Smm * read-only mode but not read-write mode. We now have enough 3859236884Smm * information and can return to userland. 3860236884Smm */ 3861332529Smav return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 3862332529Smav ENOTSUP)); 3863236884Smm } 3864236884Smm 3865219089Spjd /* 3866332529Smav * Traverse the last txgs to make sure the pool was left off in a safe 3867332529Smav * state. When performing an extreme rewind, we verify the whole pool, 3868332529Smav * which can take a very long time. 3869219089Spjd */ 3870332530Smav error = spa_ld_verify_pool_data(spa); 3871332529Smav if (error != 0) 3872332529Smav return (error); 3873219089Spjd 3874332529Smav /* 3875332529Smav * Calculate the deflated space for the pool. This must be done before 3876332529Smav * we write anything to the pool because we'd need to update the space 3877332529Smav * accounting using the deflated sizes. 3878332529Smav */ 3879332529Smav spa_update_dspace(spa); 3880332529Smav 3881332529Smav /* 3882332529Smav * We have now retrieved all the information we needed to open the 3883332529Smav * pool. If we are importing the pool in read-write mode, a few 3884332529Smav * additional steps must be performed to finish the import. 3885332529Smav */ 3886332536Smav if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 3887219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 3888332536Smav uint64_t config_cache_txg = spa->spa_config_txg; 3889168404Spjd 3890332536Smav ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 3891332536Smav 3892332525Smav /* 3893332547Smav * In case of a checkpoint rewind, log the original txg 3894332547Smav * of the checkpointed uberblock. 3895332547Smav */ 3896332547Smav if (checkpoint_rewind) { 3897332547Smav spa_history_log_internal(spa, "checkpoint rewind", 3898332547Smav NULL, "rewound state to txg=%llu", 3899332547Smav (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 3900332547Smav } 3901332547Smav 3902332547Smav /* 3903332529Smav * Traverse the ZIL and claim all blocks. 3904332529Smav */ 3905332529Smav spa_ld_claim_log_blocks(spa); 3906209962Smm 3907168404Spjd /* 3908332529Smav * Kick-off the syncing thread. 3909168404Spjd */ 3910168404Spjd spa->spa_sync_on = B_TRUE; 3911168404Spjd txg_sync_start(spa->spa_dsl_pool); 3912168404Spjd 3913168404Spjd /* 3914219089Spjd * Wait for all claims to sync. We sync up to the highest 3915219089Spjd * claimed log block birth time so that claimed log blocks 3916219089Spjd * don't appear to be from the future. spa_claim_max_txg 3917332529Smav * will have been set for us by ZIL traversal operations 3918332529Smav * performed above. 3919168404Spjd */ 3920219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 3921168404Spjd 3922168404Spjd /* 3923332529Smav * Check if we need to request an update of the config. On the 3924332529Smav * next sync, we would update the config stored in vdev labels 3925332529Smav * and the cachefile (by default /etc/zfs/zpool.cache). 3926168404Spjd */ 3927332536Smav spa_ld_check_for_config_update(spa, config_cache_txg, 3928332547Smav update_config_cache); 3929168404Spjd 3930168404Spjd /* 3931208683Spjd * Check all DTLs to see if anything needs resilvering. 3932208683Spjd */ 3933219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 3934332529Smav vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) 3935208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 3936219089Spjd 3937219089Spjd /* 3938248571Smm * Log the fact that we booted up (so that we can detect if 3939248571Smm * we rebooted in the middle of an operation). 3940248571Smm */ 3941248571Smm spa_history_log_version(spa, "open"); 3942248571Smm 3943248571Smm /* 3944219089Spjd * Delete any inconsistent datasets. 3945219089Spjd */ 3946219089Spjd (void) dmu_objset_find(spa_name(spa), 3947219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 3948219089Spjd 3949219089Spjd /* 3950219089Spjd * Clean up any stale temporary dataset userrefs. 3951219089Spjd */ 3952219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 3953332525Smav 3954332525Smav spa_restart_removal(spa); 3955332525Smav 3956332537Smav spa_spawn_aux_threads(spa); 3957168404Spjd } 3958168404Spjd 3959332530Smav spa_load_note(spa, "LOADED"); 3960332530Smav 3961219089Spjd return (0); 3962219089Spjd} 3963168404Spjd 3964219089Spjdstatic int 3965332536Smavspa_load_retry(spa_t *spa, spa_load_state_t state) 3966219089Spjd{ 3967219089Spjd int mode = spa->spa_mode; 3968219089Spjd 3969219089Spjd spa_unload(spa); 3970219089Spjd spa_deactivate(spa); 3971219089Spjd 3972268720Sdelphij spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 3973219089Spjd 3974219089Spjd spa_activate(spa, mode); 3975219089Spjd spa_async_suspend(spa); 3976219089Spjd 3977332530Smav spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 3978332530Smav (u_longlong_t)spa->spa_load_max_txg); 3979332530Smav 3980332536Smav return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 3981168404Spjd} 3982168404Spjd 3983236884Smm/* 3984236884Smm * If spa_load() fails this function will try loading prior txg's. If 3985236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 3986236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 3987236884Smm * function will not rewind the pool and will return the same error as 3988236884Smm * spa_load(). 3989236884Smm */ 3990219089Spjdstatic int 3991332536Smavspa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 3992332536Smav int rewind_flags) 3993219089Spjd{ 3994236884Smm nvlist_t *loadinfo = NULL; 3995219089Spjd nvlist_t *config = NULL; 3996219089Spjd int load_error, rewind_error; 3997219089Spjd uint64_t safe_rewind_txg; 3998219089Spjd uint64_t min_txg; 3999219089Spjd 4000219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 4001219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 4002219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 4003219089Spjd } else { 4004219089Spjd spa->spa_load_max_txg = max_request; 4005268720Sdelphij if (max_request != UINT64_MAX) 4006268720Sdelphij spa->spa_extreme_rewind = B_TRUE; 4007219089Spjd } 4008219089Spjd 4009332536Smav load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 4010219089Spjd if (load_error == 0) 4011219089Spjd return (0); 4012332547Smav if (load_error == ZFS_ERR_NO_CHECKPOINT) { 4013332547Smav /* 4014332547Smav * When attempting checkpoint-rewind on a pool with no 4015332547Smav * checkpoint, we should not attempt to load uberblocks 4016332547Smav * from previous txgs when spa_load fails. 4017332547Smav */ 4018332547Smav ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4019332547Smav return (load_error); 4020332547Smav } 4021219089Spjd 4022219089Spjd if (spa->spa_root_vdev != NULL) 4023219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4024219089Spjd 4025219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 4026219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 4027219089Spjd 4028219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 4029219089Spjd nvlist_free(config); 4030219089Spjd return (load_error); 4031219089Spjd } 4032219089Spjd 4033236884Smm if (state == SPA_LOAD_RECOVER) { 4034236884Smm /* Price of rolling back is discarding txgs, including log */ 4035219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 4036236884Smm } else { 4037236884Smm /* 4038236884Smm * If we aren't rolling back save the load info from our first 4039236884Smm * import attempt so that we can restore it after attempting 4040236884Smm * to rewind. 4041236884Smm */ 4042236884Smm loadinfo = spa->spa_load_info; 4043236884Smm spa->spa_load_info = fnvlist_alloc(); 4044236884Smm } 4045219089Spjd 4046219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 4047219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 4048219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 4049219089Spjd TXG_INITIAL : safe_rewind_txg; 4050219089Spjd 4051219089Spjd /* 4052219089Spjd * Continue as long as we're finding errors, we're still within 4053219089Spjd * the acceptable rewind range, and we're still finding uberblocks 4054219089Spjd */ 4055219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 4056219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 4057219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 4058219089Spjd spa->spa_extreme_rewind = B_TRUE; 4059332536Smav rewind_error = spa_load_retry(spa, state); 4060219089Spjd } 4061219089Spjd 4062219089Spjd spa->spa_extreme_rewind = B_FALSE; 4063219089Spjd spa->spa_load_max_txg = UINT64_MAX; 4064219089Spjd 4065219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 4066219089Spjd spa_config_set(spa, config); 4067325535Savg else 4068325535Savg nvlist_free(config); 4069219089Spjd 4070236884Smm if (state == SPA_LOAD_RECOVER) { 4071236884Smm ASSERT3P(loadinfo, ==, NULL); 4072236884Smm return (rewind_error); 4073236884Smm } else { 4074236884Smm /* Store the rewind info as part of the initial load info */ 4075236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 4076236884Smm spa->spa_load_info); 4077236884Smm 4078236884Smm /* Restore the initial load info */ 4079236884Smm fnvlist_free(spa->spa_load_info); 4080236884Smm spa->spa_load_info = loadinfo; 4081236884Smm 4082236884Smm return (load_error); 4083236884Smm } 4084219089Spjd} 4085219089Spjd 4086168404Spjd/* 4087168404Spjd * Pool Open/Import 4088168404Spjd * 4089168404Spjd * The import case is identical to an open except that the configuration is sent 4090168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 4091168404Spjd * case of an open, the pool configuration will exist in the 4092185029Spjd * POOL_STATE_UNINITIALIZED state. 4093168404Spjd * 4094168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 4095168404Spjd * the same time open the pool, without having to keep around the spa_t in some 4096168404Spjd * ambiguous state. 4097168404Spjd */ 4098168404Spjdstatic int 4099219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 4100219089Spjd nvlist_t **config) 4101168404Spjd{ 4102168404Spjd spa_t *spa; 4103219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 4104168404Spjd int error; 4105168404Spjd int locked = B_FALSE; 4106219089Spjd int firstopen = B_FALSE; 4107168404Spjd 4108168404Spjd *spapp = NULL; 4109168404Spjd 4110168404Spjd /* 4111168404Spjd * As disgusting as this is, we need to support recursive calls to this 4112168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 4113168404Spjd * up calling spa_open() again. The real fix is to figure out how to 4114168404Spjd * avoid dsl_dir_open() calling this in the first place. 4115168404Spjd */ 4116168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 4117168404Spjd mutex_enter(&spa_namespace_lock); 4118168404Spjd locked = B_TRUE; 4119168404Spjd } 4120168404Spjd 4121168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4122168404Spjd if (locked) 4123168404Spjd mutex_exit(&spa_namespace_lock); 4124249195Smm return (SET_ERROR(ENOENT)); 4125168404Spjd } 4126219089Spjd 4127168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 4128332550Smav zpool_load_policy_t policy; 4129168404Spjd 4130219089Spjd firstopen = B_TRUE; 4131219089Spjd 4132332550Smav zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 4133219089Spjd &policy); 4134332550Smav if (policy.zlp_rewind & ZPOOL_DO_REWIND) 4135219089Spjd state = SPA_LOAD_RECOVER; 4136219089Spjd 4137209962Smm spa_activate(spa, spa_mode_global); 4138168404Spjd 4139219089Spjd if (state != SPA_LOAD_RECOVER) 4140219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4141332536Smav spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 4142168404Spjd 4143332530Smav zfs_dbgmsg("spa_open_common: opening %s", pool); 4144332550Smav error = spa_load_best(spa, state, policy.zlp_txg, 4145332550Smav policy.zlp_rewind); 4146219089Spjd 4147168404Spjd if (error == EBADF) { 4148168404Spjd /* 4149168404Spjd * If vdev_validate() returns failure (indicated by 4150168404Spjd * EBADF), it indicates that one of the vdevs indicates 4151168404Spjd * that the pool has been exported or destroyed. If 4152168404Spjd * this is the case, the config cache is out of sync and 4153168404Spjd * we should remove the pool from the namespace. 4154168404Spjd */ 4155168404Spjd spa_unload(spa); 4156168404Spjd spa_deactivate(spa); 4157332525Smav spa_write_cachefile(spa, B_TRUE, B_TRUE); 4158168404Spjd spa_remove(spa); 4159168404Spjd if (locked) 4160168404Spjd mutex_exit(&spa_namespace_lock); 4161249195Smm return (SET_ERROR(ENOENT)); 4162168404Spjd } 4163168404Spjd 4164168404Spjd if (error) { 4165168404Spjd /* 4166168404Spjd * We can't open the pool, but we still have useful 4167168404Spjd * information: the state of each vdev after the 4168168404Spjd * attempted vdev_open(). Return this to the user. 4169168404Spjd */ 4170219089Spjd if (config != NULL && spa->spa_config) { 4171219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 4172219089Spjd KM_SLEEP) == 0); 4173219089Spjd VERIFY(nvlist_add_nvlist(*config, 4174219089Spjd ZPOOL_CONFIG_LOAD_INFO, 4175219089Spjd spa->spa_load_info) == 0); 4176219089Spjd } 4177168404Spjd spa_unload(spa); 4178168404Spjd spa_deactivate(spa); 4179219089Spjd spa->spa_last_open_failed = error; 4180168404Spjd if (locked) 4181168404Spjd mutex_exit(&spa_namespace_lock); 4182168404Spjd *spapp = NULL; 4183168404Spjd return (error); 4184168404Spjd } 4185168404Spjd } 4186168404Spjd 4187168404Spjd spa_open_ref(spa, tag); 4188185029Spjd 4189219089Spjd if (config != NULL) 4190219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4191219089Spjd 4192219089Spjd /* 4193219089Spjd * If we've recovered the pool, pass back any information we 4194219089Spjd * gathered while doing the load. 4195219089Spjd */ 4196219089Spjd if (state == SPA_LOAD_RECOVER) { 4197219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 4198219089Spjd spa->spa_load_info) == 0); 4199219089Spjd } 4200219089Spjd 4201219089Spjd if (locked) { 4202219089Spjd spa->spa_last_open_failed = 0; 4203219089Spjd spa->spa_last_ubsync_txg = 0; 4204219089Spjd spa->spa_load_txg = 0; 4205168404Spjd mutex_exit(&spa_namespace_lock); 4206219089Spjd#ifdef __FreeBSD__ 4207219089Spjd#ifdef _KERNEL 4208219089Spjd if (firstopen) 4209249047Savg zvol_create_minors(spa->spa_name); 4210219089Spjd#endif 4211219089Spjd#endif 4212219089Spjd } 4213168404Spjd 4214168404Spjd *spapp = spa; 4215168404Spjd 4216168404Spjd return (0); 4217168404Spjd} 4218168404Spjd 4219168404Spjdint 4220219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 4221219089Spjd nvlist_t **config) 4222219089Spjd{ 4223219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 4224219089Spjd} 4225219089Spjd 4226219089Spjdint 4227168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 4228168404Spjd{ 4229219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 4230168404Spjd} 4231168404Spjd 4232168404Spjd/* 4233168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 4234168404Spjd * preventing it from being exported or destroyed. 4235168404Spjd */ 4236168404Spjdspa_t * 4237168404Spjdspa_inject_addref(char *name) 4238168404Spjd{ 4239168404Spjd spa_t *spa; 4240168404Spjd 4241168404Spjd mutex_enter(&spa_namespace_lock); 4242168404Spjd if ((spa = spa_lookup(name)) == NULL) { 4243168404Spjd mutex_exit(&spa_namespace_lock); 4244168404Spjd return (NULL); 4245168404Spjd } 4246168404Spjd spa->spa_inject_ref++; 4247168404Spjd mutex_exit(&spa_namespace_lock); 4248168404Spjd 4249168404Spjd return (spa); 4250168404Spjd} 4251168404Spjd 4252168404Spjdvoid 4253168404Spjdspa_inject_delref(spa_t *spa) 4254168404Spjd{ 4255168404Spjd mutex_enter(&spa_namespace_lock); 4256168404Spjd spa->spa_inject_ref--; 4257168404Spjd mutex_exit(&spa_namespace_lock); 4258168404Spjd} 4259168404Spjd 4260185029Spjd/* 4261185029Spjd * Add spares device information to the nvlist. 4262185029Spjd */ 4263168404Spjdstatic void 4264168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 4265168404Spjd{ 4266168404Spjd nvlist_t **spares; 4267168404Spjd uint_t i, nspares; 4268168404Spjd nvlist_t *nvroot; 4269168404Spjd uint64_t guid; 4270168404Spjd vdev_stat_t *vs; 4271168404Spjd uint_t vsc; 4272168404Spjd uint64_t pool; 4273168404Spjd 4274209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 4275209962Smm 4276185029Spjd if (spa->spa_spares.sav_count == 0) 4277168404Spjd return; 4278168404Spjd 4279168404Spjd VERIFY(nvlist_lookup_nvlist(config, 4280168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 4281185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 4282168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 4283168404Spjd if (nspares != 0) { 4284168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 4285168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4286168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 4287168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 4288168404Spjd 4289168404Spjd /* 4290168404Spjd * Go through and find any spares which have since been 4291168404Spjd * repurposed as an active spare. If this is the case, update 4292168404Spjd * their status appropriately. 4293168404Spjd */ 4294168404Spjd for (i = 0; i < nspares; i++) { 4295168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 4296168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 4297185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 4298185029Spjd pool != 0ULL) { 4299168404Spjd VERIFY(nvlist_lookup_uint64_array( 4300219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 4301168404Spjd (uint64_t **)&vs, &vsc) == 0); 4302168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 4303168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 4304168404Spjd } 4305168404Spjd } 4306168404Spjd } 4307168404Spjd} 4308168404Spjd 4309185029Spjd/* 4310185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 4311185029Spjd */ 4312185029Spjdstatic void 4313185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 4314185029Spjd{ 4315185029Spjd nvlist_t **l2cache; 4316185029Spjd uint_t i, j, nl2cache; 4317185029Spjd nvlist_t *nvroot; 4318185029Spjd uint64_t guid; 4319185029Spjd vdev_t *vd; 4320185029Spjd vdev_stat_t *vs; 4321185029Spjd uint_t vsc; 4322185029Spjd 4323209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 4324209962Smm 4325185029Spjd if (spa->spa_l2cache.sav_count == 0) 4326185029Spjd return; 4327185029Spjd 4328185029Spjd VERIFY(nvlist_lookup_nvlist(config, 4329185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 4330185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 4331185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 4332185029Spjd if (nl2cache != 0) { 4333185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 4334185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4335185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 4336185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 4337185029Spjd 4338185029Spjd /* 4339185029Spjd * Update level 2 cache device stats. 4340185029Spjd */ 4341185029Spjd 4342185029Spjd for (i = 0; i < nl2cache; i++) { 4343185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 4344185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 4345185029Spjd 4346185029Spjd vd = NULL; 4347185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 4348185029Spjd if (guid == 4349185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 4350185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 4351185029Spjd break; 4352185029Spjd } 4353185029Spjd } 4354185029Spjd ASSERT(vd != NULL); 4355185029Spjd 4356185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 4357219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 4358219089Spjd == 0); 4359185029Spjd vdev_get_stats(vd, vs); 4360185029Spjd } 4361185029Spjd } 4362185029Spjd} 4363185029Spjd 4364236884Smmstatic void 4365236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 4366236884Smm{ 4367236884Smm nvlist_t *features; 4368236884Smm zap_cursor_t zc; 4369236884Smm zap_attribute_t za; 4370236884Smm 4371236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 4372236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4373236884Smm 4374253993Smav /* We may be unable to read features if pool is suspended. */ 4375253993Smav if (spa_suspended(spa)) 4376253993Smav goto out; 4377253993Smav 4378236884Smm if (spa->spa_feat_for_read_obj != 0) { 4379236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 4380236884Smm spa->spa_feat_for_read_obj); 4381236884Smm zap_cursor_retrieve(&zc, &za) == 0; 4382236884Smm zap_cursor_advance(&zc)) { 4383236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 4384236884Smm za.za_num_integers == 1); 4385236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 4386236884Smm za.za_first_integer)); 4387236884Smm } 4388236884Smm zap_cursor_fini(&zc); 4389236884Smm } 4390236884Smm 4391236884Smm if (spa->spa_feat_for_write_obj != 0) { 4392236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 4393236884Smm spa->spa_feat_for_write_obj); 4394236884Smm zap_cursor_retrieve(&zc, &za) == 0; 4395236884Smm zap_cursor_advance(&zc)) { 4396236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 4397236884Smm za.za_num_integers == 1); 4398236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 4399236884Smm za.za_first_integer)); 4400236884Smm } 4401236884Smm zap_cursor_fini(&zc); 4402236884Smm } 4403236884Smm 4404253993Smavout: 4405236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 4406236884Smm features) == 0); 4407236884Smm nvlist_free(features); 4408236884Smm} 4409236884Smm 4410168404Spjdint 4411236884Smmspa_get_stats(const char *name, nvlist_t **config, 4412236884Smm char *altroot, size_t buflen) 4413168404Spjd{ 4414168404Spjd int error; 4415168404Spjd spa_t *spa; 4416168404Spjd 4417168404Spjd *config = NULL; 4418219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 4419168404Spjd 4420209962Smm if (spa != NULL) { 4421209962Smm /* 4422209962Smm * This still leaves a window of inconsistency where the spares 4423209962Smm * or l2cache devices could change and the config would be 4424209962Smm * self-inconsistent. 4425209962Smm */ 4426209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4427168404Spjd 4428209962Smm if (*config != NULL) { 4429219089Spjd uint64_t loadtimes[2]; 4430219089Spjd 4431219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 4432219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 4433219089Spjd VERIFY(nvlist_add_uint64_array(*config, 4434219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 4435219089Spjd 4436185029Spjd VERIFY(nvlist_add_uint64(*config, 4437209962Smm ZPOOL_CONFIG_ERRCOUNT, 4438209962Smm spa_get_errlog_size(spa)) == 0); 4439185029Spjd 4440209962Smm if (spa_suspended(spa)) 4441209962Smm VERIFY(nvlist_add_uint64(*config, 4442209962Smm ZPOOL_CONFIG_SUSPENDED, 4443209962Smm spa->spa_failmode) == 0); 4444209962Smm 4445209962Smm spa_add_spares(spa, *config); 4446209962Smm spa_add_l2cache(spa, *config); 4447236884Smm spa_add_feature_stats(spa, *config); 4448209962Smm } 4449168404Spjd } 4450168404Spjd 4451168404Spjd /* 4452168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 4453168404Spjd * and call spa_lookup() directly. 4454168404Spjd */ 4455168404Spjd if (altroot) { 4456168404Spjd if (spa == NULL) { 4457168404Spjd mutex_enter(&spa_namespace_lock); 4458168404Spjd spa = spa_lookup(name); 4459168404Spjd if (spa) 4460168404Spjd spa_altroot(spa, altroot, buflen); 4461168404Spjd else 4462168404Spjd altroot[0] = '\0'; 4463168404Spjd spa = NULL; 4464168404Spjd mutex_exit(&spa_namespace_lock); 4465168404Spjd } else { 4466168404Spjd spa_altroot(spa, altroot, buflen); 4467168404Spjd } 4468168404Spjd } 4469168404Spjd 4470209962Smm if (spa != NULL) { 4471209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4472168404Spjd spa_close(spa, FTAG); 4473209962Smm } 4474168404Spjd 4475168404Spjd return (error); 4476168404Spjd} 4477168404Spjd 4478168404Spjd/* 4479185029Spjd * Validate that the auxiliary device array is well formed. We must have an 4480185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 4481185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 4482185029Spjd * specified, as long as they are well-formed. 4483168404Spjd */ 4484168404Spjdstatic int 4485185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 4486185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 4487185029Spjd vdev_labeltype_t label) 4488168404Spjd{ 4489185029Spjd nvlist_t **dev; 4490185029Spjd uint_t i, ndev; 4491168404Spjd vdev_t *vd; 4492168404Spjd int error; 4493168404Spjd 4494185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4495185029Spjd 4496168404Spjd /* 4497185029Spjd * It's acceptable to have no devs specified. 4498168404Spjd */ 4499185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 4500168404Spjd return (0); 4501168404Spjd 4502185029Spjd if (ndev == 0) 4503249195Smm return (SET_ERROR(EINVAL)); 4504168404Spjd 4505168404Spjd /* 4506185029Spjd * Make sure the pool is formatted with a version that supports this 4507185029Spjd * device type. 4508168404Spjd */ 4509185029Spjd if (spa_version(spa) < version) 4510249195Smm return (SET_ERROR(ENOTSUP)); 4511168404Spjd 4512168404Spjd /* 4513185029Spjd * Set the pending device list so we correctly handle device in-use 4514168404Spjd * checking. 4515168404Spjd */ 4516185029Spjd sav->sav_pending = dev; 4517185029Spjd sav->sav_npending = ndev; 4518168404Spjd 4519185029Spjd for (i = 0; i < ndev; i++) { 4520185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 4521168404Spjd mode)) != 0) 4522168404Spjd goto out; 4523168404Spjd 4524168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 4525168404Spjd vdev_free(vd); 4526249195Smm error = SET_ERROR(EINVAL); 4527168404Spjd goto out; 4528168404Spjd } 4529168404Spjd 4530185029Spjd /* 4531185029Spjd * The L2ARC currently only supports disk devices in 4532185029Spjd * kernel context. For user-level testing, we allow it. 4533185029Spjd */ 4534185029Spjd#ifdef _KERNEL 4535185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 4536185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 4537249195Smm error = SET_ERROR(ENOTBLK); 4538230514Smm vdev_free(vd); 4539185029Spjd goto out; 4540185029Spjd } 4541185029Spjd#endif 4542168404Spjd vd->vdev_top = vd; 4543168404Spjd 4544168404Spjd if ((error = vdev_open(vd)) == 0 && 4545185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 4546185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 4547168404Spjd vd->vdev_guid) == 0); 4548168404Spjd } 4549168404Spjd 4550168404Spjd vdev_free(vd); 4551168404Spjd 4552185029Spjd if (error && 4553185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 4554168404Spjd goto out; 4555168404Spjd else 4556168404Spjd error = 0; 4557168404Spjd } 4558168404Spjd 4559168404Spjdout: 4560185029Spjd sav->sav_pending = NULL; 4561185029Spjd sav->sav_npending = 0; 4562168404Spjd return (error); 4563168404Spjd} 4564168404Spjd 4565185029Spjdstatic int 4566185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 4567185029Spjd{ 4568185029Spjd int error; 4569185029Spjd 4570185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4571185029Spjd 4572185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 4573185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 4574185029Spjd VDEV_LABEL_SPARE)) != 0) { 4575185029Spjd return (error); 4576185029Spjd } 4577185029Spjd 4578185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 4579185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 4580185029Spjd VDEV_LABEL_L2CACHE)); 4581185029Spjd} 4582185029Spjd 4583185029Spjdstatic void 4584185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 4585185029Spjd const char *config) 4586185029Spjd{ 4587185029Spjd int i; 4588185029Spjd 4589185029Spjd if (sav->sav_config != NULL) { 4590185029Spjd nvlist_t **olddevs; 4591185029Spjd uint_t oldndevs; 4592185029Spjd nvlist_t **newdevs; 4593185029Spjd 4594185029Spjd /* 4595185029Spjd * Generate new dev list by concatentating with the 4596185029Spjd * current dev list. 4597185029Spjd */ 4598185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 4599185029Spjd &olddevs, &oldndevs) == 0); 4600185029Spjd 4601185029Spjd newdevs = kmem_alloc(sizeof (void *) * 4602185029Spjd (ndevs + oldndevs), KM_SLEEP); 4603185029Spjd for (i = 0; i < oldndevs; i++) 4604185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 4605185029Spjd KM_SLEEP) == 0); 4606185029Spjd for (i = 0; i < ndevs; i++) 4607185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 4608185029Spjd KM_SLEEP) == 0); 4609185029Spjd 4610185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 4611185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 4612185029Spjd 4613185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 4614185029Spjd config, newdevs, ndevs + oldndevs) == 0); 4615185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 4616185029Spjd nvlist_free(newdevs[i]); 4617185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 4618185029Spjd } else { 4619185029Spjd /* 4620185029Spjd * Generate a new dev list. 4621185029Spjd */ 4622185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 4623185029Spjd KM_SLEEP) == 0); 4624185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 4625185029Spjd devs, ndevs) == 0); 4626185029Spjd } 4627185029Spjd} 4628185029Spjd 4629168404Spjd/* 4630185029Spjd * Stop and drop level 2 ARC devices 4631185029Spjd */ 4632185029Spjdvoid 4633185029Spjdspa_l2cache_drop(spa_t *spa) 4634185029Spjd{ 4635185029Spjd vdev_t *vd; 4636185029Spjd int i; 4637185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 4638185029Spjd 4639185029Spjd for (i = 0; i < sav->sav_count; i++) { 4640185029Spjd uint64_t pool; 4641185029Spjd 4642185029Spjd vd = sav->sav_vdevs[i]; 4643185029Spjd ASSERT(vd != NULL); 4644185029Spjd 4645209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 4646209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 4647185029Spjd l2arc_remove_vdev(vd); 4648185029Spjd } 4649185029Spjd} 4650185029Spjd 4651185029Spjd/* 4652168404Spjd * Pool Creation 4653168404Spjd */ 4654168404Spjdint 4655185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 4656248571Smm nvlist_t *zplprops) 4657168404Spjd{ 4658168404Spjd spa_t *spa; 4659185029Spjd char *altroot = NULL; 4660168404Spjd vdev_t *rvd; 4661168404Spjd dsl_pool_t *dp; 4662168404Spjd dmu_tx_t *tx; 4663219089Spjd int error = 0; 4664168404Spjd uint64_t txg = TXG_INITIAL; 4665185029Spjd nvlist_t **spares, **l2cache; 4666185029Spjd uint_t nspares, nl2cache; 4667219089Spjd uint64_t version, obj; 4668236884Smm boolean_t has_features; 4669333194Savg char *poolname; 4670333194Savg nvlist_t *nvl; 4671168404Spjd 4672333194Savg if (nvlist_lookup_string(props, 4673333194Savg zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) 4674333194Savg poolname = (char *)pool; 4675333194Savg 4676168404Spjd /* 4677168404Spjd * If this pool already exists, return failure. 4678168404Spjd */ 4679168404Spjd mutex_enter(&spa_namespace_lock); 4680333194Savg if (spa_lookup(poolname) != NULL) { 4681168404Spjd mutex_exit(&spa_namespace_lock); 4682249195Smm return (SET_ERROR(EEXIST)); 4683168404Spjd } 4684168404Spjd 4685168404Spjd /* 4686168404Spjd * Allocate a new spa_t structure. 4687168404Spjd */ 4688333194Savg nvl = fnvlist_alloc(); 4689333194Savg fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 4690185029Spjd (void) nvlist_lookup_string(props, 4691185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4692333194Savg spa = spa_add(poolname, nvl, altroot); 4693333194Savg fnvlist_free(nvl); 4694209962Smm spa_activate(spa, spa_mode_global); 4695168404Spjd 4696185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 4697185029Spjd spa_deactivate(spa); 4698185029Spjd spa_remove(spa); 4699185029Spjd mutex_exit(&spa_namespace_lock); 4700185029Spjd return (error); 4701185029Spjd } 4702185029Spjd 4703333194Savg /* 4704333194Savg * Temporary pool names should never be written to disk. 4705333194Savg */ 4706333194Savg if (poolname != pool) 4707333194Savg spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 4708333194Savg 4709236884Smm has_features = B_FALSE; 4710236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 4711236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 4712236884Smm if (zpool_prop_feature(nvpair_name(elem))) 4713236884Smm has_features = B_TRUE; 4714236884Smm } 4715236884Smm 4716236884Smm if (has_features || nvlist_lookup_uint64(props, 4717236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 4718185029Spjd version = SPA_VERSION; 4719236884Smm } 4720236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 4721219089Spjd 4722219089Spjd spa->spa_first_txg = txg; 4723219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 4724185029Spjd spa->spa_uberblock.ub_version = version; 4725168404Spjd spa->spa_ubsync = spa->spa_uberblock; 4726307277Smav spa->spa_load_state = SPA_LOAD_CREATE; 4727332525Smav spa->spa_removing_phys.sr_state = DSS_NONE; 4728332525Smav spa->spa_removing_phys.sr_removing_vdev = -1; 4729332525Smav spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 4730338403Smav spa->spa_indirect_vdevs_loaded = B_TRUE; 4731168404Spjd 4732168404Spjd /* 4733209962Smm * Create "The Godfather" zio to hold all async IOs 4734209962Smm */ 4735272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 4736272598Sdelphij KM_SLEEP); 4737272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 4738272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 4739272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 4740272598Sdelphij ZIO_FLAG_GODFATHER); 4741272598Sdelphij } 4742209962Smm 4743209962Smm /* 4744168404Spjd * Create the root vdev. 4745168404Spjd */ 4746185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4747168404Spjd 4748168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 4749168404Spjd 4750168404Spjd ASSERT(error != 0 || rvd != NULL); 4751168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 4752168404Spjd 4753185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 4754249195Smm error = SET_ERROR(EINVAL); 4755168404Spjd 4756168404Spjd if (error == 0 && 4757168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 4758185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 4759168404Spjd VDEV_ALLOC_ADD)) == 0) { 4760219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 4761254591Sgibbs vdev_ashift_optimize(rvd->vdev_child[c]); 4762219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 4763219089Spjd vdev_expand(rvd->vdev_child[c], txg); 4764219089Spjd } 4765168404Spjd } 4766168404Spjd 4767185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4768168404Spjd 4769168404Spjd if (error != 0) { 4770168404Spjd spa_unload(spa); 4771168404Spjd spa_deactivate(spa); 4772168404Spjd spa_remove(spa); 4773168404Spjd mutex_exit(&spa_namespace_lock); 4774168404Spjd return (error); 4775168404Spjd } 4776168404Spjd 4777168404Spjd /* 4778168404Spjd * Get the list of spares, if specified. 4779168404Spjd */ 4780168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4781168404Spjd &spares, &nspares) == 0) { 4782185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 4783168404Spjd KM_SLEEP) == 0); 4784185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4785168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4786185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4787168404Spjd spa_load_spares(spa); 4788185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4789185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4790168404Spjd } 4791168404Spjd 4792185029Spjd /* 4793185029Spjd * Get the list of level 2 cache devices, if specified. 4794185029Spjd */ 4795185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4796185029Spjd &l2cache, &nl2cache) == 0) { 4797185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4798185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 4799185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4800185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4801185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4802185029Spjd spa_load_l2cache(spa); 4803185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4804185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4805185029Spjd } 4806185029Spjd 4807236884Smm spa->spa_is_initializing = B_TRUE; 4808185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 4809168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 4810236884Smm spa->spa_is_initializing = B_FALSE; 4811168404Spjd 4812219089Spjd /* 4813219089Spjd * Create DDTs (dedup tables). 4814219089Spjd */ 4815219089Spjd ddt_create(spa); 4816219089Spjd 4817219089Spjd spa_update_dspace(spa); 4818219089Spjd 4819168404Spjd tx = dmu_tx_create_assigned(dp, txg); 4820168404Spjd 4821168404Spjd /* 4822168404Spjd * Create the pool config object. 4823168404Spjd */ 4824168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 4825185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 4826168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 4827168404Spjd 4828168404Spjd if (zap_add(spa->spa_meta_objset, 4829168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 4830168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 4831168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 4832168404Spjd } 4833168404Spjd 4834236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 4835236884Smm spa_feature_create_zap_objects(spa, tx); 4836236884Smm 4837219089Spjd if (zap_add(spa->spa_meta_objset, 4838219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 4839219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 4840219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 4841219089Spjd } 4842219089Spjd 4843185029Spjd /* Newly created pools with the right version are always deflated. */ 4844185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 4845185029Spjd spa->spa_deflate = TRUE; 4846185029Spjd if (zap_add(spa->spa_meta_objset, 4847185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4848185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 4849185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 4850185029Spjd } 4851168404Spjd } 4852168404Spjd 4853168404Spjd /* 4854219089Spjd * Create the deferred-free bpobj. Turn off compression 4855168404Spjd * because sync-to-convergence takes longer if the blocksize 4856168404Spjd * keeps changing. 4857168404Spjd */ 4858219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 4859219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 4860168404Spjd ZIO_COMPRESS_OFF, tx); 4861168404Spjd if (zap_add(spa->spa_meta_objset, 4862219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 4863219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 4864219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 4865168404Spjd } 4866219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 4867219089Spjd spa->spa_meta_objset, obj)); 4868168404Spjd 4869168404Spjd /* 4870168404Spjd * Create the pool's history object. 4871168404Spjd */ 4872185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 4873185029Spjd spa_history_create_obj(spa, tx); 4874168404Spjd 4875185029Spjd /* 4876289422Smav * Generate some random noise for salted checksums to operate on. 4877289422Smav */ 4878289422Smav (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4879289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes)); 4880289422Smav 4881289422Smav /* 4882185029Spjd * Set pool properties. 4883185029Spjd */ 4884185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 4885185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4886185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 4887219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 4888219089Spjd 4889209962Smm if (props != NULL) { 4890209962Smm spa_configfile_set(spa, props, B_FALSE); 4891248571Smm spa_sync_props(props, tx); 4892209962Smm } 4893185029Spjd 4894168404Spjd dmu_tx_commit(tx); 4895168404Spjd 4896168404Spjd spa->spa_sync_on = B_TRUE; 4897168404Spjd txg_sync_start(spa->spa_dsl_pool); 4898168404Spjd 4899168404Spjd /* 4900168404Spjd * We explicitly wait for the first transaction to complete so that our 4901168404Spjd * bean counters are appropriately updated. 4902168404Spjd */ 4903168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 4904168404Spjd 4905332537Smav spa_spawn_aux_threads(spa); 4906332537Smav 4907332525Smav spa_write_cachefile(spa, B_FALSE, B_TRUE); 4908331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 4909168404Spjd 4910248571Smm spa_history_log_version(spa, "create"); 4911185029Spjd 4912286575Smav /* 4913286575Smav * Don't count references from objsets that are already closed 4914286575Smav * and are making their way through the eviction process. 4915286575Smav */ 4916286575Smav spa_evicting_os_wait(spa); 4917208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 4918307277Smav spa->spa_load_state = SPA_LOAD_NONE; 4919208442Smm 4920168404Spjd mutex_exit(&spa_namespace_lock); 4921168404Spjd 4922168404Spjd return (0); 4923168404Spjd} 4924168404Spjd 4925241286Savg#ifdef _KERNEL 4926277300Ssmh#ifdef illumos 4927185029Spjd/* 4928219089Spjd * Get the root pool information from the root disk, then import the root pool 4929219089Spjd * during the system boot up time. 4930185029Spjd */ 4931219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 4932219089Spjd 4933219089Spjdstatic nvlist_t * 4934219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 4935185029Spjd{ 4936219089Spjd nvlist_t *config; 4937185029Spjd nvlist_t *nvtop, *nvroot; 4938185029Spjd uint64_t pgid; 4939185029Spjd 4940219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 4941219089Spjd return (NULL); 4942219089Spjd 4943168404Spjd /* 4944185029Spjd * Add this top-level vdev to the child array. 4945168404Spjd */ 4946219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4947219089Spjd &nvtop) == 0); 4948219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4949219089Spjd &pgid) == 0); 4950219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 4951168404Spjd 4952185029Spjd /* 4953185029Spjd * Put this pool's top-level vdevs into a root vdev. 4954185029Spjd */ 4955185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4956219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4957219089Spjd VDEV_TYPE_ROOT) == 0); 4958185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4959185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4960185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4961185029Spjd &nvtop, 1) == 0); 4962168404Spjd 4963168404Spjd /* 4964185029Spjd * Replace the existing vdev_tree with the new root vdev in 4965185029Spjd * this pool's configuration (remove the old, add the new). 4966168404Spjd */ 4967185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4968185029Spjd nvlist_free(nvroot); 4969219089Spjd return (config); 4970185029Spjd} 4971168404Spjd 4972185029Spjd/* 4973219089Spjd * Walk the vdev tree and see if we can find a device with "better" 4974219089Spjd * configuration. A configuration is "better" if the label on that 4975219089Spjd * device has a more recent txg. 4976185029Spjd */ 4977219089Spjdstatic void 4978219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 4979185029Spjd{ 4980219089Spjd for (int c = 0; c < vd->vdev_children; c++) 4981219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 4982185029Spjd 4983219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 4984219089Spjd nvlist_t *label; 4985219089Spjd uint64_t label_txg; 4986185029Spjd 4987219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 4988219089Spjd &label) != 0) 4989219089Spjd return; 4990185029Spjd 4991219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 4992219089Spjd &label_txg) == 0); 4993168404Spjd 4994219089Spjd /* 4995219089Spjd * Do we have a better boot device? 4996219089Spjd */ 4997219089Spjd if (label_txg > *txg) { 4998219089Spjd *txg = label_txg; 4999219089Spjd *avd = vd; 5000185029Spjd } 5001219089Spjd nvlist_free(label); 5002185029Spjd } 5003185029Spjd} 5004185029Spjd 5005185029Spjd/* 5006185029Spjd * Import a root pool. 5007185029Spjd * 5008185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 5009185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 5010185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 5011185029Spjd * 5012185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 5013185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 5014185029Spjd * e.g. 5015185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 5016185029Spjd */ 5017185029Spjdint 5018185029Spjdspa_import_rootpool(char *devpath, char *devid) 5019185029Spjd{ 5020219089Spjd spa_t *spa; 5021219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 5022219089Spjd nvlist_t *config, *nvtop; 5023219089Spjd uint64_t guid, txg; 5024185029Spjd char *pname; 5025185029Spjd int error; 5026185029Spjd 5027185029Spjd /* 5028219089Spjd * Read the label from the boot device and generate a configuration. 5029185029Spjd */ 5030219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 5031219089Spjd#if defined(_OBP) && defined(_KERNEL) 5032219089Spjd if (config == NULL) { 5033219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 5034219089Spjd /* iscsi boot */ 5035219089Spjd get_iscsi_bootpath_phy(devpath); 5036219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 5037219089Spjd } 5038219089Spjd } 5039219089Spjd#endif 5040219089Spjd if (config == NULL) { 5041236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 5042219089Spjd devpath); 5043249195Smm return (SET_ERROR(EIO)); 5044219089Spjd } 5045185029Spjd 5046219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 5047219089Spjd &pname) == 0); 5048219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 5049185029Spjd 5050209962Smm mutex_enter(&spa_namespace_lock); 5051209962Smm if ((spa = spa_lookup(pname)) != NULL) { 5052209962Smm /* 5053209962Smm * Remove the existing root pool from the namespace so that we 5054209962Smm * can replace it with the correct config we just read in. 5055209962Smm */ 5056209962Smm spa_remove(spa); 5057209962Smm } 5058185029Spjd 5059219089Spjd spa = spa_add(pname, config, NULL); 5060209962Smm spa->spa_is_root = B_TRUE; 5061219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 5062331721Smav if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 5063331721Smav &spa->spa_ubsync.ub_version) != 0) 5064331721Smav spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 5065209962Smm 5066219089Spjd /* 5067219089Spjd * Build up a vdev tree based on the boot device's label config. 5068219089Spjd */ 5069219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 5070219089Spjd &nvtop) == 0); 5071219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5072219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 5073219089Spjd VDEV_ALLOC_ROOTPOOL); 5074219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 5075219089Spjd if (error) { 5076209962Smm mutex_exit(&spa_namespace_lock); 5077219089Spjd nvlist_free(config); 5078219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 5079219089Spjd pname); 5080219089Spjd return (error); 5081209962Smm } 5082209962Smm 5083219089Spjd /* 5084219089Spjd * Get the boot vdev. 5085219089Spjd */ 5086219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 5087219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 5088219089Spjd (u_longlong_t)guid); 5089249195Smm error = SET_ERROR(ENOENT); 5090219089Spjd goto out; 5091219089Spjd } 5092209962Smm 5093219089Spjd /* 5094219089Spjd * Determine if there is a better boot device. 5095219089Spjd */ 5096219089Spjd avd = bvd; 5097219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 5098219089Spjd if (avd != bvd) { 5099219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 5100219089Spjd "try booting from '%s'", avd->vdev_path); 5101249195Smm error = SET_ERROR(EINVAL); 5102219089Spjd goto out; 5103219089Spjd } 5104209962Smm 5105219089Spjd /* 5106219089Spjd * If the boot device is part of a spare vdev then ensure that 5107219089Spjd * we're booting off the active spare. 5108219089Spjd */ 5109219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 5110219089Spjd !bvd->vdev_isspare) { 5111219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 5112219089Spjd "try booting from '%s'", 5113219089Spjd bvd->vdev_parent-> 5114219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 5115249195Smm error = SET_ERROR(EINVAL); 5116219089Spjd goto out; 5117219089Spjd } 5118209962Smm 5119219089Spjd error = 0; 5120219089Spjdout: 5121219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5122219089Spjd vdev_free(rvd); 5123219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 5124209962Smm mutex_exit(&spa_namespace_lock); 5125209962Smm 5126219089Spjd nvlist_free(config); 5127219089Spjd return (error); 5128185029Spjd} 5129185029Spjd 5130277300Ssmh#else /* !illumos */ 5131241286Savg 5132243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 5133243502Savg uint64_t *count); 5134241286Savg 5135241286Savgstatic nvlist_t * 5136241286Savgspa_generate_rootconf(const char *name) 5137241286Savg{ 5138243502Savg nvlist_t **configs, **tops; 5139241286Savg nvlist_t *config; 5140243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 5141243502Savg uint64_t *holes; 5142243502Savg uint64_t best_txg; 5143243213Savg uint64_t nchildren; 5144241286Savg uint64_t pgid; 5145243502Savg uint64_t count; 5146243502Savg uint64_t i; 5147243502Savg uint_t nholes; 5148241286Savg 5149243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 5150241286Savg return (NULL); 5151241286Savg 5152243502Savg ASSERT3U(count, !=, 0); 5153243502Savg best_txg = 0; 5154243502Savg for (i = 0; i < count; i++) { 5155243502Savg uint64_t txg; 5156243502Savg 5157243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 5158243502Savg &txg) == 0); 5159243502Savg if (txg > best_txg) { 5160243502Savg best_txg = txg; 5161243502Savg best_cfg = configs[i]; 5162243502Savg } 5163243502Savg } 5164243502Savg 5165245945Savg nchildren = 1; 5166245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 5167243502Savg holes = NULL; 5168243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 5169243502Savg &holes, &nholes); 5170243502Savg 5171244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 5172243502Savg for (i = 0; i < nchildren; i++) { 5173243502Savg if (i >= count) 5174243502Savg break; 5175243502Savg if (configs[i] == NULL) 5176243502Savg continue; 5177243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 5178243502Savg &nvtop) == 0); 5179243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 5180243213Savg } 5181243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 5182243502Savg if (i >= nchildren) 5183243502Savg continue; 5184243502Savg if (tops[holes[i]] != NULL) 5185243502Savg continue; 5186243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 5187243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 5188243502Savg VDEV_TYPE_HOLE) == 0); 5189243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 5190243502Savg holes[i]) == 0); 5191243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 5192243502Savg 0) == 0); 5193243502Savg } 5194243502Savg for (i = 0; i < nchildren; i++) { 5195243502Savg if (tops[i] != NULL) 5196243502Savg continue; 5197243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 5198243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 5199243502Savg VDEV_TYPE_MISSING) == 0); 5200243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 5201243502Savg i) == 0); 5202243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 5203243502Savg 0) == 0); 5204243502Savg } 5205243213Savg 5206243213Savg /* 5207243502Savg * Create pool config based on the best vdev config. 5208241286Savg */ 5209243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 5210241286Savg 5211241286Savg /* 5212241286Savg * Put this pool's top-level vdevs into a root vdev. 5213241286Savg */ 5214243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5215243502Savg &pgid) == 0); 5216241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5217241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 5218241286Savg VDEV_TYPE_ROOT) == 0); 5219241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 5220241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 5221241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 5222243502Savg tops, nchildren) == 0); 5223241286Savg 5224241286Savg /* 5225241286Savg * Replace the existing vdev_tree with the new root vdev in 5226241286Savg * this pool's configuration (remove the old, add the new). 5227241286Savg */ 5228241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 5229243502Savg 5230243502Savg /* 5231243502Savg * Drop vdev config elements that should not be present at pool level. 5232243502Savg */ 5233243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 5234243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 5235243502Savg 5236243502Savg for (i = 0; i < count; i++) 5237243502Savg nvlist_free(configs[i]); 5238243502Savg kmem_free(configs, count * sizeof(void *)); 5239243502Savg for (i = 0; i < nchildren; i++) 5240243502Savg nvlist_free(tops[i]); 5241243502Savg kmem_free(tops, nchildren * sizeof(void *)); 5242241286Savg nvlist_free(nvroot); 5243241286Savg return (config); 5244241286Savg} 5245241286Savg 5246241286Savgint 5247241286Savgspa_import_rootpool(const char *name) 5248241286Savg{ 5249241286Savg spa_t *spa; 5250241286Savg vdev_t *rvd, *bvd, *avd = NULL; 5251241286Savg nvlist_t *config, *nvtop; 5252241286Savg uint64_t txg; 5253241286Savg char *pname; 5254241286Savg int error; 5255241286Savg 5256241286Savg /* 5257241286Savg * Read the label from the boot device and generate a configuration. 5258241286Savg */ 5259241286Savg config = spa_generate_rootconf(name); 5260243213Savg 5261243213Savg mutex_enter(&spa_namespace_lock); 5262243213Savg if (config != NULL) { 5263243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 5264243213Savg &pname) == 0 && strcmp(name, pname) == 0); 5265243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 5266243213Savg == 0); 5267243213Savg 5268243213Savg if ((spa = spa_lookup(pname)) != NULL) { 5269243213Savg /* 5270323746Savg * The pool could already be imported, 5271323746Savg * e.g., after reboot -r. 5272323746Savg */ 5273323746Savg if (spa->spa_state == POOL_STATE_ACTIVE) { 5274323746Savg mutex_exit(&spa_namespace_lock); 5275323746Savg nvlist_free(config); 5276323746Savg return (0); 5277323746Savg } 5278323746Savg 5279323746Savg /* 5280243213Savg * Remove the existing root pool from the namespace so 5281243213Savg * that we can replace it with the correct config 5282243213Savg * we just read in. 5283243213Savg */ 5284243213Savg spa_remove(spa); 5285243213Savg } 5286243213Savg spa = spa_add(pname, config, NULL); 5287243501Savg 5288243501Savg /* 5289243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 5290243501Savg * via spa_version(). 5291243501Savg */ 5292243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 5293243501Savg &spa->spa_ubsync.ub_version) != 0) 5294243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 5295243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 5296287100Savg mutex_exit(&spa_namespace_lock); 5297287100Savg nvlist_free(config); 5298241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 5299241286Savg name); 5300241286Savg return (EIO); 5301243213Savg } else { 5302243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 5303241286Savg } 5304241286Savg spa->spa_is_root = B_TRUE; 5305241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 5306241286Savg 5307241286Savg /* 5308241286Savg * Build up a vdev tree based on the boot device's label config. 5309241286Savg */ 5310241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 5311241286Savg &nvtop) == 0); 5312241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5313241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 5314241286Savg VDEV_ALLOC_ROOTPOOL); 5315241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 5316241286Savg if (error) { 5317241286Savg mutex_exit(&spa_namespace_lock); 5318241286Savg nvlist_free(config); 5319241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 5320241286Savg pname); 5321241286Savg return (error); 5322241286Savg } 5323241286Savg 5324241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5325241286Savg vdev_free(rvd); 5326241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 5327241286Savg mutex_exit(&spa_namespace_lock); 5328241286Savg 5329243213Savg nvlist_free(config); 5330243213Savg return (0); 5331241286Savg} 5332241286Savg 5333277300Ssmh#endif /* illumos */ 5334277300Ssmh#endif /* _KERNEL */ 5335219089Spjd 5336209962Smm/* 5337209962Smm * Import a non-root pool into the system. 5338209962Smm */ 5339185029Spjdint 5340219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 5341185029Spjd{ 5342209962Smm spa_t *spa; 5343209962Smm char *altroot = NULL; 5344219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 5345332550Smav zpool_load_policy_t policy; 5346219089Spjd uint64_t mode = spa_mode_global; 5347219089Spjd uint64_t readonly = B_FALSE; 5348209962Smm int error; 5349209962Smm nvlist_t *nvroot; 5350209962Smm nvlist_t **spares, **l2cache; 5351209962Smm uint_t nspares, nl2cache; 5352209962Smm 5353209962Smm /* 5354209962Smm * If a pool with this name exists, return failure. 5355209962Smm */ 5356209962Smm mutex_enter(&spa_namespace_lock); 5357219089Spjd if (spa_lookup(pool) != NULL) { 5358209962Smm mutex_exit(&spa_namespace_lock); 5359249195Smm return (SET_ERROR(EEXIST)); 5360209962Smm } 5361209962Smm 5362209962Smm /* 5363209962Smm * Create and initialize the spa structure. 5364209962Smm */ 5365209962Smm (void) nvlist_lookup_string(props, 5366209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5367219089Spjd (void) nvlist_lookup_uint64(props, 5368219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 5369219089Spjd if (readonly) 5370219089Spjd mode = FREAD; 5371219089Spjd spa = spa_add(pool, config, altroot); 5372219089Spjd spa->spa_import_flags = flags; 5373209962Smm 5374209962Smm /* 5375219089Spjd * Verbatim import - Take a pool and insert it into the namespace 5376219089Spjd * as if it had been loaded at boot. 5377219089Spjd */ 5378219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 5379219089Spjd if (props != NULL) 5380219089Spjd spa_configfile_set(spa, props, B_FALSE); 5381219089Spjd 5382332525Smav spa_write_cachefile(spa, B_FALSE, B_TRUE); 5383331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 5384332530Smav zfs_dbgmsg("spa_import: verbatim import of %s", pool); 5385219089Spjd mutex_exit(&spa_namespace_lock); 5386219089Spjd return (0); 5387219089Spjd } 5388219089Spjd 5389219089Spjd spa_activate(spa, mode); 5390219089Spjd 5391219089Spjd /* 5392209962Smm * Don't start async tasks until we know everything is healthy. 5393209962Smm */ 5394209962Smm spa_async_suspend(spa); 5395209962Smm 5396332550Smav zpool_get_load_policy(config, &policy); 5397332550Smav if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5398219089Spjd state = SPA_LOAD_RECOVER; 5399219089Spjd 5400332536Smav spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 5401332536Smav 5402332536Smav if (state != SPA_LOAD_RECOVER) { 5403219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5404332536Smav zfs_dbgmsg("spa_import: importing %s", pool); 5405332536Smav } else { 5406332536Smav zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 5407332550Smav "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 5408332536Smav } 5409332550Smav error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 5410209962Smm 5411219089Spjd /* 5412219089Spjd * Propagate anything learned while loading the pool and pass it 5413219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 5414219089Spjd */ 5415219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 5416219089Spjd spa->spa_load_info) == 0); 5417219089Spjd 5418209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5419209962Smm /* 5420209962Smm * Toss any existing sparelist, as it doesn't have any validity 5421209962Smm * anymore, and conflicts with spa_has_spare(). 5422209962Smm */ 5423209962Smm if (spa->spa_spares.sav_config) { 5424209962Smm nvlist_free(spa->spa_spares.sav_config); 5425209962Smm spa->spa_spares.sav_config = NULL; 5426209962Smm spa_load_spares(spa); 5427209962Smm } 5428209962Smm if (spa->spa_l2cache.sav_config) { 5429209962Smm nvlist_free(spa->spa_l2cache.sav_config); 5430209962Smm spa->spa_l2cache.sav_config = NULL; 5431209962Smm spa_load_l2cache(spa); 5432209962Smm } 5433209962Smm 5434209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 5435209962Smm &nvroot) == 0); 5436209962Smm if (error == 0) 5437209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 5438209962Smm VDEV_ALLOC_SPARE); 5439209962Smm if (error == 0) 5440209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 5441209962Smm VDEV_ALLOC_L2CACHE); 5442209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5443209962Smm 5444209962Smm if (props != NULL) 5445209962Smm spa_configfile_set(spa, props, B_FALSE); 5446209962Smm 5447209962Smm if (error != 0 || (props && spa_writeable(spa) && 5448209962Smm (error = spa_prop_set(spa, props)))) { 5449209962Smm spa_unload(spa); 5450209962Smm spa_deactivate(spa); 5451209962Smm spa_remove(spa); 5452209962Smm mutex_exit(&spa_namespace_lock); 5453209962Smm return (error); 5454209962Smm } 5455209962Smm 5456209962Smm spa_async_resume(spa); 5457209962Smm 5458209962Smm /* 5459209962Smm * Override any spares and level 2 cache devices as specified by 5460209962Smm * the user, as these may have correct device names/devids, etc. 5461209962Smm */ 5462209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5463209962Smm &spares, &nspares) == 0) { 5464209962Smm if (spa->spa_spares.sav_config) 5465209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 5466209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 5467209962Smm else 5468209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 5469209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 5470209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 5471209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 5472209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5473209962Smm spa_load_spares(spa); 5474209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5475209962Smm spa->spa_spares.sav_sync = B_TRUE; 5476209962Smm } 5477209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5478209962Smm &l2cache, &nl2cache) == 0) { 5479209962Smm if (spa->spa_l2cache.sav_config) 5480209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 5481209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 5482209962Smm else 5483209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 5484209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 5485209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 5486209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 5487209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5488209962Smm spa_load_l2cache(spa); 5489209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5490209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 5491209962Smm } 5492209962Smm 5493219089Spjd /* 5494219089Spjd * Check for any removed devices. 5495219089Spjd */ 5496219089Spjd if (spa->spa_autoreplace) { 5497219089Spjd spa_aux_check_removed(&spa->spa_spares); 5498219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 5499219089Spjd } 5500219089Spjd 5501209962Smm if (spa_writeable(spa)) { 5502209962Smm /* 5503209962Smm * Update the config cache to include the newly-imported pool. 5504209962Smm */ 5505209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5506209962Smm } 5507209962Smm 5508219089Spjd /* 5509219089Spjd * It's possible that the pool was expanded while it was exported. 5510219089Spjd * We kick off an async task to handle this for us. 5511219089Spjd */ 5512219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 5513219089Spjd 5514248571Smm spa_history_log_version(spa, "import"); 5515209962Smm 5516331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 5517287745Sdelphij 5518287745Sdelphij mutex_exit(&spa_namespace_lock); 5519287745Sdelphij 5520219089Spjd#ifdef __FreeBSD__ 5521219089Spjd#ifdef _KERNEL 5522219089Spjd zvol_create_minors(pool); 5523219089Spjd#endif 5524219089Spjd#endif 5525209962Smm return (0); 5526185029Spjd} 5527185029Spjd 5528168404Spjdnvlist_t * 5529168404Spjdspa_tryimport(nvlist_t *tryconfig) 5530168404Spjd{ 5531168404Spjd nvlist_t *config = NULL; 5532332536Smav char *poolname, *cachefile; 5533168404Spjd spa_t *spa; 5534168404Spjd uint64_t state; 5535208443Smm int error; 5536332550Smav zpool_load_policy_t policy; 5537168404Spjd 5538168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 5539168404Spjd return (NULL); 5540168404Spjd 5541168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 5542168404Spjd return (NULL); 5543168404Spjd 5544168404Spjd /* 5545168404Spjd * Create and initialize the spa structure. 5546168404Spjd */ 5547168404Spjd mutex_enter(&spa_namespace_lock); 5548219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 5549209962Smm spa_activate(spa, FREAD); 5550168404Spjd 5551168404Spjd /* 5552332550Smav * Rewind pool if a max txg was provided. 5553168404Spjd */ 5554332550Smav zpool_get_load_policy(spa->spa_config, &policy); 5555332550Smav if (policy.zlp_txg != UINT64_MAX) { 5556332550Smav spa->spa_load_max_txg = policy.zlp_txg; 5557332536Smav spa->spa_extreme_rewind = B_TRUE; 5558332536Smav zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 5559332550Smav poolname, (longlong_t)policy.zlp_txg); 5560332536Smav } else { 5561332536Smav zfs_dbgmsg("spa_tryimport: importing %s", poolname); 5562332536Smav } 5563168404Spjd 5564332536Smav if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 5565332536Smav == 0) { 5566332536Smav zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 5567332536Smav spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5568332536Smav } else { 5569332536Smav spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 5570332536Smav } 5571332536Smav 5572332536Smav error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 5573332536Smav 5574168404Spjd /* 5575168404Spjd * If 'tryconfig' was at least parsable, return the current config. 5576168404Spjd */ 5577168404Spjd if (spa->spa_root_vdev != NULL) { 5578168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5579168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 5580168404Spjd poolname) == 0); 5581168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5582168404Spjd state) == 0); 5583168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 5584168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 5585236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 5586236884Smm spa->spa_load_info) == 0); 5587168404Spjd 5588168404Spjd /* 5589185029Spjd * If the bootfs property exists on this pool then we 5590185029Spjd * copy it out so that external consumers can tell which 5591185029Spjd * pools are bootable. 5592168404Spjd */ 5593208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 5594185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 5595185029Spjd 5596185029Spjd /* 5597185029Spjd * We have to play games with the name since the 5598185029Spjd * pool was opened as TRYIMPORT_NAME. 5599185029Spjd */ 5600185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 5601185029Spjd spa->spa_bootfs, tmpname) == 0) { 5602185029Spjd char *cp; 5603185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 5604185029Spjd 5605185029Spjd cp = strchr(tmpname, '/'); 5606185029Spjd if (cp == NULL) { 5607185029Spjd (void) strlcpy(dsname, tmpname, 5608185029Spjd MAXPATHLEN); 5609185029Spjd } else { 5610185029Spjd (void) snprintf(dsname, MAXPATHLEN, 5611185029Spjd "%s/%s", poolname, ++cp); 5612185029Spjd } 5613185029Spjd VERIFY(nvlist_add_string(config, 5614185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 5615185029Spjd kmem_free(dsname, MAXPATHLEN); 5616185029Spjd } 5617185029Spjd kmem_free(tmpname, MAXPATHLEN); 5618185029Spjd } 5619185029Spjd 5620185029Spjd /* 5621185029Spjd * Add the list of hot spares and level 2 cache devices. 5622185029Spjd */ 5623209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5624168404Spjd spa_add_spares(spa, config); 5625185029Spjd spa_add_l2cache(spa, config); 5626209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 5627168404Spjd } 5628168404Spjd 5629168404Spjd spa_unload(spa); 5630168404Spjd spa_deactivate(spa); 5631168404Spjd spa_remove(spa); 5632168404Spjd mutex_exit(&spa_namespace_lock); 5633168404Spjd 5634168404Spjd return (config); 5635168404Spjd} 5636168404Spjd 5637168404Spjd/* 5638168404Spjd * Pool export/destroy 5639168404Spjd * 5640168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 5641168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 5642168404Spjd * update the pool state and sync all the labels to disk, removing the 5643207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 5644207670Smm * we don't sync the labels or remove the configuration cache. 5645168404Spjd */ 5646168404Spjdstatic int 5647185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 5648207670Smm boolean_t force, boolean_t hardforce) 5649168404Spjd{ 5650168404Spjd spa_t *spa; 5651168404Spjd 5652168404Spjd if (oldconfig) 5653168404Spjd *oldconfig = NULL; 5654168404Spjd 5655209962Smm if (!(spa_mode_global & FWRITE)) 5656249195Smm return (SET_ERROR(EROFS)); 5657168404Spjd 5658168404Spjd mutex_enter(&spa_namespace_lock); 5659168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 5660168404Spjd mutex_exit(&spa_namespace_lock); 5661249195Smm return (SET_ERROR(ENOENT)); 5662168404Spjd } 5663168404Spjd 5664168404Spjd /* 5665168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 5666168404Spjd * reacquire the namespace lock, and see if we can export. 5667168404Spjd */ 5668168404Spjd spa_open_ref(spa, FTAG); 5669168404Spjd mutex_exit(&spa_namespace_lock); 5670168404Spjd spa_async_suspend(spa); 5671168404Spjd mutex_enter(&spa_namespace_lock); 5672168404Spjd spa_close(spa, FTAG); 5673168404Spjd 5674168404Spjd /* 5675168404Spjd * The pool will be in core if it's openable, 5676168404Spjd * in which case we can modify its state. 5677168404Spjd */ 5678168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 5679168404Spjd /* 5680168404Spjd * Objsets may be open only because they're dirty, so we 5681168404Spjd * have to force it to sync before checking spa_refcnt. 5682168404Spjd */ 5683168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 5684286575Smav spa_evicting_os_wait(spa); 5685168404Spjd 5686168404Spjd /* 5687168404Spjd * A pool cannot be exported or destroyed if there are active 5688168404Spjd * references. If we are resetting a pool, allow references by 5689168404Spjd * fault injection handlers. 5690168404Spjd */ 5691168404Spjd if (!spa_refcount_zero(spa) || 5692168404Spjd (spa->spa_inject_ref != 0 && 5693168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 5694168404Spjd spa_async_resume(spa); 5695168404Spjd mutex_exit(&spa_namespace_lock); 5696249195Smm return (SET_ERROR(EBUSY)); 5697168404Spjd } 5698168404Spjd 5699185029Spjd /* 5700185029Spjd * A pool cannot be exported if it has an active shared spare. 5701185029Spjd * This is to prevent other pools stealing the active spare 5702185029Spjd * from an exported pool. At user's own will, such pool can 5703185029Spjd * be forcedly exported. 5704185029Spjd */ 5705185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 5706185029Spjd spa_has_active_shared_spare(spa)) { 5707185029Spjd spa_async_resume(spa); 5708185029Spjd mutex_exit(&spa_namespace_lock); 5709249195Smm return (SET_ERROR(EXDEV)); 5710185029Spjd } 5711168404Spjd 5712168404Spjd /* 5713168404Spjd * We want this to be reflected on every label, 5714168404Spjd * so mark them all dirty. spa_unload() will do the 5715168404Spjd * final sync that pushes these changes out. 5716168404Spjd */ 5717207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 5718185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5719168404Spjd spa->spa_state = new_state; 5720219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 5721219089Spjd TXG_DEFER_SIZE + 1; 5722168404Spjd vdev_config_dirty(spa->spa_root_vdev); 5723185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 5724168404Spjd } 5725168404Spjd } 5726168404Spjd 5727331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 5728185029Spjd 5729168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 5730168404Spjd spa_unload(spa); 5731168404Spjd spa_deactivate(spa); 5732168404Spjd } 5733168404Spjd 5734168404Spjd if (oldconfig && spa->spa_config) 5735168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 5736168404Spjd 5737168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 5738207670Smm if (!hardforce) 5739332525Smav spa_write_cachefile(spa, B_TRUE, B_TRUE); 5740168404Spjd spa_remove(spa); 5741168404Spjd } 5742168404Spjd mutex_exit(&spa_namespace_lock); 5743168404Spjd 5744168404Spjd return (0); 5745168404Spjd} 5746168404Spjd 5747168404Spjd/* 5748168404Spjd * Destroy a storage pool. 5749168404Spjd */ 5750168404Spjdint 5751168404Spjdspa_destroy(char *pool) 5752168404Spjd{ 5753207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 5754207670Smm B_FALSE, B_FALSE)); 5755168404Spjd} 5756168404Spjd 5757168404Spjd/* 5758168404Spjd * Export a storage pool. 5759168404Spjd */ 5760168404Spjdint 5761207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 5762207670Smm boolean_t hardforce) 5763168404Spjd{ 5764207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 5765207670Smm force, hardforce)); 5766168404Spjd} 5767168404Spjd 5768168404Spjd/* 5769168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 5770168404Spjd * from the namespace in any way. 5771168404Spjd */ 5772168404Spjdint 5773168404Spjdspa_reset(char *pool) 5774168404Spjd{ 5775185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 5776207670Smm B_FALSE, B_FALSE)); 5777168404Spjd} 5778168404Spjd 5779168404Spjd/* 5780168404Spjd * ========================================================================== 5781168404Spjd * Device manipulation 5782168404Spjd * ========================================================================== 5783168404Spjd */ 5784168404Spjd 5785168404Spjd/* 5786185029Spjd * Add a device to a storage pool. 5787168404Spjd */ 5788168404Spjdint 5789168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 5790168404Spjd{ 5791219089Spjd uint64_t txg, id; 5792209962Smm int error; 5793168404Spjd vdev_t *rvd = spa->spa_root_vdev; 5794168404Spjd vdev_t *vd, *tvd; 5795185029Spjd nvlist_t **spares, **l2cache; 5796185029Spjd uint_t nspares, nl2cache; 5797168404Spjd 5798219089Spjd ASSERT(spa_writeable(spa)); 5799219089Spjd 5800168404Spjd txg = spa_vdev_enter(spa); 5801168404Spjd 5802168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 5803168404Spjd VDEV_ALLOC_ADD)) != 0) 5804168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5805168404Spjd 5806185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 5807168404Spjd 5808185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 5809185029Spjd &nspares) != 0) 5810168404Spjd nspares = 0; 5811168404Spjd 5812185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 5813185029Spjd &nl2cache) != 0) 5814185029Spjd nl2cache = 0; 5815185029Spjd 5816185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 5817168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5818168404Spjd 5819185029Spjd if (vd->vdev_children != 0 && 5820185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 5821185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 5822168404Spjd 5823168404Spjd /* 5824185029Spjd * We must validate the spares and l2cache devices after checking the 5825185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 5826168404Spjd */ 5827185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 5828168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 5829168404Spjd 5830168404Spjd /* 5831332525Smav * If we are in the middle of a device removal, we can only add 5832332525Smav * devices which match the existing devices in the pool. 5833332525Smav * If we are in the middle of a removal, or have some indirect 5834332525Smav * vdevs, we can not add raidz toplevels. 5835168404Spjd */ 5836332525Smav if (spa->spa_vdev_removal != NULL || 5837332525Smav spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 5838332525Smav for (int c = 0; c < vd->vdev_children; c++) { 5839332525Smav tvd = vd->vdev_child[c]; 5840332525Smav if (spa->spa_vdev_removal != NULL && 5841332525Smav tvd->vdev_ashift != 5842332525Smav spa->spa_vdev_removal->svr_vdev->vdev_ashift) { 5843332525Smav return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5844332525Smav } 5845332525Smav /* Fail if top level vdev is raidz */ 5846332525Smav if (tvd->vdev_ops == &vdev_raidz_ops) { 5847332525Smav return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5848332525Smav } 5849332525Smav /* 5850332525Smav * Need the top level mirror to be 5851332525Smav * a mirror of leaf vdevs only 5852332525Smav */ 5853332525Smav if (tvd->vdev_ops == &vdev_mirror_ops) { 5854332525Smav for (uint64_t cid = 0; 5855332525Smav cid < tvd->vdev_children; cid++) { 5856332525Smav vdev_t *cvd = tvd->vdev_child[cid]; 5857332525Smav if (!cvd->vdev_ops->vdev_op_leaf) { 5858332525Smav return (spa_vdev_exit(spa, vd, 5859332525Smav txg, EINVAL)); 5860332525Smav } 5861332525Smav } 5862332525Smav } 5863332525Smav } 5864332525Smav } 5865332525Smav 5866209962Smm for (int c = 0; c < vd->vdev_children; c++) { 5867219089Spjd 5868219089Spjd /* 5869219089Spjd * Set the vdev id to the first hole, if one exists. 5870219089Spjd */ 5871219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 5872219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 5873219089Spjd vdev_free(rvd->vdev_child[id]); 5874219089Spjd break; 5875219089Spjd } 5876219089Spjd } 5877168404Spjd tvd = vd->vdev_child[c]; 5878168404Spjd vdev_remove_child(vd, tvd); 5879219089Spjd tvd->vdev_id = id; 5880168404Spjd vdev_add_child(rvd, tvd); 5881168404Spjd vdev_config_dirty(tvd); 5882168404Spjd } 5883168404Spjd 5884168404Spjd if (nspares != 0) { 5885185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 5886185029Spjd ZPOOL_CONFIG_SPARES); 5887168404Spjd spa_load_spares(spa); 5888185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5889168404Spjd } 5890168404Spjd 5891185029Spjd if (nl2cache != 0) { 5892185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 5893185029Spjd ZPOOL_CONFIG_L2CACHE); 5894185029Spjd spa_load_l2cache(spa); 5895185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5896185029Spjd } 5897185029Spjd 5898168404Spjd /* 5899168404Spjd * We have to be careful when adding new vdevs to an existing pool. 5900168404Spjd * If other threads start allocating from these vdevs before we 5901168404Spjd * sync the config cache, and we lose power, then upon reboot we may 5902168404Spjd * fail to open the pool because there are DVAs that the config cache 5903168404Spjd * can't translate. Therefore, we first add the vdevs without 5904168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 5905168404Spjd * and then let spa_config_update() initialize the new metaslabs. 5906168404Spjd * 5907168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 5908168404Spjd * if we lose power at any point in this sequence, the remaining 5909168404Spjd * steps will be completed the next time we load the pool. 5910168404Spjd */ 5911168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 5912168404Spjd 5913168404Spjd mutex_enter(&spa_namespace_lock); 5914168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5915331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 5916168404Spjd mutex_exit(&spa_namespace_lock); 5917168404Spjd 5918168404Spjd return (0); 5919168404Spjd} 5920168404Spjd 5921168404Spjd/* 5922168404Spjd * Attach a device to a mirror. The arguments are the path to any device 5923168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 5924168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 5925168404Spjd * 5926168404Spjd * If 'replacing' is specified, the new device is intended to replace the 5927168404Spjd * existing device; in this case the two devices are made into their own 5928185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 5929168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 5930168404Spjd * extra rules: you can't attach to it after it's been created, and upon 5931168404Spjd * completion of resilvering, the first disk (the one being replaced) 5932168404Spjd * is automatically detached. 5933168404Spjd */ 5934168404Spjdint 5935168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 5936168404Spjd{ 5937219089Spjd uint64_t txg, dtl_max_txg; 5938168404Spjd vdev_t *rvd = spa->spa_root_vdev; 5939168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 5940168404Spjd vdev_ops_t *pvops; 5941185029Spjd char *oldvdpath, *newvdpath; 5942185029Spjd int newvd_isspare; 5943185029Spjd int error; 5944168404Spjd 5945219089Spjd ASSERT(spa_writeable(spa)); 5946219089Spjd 5947168404Spjd txg = spa_vdev_enter(spa); 5948168404Spjd 5949185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 5950168404Spjd 5951332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5952332547Smav if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 5953332547Smav error = (spa_has_checkpoint(spa)) ? 5954332547Smav ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 5955332547Smav return (spa_vdev_exit(spa, NULL, txg, error)); 5956332547Smav } 5957332547Smav 5958332525Smav if (spa->spa_vdev_removal != NULL || 5959332525Smav spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 5960332525Smav return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5961332525Smav } 5962332525Smav 5963168404Spjd if (oldvd == NULL) 5964168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 5965168404Spjd 5966168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 5967168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5968168404Spjd 5969168404Spjd pvd = oldvd->vdev_parent; 5970168404Spjd 5971168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 5972230514Smm VDEV_ALLOC_ATTACH)) != 0) 5973185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5974185029Spjd 5975185029Spjd if (newrootvd->vdev_children != 1) 5976168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 5977168404Spjd 5978168404Spjd newvd = newrootvd->vdev_child[0]; 5979168404Spjd 5980168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 5981168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 5982168404Spjd 5983168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 5984168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 5985168404Spjd 5986185029Spjd /* 5987185029Spjd * Spares can't replace logs 5988185029Spjd */ 5989185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 5990185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5991185029Spjd 5992168404Spjd if (!replacing) { 5993168404Spjd /* 5994168404Spjd * For attach, the only allowable parent is a mirror or the root 5995168404Spjd * vdev. 5996168404Spjd */ 5997168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 5998168404Spjd pvd->vdev_ops != &vdev_root_ops) 5999168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6000168404Spjd 6001168404Spjd pvops = &vdev_mirror_ops; 6002168404Spjd } else { 6003168404Spjd /* 6004168404Spjd * Active hot spares can only be replaced by inactive hot 6005168404Spjd * spares. 6006168404Spjd */ 6007168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 6008219089Spjd oldvd->vdev_isspare && 6009168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 6010168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6011168404Spjd 6012168404Spjd /* 6013168404Spjd * If the source is a hot spare, and the parent isn't already a 6014168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 6015168404Spjd * want to create a replacing vdev. The user is not allowed to 6016168404Spjd * attach to a spared vdev child unless the 'isspare' state is 6017168404Spjd * the same (spare replaces spare, non-spare replaces 6018168404Spjd * non-spare). 6019168404Spjd */ 6020219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 6021219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 6022168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6023219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 6024219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 6025168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6026219089Spjd } 6027219089Spjd 6028219089Spjd if (newvd->vdev_isspare) 6029168404Spjd pvops = &vdev_spare_ops; 6030168404Spjd else 6031168404Spjd pvops = &vdev_replacing_ops; 6032168404Spjd } 6033168404Spjd 6034168404Spjd /* 6035219089Spjd * Make sure the new device is big enough. 6036168404Spjd */ 6037219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 6038168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 6039168404Spjd 6040168404Spjd /* 6041168404Spjd * The new device cannot have a higher alignment requirement 6042168404Spjd * than the top-level vdev. 6043168404Spjd */ 6044168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 6045168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 6046168404Spjd 6047168404Spjd /* 6048168404Spjd * If this is an in-place replacement, update oldvd's path and devid 6049168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 6050168404Spjd */ 6051168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 6052168404Spjd spa_strfree(oldvd->vdev_path); 6053168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 6054168404Spjd KM_SLEEP); 6055168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 6056168404Spjd newvd->vdev_path, "old"); 6057168404Spjd if (oldvd->vdev_devid != NULL) { 6058168404Spjd spa_strfree(oldvd->vdev_devid); 6059168404Spjd oldvd->vdev_devid = NULL; 6060168404Spjd } 6061168404Spjd } 6062168404Spjd 6063219089Spjd /* mark the device being resilvered */ 6064254112Sdelphij newvd->vdev_resilver_txg = txg; 6065219089Spjd 6066168404Spjd /* 6067168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 6068168404Spjd * mirror/replacing/spare vdev above oldvd. 6069168404Spjd */ 6070168404Spjd if (pvd->vdev_ops != pvops) 6071168404Spjd pvd = vdev_add_parent(oldvd, pvops); 6072168404Spjd 6073168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 6074168404Spjd ASSERT(pvd->vdev_ops == pvops); 6075168404Spjd ASSERT(oldvd->vdev_parent == pvd); 6076168404Spjd 6077168404Spjd /* 6078168404Spjd * Extract the new device from its root and add it to pvd. 6079168404Spjd */ 6080168404Spjd vdev_remove_child(newrootvd, newvd); 6081168404Spjd newvd->vdev_id = pvd->vdev_children; 6082219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 6083168404Spjd vdev_add_child(pvd, newvd); 6084168404Spjd 6085168404Spjd tvd = newvd->vdev_top; 6086168404Spjd ASSERT(pvd->vdev_top == tvd); 6087168404Spjd ASSERT(tvd->vdev_parent == rvd); 6088168404Spjd 6089168404Spjd vdev_config_dirty(tvd); 6090168404Spjd 6091168404Spjd /* 6092219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 6093219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 6094219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 6095168404Spjd */ 6096219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 6097168404Spjd 6098219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 6099219089Spjd dtl_max_txg - TXG_INITIAL); 6100168404Spjd 6101209962Smm if (newvd->vdev_isspare) { 6102168404Spjd spa_spare_activate(newvd); 6103331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 6104209962Smm } 6105209962Smm 6106185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 6107185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 6108185029Spjd newvd_isspare = newvd->vdev_isspare; 6109168404Spjd 6110168404Spjd /* 6111168404Spjd * Mark newvd's DTL dirty in this txg. 6112168404Spjd */ 6113168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 6114168404Spjd 6115219089Spjd /* 6116258717Savg * Schedule the resilver to restart in the future. We do this to 6117258717Savg * ensure that dmu_sync-ed blocks have been stitched into the 6118258717Savg * respective datasets. 6119219089Spjd */ 6120219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 6121168404Spjd 6122287745Sdelphij if (spa->spa_bootfs) 6123331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 6124287745Sdelphij 6125331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 6126287745Sdelphij 6127219089Spjd /* 6128219089Spjd * Commit the config 6129219089Spjd */ 6130219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 6131185029Spjd 6132248571Smm spa_history_log_internal(spa, "vdev attach", NULL, 6133219089Spjd "%s vdev=%s %s vdev=%s", 6134219089Spjd replacing && newvd_isspare ? "spare in" : 6135219089Spjd replacing ? "replace" : "attach", newvdpath, 6136219089Spjd replacing ? "for" : "to", oldvdpath); 6137219089Spjd 6138185029Spjd spa_strfree(oldvdpath); 6139185029Spjd spa_strfree(newvdpath); 6140185029Spjd 6141168404Spjd return (0); 6142168404Spjd} 6143168404Spjd 6144168404Spjd/* 6145168404Spjd * Detach a device from a mirror or replacing vdev. 6146251631Sdelphij * 6147168404Spjd * If 'replace_done' is specified, only detach if the parent 6148168404Spjd * is a replacing vdev. 6149168404Spjd */ 6150168404Spjdint 6151209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 6152168404Spjd{ 6153168404Spjd uint64_t txg; 6154209962Smm int error; 6155168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6156168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 6157168404Spjd boolean_t unspare = B_FALSE; 6158247187Smm uint64_t unspare_guid = 0; 6159219089Spjd char *vdpath; 6160168404Spjd 6161219089Spjd ASSERT(spa_writeable(spa)); 6162219089Spjd 6163168404Spjd txg = spa_vdev_enter(spa); 6164168404Spjd 6165185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 6166168404Spjd 6167332547Smav /* 6168332547Smav * Besides being called directly from the userland through the 6169332547Smav * ioctl interface, spa_vdev_detach() can be potentially called 6170332547Smav * at the end of spa_vdev_resilver_done(). 6171332547Smav * 6172332547Smav * In the regular case, when we have a checkpoint this shouldn't 6173332547Smav * happen as we never empty the DTLs of a vdev during the scrub 6174332547Smav * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 6175332547Smav * should never get here when we have a checkpoint. 6176332547Smav * 6177332547Smav * That said, even in a case when we checkpoint the pool exactly 6178332547Smav * as spa_vdev_resilver_done() calls this function everything 6179332547Smav * should be fine as the resilver will return right away. 6180332547Smav */ 6181332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6182332547Smav if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6183332547Smav error = (spa_has_checkpoint(spa)) ? 6184332547Smav ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6185332547Smav return (spa_vdev_exit(spa, NULL, txg, error)); 6186332547Smav } 6187332547Smav 6188168404Spjd if (vd == NULL) 6189168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 6190168404Spjd 6191168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 6192168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6193168404Spjd 6194168404Spjd pvd = vd->vdev_parent; 6195168404Spjd 6196168404Spjd /* 6197209962Smm * If the parent/child relationship is not as expected, don't do it. 6198209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 6199209962Smm * vdev that's replacing B with C. The user's intent in replacing 6200209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 6201209962Smm * the replace by detaching C, the expected behavior is to end up 6202209962Smm * M(A,B). But suppose that right after deciding to detach C, 6203209962Smm * the replacement of B completes. We would have M(A,C), and then 6204209962Smm * ask to detach C, which would leave us with just A -- not what 6205209962Smm * the user wanted. To prevent this, we make sure that the 6206209962Smm * parent/child relationship hasn't changed -- in this example, 6207209962Smm * that C's parent is still the replacing vdev R. 6208209962Smm */ 6209209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 6210209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 6211209962Smm 6212209962Smm /* 6213219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 6214168404Spjd */ 6215219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 6216219089Spjd pvd->vdev_ops != &vdev_spare_ops) 6217219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6218168404Spjd 6219168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 6220185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 6221168404Spjd 6222168404Spjd /* 6223168404Spjd * Only mirror, replacing, and spare vdevs support detach. 6224168404Spjd */ 6225168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 6226168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 6227168404Spjd pvd->vdev_ops != &vdev_spare_ops) 6228168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6229168404Spjd 6230168404Spjd /* 6231209962Smm * If this device has the only valid copy of some data, 6232209962Smm * we cannot safely detach it. 6233168404Spjd */ 6234209962Smm if (vdev_dtl_required(vd)) 6235168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 6236168404Spjd 6237209962Smm ASSERT(pvd->vdev_children >= 2); 6238168404Spjd 6239168404Spjd /* 6240185029Spjd * If we are detaching the second disk from a replacing vdev, then 6241185029Spjd * check to see if we changed the original vdev's path to have "/old" 6242185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 6243168404Spjd */ 6244219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 6245219089Spjd vd->vdev_path != NULL) { 6246219089Spjd size_t len = strlen(vd->vdev_path); 6247219089Spjd 6248219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 6249219089Spjd cvd = pvd->vdev_child[c]; 6250219089Spjd 6251219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 6252219089Spjd continue; 6253219089Spjd 6254219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 6255219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 6256219089Spjd spa_strfree(cvd->vdev_path); 6257219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 6258219089Spjd break; 6259219089Spjd } 6260185029Spjd } 6261185029Spjd } 6262168404Spjd 6263168404Spjd /* 6264168404Spjd * If we are detaching the original disk from a spare, then it implies 6265168404Spjd * that the spare should become a real disk, and be removed from the 6266168404Spjd * active spare list for the pool. 6267168404Spjd */ 6268168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 6269219089Spjd vd->vdev_id == 0 && 6270219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 6271168404Spjd unspare = B_TRUE; 6272168404Spjd 6273168404Spjd /* 6274168404Spjd * Erase the disk labels so the disk can be used for other things. 6275168404Spjd * This must be done after all other error cases are handled, 6276168404Spjd * but before we disembowel vd (so we can still do I/O to it). 6277168404Spjd * But if we can't do it, don't treat the error as fatal -- 6278168404Spjd * it may be that the unwritability of the disk is the reason 6279168404Spjd * it's being detached! 6280168404Spjd */ 6281168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 6282168404Spjd 6283168404Spjd /* 6284168404Spjd * Remove vd from its parent and compact the parent's children. 6285168404Spjd */ 6286168404Spjd vdev_remove_child(pvd, vd); 6287168404Spjd vdev_compact_children(pvd); 6288168404Spjd 6289168404Spjd /* 6290168404Spjd * Remember one of the remaining children so we can get tvd below. 6291168404Spjd */ 6292219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 6293168404Spjd 6294168404Spjd /* 6295168404Spjd * If we need to remove the remaining child from the list of hot spares, 6296209962Smm * do it now, marking the vdev as no longer a spare in the process. 6297209962Smm * We must do this before vdev_remove_parent(), because that can 6298209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 6299209962Smm * reason, we must remove the spare now, in the same txg as the detach; 6300209962Smm * otherwise someone could attach a new sibling, change the GUID, and 6301209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 6302168404Spjd */ 6303168404Spjd if (unspare) { 6304168404Spjd ASSERT(cvd->vdev_isspare); 6305168404Spjd spa_spare_remove(cvd); 6306168404Spjd unspare_guid = cvd->vdev_guid; 6307209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 6308219089Spjd cvd->vdev_unspare = B_TRUE; 6309168404Spjd } 6310168404Spjd 6311168404Spjd /* 6312168404Spjd * If the parent mirror/replacing vdev only has one child, 6313168404Spjd * the parent is no longer needed. Remove it from the tree. 6314168404Spjd */ 6315219089Spjd if (pvd->vdev_children == 1) { 6316219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 6317219089Spjd cvd->vdev_unspare = B_FALSE; 6318168404Spjd vdev_remove_parent(cvd); 6319219089Spjd } 6320168404Spjd 6321219089Spjd 6322168404Spjd /* 6323168404Spjd * We don't set tvd until now because the parent we just removed 6324168404Spjd * may have been the previous top-level vdev. 6325168404Spjd */ 6326168404Spjd tvd = cvd->vdev_top; 6327168404Spjd ASSERT(tvd->vdev_parent == rvd); 6328168404Spjd 6329168404Spjd /* 6330168404Spjd * Reevaluate the parent vdev state. 6331168404Spjd */ 6332185029Spjd vdev_propagate_state(cvd); 6333168404Spjd 6334168404Spjd /* 6335219089Spjd * If the 'autoexpand' property is set on the pool then automatically 6336219089Spjd * try to expand the size of the pool. For example if the device we 6337219089Spjd * just detached was smaller than the others, it may be possible to 6338219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 6339219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 6340168404Spjd */ 6341219089Spjd if (spa->spa_autoexpand) { 6342219089Spjd vdev_reopen(tvd); 6343219089Spjd vdev_expand(tvd, txg); 6344219089Spjd } 6345168404Spjd 6346168404Spjd vdev_config_dirty(tvd); 6347168404Spjd 6348168404Spjd /* 6349168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 6350168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 6351168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 6352168404Spjd * prevent vd from being accessed after it's freed. 6353168404Spjd */ 6354219089Spjd vdpath = spa_strdup(vd->vdev_path); 6355209962Smm for (int t = 0; t < TXG_SIZE; t++) 6356168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 6357168404Spjd vd->vdev_detached = B_TRUE; 6358168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 6359168404Spjd 6360331397Smav spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 6361185029Spjd 6362219089Spjd /* hang on to the spa before we release the lock */ 6363219089Spjd spa_open_ref(spa, FTAG); 6364219089Spjd 6365168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 6366168404Spjd 6367248571Smm spa_history_log_internal(spa, "detach", NULL, 6368219089Spjd "vdev=%s", vdpath); 6369219089Spjd spa_strfree(vdpath); 6370219089Spjd 6371168404Spjd /* 6372168404Spjd * If this was the removal of the original device in a hot spare vdev, 6373168404Spjd * then we want to go through and remove the device from the hot spare 6374168404Spjd * list of every other pool. 6375168404Spjd */ 6376168404Spjd if (unspare) { 6377219089Spjd spa_t *altspa = NULL; 6378219089Spjd 6379168404Spjd mutex_enter(&spa_namespace_lock); 6380219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 6381219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 6382219089Spjd altspa == spa) 6383168404Spjd continue; 6384219089Spjd 6385219089Spjd spa_open_ref(altspa, FTAG); 6386185029Spjd mutex_exit(&spa_namespace_lock); 6387219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 6388185029Spjd mutex_enter(&spa_namespace_lock); 6389219089Spjd spa_close(altspa, FTAG); 6390168404Spjd } 6391168404Spjd mutex_exit(&spa_namespace_lock); 6392219089Spjd 6393219089Spjd /* search the rest of the vdevs for spares to remove */ 6394219089Spjd spa_vdev_resilver_done(spa); 6395168404Spjd } 6396168404Spjd 6397219089Spjd /* all done with the spa; OK to release */ 6398219089Spjd mutex_enter(&spa_namespace_lock); 6399219089Spjd spa_close(spa, FTAG); 6400219089Spjd mutex_exit(&spa_namespace_lock); 6401219089Spjd 6402168404Spjd return (error); 6403168404Spjd} 6404168404Spjd 6405219089Spjd/* 6406219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 6407219089Spjd */ 6408219089Spjdint 6409219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 6410219089Spjd nvlist_t *props, boolean_t exp) 6411219089Spjd{ 6412219089Spjd int error = 0; 6413219089Spjd uint64_t txg, *glist; 6414219089Spjd spa_t *newspa; 6415219089Spjd uint_t c, children, lastlog; 6416219089Spjd nvlist_t **child, *nvl, *tmp; 6417219089Spjd dmu_tx_t *tx; 6418219089Spjd char *altroot = NULL; 6419219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 6420219089Spjd boolean_t activate_slog; 6421219089Spjd 6422219089Spjd ASSERT(spa_writeable(spa)); 6423219089Spjd 6424219089Spjd txg = spa_vdev_enter(spa); 6425219089Spjd 6426332547Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6427332547Smav if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6428332547Smav error = (spa_has_checkpoint(spa)) ? 6429332547Smav ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6430332547Smav return (spa_vdev_exit(spa, NULL, txg, error)); 6431332547Smav } 6432332547Smav 6433219089Spjd /* clear the log and flush everything up to now */ 6434219089Spjd activate_slog = spa_passivate_log(spa); 6435219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 6436332525Smav error = spa_reset_logs(spa); 6437219089Spjd txg = spa_vdev_config_enter(spa); 6438219089Spjd 6439219089Spjd if (activate_slog) 6440219089Spjd spa_activate_log(spa); 6441219089Spjd 6442219089Spjd if (error != 0) 6443219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 6444219089Spjd 6445219089Spjd /* check new spa name before going any further */ 6446219089Spjd if (spa_lookup(newname) != NULL) 6447219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 6448219089Spjd 6449219089Spjd /* 6450219089Spjd * scan through all the children to ensure they're all mirrors 6451219089Spjd */ 6452219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 6453219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 6454219089Spjd &children) != 0) 6455219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6456219089Spjd 6457219089Spjd /* first, check to ensure we've got the right child count */ 6458219089Spjd rvd = spa->spa_root_vdev; 6459219089Spjd lastlog = 0; 6460219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 6461219089Spjd vdev_t *vd = rvd->vdev_child[c]; 6462219089Spjd 6463219089Spjd /* don't count the holes & logs as children */ 6464332525Smav if (vd->vdev_islog || !vdev_is_concrete(vd)) { 6465219089Spjd if (lastlog == 0) 6466219089Spjd lastlog = c; 6467219089Spjd continue; 6468219089Spjd } 6469219089Spjd 6470219089Spjd lastlog = 0; 6471219089Spjd } 6472219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 6473219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6474219089Spjd 6475219089Spjd /* next, ensure no spare or cache devices are part of the split */ 6476219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 6477219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 6478219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6479219089Spjd 6480219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 6481219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 6482219089Spjd 6483219089Spjd /* then, loop over each vdev and validate it */ 6484219089Spjd for (c = 0; c < children; c++) { 6485219089Spjd uint64_t is_hole = 0; 6486219089Spjd 6487219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 6488219089Spjd &is_hole); 6489219089Spjd 6490219089Spjd if (is_hole != 0) { 6491219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 6492219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 6493219089Spjd continue; 6494219089Spjd } else { 6495249195Smm error = SET_ERROR(EINVAL); 6496219089Spjd break; 6497219089Spjd } 6498219089Spjd } 6499219089Spjd 6500219089Spjd /* which disk is going to be split? */ 6501219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 6502219089Spjd &glist[c]) != 0) { 6503249195Smm error = SET_ERROR(EINVAL); 6504219089Spjd break; 6505219089Spjd } 6506219089Spjd 6507219089Spjd /* look it up in the spa */ 6508219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 6509219089Spjd if (vml[c] == NULL) { 6510249195Smm error = SET_ERROR(ENODEV); 6511219089Spjd break; 6512219089Spjd } 6513219089Spjd 6514219089Spjd /* make sure there's nothing stopping the split */ 6515219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 6516219089Spjd vml[c]->vdev_islog || 6517332525Smav !vdev_is_concrete(vml[c]) || 6518219089Spjd vml[c]->vdev_isspare || 6519219089Spjd vml[c]->vdev_isl2cache || 6520219089Spjd !vdev_writeable(vml[c]) || 6521219089Spjd vml[c]->vdev_children != 0 || 6522219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 6523219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 6524249195Smm error = SET_ERROR(EINVAL); 6525219089Spjd break; 6526219089Spjd } 6527219089Spjd 6528219089Spjd if (vdev_dtl_required(vml[c])) { 6529249195Smm error = SET_ERROR(EBUSY); 6530219089Spjd break; 6531219089Spjd } 6532219089Spjd 6533219089Spjd /* we need certain info from the top level */ 6534219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 6535219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 6536219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 6537219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 6538219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 6539219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 6540219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 6541219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 6542299441Smav 6543299441Smav /* transfer per-vdev ZAPs */ 6544299441Smav ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 6545299441Smav VERIFY0(nvlist_add_uint64(child[c], 6546299441Smav ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 6547299441Smav 6548299441Smav ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 6549299441Smav VERIFY0(nvlist_add_uint64(child[c], 6550299441Smav ZPOOL_CONFIG_VDEV_TOP_ZAP, 6551299441Smav vml[c]->vdev_parent->vdev_top_zap)); 6552219089Spjd } 6553219089Spjd 6554219089Spjd if (error != 0) { 6555219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 6556219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 6557219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 6558219089Spjd } 6559219089Spjd 6560219089Spjd /* stop writers from using the disks */ 6561219089Spjd for (c = 0; c < children; c++) { 6562219089Spjd if (vml[c] != NULL) 6563219089Spjd vml[c]->vdev_offline = B_TRUE; 6564219089Spjd } 6565219089Spjd vdev_reopen(spa->spa_root_vdev); 6566219089Spjd 6567219089Spjd /* 6568219089Spjd * Temporarily record the splitting vdevs in the spa config. This 6569219089Spjd * will disappear once the config is regenerated. 6570219089Spjd */ 6571219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6572219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 6573219089Spjd glist, children) == 0); 6574219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 6575219089Spjd 6576219089Spjd mutex_enter(&spa->spa_props_lock); 6577219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 6578219089Spjd nvl) == 0); 6579219089Spjd mutex_exit(&spa->spa_props_lock); 6580219089Spjd spa->spa_config_splitting = nvl; 6581219089Spjd vdev_config_dirty(spa->spa_root_vdev); 6582219089Spjd 6583219089Spjd /* configure and create the new pool */ 6584219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 6585219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 6586219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 6587219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6588219089Spjd spa_version(spa)) == 0); 6589219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 6590219089Spjd spa->spa_config_txg) == 0); 6591219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 6592219089Spjd spa_generate_guid(NULL)) == 0); 6593299441Smav VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 6594219089Spjd (void) nvlist_lookup_string(props, 6595219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6596219089Spjd 6597219089Spjd /* add the new pool to the namespace */ 6598219089Spjd newspa = spa_add(newname, config, altroot); 6599299441Smav newspa->spa_avz_action = AVZ_ACTION_REBUILD; 6600219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 6601219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 6602219089Spjd 6603219089Spjd /* release the spa config lock, retaining the namespace lock */ 6604219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 6605219089Spjd 6606219089Spjd if (zio_injection_enabled) 6607219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 6608219089Spjd 6609219089Spjd spa_activate(newspa, spa_mode_global); 6610219089Spjd spa_async_suspend(newspa); 6611219089Spjd 6612277300Ssmh#ifndef illumos 6613219089Spjd /* mark that we are creating new spa by splitting */ 6614219089Spjd newspa->spa_splitting_newspa = B_TRUE; 6615219089Spjd#endif 6616332536Smav newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 6617332536Smav 6618219089Spjd /* create the new pool from the disks of the original pool */ 6619332536Smav error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 6620277300Ssmh#ifndef illumos 6621219089Spjd newspa->spa_splitting_newspa = B_FALSE; 6622219089Spjd#endif 6623219089Spjd if (error) 6624219089Spjd goto out; 6625219089Spjd 6626219089Spjd /* if that worked, generate a real config for the new pool */ 6627219089Spjd if (newspa->spa_root_vdev != NULL) { 6628219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 6629219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 6630219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 6631219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 6632219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 6633219089Spjd B_TRUE)); 6634219089Spjd } 6635219089Spjd 6636219089Spjd /* set the props */ 6637219089Spjd if (props != NULL) { 6638219089Spjd spa_configfile_set(newspa, props, B_FALSE); 6639219089Spjd error = spa_prop_set(newspa, props); 6640219089Spjd if (error) 6641219089Spjd goto out; 6642219089Spjd } 6643219089Spjd 6644219089Spjd /* flush everything */ 6645219089Spjd txg = spa_vdev_config_enter(newspa); 6646219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 6647219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 6648219089Spjd 6649219089Spjd if (zio_injection_enabled) 6650219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 6651219089Spjd 6652219089Spjd spa_async_resume(newspa); 6653219089Spjd 6654219089Spjd /* finally, update the original pool's config */ 6655219089Spjd txg = spa_vdev_config_enter(spa); 6656219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 6657219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 6658219089Spjd if (error != 0) 6659219089Spjd dmu_tx_abort(tx); 6660219089Spjd for (c = 0; c < children; c++) { 6661219089Spjd if (vml[c] != NULL) { 6662219089Spjd vdev_split(vml[c]); 6663219089Spjd if (error == 0) 6664248571Smm spa_history_log_internal(spa, "detach", tx, 6665248571Smm "vdev=%s", vml[c]->vdev_path); 6666299441Smav 6667219089Spjd vdev_free(vml[c]); 6668219089Spjd } 6669219089Spjd } 6670299441Smav spa->spa_avz_action = AVZ_ACTION_REBUILD; 6671219089Spjd vdev_config_dirty(spa->spa_root_vdev); 6672219089Spjd spa->spa_config_splitting = NULL; 6673219089Spjd nvlist_free(nvl); 6674219089Spjd if (error == 0) 6675219089Spjd dmu_tx_commit(tx); 6676219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 6677219089Spjd 6678219089Spjd if (zio_injection_enabled) 6679219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 6680219089Spjd 6681219089Spjd /* split is complete; log a history record */ 6682248571Smm spa_history_log_internal(newspa, "split", NULL, 6683248571Smm "from pool %s", spa_name(spa)); 6684219089Spjd 6685219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 6686219089Spjd 6687219089Spjd /* if we're not going to mount the filesystems in userland, export */ 6688219089Spjd if (exp) 6689219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 6690219089Spjd B_FALSE, B_FALSE); 6691219089Spjd 6692219089Spjd return (error); 6693219089Spjd 6694219089Spjdout: 6695219089Spjd spa_unload(newspa); 6696219089Spjd spa_deactivate(newspa); 6697219089Spjd spa_remove(newspa); 6698219089Spjd 6699219089Spjd txg = spa_vdev_config_enter(spa); 6700219089Spjd 6701219089Spjd /* re-online all offlined disks */ 6702219089Spjd for (c = 0; c < children; c++) { 6703219089Spjd if (vml[c] != NULL) 6704219089Spjd vml[c]->vdev_offline = B_FALSE; 6705219089Spjd } 6706219089Spjd vdev_reopen(spa->spa_root_vdev); 6707219089Spjd 6708219089Spjd nvlist_free(spa->spa_config_splitting); 6709219089Spjd spa->spa_config_splitting = NULL; 6710219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 6711219089Spjd 6712219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 6713219089Spjd return (error); 6714219089Spjd} 6715219089Spjd 6716168404Spjd/* 6717185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 6718251631Sdelphij * currently spared, so we can detach it. 6719168404Spjd */ 6720168404Spjdstatic vdev_t * 6721185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 6722168404Spjd{ 6723168404Spjd vdev_t *newvd, *oldvd; 6724168404Spjd 6725219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 6726185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 6727168404Spjd if (oldvd != NULL) 6728168404Spjd return (oldvd); 6729168404Spjd } 6730168404Spjd 6731185029Spjd /* 6732219089Spjd * Check for a completed replacement. We always consider the first 6733219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 6734219089Spjd * the newest (see spa_vdev_attach() for how that works). In 6735219089Spjd * the case where the newest vdev is faulted, we will not automatically 6736219089Spjd * remove it after a resilver completes. This is OK as it will require 6737219089Spjd * user intervention to determine which disk the admin wishes to keep. 6738185029Spjd */ 6739219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 6740219089Spjd ASSERT(vd->vdev_children > 1); 6741219089Spjd 6742219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 6743168404Spjd oldvd = vd->vdev_child[0]; 6744168404Spjd 6745209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 6746219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 6747209962Smm !vdev_dtl_required(oldvd)) 6748168404Spjd return (oldvd); 6749168404Spjd } 6750168404Spjd 6751185029Spjd /* 6752185029Spjd * Check for a completed resilver with the 'unspare' flag set. 6753185029Spjd */ 6754219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 6755219089Spjd vdev_t *first = vd->vdev_child[0]; 6756219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 6757185029Spjd 6758219089Spjd if (last->vdev_unspare) { 6759219089Spjd oldvd = first; 6760219089Spjd newvd = last; 6761219089Spjd } else if (first->vdev_unspare) { 6762219089Spjd oldvd = last; 6763219089Spjd newvd = first; 6764219089Spjd } else { 6765219089Spjd oldvd = NULL; 6766219089Spjd } 6767219089Spjd 6768219089Spjd if (oldvd != NULL && 6769209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 6770219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 6771219089Spjd !vdev_dtl_required(oldvd)) 6772185029Spjd return (oldvd); 6773219089Spjd 6774219089Spjd /* 6775219089Spjd * If there are more than two spares attached to a disk, 6776219089Spjd * and those spares are not required, then we want to 6777219089Spjd * attempt to free them up now so that they can be used 6778219089Spjd * by other pools. Once we're back down to a single 6779219089Spjd * disk+spare, we stop removing them. 6780219089Spjd */ 6781219089Spjd if (vd->vdev_children > 2) { 6782219089Spjd newvd = vd->vdev_child[1]; 6783219089Spjd 6784219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 6785219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 6786219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 6787219089Spjd !vdev_dtl_required(newvd)) 6788219089Spjd return (newvd); 6789185029Spjd } 6790185029Spjd } 6791185029Spjd 6792168404Spjd return (NULL); 6793168404Spjd} 6794168404Spjd 6795168404Spjdstatic void 6796185029Spjdspa_vdev_resilver_done(spa_t *spa) 6797168404Spjd{ 6798209962Smm vdev_t *vd, *pvd, *ppvd; 6799209962Smm uint64_t guid, sguid, pguid, ppguid; 6800168404Spjd 6801209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6802168404Spjd 6803185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 6804209962Smm pvd = vd->vdev_parent; 6805209962Smm ppvd = pvd->vdev_parent; 6806168404Spjd guid = vd->vdev_guid; 6807209962Smm pguid = pvd->vdev_guid; 6808209962Smm ppguid = ppvd->vdev_guid; 6809209962Smm sguid = 0; 6810168404Spjd /* 6811168404Spjd * If we have just finished replacing a hot spared device, then 6812168404Spjd * we need to detach the parent's first child (the original hot 6813168404Spjd * spare) as well. 6814168404Spjd */ 6815219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 6816219089Spjd ppvd->vdev_children == 2) { 6817168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 6818209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 6819168404Spjd } 6820254112Sdelphij ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 6821254112Sdelphij 6822209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 6823209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 6824168404Spjd return; 6825209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 6826168404Spjd return; 6827209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6828168404Spjd } 6829168404Spjd 6830209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 6831168404Spjd} 6832168404Spjd 6833168404Spjd/* 6834219089Spjd * Update the stored path or FRU for this vdev. 6835168404Spjd */ 6836168404Spjdint 6837209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 6838209962Smm boolean_t ispath) 6839168404Spjd{ 6840185029Spjd vdev_t *vd; 6841219089Spjd boolean_t sync = B_FALSE; 6842168404Spjd 6843219089Spjd ASSERT(spa_writeable(spa)); 6844168404Spjd 6845219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 6846219089Spjd 6847209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 6848219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 6849168404Spjd 6850168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 6851219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 6852168404Spjd 6853209962Smm if (ispath) { 6854219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 6855219089Spjd spa_strfree(vd->vdev_path); 6856219089Spjd vd->vdev_path = spa_strdup(value); 6857219089Spjd sync = B_TRUE; 6858219089Spjd } 6859209962Smm } else { 6860219089Spjd if (vd->vdev_fru == NULL) { 6861219089Spjd vd->vdev_fru = spa_strdup(value); 6862219089Spjd sync = B_TRUE; 6863219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 6864209962Smm spa_strfree(vd->vdev_fru); 6865219089Spjd vd->vdev_fru = spa_strdup(value); 6866219089Spjd sync = B_TRUE; 6867219089Spjd } 6868209962Smm } 6869168404Spjd 6870219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 6871168404Spjd} 6872168404Spjd 6873209962Smmint 6874209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 6875209962Smm{ 6876209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 6877209962Smm} 6878209962Smm 6879209962Smmint 6880209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 6881209962Smm{ 6882209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 6883209962Smm} 6884209962Smm 6885168404Spjd/* 6886168404Spjd * ========================================================================== 6887219089Spjd * SPA Scanning 6888168404Spjd * ========================================================================== 6889168404Spjd */ 6890324010Savgint 6891324010Savgspa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 6892324010Savg{ 6893324010Savg ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6894168404Spjd 6895324010Savg if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6896324010Savg return (SET_ERROR(EBUSY)); 6897324010Savg 6898324010Savg return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 6899324010Savg} 6900324010Savg 6901168404Spjdint 6902219089Spjdspa_scan_stop(spa_t *spa) 6903168404Spjd{ 6904185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6905219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6906249195Smm return (SET_ERROR(EBUSY)); 6907219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 6908219089Spjd} 6909168404Spjd 6910219089Spjdint 6911219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 6912219089Spjd{ 6913219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6914219089Spjd 6915219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 6916249195Smm return (SET_ERROR(ENOTSUP)); 6917168404Spjd 6918168404Spjd /* 6919185029Spjd * If a resilver was requested, but there is no DTL on a 6920185029Spjd * writeable leaf device, we have nothing to do. 6921168404Spjd */ 6922219089Spjd if (func == POOL_SCAN_RESILVER && 6923185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 6924185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 6925168404Spjd return (0); 6926168404Spjd } 6927168404Spjd 6928219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 6929168404Spjd} 6930168404Spjd 6931168404Spjd/* 6932168404Spjd * ========================================================================== 6933168404Spjd * SPA async task processing 6934168404Spjd * ========================================================================== 6935168404Spjd */ 6936168404Spjd 6937168404Spjdstatic void 6938185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 6939168404Spjd{ 6940185029Spjd if (vd->vdev_remove_wanted) { 6941219089Spjd vd->vdev_remove_wanted = B_FALSE; 6942219089Spjd vd->vdev_delayed_close = B_FALSE; 6943185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 6944209962Smm 6945209962Smm /* 6946209962Smm * We want to clear the stats, but we don't want to do a full 6947209962Smm * vdev_clear() as that will cause us to throw away 6948209962Smm * degraded/faulted state as well as attempt to reopen the 6949209962Smm * device, all of which is a waste. 6950209962Smm */ 6951209962Smm vd->vdev_stat.vs_read_errors = 0; 6952209962Smm vd->vdev_stat.vs_write_errors = 0; 6953209962Smm vd->vdev_stat.vs_checksum_errors = 0; 6954209962Smm 6955185029Spjd vdev_state_dirty(vd->vdev_top); 6956294027Sasomers /* Tell userspace that the vdev is gone. */ 6957294027Sasomers zfs_post_remove(spa, vd); 6958185029Spjd } 6959168404Spjd 6960185029Spjd for (int c = 0; c < vd->vdev_children; c++) 6961185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 6962185029Spjd} 6963168404Spjd 6964185029Spjdstatic void 6965185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 6966185029Spjd{ 6967185029Spjd if (vd->vdev_probe_wanted) { 6968219089Spjd vd->vdev_probe_wanted = B_FALSE; 6969185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 6970168404Spjd } 6971168404Spjd 6972185029Spjd for (int c = 0; c < vd->vdev_children; c++) 6973185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 6974168404Spjd} 6975168404Spjd 6976168404Spjdstatic void 6977219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 6978219089Spjd{ 6979219089Spjd sysevent_id_t eid; 6980219089Spjd nvlist_t *attr; 6981219089Spjd char *physpath; 6982219089Spjd 6983219089Spjd if (!spa->spa_autoexpand) 6984219089Spjd return; 6985219089Spjd 6986219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 6987219089Spjd vdev_t *cvd = vd->vdev_child[c]; 6988219089Spjd spa_async_autoexpand(spa, cvd); 6989219089Spjd } 6990219089Spjd 6991219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 6992219089Spjd return; 6993219089Spjd 6994219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 6995219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 6996219089Spjd 6997219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6998219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 6999219089Spjd 7000219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 7001219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 7002219089Spjd 7003219089Spjd nvlist_free(attr); 7004219089Spjd kmem_free(physpath, MAXPATHLEN); 7005219089Spjd} 7006219089Spjd 7007219089Spjdstatic void 7008168404Spjdspa_async_thread(void *arg) 7009168404Spjd{ 7010331399Smav spa_t *spa = (spa_t *)arg; 7011168404Spjd int tasks; 7012168404Spjd 7013168404Spjd ASSERT(spa->spa_sync_on); 7014168404Spjd 7015168404Spjd mutex_enter(&spa->spa_async_lock); 7016168404Spjd tasks = spa->spa_async_tasks; 7017253990Smav spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 7018168404Spjd mutex_exit(&spa->spa_async_lock); 7019168404Spjd 7020168404Spjd /* 7021168404Spjd * See if the config needs to be updated. 7022168404Spjd */ 7023168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 7024219089Spjd uint64_t old_space, new_space; 7025219089Spjd 7026168404Spjd mutex_enter(&spa_namespace_lock); 7027219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 7028168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 7029219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 7030168404Spjd mutex_exit(&spa_namespace_lock); 7031219089Spjd 7032219089Spjd /* 7033219089Spjd * If the pool grew as a result of the config update, 7034219089Spjd * then log an internal history event. 7035219089Spjd */ 7036219089Spjd if (new_space != old_space) { 7037248571Smm spa_history_log_internal(spa, "vdev online", NULL, 7038219089Spjd "pool '%s' size: %llu(+%llu)", 7039219089Spjd spa_name(spa), new_space, new_space - old_space); 7040219089Spjd } 7041168404Spjd } 7042168404Spjd 7043219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 7044219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7045219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 7046219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 7047219089Spjd } 7048219089Spjd 7049168404Spjd /* 7050185029Spjd * See if any devices need to be probed. 7051168404Spjd */ 7052185029Spjd if (tasks & SPA_ASYNC_PROBE) { 7053219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 7054185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 7055185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 7056185029Spjd } 7057168404Spjd 7058168404Spjd /* 7059185029Spjd * If any devices are done replacing, detach them. 7060168404Spjd */ 7061185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 7062185029Spjd spa_vdev_resilver_done(spa); 7063168404Spjd 7064168404Spjd /* 7065168404Spjd * Kick off a resilver. 7066168404Spjd */ 7067168404Spjd if (tasks & SPA_ASYNC_RESILVER) 7068219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 7069168404Spjd 7070168404Spjd /* 7071168404Spjd * Let the world know that we're done. 7072168404Spjd */ 7073168404Spjd mutex_enter(&spa->spa_async_lock); 7074168404Spjd spa->spa_async_thread = NULL; 7075168404Spjd cv_broadcast(&spa->spa_async_cv); 7076168404Spjd mutex_exit(&spa->spa_async_lock); 7077168404Spjd thread_exit(); 7078168404Spjd} 7079168404Spjd 7080253990Smavstatic void 7081253990Smavspa_async_thread_vd(void *arg) 7082253990Smav{ 7083253990Smav spa_t *spa = arg; 7084253990Smav int tasks; 7085253990Smav 7086253990Smav mutex_enter(&spa->spa_async_lock); 7087253990Smav tasks = spa->spa_async_tasks; 7088253990Smavretry: 7089253990Smav spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 7090253990Smav mutex_exit(&spa->spa_async_lock); 7091253990Smav 7092253990Smav /* 7093253990Smav * See if any devices need to be marked REMOVED. 7094253990Smav */ 7095253990Smav if (tasks & SPA_ASYNC_REMOVE) { 7096253990Smav spa_vdev_state_enter(spa, SCL_NONE); 7097253990Smav spa_async_remove(spa, spa->spa_root_vdev); 7098253990Smav for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 7099253990Smav spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 7100253990Smav for (int i = 0; i < spa->spa_spares.sav_count; i++) 7101253990Smav spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 7102253990Smav (void) spa_vdev_state_exit(spa, NULL, 0); 7103253990Smav } 7104253990Smav 7105253990Smav /* 7106253990Smav * Let the world know that we're done. 7107253990Smav */ 7108253990Smav mutex_enter(&spa->spa_async_lock); 7109253990Smav tasks = spa->spa_async_tasks; 7110253990Smav if ((tasks & SPA_ASYNC_REMOVE) != 0) 7111253990Smav goto retry; 7112253990Smav spa->spa_async_thread_vd = NULL; 7113253990Smav cv_broadcast(&spa->spa_async_cv); 7114253990Smav mutex_exit(&spa->spa_async_lock); 7115253990Smav thread_exit(); 7116253990Smav} 7117253990Smav 7118168404Spjdvoid 7119168404Spjdspa_async_suspend(spa_t *spa) 7120168404Spjd{ 7121168404Spjd mutex_enter(&spa->spa_async_lock); 7122168404Spjd spa->spa_async_suspended++; 7123332525Smav while (spa->spa_async_thread != NULL || 7124332537Smav spa->spa_async_thread_vd != NULL) 7125168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 7126168404Spjd mutex_exit(&spa->spa_async_lock); 7127332525Smav 7128332525Smav spa_vdev_remove_suspend(spa); 7129332537Smav 7130332537Smav zthr_t *condense_thread = spa->spa_condense_zthr; 7131332537Smav if (condense_thread != NULL && zthr_isrunning(condense_thread)) 7132332537Smav VERIFY0(zthr_cancel(condense_thread)); 7133332547Smav 7134332547Smav zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 7135332547Smav if (discard_thread != NULL && zthr_isrunning(discard_thread)) 7136332547Smav VERIFY0(zthr_cancel(discard_thread)); 7137168404Spjd} 7138168404Spjd 7139168404Spjdvoid 7140168404Spjdspa_async_resume(spa_t *spa) 7141168404Spjd{ 7142168404Spjd mutex_enter(&spa->spa_async_lock); 7143168404Spjd ASSERT(spa->spa_async_suspended != 0); 7144168404Spjd spa->spa_async_suspended--; 7145168404Spjd mutex_exit(&spa->spa_async_lock); 7146332525Smav spa_restart_removal(spa); 7147332537Smav 7148332537Smav zthr_t *condense_thread = spa->spa_condense_zthr; 7149332537Smav if (condense_thread != NULL && !zthr_isrunning(condense_thread)) 7150332537Smav zthr_resume(condense_thread); 7151332547Smav 7152332547Smav zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 7153332547Smav if (discard_thread != NULL && !zthr_isrunning(discard_thread)) 7154332547Smav zthr_resume(discard_thread); 7155168404Spjd} 7156168404Spjd 7157251636Sdelphijstatic boolean_t 7158251636Sdelphijspa_async_tasks_pending(spa_t *spa) 7159251636Sdelphij{ 7160251636Sdelphij uint_t non_config_tasks; 7161251636Sdelphij uint_t config_task; 7162251636Sdelphij boolean_t config_task_suspended; 7163251636Sdelphij 7164253990Smav non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 7165253990Smav SPA_ASYNC_REMOVE); 7166251636Sdelphij config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 7167251636Sdelphij if (spa->spa_ccw_fail_time == 0) { 7168251636Sdelphij config_task_suspended = B_FALSE; 7169251636Sdelphij } else { 7170251636Sdelphij config_task_suspended = 7171251636Sdelphij (gethrtime() - spa->spa_ccw_fail_time) < 7172251636Sdelphij (zfs_ccw_retry_interval * NANOSEC); 7173251636Sdelphij } 7174251636Sdelphij 7175251636Sdelphij return (non_config_tasks || (config_task && !config_task_suspended)); 7176251636Sdelphij} 7177251636Sdelphij 7178168404Spjdstatic void 7179168404Spjdspa_async_dispatch(spa_t *spa) 7180168404Spjd{ 7181168404Spjd mutex_enter(&spa->spa_async_lock); 7182251636Sdelphij if (spa_async_tasks_pending(spa) && 7183251636Sdelphij !spa->spa_async_suspended && 7184168404Spjd spa->spa_async_thread == NULL && 7185251636Sdelphij rootdir != NULL) 7186168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 7187168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 7188168404Spjd mutex_exit(&spa->spa_async_lock); 7189168404Spjd} 7190168404Spjd 7191253990Smavstatic void 7192253990Smavspa_async_dispatch_vd(spa_t *spa) 7193253990Smav{ 7194253990Smav mutex_enter(&spa->spa_async_lock); 7195253990Smav if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 7196253990Smav !spa->spa_async_suspended && 7197253990Smav spa->spa_async_thread_vd == NULL && 7198253990Smav rootdir != NULL) 7199253990Smav spa->spa_async_thread_vd = thread_create(NULL, 0, 7200253990Smav spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 7201253990Smav mutex_exit(&spa->spa_async_lock); 7202253990Smav} 7203253990Smav 7204168404Spjdvoid 7205168404Spjdspa_async_request(spa_t *spa, int task) 7206168404Spjd{ 7207219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 7208168404Spjd mutex_enter(&spa->spa_async_lock); 7209168404Spjd spa->spa_async_tasks |= task; 7210168404Spjd mutex_exit(&spa->spa_async_lock); 7211253990Smav spa_async_dispatch_vd(spa); 7212168404Spjd} 7213168404Spjd 7214168404Spjd/* 7215168404Spjd * ========================================================================== 7216168404Spjd * SPA syncing routines 7217168404Spjd * ========================================================================== 7218168404Spjd */ 7219168404Spjd 7220219089Spjdstatic int 7221219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 7222168404Spjd{ 7223219089Spjd bpobj_t *bpo = arg; 7224219089Spjd bpobj_enqueue(bpo, bp, tx); 7225219089Spjd return (0); 7226219089Spjd} 7227168404Spjd 7228219089Spjdstatic int 7229219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 7230219089Spjd{ 7231219089Spjd zio_t *zio = arg; 7232168404Spjd 7233219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 7234240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 7235219089Spjd return (0); 7236168404Spjd} 7237168404Spjd 7238258632Savg/* 7239258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 7240258632Savg * amount of time spent syncing frees. 7241258632Savg */ 7242168404Spjdstatic void 7243258632Savgspa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 7244258632Savg{ 7245258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 7246258632Savg bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 7247258632Savg VERIFY(zio_wait(zio) == 0); 7248258632Savg} 7249258632Savg 7250258632Savg/* 7251258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 7252258632Savg * amount of time spent syncing deferred frees. 7253258632Savg */ 7254258632Savgstatic void 7255258632Savgspa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 7256258632Savg{ 7257258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 7258258632Savg VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 7259258632Savg spa_free_sync_cb, zio, tx), ==, 0); 7260258632Savg VERIFY0(zio_wait(zio)); 7261258632Savg} 7262258632Savg 7263258632Savg 7264258632Savgstatic void 7265168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 7266168404Spjd{ 7267168404Spjd char *packed = NULL; 7268185029Spjd size_t bufsize; 7269168404Spjd size_t nvsize = 0; 7270168404Spjd dmu_buf_t *db; 7271168404Spjd 7272168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 7273168404Spjd 7274185029Spjd /* 7275185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 7276260150Sdelphij * information. This avoids the dmu_buf_will_dirty() path and 7277185029Spjd * saves us a pre-read to get data we don't actually care about. 7278185029Spjd */ 7279236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 7280185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 7281168404Spjd 7282168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 7283168404Spjd KM_SLEEP) == 0); 7284185029Spjd bzero(packed + nvsize, bufsize - nvsize); 7285168404Spjd 7286185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 7287168404Spjd 7288185029Spjd kmem_free(packed, bufsize); 7289168404Spjd 7290168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 7291168404Spjd dmu_buf_will_dirty(db, tx); 7292168404Spjd *(uint64_t *)db->db_data = nvsize; 7293168404Spjd dmu_buf_rele(db, FTAG); 7294168404Spjd} 7295168404Spjd 7296168404Spjdstatic void 7297185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 7298185029Spjd const char *config, const char *entry) 7299168404Spjd{ 7300168404Spjd nvlist_t *nvroot; 7301185029Spjd nvlist_t **list; 7302168404Spjd int i; 7303168404Spjd 7304185029Spjd if (!sav->sav_sync) 7305168404Spjd return; 7306168404Spjd 7307168404Spjd /* 7308185029Spjd * Update the MOS nvlist describing the list of available devices. 7309185029Spjd * spa_validate_aux() will have already made sure this nvlist is 7310185029Spjd * valid and the vdevs are labeled appropriately. 7311168404Spjd */ 7312185029Spjd if (sav->sav_object == 0) { 7313185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 7314185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 7315185029Spjd sizeof (uint64_t), tx); 7316168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 7317185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 7318185029Spjd &sav->sav_object, tx) == 0); 7319168404Spjd } 7320168404Spjd 7321168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 7322185029Spjd if (sav->sav_count == 0) { 7323185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 7324168404Spjd } else { 7325185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 7326185029Spjd for (i = 0; i < sav->sav_count; i++) 7327185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 7328219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 7329185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 7330185029Spjd sav->sav_count) == 0); 7331185029Spjd for (i = 0; i < sav->sav_count; i++) 7332185029Spjd nvlist_free(list[i]); 7333185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 7334168404Spjd } 7335168404Spjd 7336185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 7337168404Spjd nvlist_free(nvroot); 7338168404Spjd 7339185029Spjd sav->sav_sync = B_FALSE; 7340168404Spjd} 7341168404Spjd 7342299441Smav/* 7343299441Smav * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 7344299441Smav * The all-vdev ZAP must be empty. 7345299441Smav */ 7346168404Spjdstatic void 7347299441Smavspa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 7348299441Smav{ 7349299441Smav spa_t *spa = vd->vdev_spa; 7350299441Smav if (vd->vdev_top_zap != 0) { 7351299441Smav VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 7352299441Smav vd->vdev_top_zap, tx)); 7353299441Smav } 7354299441Smav if (vd->vdev_leaf_zap != 0) { 7355299441Smav VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 7356299441Smav vd->vdev_leaf_zap, tx)); 7357299441Smav } 7358299441Smav for (uint64_t i = 0; i < vd->vdev_children; i++) { 7359299441Smav spa_avz_build(vd->vdev_child[i], avz, tx); 7360299441Smav } 7361299441Smav} 7362299441Smav 7363299441Smavstatic void 7364168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 7365168404Spjd{ 7366168404Spjd nvlist_t *config; 7367168404Spjd 7368299441Smav /* 7369299441Smav * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 7370299441Smav * its config may not be dirty but we still need to build per-vdev ZAPs. 7371299441Smav * Similarly, if the pool is being assembled (e.g. after a split), we 7372299441Smav * need to rebuild the AVZ although the config may not be dirty. 7373299441Smav */ 7374299441Smav if (list_is_empty(&spa->spa_config_dirty_list) && 7375299441Smav spa->spa_avz_action == AVZ_ACTION_NONE) 7376168404Spjd return; 7377168404Spjd 7378185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7379168404Spjd 7380299441Smav ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 7381321540Smav spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 7382299441Smav spa->spa_all_vdev_zaps != 0); 7383299441Smav 7384299441Smav if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 7385299441Smav /* Make and build the new AVZ */ 7386299441Smav uint64_t new_avz = zap_create(spa->spa_meta_objset, 7387299441Smav DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 7388299441Smav spa_avz_build(spa->spa_root_vdev, new_avz, tx); 7389299441Smav 7390299441Smav /* Diff old AVZ with new one */ 7391299441Smav zap_cursor_t zc; 7392299441Smav zap_attribute_t za; 7393299441Smav 7394299441Smav for (zap_cursor_init(&zc, spa->spa_meta_objset, 7395299441Smav spa->spa_all_vdev_zaps); 7396299441Smav zap_cursor_retrieve(&zc, &za) == 0; 7397299441Smav zap_cursor_advance(&zc)) { 7398299441Smav uint64_t vdzap = za.za_first_integer; 7399299441Smav if (zap_lookup_int(spa->spa_meta_objset, new_avz, 7400299441Smav vdzap) == ENOENT) { 7401299441Smav /* 7402299441Smav * ZAP is listed in old AVZ but not in new one; 7403299441Smav * destroy it 7404299441Smav */ 7405299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 7406299441Smav tx)); 7407299441Smav } 7408299441Smav } 7409299441Smav 7410299441Smav zap_cursor_fini(&zc); 7411299441Smav 7412299441Smav /* Destroy the old AVZ */ 7413299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, 7414299441Smav spa->spa_all_vdev_zaps, tx)); 7415299441Smav 7416299441Smav /* Replace the old AVZ in the dir obj with the new one */ 7417299441Smav VERIFY0(zap_update(spa->spa_meta_objset, 7418299441Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 7419299441Smav sizeof (new_avz), 1, &new_avz, tx)); 7420299441Smav 7421299441Smav spa->spa_all_vdev_zaps = new_avz; 7422299441Smav } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 7423299441Smav zap_cursor_t zc; 7424299441Smav zap_attribute_t za; 7425299441Smav 7426299441Smav /* Walk through the AVZ and destroy all listed ZAPs */ 7427299441Smav for (zap_cursor_init(&zc, spa->spa_meta_objset, 7428299441Smav spa->spa_all_vdev_zaps); 7429299441Smav zap_cursor_retrieve(&zc, &za) == 0; 7430299441Smav zap_cursor_advance(&zc)) { 7431299441Smav uint64_t zap = za.za_first_integer; 7432299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 7433299441Smav } 7434299441Smav 7435299441Smav zap_cursor_fini(&zc); 7436299441Smav 7437299441Smav /* Destroy and unlink the AVZ itself */ 7438299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, 7439299441Smav spa->spa_all_vdev_zaps, tx)); 7440299441Smav VERIFY0(zap_remove(spa->spa_meta_objset, 7441299441Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 7442299441Smav spa->spa_all_vdev_zaps = 0; 7443299441Smav } 7444299441Smav 7445299441Smav if (spa->spa_all_vdev_zaps == 0) { 7446299441Smav spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 7447299441Smav DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 7448299441Smav DMU_POOL_VDEV_ZAP_MAP, tx); 7449299441Smav } 7450299441Smav spa->spa_avz_action = AVZ_ACTION_NONE; 7451299441Smav 7452299441Smav /* Create ZAPs for vdevs that don't have them. */ 7453299441Smav vdev_construct_zaps(spa->spa_root_vdev, tx); 7454299441Smav 7455185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 7456185029Spjd dmu_tx_get_txg(tx), B_FALSE); 7457185029Spjd 7458243505Smm /* 7459243505Smm * If we're upgrading the spa version then make sure that 7460243505Smm * the config object gets updated with the correct version. 7461243505Smm */ 7462243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 7463243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 7464243505Smm spa->spa_uberblock.ub_version); 7465243505Smm 7466185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 7467185029Spjd 7468296528Smav nvlist_free(spa->spa_config_syncing); 7469168404Spjd spa->spa_config_syncing = config; 7470168404Spjd 7471168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 7472168404Spjd} 7473168404Spjd 7474236884Smmstatic void 7475248571Smmspa_sync_version(void *arg, dmu_tx_t *tx) 7476236884Smm{ 7477248571Smm uint64_t *versionp = arg; 7478248571Smm uint64_t version = *versionp; 7479248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 7480236884Smm 7481236884Smm /* 7482236884Smm * Setting the version is special cased when first creating the pool. 7483236884Smm */ 7484236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 7485236884Smm 7486247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 7487236884Smm ASSERT(version >= spa_version(spa)); 7488236884Smm 7489236884Smm spa->spa_uberblock.ub_version = version; 7490236884Smm vdev_config_dirty(spa->spa_root_vdev); 7491248571Smm spa_history_log_internal(spa, "set", tx, "version=%lld", version); 7492236884Smm} 7493236884Smm 7494185029Spjd/* 7495185029Spjd * Set zpool properties. 7496185029Spjd */ 7497168404Spjdstatic void 7498248571Smmspa_sync_props(void *arg, dmu_tx_t *tx) 7499168404Spjd{ 7500248571Smm nvlist_t *nvp = arg; 7501248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 7502185029Spjd objset_t *mos = spa->spa_meta_objset; 7503236884Smm nvpair_t *elem = NULL; 7504168404Spjd 7505168404Spjd mutex_enter(&spa->spa_props_lock); 7506168404Spjd 7507185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 7508236884Smm uint64_t intval; 7509236884Smm char *strval, *fname; 7510236884Smm zpool_prop_t prop; 7511236884Smm const char *propname; 7512236884Smm zprop_type_t proptype; 7513259813Sdelphij spa_feature_t fid; 7514236884Smm 7515185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 7516329493Smav case ZPOOL_PROP_INVAL: 7517236884Smm /* 7518236884Smm * We checked this earlier in spa_prop_validate(). 7519236884Smm */ 7520236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 7521236884Smm 7522236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 7523259813Sdelphij VERIFY0(zfeature_lookup_name(fname, &fid)); 7524236884Smm 7525259813Sdelphij spa_feature_enable(spa, fid, tx); 7526248571Smm spa_history_log_internal(spa, "set", tx, 7527248571Smm "%s=enabled", nvpair_name(elem)); 7528236884Smm break; 7529236884Smm 7530185029Spjd case ZPOOL_PROP_VERSION: 7531258717Savg intval = fnvpair_value_uint64(elem); 7532185029Spjd /* 7533236884Smm * The version is synced seperatly before other 7534236884Smm * properties and should be correct by now. 7535185029Spjd */ 7536236884Smm ASSERT3U(spa_version(spa), >=, intval); 7537185029Spjd break; 7538168404Spjd 7539185029Spjd case ZPOOL_PROP_ALTROOT: 7540185029Spjd /* 7541185029Spjd * 'altroot' is a non-persistent property. It should 7542185029Spjd * have been set temporarily at creation or import time. 7543185029Spjd */ 7544185029Spjd ASSERT(spa->spa_root != NULL); 7545185029Spjd break; 7546168404Spjd 7547219089Spjd case ZPOOL_PROP_READONLY: 7548185029Spjd case ZPOOL_PROP_CACHEFILE: 7549185029Spjd /* 7550219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 7551219089Spjd * properties. 7552185029Spjd */ 7553168404Spjd break; 7554228103Smm case ZPOOL_PROP_COMMENT: 7555258717Savg strval = fnvpair_value_string(elem); 7556228103Smm if (spa->spa_comment != NULL) 7557228103Smm spa_strfree(spa->spa_comment); 7558228103Smm spa->spa_comment = spa_strdup(strval); 7559228103Smm /* 7560228103Smm * We need to dirty the configuration on all the vdevs 7561228103Smm * so that their labels get updated. It's unnecessary 7562228103Smm * to do this for pool creation since the vdev's 7563228103Smm * configuratoin has already been dirtied. 7564228103Smm */ 7565228103Smm if (tx->tx_txg != TXG_INITIAL) 7566228103Smm vdev_config_dirty(spa->spa_root_vdev); 7567248571Smm spa_history_log_internal(spa, "set", tx, 7568248571Smm "%s=%s", nvpair_name(elem), strval); 7569228103Smm break; 7570185029Spjd default: 7571185029Spjd /* 7572185029Spjd * Set pool property values in the poolprops mos object. 7573185029Spjd */ 7574185029Spjd if (spa->spa_pool_props_object == 0) { 7575236884Smm spa->spa_pool_props_object = 7576236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 7577185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 7578236884Smm tx); 7579185029Spjd } 7580185029Spjd 7581185029Spjd /* normalize the property name */ 7582185029Spjd propname = zpool_prop_to_name(prop); 7583185029Spjd proptype = zpool_prop_get_type(prop); 7584185029Spjd 7585185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 7586185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 7587258717Savg strval = fnvpair_value_string(elem); 7588258717Savg VERIFY0(zap_update(mos, 7589185029Spjd spa->spa_pool_props_object, propname, 7590258717Savg 1, strlen(strval) + 1, strval, tx)); 7591248571Smm spa_history_log_internal(spa, "set", tx, 7592248571Smm "%s=%s", nvpair_name(elem), strval); 7593185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 7594258717Savg intval = fnvpair_value_uint64(elem); 7595185029Spjd 7596185029Spjd if (proptype == PROP_TYPE_INDEX) { 7597185029Spjd const char *unused; 7598258717Savg VERIFY0(zpool_prop_index_to_string( 7599258717Savg prop, intval, &unused)); 7600185029Spjd } 7601258717Savg VERIFY0(zap_update(mos, 7602185029Spjd spa->spa_pool_props_object, propname, 7603258717Savg 8, 1, &intval, tx)); 7604248571Smm spa_history_log_internal(spa, "set", tx, 7605248571Smm "%s=%lld", nvpair_name(elem), intval); 7606185029Spjd } else { 7607185029Spjd ASSERT(0); /* not allowed */ 7608185029Spjd } 7609185029Spjd 7610185029Spjd switch (prop) { 7611185029Spjd case ZPOOL_PROP_DELEGATION: 7612185029Spjd spa->spa_delegation = intval; 7613185029Spjd break; 7614185029Spjd case ZPOOL_PROP_BOOTFS: 7615185029Spjd spa->spa_bootfs = intval; 7616185029Spjd break; 7617185029Spjd case ZPOOL_PROP_FAILUREMODE: 7618185029Spjd spa->spa_failmode = intval; 7619185029Spjd break; 7620219089Spjd case ZPOOL_PROP_AUTOEXPAND: 7621219089Spjd spa->spa_autoexpand = intval; 7622219089Spjd if (tx->tx_txg != TXG_INITIAL) 7623219089Spjd spa_async_request(spa, 7624219089Spjd SPA_ASYNC_AUTOEXPAND); 7625219089Spjd break; 7626219089Spjd case ZPOOL_PROP_DEDUPDITTO: 7627219089Spjd spa->spa_dedup_ditto = intval; 7628219089Spjd break; 7629185029Spjd default: 7630185029Spjd break; 7631185029Spjd } 7632168404Spjd } 7633185029Spjd 7634168404Spjd } 7635185029Spjd 7636185029Spjd mutex_exit(&spa->spa_props_lock); 7637168404Spjd} 7638168404Spjd 7639168404Spjd/* 7640219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 7641219089Spjd * reflect the new version this txg, so there must be no changes this 7642219089Spjd * txg to anything that the upgrade code depends on after it executes. 7643219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 7644219089Spjd * tasks. 7645219089Spjd */ 7646219089Spjdstatic void 7647219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 7648219089Spjd{ 7649219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 7650219089Spjd 7651219089Spjd ASSERT(spa->spa_sync_pass == 1); 7652219089Spjd 7653248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 7654248571Smm 7655219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 7656219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 7657219089Spjd dsl_pool_create_origin(dp, tx); 7658219089Spjd 7659219089Spjd /* Keeping the origin open increases spa_minref */ 7660219089Spjd spa->spa_minref += 3; 7661219089Spjd } 7662219089Spjd 7663219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 7664219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 7665219089Spjd dsl_pool_upgrade_clones(dp, tx); 7666219089Spjd } 7667219089Spjd 7668219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 7669219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 7670219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 7671219089Spjd 7672219089Spjd /* Keeping the freedir open increases spa_minref */ 7673219089Spjd spa->spa_minref += 3; 7674219089Spjd } 7675236884Smm 7676236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 7677236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 7678236884Smm spa_feature_create_zap_objects(spa, tx); 7679236884Smm } 7680268126Sdelphij 7681268126Sdelphij /* 7682268126Sdelphij * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 7683268126Sdelphij * when possibility to use lz4 compression for metadata was added 7684268126Sdelphij * Old pools that have this feature enabled must be upgraded to have 7685268126Sdelphij * this feature active 7686268126Sdelphij */ 7687268126Sdelphij if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 7688268126Sdelphij boolean_t lz4_en = spa_feature_is_enabled(spa, 7689268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 7690268126Sdelphij boolean_t lz4_ac = spa_feature_is_active(spa, 7691268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 7692268126Sdelphij 7693268126Sdelphij if (lz4_en && !lz4_ac) 7694268126Sdelphij spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 7695268126Sdelphij } 7696289422Smav 7697289422Smav /* 7698289422Smav * If we haven't written the salt, do so now. Note that the 7699289422Smav * feature may not be activated yet, but that's fine since 7700289422Smav * the presence of this ZAP entry is backwards compatible. 7701289422Smav */ 7702289422Smav if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 7703289422Smav DMU_POOL_CHECKSUM_SALT) == ENOENT) { 7704289422Smav VERIFY0(zap_add(spa->spa_meta_objset, 7705289422Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 7706289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes), 7707289422Smav spa->spa_cksum_salt.zcs_bytes, tx)); 7708289422Smav } 7709289422Smav 7710248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 7711219089Spjd} 7712219089Spjd 7713332525Smavstatic void 7714332525Smavvdev_indirect_state_sync_verify(vdev_t *vd) 7715332525Smav{ 7716332525Smav vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 7717332525Smav vdev_indirect_births_t *vib = vd->vdev_indirect_births; 7718332525Smav 7719332525Smav if (vd->vdev_ops == &vdev_indirect_ops) { 7720332525Smav ASSERT(vim != NULL); 7721332525Smav ASSERT(vib != NULL); 7722332525Smav } 7723332525Smav 7724332525Smav if (vdev_obsolete_sm_object(vd) != 0) { 7725332525Smav ASSERT(vd->vdev_obsolete_sm != NULL); 7726332525Smav ASSERT(vd->vdev_removing || 7727332525Smav vd->vdev_ops == &vdev_indirect_ops); 7728332525Smav ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 7729332525Smav ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 7730332525Smav 7731332525Smav ASSERT3U(vdev_obsolete_sm_object(vd), ==, 7732332525Smav space_map_object(vd->vdev_obsolete_sm)); 7733332525Smav ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 7734332525Smav space_map_allocated(vd->vdev_obsolete_sm)); 7735332525Smav } 7736332525Smav ASSERT(vd->vdev_obsolete_segments != NULL); 7737332525Smav 7738332525Smav /* 7739332525Smav * Since frees / remaps to an indirect vdev can only 7740332525Smav * happen in syncing context, the obsolete segments 7741332525Smav * tree must be empty when we start syncing. 7742332525Smav */ 7743332525Smav ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 7744332525Smav} 7745332525Smav 7746219089Spjd/* 7747168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 7748168404Spjd * part of the process, so we iterate until it converges. 7749168404Spjd */ 7750168404Spjdvoid 7751168404Spjdspa_sync(spa_t *spa, uint64_t txg) 7752168404Spjd{ 7753168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 7754168404Spjd objset_t *mos = spa->spa_meta_objset; 7755219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 7756168404Spjd vdev_t *rvd = spa->spa_root_vdev; 7757168404Spjd vdev_t *vd; 7758168404Spjd dmu_tx_t *tx; 7759185029Spjd int error; 7760307277Smav uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 7761307277Smav zfs_vdev_queue_depth_pct / 100; 7762168404Spjd 7763219089Spjd VERIFY(spa_writeable(spa)); 7764219089Spjd 7765168404Spjd /* 7766332525Smav * Wait for i/os issued in open context that need to complete 7767332525Smav * before this txg syncs. 7768332525Smav */ 7769332525Smav VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK])); 7770332525Smav spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0); 7771332525Smav 7772332525Smav /* 7773168404Spjd * Lock out configuration changes. 7774168404Spjd */ 7775185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7776168404Spjd 7777168404Spjd spa->spa_syncing_txg = txg; 7778168404Spjd spa->spa_sync_pass = 0; 7779168404Spjd 7780307277Smav mutex_enter(&spa->spa_alloc_lock); 7781307277Smav VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 7782307277Smav mutex_exit(&spa->spa_alloc_lock); 7783307277Smav 7784185029Spjd /* 7785185029Spjd * If there are any pending vdev state changes, convert them 7786185029Spjd * into config changes that go out with this transaction group. 7787185029Spjd */ 7788185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7789209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 7790209962Smm /* 7791209962Smm * We need the write lock here because, for aux vdevs, 7792209962Smm * calling vdev_config_dirty() modifies sav_config. 7793209962Smm * This is ugly and will become unnecessary when we 7794209962Smm * eliminate the aux vdev wart by integrating all vdevs 7795209962Smm * into the root vdev tree. 7796209962Smm */ 7797209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7798209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 7799209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 7800209962Smm vdev_state_clean(vd); 7801209962Smm vdev_config_dirty(vd); 7802209962Smm } 7803209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7804209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7805185029Spjd } 7806185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 7807185029Spjd 7808168404Spjd tx = dmu_tx_create_assigned(dp, txg); 7809168404Spjd 7810247265Smm spa->spa_sync_starttime = gethrtime(); 7811247265Smm#ifdef illumos 7812247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 7813247265Smm spa->spa_sync_starttime + spa->spa_deadman_synctime)); 7814277300Ssmh#else /* !illumos */ 7815247265Smm#ifdef _KERNEL 7816314665Savg callout_schedule(&spa->spa_deadman_cycid, 7817314665Savg hz * spa->spa_deadman_synctime / NANOSEC); 7818247265Smm#endif 7819277300Ssmh#endif /* illumos */ 7820247265Smm 7821168404Spjd /* 7822185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 7823168404Spjd * set spa_deflate if we have no raid-z vdevs. 7824168404Spjd */ 7825185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 7826185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 7827168404Spjd int i; 7828168404Spjd 7829168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 7830168404Spjd vd = rvd->vdev_child[i]; 7831168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 7832168404Spjd break; 7833168404Spjd } 7834168404Spjd if (i == rvd->vdev_children) { 7835168404Spjd spa->spa_deflate = TRUE; 7836168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 7837168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 7838168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 7839168404Spjd } 7840168404Spjd } 7841168404Spjd 7842168404Spjd /* 7843307277Smav * Set the top-level vdev's max queue depth. Evaluate each 7844307277Smav * top-level's async write queue depth in case it changed. 7845307277Smav * The max queue depth will not change in the middle of syncing 7846307277Smav * out this txg. 7847307277Smav */ 7848307277Smav uint64_t queue_depth_total = 0; 7849307277Smav for (int c = 0; c < rvd->vdev_children; c++) { 7850307277Smav vdev_t *tvd = rvd->vdev_child[c]; 7851307277Smav metaslab_group_t *mg = tvd->vdev_mg; 7852307277Smav 7853307277Smav if (mg == NULL || mg->mg_class != spa_normal_class(spa) || 7854307277Smav !metaslab_group_initialized(mg)) 7855307277Smav continue; 7856307277Smav 7857307277Smav /* 7858307277Smav * It is safe to do a lock-free check here because only async 7859307277Smav * allocations look at mg_max_alloc_queue_depth, and async 7860307277Smav * allocations all happen from spa_sync(). 7861307277Smav */ 7862307277Smav ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); 7863307277Smav mg->mg_max_alloc_queue_depth = max_queue_depth; 7864307277Smav queue_depth_total += mg->mg_max_alloc_queue_depth; 7865307277Smav } 7866307277Smav metaslab_class_t *mc = spa_normal_class(spa); 7867307277Smav ASSERT0(refcount_count(&mc->mc_alloc_slots)); 7868307277Smav mc->mc_alloc_max_slots = queue_depth_total; 7869307277Smav mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 7870307277Smav 7871307277Smav ASSERT3U(mc->mc_alloc_max_slots, <=, 7872307277Smav max_queue_depth * rvd->vdev_children); 7873307277Smav 7874332525Smav for (int c = 0; c < rvd->vdev_children; c++) { 7875332525Smav vdev_t *vd = rvd->vdev_child[c]; 7876332525Smav vdev_indirect_state_sync_verify(vd); 7877332525Smav 7878332525Smav if (vdev_indirect_should_condense(vd)) { 7879332525Smav spa_condense_indirect_start_sync(vd, tx); 7880332525Smav break; 7881332525Smav } 7882332525Smav } 7883332525Smav 7884307277Smav /* 7885168404Spjd * Iterate to convergence. 7886168404Spjd */ 7887168404Spjd do { 7888219089Spjd int pass = ++spa->spa_sync_pass; 7889168404Spjd 7890168404Spjd spa_sync_config_object(spa, tx); 7891185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 7892185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 7893185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 7894185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 7895168404Spjd spa_errlog_sync(spa, txg); 7896168404Spjd dsl_pool_sync(dp, txg); 7897168404Spjd 7898243503Smm if (pass < zfs_sync_pass_deferred_free) { 7899258632Savg spa_sync_frees(spa, free_bpl, tx); 7900219089Spjd } else { 7901275781Sdelphij /* 7902275781Sdelphij * We can not defer frees in pass 1, because 7903275781Sdelphij * we sync the deferred frees later in pass 1. 7904275781Sdelphij */ 7905275781Sdelphij ASSERT3U(pass, >, 1); 7906219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 7907258632Savg &spa->spa_deferred_bpobj, tx); 7908168404Spjd } 7909168404Spjd 7910219089Spjd ddt_sync(spa, txg); 7911219089Spjd dsl_scan_sync(dp, tx); 7912168404Spjd 7913332525Smav if (spa->spa_vdev_removal != NULL) 7914332525Smav svr_sync(spa, tx); 7915332525Smav 7916332525Smav while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 7917332525Smav != NULL) 7918219089Spjd vdev_sync(vd, txg); 7919168404Spjd 7920275781Sdelphij if (pass == 1) { 7921219089Spjd spa_sync_upgrades(spa, tx); 7922275781Sdelphij ASSERT3U(txg, >=, 7923275781Sdelphij spa->spa_uberblock.ub_rootbp.blk_birth); 7924275781Sdelphij /* 7925275781Sdelphij * Note: We need to check if the MOS is dirty 7926275781Sdelphij * because we could have marked the MOS dirty 7927275781Sdelphij * without updating the uberblock (e.g. if we 7928275781Sdelphij * have sync tasks but no dirty user data). We 7929275781Sdelphij * need to check the uberblock's rootbp because 7930275781Sdelphij * it is updated if we have synced out dirty 7931275781Sdelphij * data (though in this case the MOS will most 7932275781Sdelphij * likely also be dirty due to second order 7933275781Sdelphij * effects, we don't want to rely on that here). 7934275781Sdelphij */ 7935275781Sdelphij if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 7936275781Sdelphij !dmu_objset_is_dirty(mos, txg)) { 7937275781Sdelphij /* 7938275781Sdelphij * Nothing changed on the first pass, 7939275781Sdelphij * therefore this TXG is a no-op. Avoid 7940275781Sdelphij * syncing deferred frees, so that we 7941275781Sdelphij * can keep this TXG as a no-op. 7942275781Sdelphij */ 7943275781Sdelphij ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 7944275781Sdelphij txg)); 7945275781Sdelphij ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7946275781Sdelphij ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 7947332547Smav ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, 7948332547Smav txg)); 7949275781Sdelphij break; 7950275781Sdelphij } 7951275781Sdelphij spa_sync_deferred_frees(spa, tx); 7952275781Sdelphij } 7953168404Spjd 7954219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 7955219089Spjd 7956299441Smav if (!list_is_empty(&spa->spa_config_dirty_list)) { 7957299441Smav /* 7958299441Smav * Make sure that the number of ZAPs for all the vdevs matches 7959299441Smav * the number of ZAPs in the per-vdev ZAP list. This only gets 7960299441Smav * called if the config is dirty; otherwise there may be 7961299441Smav * outstanding AVZ operations that weren't completed in 7962299441Smav * spa_sync_config_object. 7963299441Smav */ 7964299441Smav uint64_t all_vdev_zap_entry_count; 7965299441Smav ASSERT0(zap_count(spa->spa_meta_objset, 7966299441Smav spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 7967299441Smav ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 7968299441Smav all_vdev_zap_entry_count); 7969299441Smav } 7970299441Smav 7971332525Smav if (spa->spa_vdev_removal != NULL) { 7972332525Smav ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 7973332525Smav } 7974332525Smav 7975168404Spjd /* 7976168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 7977168404Spjd * to commit the transaction group. 7978168404Spjd * 7979185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 7980185029Spjd * random top-level vdevs that are known to be visible in the 7981185029Spjd * config cache (see spa_vdev_add() for a complete description). 7982185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 7983168404Spjd */ 7984185029Spjd for (;;) { 7985185029Spjd /* 7986185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 7987185029Spjd * while we're attempting to write the vdev labels. 7988185029Spjd */ 7989185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7990168404Spjd 7991185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 7992332547Smav vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 7993185029Spjd int svdcount = 0; 7994185029Spjd int children = rvd->vdev_children; 7995185029Spjd int c0 = spa_get_random(children); 7996185029Spjd 7997219089Spjd for (int c = 0; c < children; c++) { 7998185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 7999332547Smav 8000332547Smav /* Stop when revisiting the first vdev */ 8001332547Smav if (c > 0 && svd[0] == vd) 8002332547Smav break; 8003332547Smav 8004332525Smav if (vd->vdev_ms_array == 0 || vd->vdev_islog || 8005332525Smav !vdev_is_concrete(vd)) 8006185029Spjd continue; 8007332547Smav 8008185029Spjd svd[svdcount++] = vd; 8009332536Smav if (svdcount == SPA_SYNC_MIN_VDEVS) 8010185029Spjd break; 8011185029Spjd } 8012294811Smav error = vdev_config_sync(svd, svdcount, txg); 8013185029Spjd } else { 8014185029Spjd error = vdev_config_sync(rvd->vdev_child, 8015294811Smav rvd->vdev_children, txg); 8016168404Spjd } 8017185029Spjd 8018239620Smm if (error == 0) 8019239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 8020239620Smm 8021185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 8022185029Spjd 8023185029Spjd if (error == 0) 8024185029Spjd break; 8025185029Spjd zio_suspend(spa, NULL); 8026185029Spjd zio_resume_wait(spa); 8027168404Spjd } 8028168404Spjd dmu_tx_commit(tx); 8029168404Spjd 8030247265Smm#ifdef illumos 8031247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 8032277300Ssmh#else /* !illumos */ 8033247265Smm#ifdef _KERNEL 8034247265Smm callout_drain(&spa->spa_deadman_cycid); 8035247265Smm#endif 8036277300Ssmh#endif /* illumos */ 8037247265Smm 8038168404Spjd /* 8039168404Spjd * Clear the dirty config list. 8040168404Spjd */ 8041185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 8042168404Spjd vdev_config_clean(vd); 8043168404Spjd 8044168404Spjd /* 8045168404Spjd * Now that the new config has synced transactionally, 8046168404Spjd * let it become visible to the config cache. 8047168404Spjd */ 8048168404Spjd if (spa->spa_config_syncing != NULL) { 8049168404Spjd spa_config_set(spa, spa->spa_config_syncing); 8050168404Spjd spa->spa_config_txg = txg; 8051168404Spjd spa->spa_config_syncing = NULL; 8052168404Spjd } 8053168404Spjd 8054219089Spjd dsl_pool_sync_done(dp, txg); 8055168404Spjd 8056307277Smav mutex_enter(&spa->spa_alloc_lock); 8057307277Smav VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 8058307277Smav mutex_exit(&spa->spa_alloc_lock); 8059307277Smav 8060168404Spjd /* 8061168404Spjd * Update usable space statistics. 8062168404Spjd */ 8063168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 8064168404Spjd vdev_sync_done(vd, txg); 8065168404Spjd 8066219089Spjd spa_update_dspace(spa); 8067219089Spjd 8068168404Spjd /* 8069168404Spjd * It had better be the case that we didn't dirty anything 8070168404Spjd * since vdev_config_sync(). 8071168404Spjd */ 8072168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 8073168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 8074168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 8075168404Spjd 8076332547Smav while (zfs_pause_spa_sync) 8077332547Smav delay(1); 8078332547Smav 8079219089Spjd spa->spa_sync_pass = 0; 8080219089Spjd 8081310515Savg /* 8082310515Savg * Update the last synced uberblock here. We want to do this at 8083310515Savg * the end of spa_sync() so that consumers of spa_last_synced_txg() 8084310515Savg * will be guaranteed that all the processing associated with 8085310515Savg * that txg has been completed. 8086310515Savg */ 8087310515Savg spa->spa_ubsync = spa->spa_uberblock; 8088185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 8089168404Spjd 8090219089Spjd spa_handle_ignored_writes(spa); 8091219089Spjd 8092168404Spjd /* 8093168404Spjd * If any async tasks have been requested, kick them off. 8094168404Spjd */ 8095168404Spjd spa_async_dispatch(spa); 8096253990Smav spa_async_dispatch_vd(spa); 8097168404Spjd} 8098168404Spjd 8099168404Spjd/* 8100168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 8101168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 8102168404Spjd * sync. 8103168404Spjd */ 8104168404Spjdvoid 8105168404Spjdspa_sync_allpools(void) 8106168404Spjd{ 8107168404Spjd spa_t *spa = NULL; 8108168404Spjd mutex_enter(&spa_namespace_lock); 8109168404Spjd while ((spa = spa_next(spa)) != NULL) { 8110219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 8111219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 8112168404Spjd continue; 8113168404Spjd spa_open_ref(spa, FTAG); 8114168404Spjd mutex_exit(&spa_namespace_lock); 8115168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 8116168404Spjd mutex_enter(&spa_namespace_lock); 8117168404Spjd spa_close(spa, FTAG); 8118168404Spjd } 8119168404Spjd mutex_exit(&spa_namespace_lock); 8120168404Spjd} 8121168404Spjd 8122168404Spjd/* 8123168404Spjd * ========================================================================== 8124168404Spjd * Miscellaneous routines 8125168404Spjd * ========================================================================== 8126168404Spjd */ 8127168404Spjd 8128168404Spjd/* 8129168404Spjd * Remove all pools in the system. 8130168404Spjd */ 8131168404Spjdvoid 8132168404Spjdspa_evict_all(void) 8133168404Spjd{ 8134168404Spjd spa_t *spa; 8135168404Spjd 8136168404Spjd /* 8137168404Spjd * Remove all cached state. All pools should be closed now, 8138168404Spjd * so every spa in the AVL tree should be unreferenced. 8139168404Spjd */ 8140168404Spjd mutex_enter(&spa_namespace_lock); 8141168404Spjd while ((spa = spa_next(NULL)) != NULL) { 8142168404Spjd /* 8143168404Spjd * Stop async tasks. The async thread may need to detach 8144168404Spjd * a device that's been replaced, which requires grabbing 8145168404Spjd * spa_namespace_lock, so we must drop it here. 8146168404Spjd */ 8147168404Spjd spa_open_ref(spa, FTAG); 8148168404Spjd mutex_exit(&spa_namespace_lock); 8149168404Spjd spa_async_suspend(spa); 8150168404Spjd mutex_enter(&spa_namespace_lock); 8151168404Spjd spa_close(spa, FTAG); 8152168404Spjd 8153168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 8154168404Spjd spa_unload(spa); 8155168404Spjd spa_deactivate(spa); 8156168404Spjd } 8157168404Spjd spa_remove(spa); 8158168404Spjd } 8159168404Spjd mutex_exit(&spa_namespace_lock); 8160168404Spjd} 8161168404Spjd 8162168404Spjdvdev_t * 8163209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 8164168404Spjd{ 8165185029Spjd vdev_t *vd; 8166185029Spjd int i; 8167185029Spjd 8168185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 8169185029Spjd return (vd); 8170185029Spjd 8171209962Smm if (aux) { 8172185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 8173185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 8174185029Spjd if (vd->vdev_guid == guid) 8175185029Spjd return (vd); 8176185029Spjd } 8177209962Smm 8178209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 8179209962Smm vd = spa->spa_spares.sav_vdevs[i]; 8180209962Smm if (vd->vdev_guid == guid) 8181209962Smm return (vd); 8182209962Smm } 8183185029Spjd } 8184185029Spjd 8185185029Spjd return (NULL); 8186168404Spjd} 8187168404Spjd 8188168404Spjdvoid 8189185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 8190168404Spjd{ 8191219089Spjd ASSERT(spa_writeable(spa)); 8192219089Spjd 8193185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8194168404Spjd 8195168404Spjd /* 8196168404Spjd * This should only be called for a non-faulted pool, and since a 8197168404Spjd * future version would result in an unopenable pool, this shouldn't be 8198168404Spjd * possible. 8199168404Spjd */ 8200247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 8201268075Sdelphij ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 8202168404Spjd 8203185029Spjd spa->spa_uberblock.ub_version = version; 8204168404Spjd vdev_config_dirty(spa->spa_root_vdev); 8205168404Spjd 8206185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 8207168404Spjd 8208168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 8209168404Spjd} 8210168404Spjd 8211168404Spjdboolean_t 8212168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 8213168404Spjd{ 8214168404Spjd int i; 8215168404Spjd uint64_t spareguid; 8216185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 8217168404Spjd 8218185029Spjd for (i = 0; i < sav->sav_count; i++) 8219185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 8220168404Spjd return (B_TRUE); 8221168404Spjd 8222185029Spjd for (i = 0; i < sav->sav_npending; i++) { 8223185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 8224185029Spjd &spareguid) == 0 && spareguid == guid) 8225168404Spjd return (B_TRUE); 8226168404Spjd } 8227168404Spjd 8228168404Spjd return (B_FALSE); 8229168404Spjd} 8230168404Spjd 8231185029Spjd/* 8232185029Spjd * Check if a pool has an active shared spare device. 8233185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 8234185029Spjd */ 8235185029Spjdstatic boolean_t 8236185029Spjdspa_has_active_shared_spare(spa_t *spa) 8237168404Spjd{ 8238185029Spjd int i, refcnt; 8239185029Spjd uint64_t pool; 8240185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 8241185029Spjd 8242185029Spjd for (i = 0; i < sav->sav_count; i++) { 8243185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 8244185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 8245185029Spjd refcnt > 2) 8246185029Spjd return (B_TRUE); 8247185029Spjd } 8248185029Spjd 8249185029Spjd return (B_FALSE); 8250168404Spjd} 8251168404Spjd 8252332525Smavsysevent_t * 8253331397Smavspa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 8254168404Spjd{ 8255307113Smav sysevent_t *ev = NULL; 8256185029Spjd#ifdef _KERNEL 8257185029Spjd sysevent_attr_list_t *attr = NULL; 8258185029Spjd sysevent_value_t value; 8259168404Spjd 8260185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 8261185029Spjd SE_SLEEP); 8262307113Smav ASSERT(ev != NULL); 8263168404Spjd 8264185029Spjd value.value_type = SE_DATA_TYPE_STRING; 8265185029Spjd value.value.sv_string = spa_name(spa); 8266185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 8267185029Spjd goto done; 8268168404Spjd 8269185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 8270185029Spjd value.value.sv_uint64 = spa_guid(spa); 8271185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 8272185029Spjd goto done; 8273168404Spjd 8274185029Spjd if (vd) { 8275185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 8276185029Spjd value.value.sv_uint64 = vd->vdev_guid; 8277185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 8278185029Spjd SE_SLEEP) != 0) 8279185029Spjd goto done; 8280168404Spjd 8281185029Spjd if (vd->vdev_path) { 8282185029Spjd value.value_type = SE_DATA_TYPE_STRING; 8283185029Spjd value.value.sv_string = vd->vdev_path; 8284185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 8285185029Spjd &value, SE_SLEEP) != 0) 8286185029Spjd goto done; 8287168404Spjd } 8288168404Spjd } 8289168404Spjd 8290331397Smav if (hist_nvl != NULL) { 8291331397Smav fnvlist_merge((nvlist_t *)attr, hist_nvl); 8292331397Smav } 8293331397Smav 8294185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 8295185029Spjd goto done; 8296185029Spjd attr = NULL; 8297168404Spjd 8298185029Spjddone: 8299185029Spjd if (attr) 8300185029Spjd sysevent_free_attr(attr); 8301307113Smav 8302307113Smav#endif 8303307113Smav return (ev); 8304307113Smav} 8305307113Smav 8306332525Smavvoid 8307307113Smavspa_event_post(sysevent_t *ev) 8308307113Smav{ 8309307113Smav#ifdef _KERNEL 8310307113Smav sysevent_id_t eid; 8311307113Smav 8312307113Smav (void) log_sysevent(ev, SE_SLEEP, &eid); 8313185029Spjd sysevent_free(ev); 8314185029Spjd#endif 8315168404Spjd} 8316307113Smav 8317332525Smavvoid 8318332525Smavspa_event_discard(sysevent_t *ev) 8319332525Smav{ 8320332525Smav#ifdef _KERNEL 8321332525Smav sysevent_free(ev); 8322332525Smav#endif 8323332525Smav} 8324332525Smav 8325307113Smav/* 8326307113Smav * Post a sysevent corresponding to the given event. The 'name' must be one of 8327307113Smav * the event definitions in sys/sysevent/eventdefs.h. The payload will be 8328331397Smav * filled in from the spa and (optionally) the vdev and history nvl. This 8329331397Smav * doesn't do anything in the userland libzpool, as we don't want consumers to 8330331397Smav * misinterpret ztest or zdb as real changes. 8331307113Smav */ 8332307113Smavvoid 8333331397Smavspa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 8334307113Smav{ 8335331397Smav spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 8336307113Smav} 8337