spa.c revision 332531
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24332525Smav * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25287745Sdelphij * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26247265Smm * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27286575Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28289422Smav * Copyright 2013 Saso Kiselkov. All rights reserved. 29296519Smav * Copyright (c) 2014 Integros [integros.com] 30332524Smav * Copyright 2016 Toomas Soome <tsoome@me.com> 31331397Smav * Copyright 2017 Joyent, Inc. 32324010Savg * Copyright (c) 2017 Datto Inc. 33331721Smav * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. 34168404Spjd */ 35168404Spjd 36168404Spjd/* 37251629Sdelphij * SPA: Storage Pool Allocator 38251629Sdelphij * 39168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 40168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 41168404Spjd * pool. 42168404Spjd */ 43168404Spjd 44168404Spjd#include <sys/zfs_context.h> 45168404Spjd#include <sys/fm/fs/zfs.h> 46168404Spjd#include <sys/spa_impl.h> 47168404Spjd#include <sys/zio.h> 48168404Spjd#include <sys/zio_checksum.h> 49168404Spjd#include <sys/dmu.h> 50168404Spjd#include <sys/dmu_tx.h> 51168404Spjd#include <sys/zap.h> 52168404Spjd#include <sys/zil.h> 53219089Spjd#include <sys/ddt.h> 54168404Spjd#include <sys/vdev_impl.h> 55332525Smav#include <sys/vdev_removal.h> 56332525Smav#include <sys/vdev_indirect_mapping.h> 57332525Smav#include <sys/vdev_indirect_births.h> 58168404Spjd#include <sys/metaslab.h> 59219089Spjd#include <sys/metaslab_impl.h> 60168404Spjd#include <sys/uberblock_impl.h> 61168404Spjd#include <sys/txg.h> 62168404Spjd#include <sys/avl.h> 63332525Smav#include <sys/bpobj.h> 64168404Spjd#include <sys/dmu_traverse.h> 65168404Spjd#include <sys/dmu_objset.h> 66168404Spjd#include <sys/unique.h> 67168404Spjd#include <sys/dsl_pool.h> 68168404Spjd#include <sys/dsl_dataset.h> 69168404Spjd#include <sys/dsl_dir.h> 70168404Spjd#include <sys/dsl_prop.h> 71168404Spjd#include <sys/dsl_synctask.h> 72168404Spjd#include <sys/fs/zfs.h> 73185029Spjd#include <sys/arc.h> 74168404Spjd#include <sys/callb.h> 75185029Spjd#include <sys/spa_boot.h> 76219089Spjd#include <sys/zfs_ioctl.h> 77219089Spjd#include <sys/dsl_scan.h> 78248571Smm#include <sys/dmu_send.h> 79248571Smm#include <sys/dsl_destroy.h> 80248571Smm#include <sys/dsl_userhold.h> 81236884Smm#include <sys/zfeature.h> 82219089Spjd#include <sys/zvol.h> 83240868Spjd#include <sys/trim_map.h> 84321610Smav#include <sys/abd.h> 85168404Spjd 86219089Spjd#ifdef _KERNEL 87219089Spjd#include <sys/callb.h> 88219089Spjd#include <sys/cpupart.h> 89219089Spjd#include <sys/zone.h> 90219089Spjd#endif /* _KERNEL */ 91219089Spjd 92185029Spjd#include "zfs_prop.h" 93185029Spjd#include "zfs_comutil.h" 94168404Spjd 95204073Spjd/* Check hostid on import? */ 96204073Spjdstatic int check_hostid = 1; 97204073Spjd 98251636Sdelphij/* 99251636Sdelphij * The interval, in seconds, at which failed configuration cache file writes 100251636Sdelphij * should be retried. 101251636Sdelphij */ 102332525Smavint zfs_ccw_retry_interval = 300; 103251636Sdelphij 104271785SwillSYSCTL_DECL(_vfs_zfs); 105271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, 106271785Swill "Check hostid on import?"); 107271785SwillTUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); 108271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, 109271785Swill &zfs_ccw_retry_interval, 0, 110271785Swill "Configuration cache file write, retry after failure, interval (seconds)"); 111271785Swill 112219089Spjdtypedef enum zti_modes { 113258631Savg ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 114258631Savg ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 115258631Savg ZTI_MODE_NULL, /* don't create a taskq */ 116258631Savg ZTI_NMODES 117219089Spjd} zti_modes_t; 118168712Spjd 119258631Savg#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 120258631Savg#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 121258631Savg#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 122209962Smm 123258631Savg#define ZTI_N(n) ZTI_P(n, 1) 124258631Savg#define ZTI_ONE ZTI_N(1) 125209962Smm 126209962Smmtypedef struct zio_taskq_info { 127258631Savg zti_modes_t zti_mode; 128211931Smm uint_t zti_value; 129258631Savg uint_t zti_count; 130209962Smm} zio_taskq_info_t; 131209962Smm 132209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 133219089Spjd "issue", "issue_high", "intr", "intr_high" 134209962Smm}; 135209962Smm 136211931Smm/* 137258631Savg * This table defines the taskq settings for each ZFS I/O type. When 138258631Savg * initializing a pool, we use this table to create an appropriately sized 139258631Savg * taskq. Some operations are low volume and therefore have a small, static 140258631Savg * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 141258631Savg * macros. Other operations process a large amount of data; the ZTI_BATCH 142258631Savg * macro causes us to create a taskq oriented for throughput. Some operations 143258631Savg * are so high frequency and short-lived that the taskq itself can become a a 144258631Savg * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 145258631Savg * additional degree of parallelism specified by the number of threads per- 146258631Savg * taskq and the number of taskqs; when dispatching an event in this case, the 147258631Savg * particular taskq is chosen at random. 148258631Savg * 149258631Savg * The different taskq priorities are to handle the different contexts (issue 150258631Savg * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 151258631Savg * need to be handled with minimum delay. 152211931Smm */ 153211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 154211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 155258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 156264670Sdelphij { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 157258631Savg { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 158258631Savg { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 159258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 160258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 161209962Smm}; 162209962Smm 163248571Smmstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 164248571Smmstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 165185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 166219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 167332525Smav spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, 168219089Spjd char **ereport); 169219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 170185029Spjd 171258632Savguint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 172219089Spjd#ifdef PSRSET_BIND 173219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 174219089Spjd#endif 175219089Spjd#ifdef SYSDC 176219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 177314355Savguint_t zio_taskq_basedc = 80; /* base duty cycle */ 178219089Spjd#endif 179219089Spjd 180219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 181243503Smmextern int zfs_sync_pass_deferred_free; 182219089Spjd 183168404Spjd/* 184332531Smav * Report any spa_load_verify errors found, but do not fail spa_load. 185332531Smav * This is used by zdb to analyze non-idle pools. 186332531Smav */ 187332531Smavboolean_t spa_load_verify_dryrun = B_FALSE; 188332531Smav 189332531Smav/* 190219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 191219089Spjd * to get the vdev stats associated with the imported devices. 192219089Spjd */ 193219089Spjd#define TRYIMPORT_NAME "$import" 194219089Spjd 195219089Spjd/* 196168404Spjd * ========================================================================== 197185029Spjd * SPA properties routines 198185029Spjd * ========================================================================== 199185029Spjd */ 200185029Spjd 201185029Spjd/* 202185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 203185029Spjd */ 204185029Spjdstatic void 205185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 206185029Spjd uint64_t intval, zprop_source_t src) 207185029Spjd{ 208185029Spjd const char *propname = zpool_prop_to_name(prop); 209185029Spjd nvlist_t *propval; 210185029Spjd 211185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 212185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 213185029Spjd 214185029Spjd if (strval != NULL) 215185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 216185029Spjd else 217185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 218185029Spjd 219185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 220185029Spjd nvlist_free(propval); 221185029Spjd} 222185029Spjd 223185029Spjd/* 224185029Spjd * Get property values from the spa configuration. 225185029Spjd */ 226185029Spjdstatic void 227185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 228185029Spjd{ 229236155Smm vdev_t *rvd = spa->spa_root_vdev; 230236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 231269118Sdelphij uint64_t size, alloc, cap, version; 232185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 233185029Spjd spa_config_dirent_t *dp; 234269118Sdelphij metaslab_class_t *mc = spa_normal_class(spa); 235185029Spjd 236185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 237185029Spjd 238236155Smm if (rvd != NULL) { 239219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 240219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 241209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 242209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 243219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 244219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 245219089Spjd size - alloc, src); 246236155Smm 247269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 248269118Sdelphij metaslab_class_fragmentation(mc), src); 249269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 250269118Sdelphij metaslab_class_expandable_space(mc), src); 251219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 252219089Spjd (spa_mode(spa) == FREAD), src); 253185029Spjd 254219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 255209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 256185029Spjd 257219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 258219089Spjd ddt_get_pool_dedup_ratio(spa), src); 259219089Spjd 260209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 261236155Smm rvd->vdev_state, src); 262209962Smm 263209962Smm version = spa_version(spa); 264209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 265209962Smm src = ZPROP_SRC_DEFAULT; 266209962Smm else 267209962Smm src = ZPROP_SRC_LOCAL; 268209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 269209962Smm } 270209962Smm 271236884Smm if (pool != NULL) { 272236884Smm /* 273236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 274236884Smm * when opening pools before this version freedir will be NULL. 275236884Smm */ 276268079Sdelphij if (pool->dp_free_dir != NULL) { 277236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 278275782Sdelphij dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 279275782Sdelphij src); 280236884Smm } else { 281236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 282236884Smm NULL, 0, src); 283236884Smm } 284268079Sdelphij 285268079Sdelphij if (pool->dp_leak_dir != NULL) { 286268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 287275782Sdelphij dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 288275782Sdelphij src); 289268079Sdelphij } else { 290268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 291268079Sdelphij NULL, 0, src); 292268079Sdelphij } 293236884Smm } 294236884Smm 295185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 296185029Spjd 297228103Smm if (spa->spa_comment != NULL) { 298228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 299228103Smm 0, ZPROP_SRC_LOCAL); 300228103Smm } 301228103Smm 302185029Spjd if (spa->spa_root != NULL) 303185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 304185029Spjd 0, ZPROP_SRC_LOCAL); 305185029Spjd 306274337Sdelphij if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 307274337Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 308274337Sdelphij MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 309274337Sdelphij } else { 310274337Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 311274337Sdelphij SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 312274337Sdelphij } 313274337Sdelphij 314185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 315185029Spjd if (dp->scd_path == NULL) { 316185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 317185029Spjd "none", 0, ZPROP_SRC_LOCAL); 318185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 319185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 320185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 321185029Spjd } 322185029Spjd } 323185029Spjd} 324185029Spjd 325185029Spjd/* 326185029Spjd * Get zpool property values. 327185029Spjd */ 328185029Spjdint 329185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 330185029Spjd{ 331219089Spjd objset_t *mos = spa->spa_meta_objset; 332185029Spjd zap_cursor_t zc; 333185029Spjd zap_attribute_t za; 334185029Spjd int err; 335185029Spjd 336185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 337185029Spjd 338185029Spjd mutex_enter(&spa->spa_props_lock); 339185029Spjd 340185029Spjd /* 341185029Spjd * Get properties from the spa config. 342185029Spjd */ 343185029Spjd spa_prop_get_config(spa, nvp); 344185029Spjd 345185029Spjd /* If no pool property object, no more prop to get. */ 346219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 347185029Spjd mutex_exit(&spa->spa_props_lock); 348185029Spjd return (0); 349185029Spjd } 350185029Spjd 351185029Spjd /* 352185029Spjd * Get properties from the MOS pool property object. 353185029Spjd */ 354185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 355185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 356185029Spjd zap_cursor_advance(&zc)) { 357185029Spjd uint64_t intval = 0; 358185029Spjd char *strval = NULL; 359185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 360185029Spjd zpool_prop_t prop; 361185029Spjd 362329493Smav if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) 363185029Spjd continue; 364185029Spjd 365185029Spjd switch (za.za_integer_length) { 366185029Spjd case 8: 367185029Spjd /* integer property */ 368185029Spjd if (za.za_first_integer != 369185029Spjd zpool_prop_default_numeric(prop)) 370185029Spjd src = ZPROP_SRC_LOCAL; 371185029Spjd 372185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 373185029Spjd dsl_pool_t *dp; 374185029Spjd dsl_dataset_t *ds = NULL; 375185029Spjd 376185029Spjd dp = spa_get_dsl(spa); 377248571Smm dsl_pool_config_enter(dp, FTAG); 378185029Spjd if (err = dsl_dataset_hold_obj(dp, 379185029Spjd za.za_first_integer, FTAG, &ds)) { 380248571Smm dsl_pool_config_exit(dp, FTAG); 381185029Spjd break; 382185029Spjd } 383185029Spjd 384307108Smav strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 385185029Spjd KM_SLEEP); 386185029Spjd dsl_dataset_name(ds, strval); 387185029Spjd dsl_dataset_rele(ds, FTAG); 388248571Smm dsl_pool_config_exit(dp, FTAG); 389185029Spjd } else { 390185029Spjd strval = NULL; 391185029Spjd intval = za.za_first_integer; 392185029Spjd } 393185029Spjd 394185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 395185029Spjd 396185029Spjd if (strval != NULL) 397307108Smav kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 398185029Spjd 399185029Spjd break; 400185029Spjd 401185029Spjd case 1: 402185029Spjd /* string property */ 403185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 404185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 405185029Spjd za.za_name, 1, za.za_num_integers, strval); 406185029Spjd if (err) { 407185029Spjd kmem_free(strval, za.za_num_integers); 408185029Spjd break; 409185029Spjd } 410185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 411185029Spjd kmem_free(strval, za.za_num_integers); 412185029Spjd break; 413185029Spjd 414185029Spjd default: 415185029Spjd break; 416185029Spjd } 417185029Spjd } 418185029Spjd zap_cursor_fini(&zc); 419185029Spjd mutex_exit(&spa->spa_props_lock); 420185029Spjdout: 421185029Spjd if (err && err != ENOENT) { 422185029Spjd nvlist_free(*nvp); 423185029Spjd *nvp = NULL; 424185029Spjd return (err); 425185029Spjd } 426185029Spjd 427185029Spjd return (0); 428185029Spjd} 429185029Spjd 430185029Spjd/* 431185029Spjd * Validate the given pool properties nvlist and modify the list 432185029Spjd * for the property values to be set. 433185029Spjd */ 434185029Spjdstatic int 435185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 436185029Spjd{ 437185029Spjd nvpair_t *elem; 438185029Spjd int error = 0, reset_bootfs = 0; 439247187Smm uint64_t objnum = 0; 440236884Smm boolean_t has_feature = B_FALSE; 441185029Spjd 442185029Spjd elem = NULL; 443185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 444185029Spjd uint64_t intval; 445236884Smm char *strval, *slash, *check, *fname; 446236884Smm const char *propname = nvpair_name(elem); 447236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 448185029Spjd 449236884Smm switch (prop) { 450329493Smav case ZPOOL_PROP_INVAL: 451236884Smm if (!zpool_prop_feature(propname)) { 452249195Smm error = SET_ERROR(EINVAL); 453236884Smm break; 454236884Smm } 455185029Spjd 456236884Smm /* 457236884Smm * Sanitize the input. 458236884Smm */ 459236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 460249195Smm error = SET_ERROR(EINVAL); 461236884Smm break; 462236884Smm } 463185029Spjd 464236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 465249195Smm error = SET_ERROR(EINVAL); 466236884Smm break; 467236884Smm } 468236884Smm 469236884Smm if (intval != 0) { 470249195Smm error = SET_ERROR(EINVAL); 471236884Smm break; 472236884Smm } 473236884Smm 474236884Smm fname = strchr(propname, '@') + 1; 475236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 476249195Smm error = SET_ERROR(EINVAL); 477236884Smm break; 478236884Smm } 479236884Smm 480236884Smm has_feature = B_TRUE; 481236884Smm break; 482236884Smm 483185029Spjd case ZPOOL_PROP_VERSION: 484185029Spjd error = nvpair_value_uint64(elem, &intval); 485185029Spjd if (!error && 486236884Smm (intval < spa_version(spa) || 487236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 488236884Smm has_feature)) 489249195Smm error = SET_ERROR(EINVAL); 490185029Spjd break; 491185029Spjd 492185029Spjd case ZPOOL_PROP_DELEGATION: 493185029Spjd case ZPOOL_PROP_AUTOREPLACE: 494185029Spjd case ZPOOL_PROP_LISTSNAPS: 495219089Spjd case ZPOOL_PROP_AUTOEXPAND: 496185029Spjd error = nvpair_value_uint64(elem, &intval); 497185029Spjd if (!error && intval > 1) 498249195Smm error = SET_ERROR(EINVAL); 499185029Spjd break; 500185029Spjd 501185029Spjd case ZPOOL_PROP_BOOTFS: 502209962Smm /* 503209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 504209962Smm * or the pool is still being created (version == 0), 505209962Smm * the bootfs property cannot be set. 506209962Smm */ 507185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 508249195Smm error = SET_ERROR(ENOTSUP); 509185029Spjd break; 510185029Spjd } 511185029Spjd 512185029Spjd /* 513185029Spjd * Make sure the vdev config is bootable 514185029Spjd */ 515185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 516249195Smm error = SET_ERROR(ENOTSUP); 517185029Spjd break; 518185029Spjd } 519185029Spjd 520185029Spjd reset_bootfs = 1; 521185029Spjd 522185029Spjd error = nvpair_value_string(elem, &strval); 523185029Spjd 524185029Spjd if (!error) { 525236884Smm objset_t *os; 526274337Sdelphij uint64_t propval; 527185029Spjd 528185029Spjd if (strval == NULL || strval[0] == '\0') { 529185029Spjd objnum = zpool_prop_default_numeric( 530185029Spjd ZPOOL_PROP_BOOTFS); 531185029Spjd break; 532185029Spjd } 533185029Spjd 534219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 535185029Spjd break; 536185029Spjd 537274337Sdelphij /* 538274337Sdelphij * Must be ZPL, and its property settings 539274337Sdelphij * must be supported by GRUB (compression 540274337Sdelphij * is not gzip, and large blocks are not used). 541274337Sdelphij */ 542219089Spjd 543219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 544249195Smm error = SET_ERROR(ENOTSUP); 545248571Smm } else if ((error = 546248571Smm dsl_prop_get_int_ds(dmu_objset_ds(os), 547185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 548274337Sdelphij &propval)) == 0 && 549274337Sdelphij !BOOTFS_COMPRESS_VALID(propval)) { 550249195Smm error = SET_ERROR(ENOTSUP); 551185029Spjd } else { 552185029Spjd objnum = dmu_objset_id(os); 553185029Spjd } 554219089Spjd dmu_objset_rele(os, FTAG); 555185029Spjd } 556185029Spjd break; 557185029Spjd 558185029Spjd case ZPOOL_PROP_FAILUREMODE: 559185029Spjd error = nvpair_value_uint64(elem, &intval); 560185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 561185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 562249195Smm error = SET_ERROR(EINVAL); 563185029Spjd 564185029Spjd /* 565185029Spjd * This is a special case which only occurs when 566185029Spjd * the pool has completely failed. This allows 567185029Spjd * the user to change the in-core failmode property 568185029Spjd * without syncing it out to disk (I/Os might 569185029Spjd * currently be blocked). We do this by returning 570185029Spjd * EIO to the caller (spa_prop_set) to trick it 571185029Spjd * into thinking we encountered a property validation 572185029Spjd * error. 573185029Spjd */ 574185029Spjd if (!error && spa_suspended(spa)) { 575185029Spjd spa->spa_failmode = intval; 576249195Smm error = SET_ERROR(EIO); 577185029Spjd } 578185029Spjd break; 579185029Spjd 580185029Spjd case ZPOOL_PROP_CACHEFILE: 581185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 582185029Spjd break; 583185029Spjd 584185029Spjd if (strval[0] == '\0') 585185029Spjd break; 586185029Spjd 587185029Spjd if (strcmp(strval, "none") == 0) 588185029Spjd break; 589185029Spjd 590185029Spjd if (strval[0] != '/') { 591249195Smm error = SET_ERROR(EINVAL); 592185029Spjd break; 593185029Spjd } 594185029Spjd 595185029Spjd slash = strrchr(strval, '/'); 596185029Spjd ASSERT(slash != NULL); 597185029Spjd 598185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 599185029Spjd strcmp(slash, "/..") == 0) 600249195Smm error = SET_ERROR(EINVAL); 601185029Spjd break; 602219089Spjd 603228103Smm case ZPOOL_PROP_COMMENT: 604228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 605228103Smm break; 606228103Smm for (check = strval; *check != '\0'; check++) { 607228103Smm /* 608228103Smm * The kernel doesn't have an easy isprint() 609228103Smm * check. For this kernel check, we merely 610228103Smm * check ASCII apart from DEL. Fix this if 611228103Smm * there is an easy-to-use kernel isprint(). 612228103Smm */ 613228103Smm if (*check >= 0x7f) { 614249195Smm error = SET_ERROR(EINVAL); 615228103Smm break; 616228103Smm } 617228103Smm } 618228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 619228103Smm error = E2BIG; 620228103Smm break; 621228103Smm 622219089Spjd case ZPOOL_PROP_DEDUPDITTO: 623219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 624249195Smm error = SET_ERROR(ENOTSUP); 625219089Spjd else 626219089Spjd error = nvpair_value_uint64(elem, &intval); 627219089Spjd if (error == 0 && 628219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 629249195Smm error = SET_ERROR(EINVAL); 630219089Spjd break; 631185029Spjd } 632185029Spjd 633185029Spjd if (error) 634185029Spjd break; 635185029Spjd } 636185029Spjd 637185029Spjd if (!error && reset_bootfs) { 638185029Spjd error = nvlist_remove(props, 639185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 640185029Spjd 641185029Spjd if (!error) { 642185029Spjd error = nvlist_add_uint64(props, 643185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 644185029Spjd } 645185029Spjd } 646185029Spjd 647185029Spjd return (error); 648185029Spjd} 649185029Spjd 650209962Smmvoid 651209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 652209962Smm{ 653209962Smm char *cachefile; 654209962Smm spa_config_dirent_t *dp; 655209962Smm 656209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 657209962Smm &cachefile) != 0) 658209962Smm return; 659209962Smm 660209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 661209962Smm KM_SLEEP); 662209962Smm 663209962Smm if (cachefile[0] == '\0') 664209962Smm dp->scd_path = spa_strdup(spa_config_path); 665209962Smm else if (strcmp(cachefile, "none") == 0) 666209962Smm dp->scd_path = NULL; 667209962Smm else 668209962Smm dp->scd_path = spa_strdup(cachefile); 669209962Smm 670209962Smm list_insert_head(&spa->spa_config_list, dp); 671209962Smm if (need_sync) 672209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 673209962Smm} 674209962Smm 675185029Spjdint 676185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 677185029Spjd{ 678185029Spjd int error; 679236884Smm nvpair_t *elem = NULL; 680209962Smm boolean_t need_sync = B_FALSE; 681185029Spjd 682185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 683185029Spjd return (error); 684185029Spjd 685209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 686236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 687209962Smm 688219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 689219089Spjd prop == ZPOOL_PROP_ALTROOT || 690219089Spjd prop == ZPOOL_PROP_READONLY) 691209962Smm continue; 692209962Smm 693329493Smav if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 694236884Smm uint64_t ver; 695236884Smm 696236884Smm if (prop == ZPOOL_PROP_VERSION) { 697236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 698236884Smm } else { 699236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 700236884Smm ver = SPA_VERSION_FEATURES; 701236884Smm need_sync = B_TRUE; 702236884Smm } 703236884Smm 704236884Smm /* Save time if the version is already set. */ 705236884Smm if (ver == spa_version(spa)) 706236884Smm continue; 707236884Smm 708236884Smm /* 709236884Smm * In addition to the pool directory object, we might 710236884Smm * create the pool properties object, the features for 711236884Smm * read object, the features for write object, or the 712236884Smm * feature descriptions object. 713236884Smm */ 714248571Smm error = dsl_sync_task(spa->spa_name, NULL, 715268473Sdelphij spa_sync_version, &ver, 716268473Sdelphij 6, ZFS_SPACE_CHECK_RESERVED); 717236884Smm if (error) 718236884Smm return (error); 719236884Smm continue; 720236884Smm } 721236884Smm 722209962Smm need_sync = B_TRUE; 723209962Smm break; 724209962Smm } 725209962Smm 726236884Smm if (need_sync) { 727248571Smm return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 728268473Sdelphij nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 729236884Smm } 730236884Smm 731236884Smm return (0); 732185029Spjd} 733185029Spjd 734185029Spjd/* 735185029Spjd * If the bootfs property value is dsobj, clear it. 736185029Spjd */ 737185029Spjdvoid 738185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 739185029Spjd{ 740185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 741185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 742185029Spjd spa->spa_pool_props_object, 743185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 744185029Spjd spa->spa_bootfs = 0; 745185029Spjd } 746185029Spjd} 747185029Spjd 748239620Smm/*ARGSUSED*/ 749239620Smmstatic int 750248571Smmspa_change_guid_check(void *arg, dmu_tx_t *tx) 751239620Smm{ 752248571Smm uint64_t *newguid = arg; 753248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 754239620Smm vdev_t *rvd = spa->spa_root_vdev; 755239620Smm uint64_t vdev_state; 756239620Smm 757239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 758239620Smm vdev_state = rvd->vdev_state; 759239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 760239620Smm 761239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 762249195Smm return (SET_ERROR(ENXIO)); 763239620Smm 764239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 765239620Smm 766239620Smm return (0); 767239620Smm} 768239620Smm 769239620Smmstatic void 770248571Smmspa_change_guid_sync(void *arg, dmu_tx_t *tx) 771239620Smm{ 772248571Smm uint64_t *newguid = arg; 773248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 774239620Smm uint64_t oldguid; 775239620Smm vdev_t *rvd = spa->spa_root_vdev; 776239620Smm 777239620Smm oldguid = spa_guid(spa); 778239620Smm 779239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 780239620Smm rvd->vdev_guid = *newguid; 781239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 782239620Smm vdev_config_dirty(rvd); 783239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 784239620Smm 785248571Smm spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 786239620Smm oldguid, *newguid); 787239620Smm} 788239620Smm 789185029Spjd/* 790228103Smm * Change the GUID for the pool. This is done so that we can later 791228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 792228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 793228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 794228103Smm * online when we do this, or else any vdevs that weren't present 795228103Smm * would be orphaned from our pool. We are also going to issue a 796228103Smm * sysevent to update any watchers. 797228103Smm */ 798228103Smmint 799228103Smmspa_change_guid(spa_t *spa) 800228103Smm{ 801239620Smm int error; 802239620Smm uint64_t guid; 803228103Smm 804254074Sdelphij mutex_enter(&spa->spa_vdev_top_lock); 805239620Smm mutex_enter(&spa_namespace_lock); 806239620Smm guid = spa_generate_guid(NULL); 807228103Smm 808248571Smm error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 809268473Sdelphij spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 810228103Smm 811239620Smm if (error == 0) { 812332525Smav spa_write_cachefile(spa, B_FALSE, B_TRUE); 813331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 814239620Smm } 815228103Smm 816239620Smm mutex_exit(&spa_namespace_lock); 817254074Sdelphij mutex_exit(&spa->spa_vdev_top_lock); 818228103Smm 819239620Smm return (error); 820228103Smm} 821228103Smm 822228103Smm/* 823185029Spjd * ========================================================================== 824168404Spjd * SPA state manipulation (open/create/destroy/import/export) 825168404Spjd * ========================================================================== 826168404Spjd */ 827168404Spjd 828168404Spjdstatic int 829168404Spjdspa_error_entry_compare(const void *a, const void *b) 830168404Spjd{ 831168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 832168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 833168404Spjd int ret; 834168404Spjd 835168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 836268123Sdelphij sizeof (zbookmark_phys_t)); 837168404Spjd 838168404Spjd if (ret < 0) 839168404Spjd return (-1); 840168404Spjd else if (ret > 0) 841168404Spjd return (1); 842168404Spjd else 843168404Spjd return (0); 844168404Spjd} 845168404Spjd 846168404Spjd/* 847168404Spjd * Utility function which retrieves copies of the current logs and 848168404Spjd * re-initializes them in the process. 849168404Spjd */ 850168404Spjdvoid 851168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 852168404Spjd{ 853168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 854168404Spjd 855168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 856168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 857168404Spjd 858168404Spjd avl_create(&spa->spa_errlist_scrub, 859168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 860168404Spjd offsetof(spa_error_entry_t, se_avl)); 861168404Spjd avl_create(&spa->spa_errlist_last, 862168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 863168404Spjd offsetof(spa_error_entry_t, se_avl)); 864168404Spjd} 865168404Spjd 866258631Savgstatic void 867258631Savgspa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 868168404Spjd{ 869258631Savg const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 870258631Savg enum zti_modes mode = ztip->zti_mode; 871258631Savg uint_t value = ztip->zti_value; 872258631Savg uint_t count = ztip->zti_count; 873258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 874258631Savg char name[32]; 875258630Savg uint_t flags = 0; 876219089Spjd boolean_t batch = B_FALSE; 877168404Spjd 878258631Savg if (mode == ZTI_MODE_NULL) { 879258631Savg tqs->stqs_count = 0; 880258631Savg tqs->stqs_taskq = NULL; 881258631Savg return; 882258631Savg } 883168404Spjd 884258631Savg ASSERT3U(count, >, 0); 885168404Spjd 886258631Savg tqs->stqs_count = count; 887258631Savg tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 888219089Spjd 889258632Savg switch (mode) { 890258632Savg case ZTI_MODE_FIXED: 891258632Savg ASSERT3U(value, >=, 1); 892258632Savg value = MAX(value, 1); 893258632Savg break; 894219089Spjd 895258632Savg case ZTI_MODE_BATCH: 896258632Savg batch = B_TRUE; 897258632Savg flags |= TASKQ_THREADS_CPU_PCT; 898258632Savg value = zio_taskq_batch_pct; 899258632Savg break; 900219089Spjd 901258632Savg default: 902258632Savg panic("unrecognized mode for %s_%s taskq (%u:%u) in " 903258632Savg "spa_activate()", 904258632Savg zio_type_name[t], zio_taskq_types[q], mode, value); 905258632Savg break; 906258632Savg } 907258631Savg 908258632Savg for (uint_t i = 0; i < count; i++) { 909258632Savg taskq_t *tq; 910258631Savg 911258631Savg if (count > 1) { 912258631Savg (void) snprintf(name, sizeof (name), "%s_%s_%u", 913258631Savg zio_type_name[t], zio_taskq_types[q], i); 914258631Savg } else { 915258631Savg (void) snprintf(name, sizeof (name), "%s_%s", 916258631Savg zio_type_name[t], zio_taskq_types[q]); 917258631Savg } 918258631Savg 919219089Spjd#ifdef SYSDC 920258631Savg if (zio_taskq_sysdc && spa->spa_proc != &p0) { 921258631Savg if (batch) 922258631Savg flags |= TASKQ_DC_BATCH; 923219089Spjd 924258631Savg tq = taskq_create_sysdc(name, value, 50, INT_MAX, 925258631Savg spa->spa_proc, zio_taskq_basedc, flags); 926258631Savg } else { 927258631Savg#endif 928258632Savg pri_t pri = maxclsyspri; 929258632Savg /* 930258632Savg * The write issue taskq can be extremely CPU 931258632Savg * intensive. Run it at slightly lower priority 932258632Savg * than the other taskqs. 933314858Savg * FreeBSD notes: 934314858Savg * - numerically higher priorities are lower priorities; 935314858Savg * - if priorities divided by four (RQ_PPQ) are equal 936314858Savg * then a difference between them is insignificant. 937258632Savg */ 938258632Savg if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 939314858Savg#ifdef illumos 940314858Savg pri--; 941314858Savg#else 942314858Savg pri += 4; 943314858Savg#endif 944258632Savg 945258632Savg tq = taskq_create_proc(name, value, pri, 50, 946258631Savg INT_MAX, spa->spa_proc, flags); 947258631Savg#ifdef SYSDC 948258631Savg } 949258631Savg#endif 950258631Savg 951258631Savg tqs->stqs_taskq[i] = tq; 952219089Spjd } 953219089Spjd} 954219089Spjd 955219089Spjdstatic void 956258631Savgspa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 957258631Savg{ 958258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 959258631Savg 960258631Savg if (tqs->stqs_taskq == NULL) { 961258631Savg ASSERT0(tqs->stqs_count); 962258631Savg return; 963258631Savg } 964258631Savg 965258631Savg for (uint_t i = 0; i < tqs->stqs_count; i++) { 966258631Savg ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 967258631Savg taskq_destroy(tqs->stqs_taskq[i]); 968258631Savg } 969258631Savg 970258631Savg kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 971258631Savg tqs->stqs_taskq = NULL; 972258631Savg} 973258631Savg 974258631Savg/* 975258631Savg * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 976258631Savg * Note that a type may have multiple discrete taskqs to avoid lock contention 977258631Savg * on the taskq itself. In that case we choose which taskq at random by using 978258631Savg * the low bits of gethrtime(). 979258631Savg */ 980258631Savgvoid 981258631Savgspa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 982258631Savg task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 983258631Savg{ 984258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 985258631Savg taskq_t *tq; 986258631Savg 987258631Savg ASSERT3P(tqs->stqs_taskq, !=, NULL); 988258631Savg ASSERT3U(tqs->stqs_count, !=, 0); 989258631Savg 990258631Savg if (tqs->stqs_count == 1) { 991258631Savg tq = tqs->stqs_taskq[0]; 992258631Savg } else { 993267038Sbdrewery#ifdef _KERNEL 994267029Smav tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 995267038Sbdrewery#else 996267038Sbdrewery tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 997267038Sbdrewery#endif 998258631Savg } 999258631Savg 1000258631Savg taskq_dispatch_ent(tq, func, arg, flags, ent); 1001258631Savg} 1002258631Savg 1003258631Savgstatic void 1004219089Spjdspa_create_zio_taskqs(spa_t *spa) 1005219089Spjd{ 1006185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1007185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1008258631Savg spa_taskqs_init(spa, t, q); 1009219089Spjd } 1010219089Spjd } 1011219089Spjd} 1012209962Smm 1013219089Spjd#ifdef _KERNEL 1014219089Spjd#ifdef SPA_PROCESS 1015219089Spjdstatic void 1016219089Spjdspa_thread(void *arg) 1017219089Spjd{ 1018219089Spjd callb_cpr_t cprinfo; 1019209962Smm 1020219089Spjd spa_t *spa = arg; 1021219089Spjd user_t *pu = PTOU(curproc); 1022209962Smm 1023219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1024219089Spjd spa->spa_name); 1025209962Smm 1026219089Spjd ASSERT(curproc != &p0); 1027219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1028219089Spjd "zpool-%s", spa->spa_name); 1029219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1030211931Smm 1031219089Spjd#ifdef PSRSET_BIND 1032219089Spjd /* bind this thread to the requested psrset */ 1033219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 1034219089Spjd pool_lock(); 1035219089Spjd mutex_enter(&cpu_lock); 1036219089Spjd mutex_enter(&pidlock); 1037219089Spjd mutex_enter(&curproc->p_lock); 1038219089Spjd 1039219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1040219089Spjd 0, NULL, NULL) == 0) { 1041219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 1042219089Spjd } else { 1043219089Spjd cmn_err(CE_WARN, 1044219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 1045219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1046219089Spjd } 1047219089Spjd 1048219089Spjd mutex_exit(&curproc->p_lock); 1049219089Spjd mutex_exit(&pidlock); 1050219089Spjd mutex_exit(&cpu_lock); 1051219089Spjd pool_unlock(); 1052219089Spjd } 1053219089Spjd#endif 1054219089Spjd 1055219089Spjd#ifdef SYSDC 1056219089Spjd if (zio_taskq_sysdc) { 1057219089Spjd sysdc_thread_enter(curthread, 100, 0); 1058219089Spjd } 1059219089Spjd#endif 1060219089Spjd 1061219089Spjd spa->spa_proc = curproc; 1062219089Spjd spa->spa_did = curthread->t_did; 1063219089Spjd 1064219089Spjd spa_create_zio_taskqs(spa); 1065219089Spjd 1066219089Spjd mutex_enter(&spa->spa_proc_lock); 1067219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1068219089Spjd 1069219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 1070219089Spjd cv_broadcast(&spa->spa_proc_cv); 1071219089Spjd 1072219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 1073219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1074219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1075219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1076219089Spjd 1077219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1078219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 1079219089Spjd spa->spa_proc = &p0; 1080219089Spjd cv_broadcast(&spa->spa_proc_cv); 1081219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1082219089Spjd 1083219089Spjd mutex_enter(&curproc->p_lock); 1084219089Spjd lwp_exit(); 1085219089Spjd} 1086219089Spjd#endif /* SPA_PROCESS */ 1087219089Spjd#endif 1088219089Spjd 1089219089Spjd/* 1090219089Spjd * Activate an uninitialized pool. 1091219089Spjd */ 1092219089Spjdstatic void 1093219089Spjdspa_activate(spa_t *spa, int mode) 1094219089Spjd{ 1095219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1096219089Spjd 1097219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 1098219089Spjd spa->spa_mode = mode; 1099219089Spjd 1100219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1101219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1102219089Spjd 1103219089Spjd /* Try to create a covering process */ 1104219089Spjd mutex_enter(&spa->spa_proc_lock); 1105219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1106219089Spjd ASSERT(spa->spa_proc == &p0); 1107219089Spjd spa->spa_did = 0; 1108219089Spjd 1109219089Spjd#ifdef SPA_PROCESS 1110219089Spjd /* Only create a process if we're going to be around a while. */ 1111219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1112219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1113219089Spjd NULL, 0) == 0) { 1114219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 1115219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 1116219089Spjd cv_wait(&spa->spa_proc_cv, 1117219089Spjd &spa->spa_proc_lock); 1118209962Smm } 1119219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1120219089Spjd ASSERT(spa->spa_proc != &p0); 1121219089Spjd ASSERT(spa->spa_did != 0); 1122219089Spjd } else { 1123219089Spjd#ifdef _KERNEL 1124219089Spjd cmn_err(CE_WARN, 1125219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 1126219089Spjd spa->spa_name); 1127219089Spjd#endif 1128185029Spjd } 1129168404Spjd } 1130219089Spjd#endif /* SPA_PROCESS */ 1131219089Spjd mutex_exit(&spa->spa_proc_lock); 1132168404Spjd 1133219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1134219089Spjd ASSERT(spa->spa_proc == &p0); 1135219089Spjd if (spa->spa_proc == &p0) { 1136219089Spjd spa_create_zio_taskqs(spa); 1137219089Spjd } 1138219089Spjd 1139240868Spjd /* 1140240868Spjd * Start TRIM thread. 1141240868Spjd */ 1142240868Spjd trim_thread_create(spa); 1143240868Spjd 1144332525Smav for (size_t i = 0; i < TXG_SIZE; i++) 1145332525Smav spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0); 1146332525Smav 1147185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1148185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1149286575Smav list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1150286575Smav offsetof(objset_t, os_evicting_node)); 1151185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1152185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1153168404Spjd 1154321567Smav txg_list_create(&spa->spa_vdev_txg_list, spa, 1155168404Spjd offsetof(struct vdev, vdev_txg_node)); 1156168404Spjd 1157168404Spjd avl_create(&spa->spa_errlist_scrub, 1158168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1159168404Spjd offsetof(spa_error_entry_t, se_avl)); 1160168404Spjd avl_create(&spa->spa_errlist_last, 1161168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1162168404Spjd offsetof(spa_error_entry_t, se_avl)); 1163168404Spjd} 1164168404Spjd 1165168404Spjd/* 1166168404Spjd * Opposite of spa_activate(). 1167168404Spjd */ 1168168404Spjdstatic void 1169168404Spjdspa_deactivate(spa_t *spa) 1170168404Spjd{ 1171168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1172168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1173168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1174209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1175168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1176168404Spjd 1177240868Spjd /* 1178240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1179240868Spjd * before spa_deactivate(). 1180240868Spjd */ 1181240868Spjd trim_thread_destroy(spa); 1182240868Spjd 1183286575Smav spa_evicting_os_wait(spa); 1184286575Smav 1185168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1186168404Spjd 1187185029Spjd list_destroy(&spa->spa_config_dirty_list); 1188286575Smav list_destroy(&spa->spa_evicting_os_list); 1189185029Spjd list_destroy(&spa->spa_state_dirty_list); 1190168404Spjd 1191185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1192185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1193258631Savg spa_taskqs_fini(spa, t, q); 1194185029Spjd } 1195168404Spjd } 1196168404Spjd 1197332525Smav for (size_t i = 0; i < TXG_SIZE; i++) { 1198332525Smav ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1199332525Smav VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1200332525Smav spa->spa_txg_zio[i] = NULL; 1201332525Smav } 1202332525Smav 1203168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1204168404Spjd spa->spa_normal_class = NULL; 1205168404Spjd 1206185029Spjd metaslab_class_destroy(spa->spa_log_class); 1207185029Spjd spa->spa_log_class = NULL; 1208185029Spjd 1209168404Spjd /* 1210168404Spjd * If this was part of an import or the open otherwise failed, we may 1211168404Spjd * still have errors left in the queues. Empty them just in case. 1212168404Spjd */ 1213168404Spjd spa_errlog_drain(spa); 1214168404Spjd 1215168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1216168404Spjd avl_destroy(&spa->spa_errlist_last); 1217168404Spjd 1218168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1219219089Spjd 1220219089Spjd mutex_enter(&spa->spa_proc_lock); 1221219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1222219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1223219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1224219089Spjd cv_broadcast(&spa->spa_proc_cv); 1225219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1226219089Spjd ASSERT(spa->spa_proc != &p0); 1227219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1228219089Spjd } 1229219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1230219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1231219089Spjd } 1232219089Spjd ASSERT(spa->spa_proc == &p0); 1233219089Spjd mutex_exit(&spa->spa_proc_lock); 1234219089Spjd 1235219089Spjd#ifdef SPA_PROCESS 1236219089Spjd /* 1237219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1238219089Spjd * module, so that the module can't be unloaded out from underneath 1239219089Spjd * it. 1240219089Spjd */ 1241219089Spjd if (spa->spa_did != 0) { 1242219089Spjd thread_join(spa->spa_did); 1243219089Spjd spa->spa_did = 0; 1244219089Spjd } 1245219089Spjd#endif /* SPA_PROCESS */ 1246168404Spjd} 1247168404Spjd 1248168404Spjd/* 1249168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1250168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1251168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1252168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1253168404Spjd */ 1254168404Spjdstatic int 1255168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1256168404Spjd uint_t id, int atype) 1257168404Spjd{ 1258168404Spjd nvlist_t **child; 1259219089Spjd uint_t children; 1260168404Spjd int error; 1261168404Spjd 1262168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1263168404Spjd return (error); 1264168404Spjd 1265168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1266168404Spjd return (0); 1267168404Spjd 1268185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1269185029Spjd &child, &children); 1270185029Spjd 1271185029Spjd if (error == ENOENT) 1272185029Spjd return (0); 1273185029Spjd 1274185029Spjd if (error) { 1275168404Spjd vdev_free(*vdp); 1276168404Spjd *vdp = NULL; 1277249195Smm return (SET_ERROR(EINVAL)); 1278168404Spjd } 1279168404Spjd 1280219089Spjd for (int c = 0; c < children; c++) { 1281168404Spjd vdev_t *vd; 1282168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1283168404Spjd atype)) != 0) { 1284168404Spjd vdev_free(*vdp); 1285168404Spjd *vdp = NULL; 1286168404Spjd return (error); 1287168404Spjd } 1288168404Spjd } 1289168404Spjd 1290168404Spjd ASSERT(*vdp != NULL); 1291168404Spjd 1292168404Spjd return (0); 1293168404Spjd} 1294168404Spjd 1295168404Spjd/* 1296168404Spjd * Opposite of spa_load(). 1297168404Spjd */ 1298168404Spjdstatic void 1299168404Spjdspa_unload(spa_t *spa) 1300168404Spjd{ 1301168404Spjd int i; 1302168404Spjd 1303185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1304185029Spjd 1305332530Smav spa_load_note(spa, "UNLOADING"); 1306332530Smav 1307168404Spjd /* 1308240868Spjd * Stop TRIM thread. 1309240868Spjd */ 1310240868Spjd trim_thread_destroy(spa); 1311240868Spjd 1312240868Spjd /* 1313168404Spjd * Stop async tasks. 1314168404Spjd */ 1315168404Spjd spa_async_suspend(spa); 1316168404Spjd 1317168404Spjd /* 1318168404Spjd * Stop syncing. 1319168404Spjd */ 1320168404Spjd if (spa->spa_sync_on) { 1321168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1322168404Spjd spa->spa_sync_on = B_FALSE; 1323168404Spjd } 1324168404Spjd 1325168404Spjd /* 1326321529Smav * Even though vdev_free() also calls vdev_metaslab_fini, we need 1327321529Smav * to call it earlier, before we wait for async i/o to complete. 1328321529Smav * This ensures that there is no async metaslab prefetching, by 1329321529Smav * calling taskq_wait(mg_taskq). 1330321529Smav */ 1331321529Smav if (spa->spa_root_vdev != NULL) { 1332321529Smav spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1333321529Smav for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) 1334321529Smav vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); 1335321529Smav spa_config_exit(spa, SCL_ALL, FTAG); 1336321529Smav } 1337321529Smav 1338321529Smav /* 1339185029Spjd * Wait for any outstanding async I/O to complete. 1340168404Spjd */ 1341209962Smm if (spa->spa_async_zio_root != NULL) { 1342272598Sdelphij for (int i = 0; i < max_ncpus; i++) 1343272598Sdelphij (void) zio_wait(spa->spa_async_zio_root[i]); 1344272598Sdelphij kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1345209962Smm spa->spa_async_zio_root = NULL; 1346209962Smm } 1347168404Spjd 1348332525Smav if (spa->spa_vdev_removal != NULL) { 1349332525Smav spa_vdev_removal_destroy(spa->spa_vdev_removal); 1350332525Smav spa->spa_vdev_removal = NULL; 1351332525Smav } 1352332525Smav 1353332525Smav spa_condense_fini(spa); 1354332525Smav 1355219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1356219089Spjd 1357258717Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1358258717Savg 1359168404Spjd /* 1360258717Savg * Close all vdevs. 1361258717Savg */ 1362258717Savg if (spa->spa_root_vdev) 1363258717Savg vdev_free(spa->spa_root_vdev); 1364258717Savg ASSERT(spa->spa_root_vdev == NULL); 1365258717Savg 1366258717Savg /* 1367168404Spjd * Close the dsl pool. 1368168404Spjd */ 1369168404Spjd if (spa->spa_dsl_pool) { 1370168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1371168404Spjd spa->spa_dsl_pool = NULL; 1372219089Spjd spa->spa_meta_objset = NULL; 1373168404Spjd } 1374168404Spjd 1375219089Spjd ddt_unload(spa); 1376219089Spjd 1377168404Spjd /* 1378209962Smm * Drop and purge level 2 cache 1379209962Smm */ 1380209962Smm spa_l2cache_drop(spa); 1381209962Smm 1382185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1383185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1384185029Spjd if (spa->spa_spares.sav_vdevs) { 1385185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1386185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1387185029Spjd spa->spa_spares.sav_vdevs = NULL; 1388168404Spjd } 1389185029Spjd if (spa->spa_spares.sav_config) { 1390185029Spjd nvlist_free(spa->spa_spares.sav_config); 1391185029Spjd spa->spa_spares.sav_config = NULL; 1392168404Spjd } 1393185029Spjd spa->spa_spares.sav_count = 0; 1394168404Spjd 1395230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1396230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1397185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1398230514Smm } 1399185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1400185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1401185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1402185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1403185029Spjd } 1404185029Spjd if (spa->spa_l2cache.sav_config) { 1405185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1406185029Spjd spa->spa_l2cache.sav_config = NULL; 1407185029Spjd } 1408185029Spjd spa->spa_l2cache.sav_count = 0; 1409185029Spjd 1410168404Spjd spa->spa_async_suspended = 0; 1411209962Smm 1412332525Smav spa->spa_indirect_vdevs_loaded = B_FALSE; 1413332525Smav 1414228103Smm if (spa->spa_comment != NULL) { 1415228103Smm spa_strfree(spa->spa_comment); 1416228103Smm spa->spa_comment = NULL; 1417228103Smm } 1418228103Smm 1419209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1420168404Spjd} 1421168404Spjd 1422168404Spjd/* 1423168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1424168404Spjd * this pool. When this is called, we have some form of basic information in 1425185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1426185029Spjd * then re-generate a more complete list including status information. 1427168404Spjd */ 1428332525Smavvoid 1429168404Spjdspa_load_spares(spa_t *spa) 1430168404Spjd{ 1431168404Spjd nvlist_t **spares; 1432168404Spjd uint_t nspares; 1433168404Spjd int i; 1434168404Spjd vdev_t *vd, *tvd; 1435168404Spjd 1436185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1437185029Spjd 1438168404Spjd /* 1439168404Spjd * First, close and free any existing spare vdevs. 1440168404Spjd */ 1441185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1442185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1443168404Spjd 1444168404Spjd /* Undo the call to spa_activate() below */ 1445185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1446185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1447168404Spjd spa_spare_remove(tvd); 1448168404Spjd vdev_close(vd); 1449168404Spjd vdev_free(vd); 1450168404Spjd } 1451168404Spjd 1452185029Spjd if (spa->spa_spares.sav_vdevs) 1453185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1454185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1455168404Spjd 1456185029Spjd if (spa->spa_spares.sav_config == NULL) 1457168404Spjd nspares = 0; 1458168404Spjd else 1459185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1460168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1461168404Spjd 1462185029Spjd spa->spa_spares.sav_count = (int)nspares; 1463185029Spjd spa->spa_spares.sav_vdevs = NULL; 1464168404Spjd 1465168404Spjd if (nspares == 0) 1466168404Spjd return; 1467168404Spjd 1468168404Spjd /* 1469168404Spjd * Construct the array of vdevs, opening them to get status in the 1470168404Spjd * process. For each spare, there is potentially two different vdev_t 1471168404Spjd * structures associated with it: one in the list of spares (used only 1472168404Spjd * for basic validation purposes) and one in the active vdev 1473168404Spjd * configuration (if it's spared in). During this phase we open and 1474168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1475168404Spjd * active configuration, then we also mark this vdev as an active spare. 1476168404Spjd */ 1477185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1478185029Spjd KM_SLEEP); 1479185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1480168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1481168404Spjd VDEV_ALLOC_SPARE) == 0); 1482168404Spjd ASSERT(vd != NULL); 1483168404Spjd 1484185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1485168404Spjd 1486185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1487185029Spjd B_FALSE)) != NULL) { 1488168404Spjd if (!tvd->vdev_isspare) 1489168404Spjd spa_spare_add(tvd); 1490168404Spjd 1491168404Spjd /* 1492168404Spjd * We only mark the spare active if we were successfully 1493168404Spjd * able to load the vdev. Otherwise, importing a pool 1494168404Spjd * with a bad active spare would result in strange 1495168404Spjd * behavior, because multiple pool would think the spare 1496168404Spjd * is actively in use. 1497168404Spjd * 1498168404Spjd * There is a vulnerability here to an equally bizarre 1499168404Spjd * circumstance, where a dead active spare is later 1500168404Spjd * brought back to life (onlined or otherwise). Given 1501168404Spjd * the rarity of this scenario, and the extra complexity 1502168404Spjd * it adds, we ignore the possibility. 1503168404Spjd */ 1504168404Spjd if (!vdev_is_dead(tvd)) 1505168404Spjd spa_spare_activate(tvd); 1506168404Spjd } 1507168404Spjd 1508185029Spjd vd->vdev_top = vd; 1509209962Smm vd->vdev_aux = &spa->spa_spares; 1510185029Spjd 1511168404Spjd if (vdev_open(vd) != 0) 1512168404Spjd continue; 1513168404Spjd 1514185029Spjd if (vdev_validate_aux(vd) == 0) 1515185029Spjd spa_spare_add(vd); 1516168404Spjd } 1517168404Spjd 1518168404Spjd /* 1519168404Spjd * Recompute the stashed list of spares, with status information 1520168404Spjd * this time. 1521168404Spjd */ 1522185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1523168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1524168404Spjd 1525185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1526185029Spjd KM_SLEEP); 1527185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1528185029Spjd spares[i] = vdev_config_generate(spa, 1529219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1530185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1531185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1532185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1533168404Spjd nvlist_free(spares[i]); 1534185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1535168404Spjd} 1536168404Spjd 1537185029Spjd/* 1538185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1539185029Spjd * this pool. When this is called, we have some form of basic information in 1540185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1541185029Spjd * then re-generate a more complete list including status information. 1542185029Spjd * Devices which are already active have their details maintained, and are 1543185029Spjd * not re-opened. 1544185029Spjd */ 1545332525Smavvoid 1546185029Spjdspa_load_l2cache(spa_t *spa) 1547185029Spjd{ 1548185029Spjd nvlist_t **l2cache; 1549185029Spjd uint_t nl2cache; 1550185029Spjd int i, j, oldnvdevs; 1551219089Spjd uint64_t guid; 1552185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1553185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1554185029Spjd 1555185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1556185029Spjd 1557185029Spjd if (sav->sav_config != NULL) { 1558185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1559185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1560185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1561185029Spjd } else { 1562185029Spjd nl2cache = 0; 1563247187Smm newvdevs = NULL; 1564185029Spjd } 1565185029Spjd 1566185029Spjd oldvdevs = sav->sav_vdevs; 1567185029Spjd oldnvdevs = sav->sav_count; 1568185029Spjd sav->sav_vdevs = NULL; 1569185029Spjd sav->sav_count = 0; 1570185029Spjd 1571185029Spjd /* 1572185029Spjd * Process new nvlist of vdevs. 1573185029Spjd */ 1574185029Spjd for (i = 0; i < nl2cache; i++) { 1575185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1576185029Spjd &guid) == 0); 1577185029Spjd 1578185029Spjd newvdevs[i] = NULL; 1579185029Spjd for (j = 0; j < oldnvdevs; j++) { 1580185029Spjd vd = oldvdevs[j]; 1581185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1582185029Spjd /* 1583185029Spjd * Retain previous vdev for add/remove ops. 1584185029Spjd */ 1585185029Spjd newvdevs[i] = vd; 1586185029Spjd oldvdevs[j] = NULL; 1587185029Spjd break; 1588185029Spjd } 1589185029Spjd } 1590185029Spjd 1591185029Spjd if (newvdevs[i] == NULL) { 1592185029Spjd /* 1593185029Spjd * Create new vdev 1594185029Spjd */ 1595185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1596185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1597185029Spjd ASSERT(vd != NULL); 1598185029Spjd newvdevs[i] = vd; 1599185029Spjd 1600185029Spjd /* 1601185029Spjd * Commit this vdev as an l2cache device, 1602185029Spjd * even if it fails to open. 1603185029Spjd */ 1604185029Spjd spa_l2cache_add(vd); 1605185029Spjd 1606185029Spjd vd->vdev_top = vd; 1607185029Spjd vd->vdev_aux = sav; 1608185029Spjd 1609185029Spjd spa_l2cache_activate(vd); 1610185029Spjd 1611185029Spjd if (vdev_open(vd) != 0) 1612185029Spjd continue; 1613185029Spjd 1614185029Spjd (void) vdev_validate_aux(vd); 1615185029Spjd 1616219089Spjd if (!vdev_is_dead(vd)) 1617219089Spjd l2arc_add_vdev(spa, vd); 1618185029Spjd } 1619185029Spjd } 1620185029Spjd 1621185029Spjd /* 1622185029Spjd * Purge vdevs that were dropped 1623185029Spjd */ 1624185029Spjd for (i = 0; i < oldnvdevs; i++) { 1625185029Spjd uint64_t pool; 1626185029Spjd 1627185029Spjd vd = oldvdevs[i]; 1628185029Spjd if (vd != NULL) { 1629230514Smm ASSERT(vd->vdev_isl2cache); 1630230514Smm 1631209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1632209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1633185029Spjd l2arc_remove_vdev(vd); 1634230514Smm vdev_clear_stats(vd); 1635230514Smm vdev_free(vd); 1636185029Spjd } 1637185029Spjd } 1638185029Spjd 1639185029Spjd if (oldvdevs) 1640185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1641185029Spjd 1642185029Spjd if (sav->sav_config == NULL) 1643185029Spjd goto out; 1644185029Spjd 1645185029Spjd sav->sav_vdevs = newvdevs; 1646185029Spjd sav->sav_count = (int)nl2cache; 1647185029Spjd 1648185029Spjd /* 1649185029Spjd * Recompute the stashed list of l2cache devices, with status 1650185029Spjd * information this time. 1651185029Spjd */ 1652185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1653185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1654185029Spjd 1655185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1656185029Spjd for (i = 0; i < sav->sav_count; i++) 1657185029Spjd l2cache[i] = vdev_config_generate(spa, 1658219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1659185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1660185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1661185029Spjdout: 1662185029Spjd for (i = 0; i < sav->sav_count; i++) 1663185029Spjd nvlist_free(l2cache[i]); 1664185029Spjd if (sav->sav_count) 1665185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1666185029Spjd} 1667185029Spjd 1668168404Spjdstatic int 1669168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1670168404Spjd{ 1671168404Spjd dmu_buf_t *db; 1672168404Spjd char *packed = NULL; 1673168404Spjd size_t nvsize = 0; 1674168404Spjd int error; 1675168404Spjd *value = NULL; 1676168404Spjd 1677262676Sdelphij error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1678262676Sdelphij if (error != 0) 1679262676Sdelphij return (error); 1680287744Sdelphij 1681168404Spjd nvsize = *(uint64_t *)db->db_data; 1682168404Spjd dmu_buf_rele(db, FTAG); 1683168404Spjd 1684168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1685209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1686209962Smm DMU_READ_PREFETCH); 1687168404Spjd if (error == 0) 1688168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1689168404Spjd kmem_free(packed, nvsize); 1690168404Spjd 1691168404Spjd return (error); 1692168404Spjd} 1693168404Spjd 1694168404Spjd/* 1695185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1696185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1697185029Spjd */ 1698185029Spjdstatic void 1699185029Spjdspa_check_removed(vdev_t *vd) 1700185029Spjd{ 1701219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1702185029Spjd spa_check_removed(vd->vdev_child[c]); 1703185029Spjd 1704249188Smm if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1705332525Smav vdev_is_concrete(vd)) { 1706185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1707331397Smav spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 1708185029Spjd } 1709185029Spjd} 1710185029Spjd 1711299441Smavstatic void 1712299441Smavspa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) 1713299441Smav{ 1714299441Smav ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); 1715299441Smav 1716299441Smav vd->vdev_top_zap = mvd->vdev_top_zap; 1717299441Smav vd->vdev_leaf_zap = mvd->vdev_leaf_zap; 1718299441Smav 1719299441Smav for (uint64_t i = 0; i < vd->vdev_children; i++) { 1720299441Smav spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); 1721299441Smav } 1722299441Smav} 1723299441Smav 1724185029Spjd/* 1725219089Spjd * Validate the current config against the MOS config 1726213197Smm */ 1727219089Spjdstatic boolean_t 1728219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1729213197Smm{ 1730219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1731219089Spjd nvlist_t *nv; 1732213197Smm 1733219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1734213197Smm 1735219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1736219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1737219089Spjd 1738219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1739219089Spjd 1740219089Spjd /* 1741219089Spjd * If we're doing a normal import, then build up any additional 1742219089Spjd * diagnostic information about missing devices in this config. 1743219089Spjd * We'll pass this up to the user for further processing. 1744219089Spjd */ 1745219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1746219089Spjd nvlist_t **child, *nv; 1747219089Spjd uint64_t idx = 0; 1748219089Spjd 1749219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1750219089Spjd KM_SLEEP); 1751219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1752219089Spjd 1753219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1754219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1755219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1756219089Spjd 1757219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1758219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1759219089Spjd mtvd->vdev_islog) 1760219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1761219089Spjd B_FALSE, 0); 1762219089Spjd } 1763219089Spjd 1764219089Spjd if (idx) { 1765219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1766219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1767219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1768219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1769219089Spjd 1770219089Spjd for (int i = 0; i < idx; i++) 1771219089Spjd nvlist_free(child[i]); 1772219089Spjd } 1773219089Spjd nvlist_free(nv); 1774219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1775219089Spjd } 1776219089Spjd 1777219089Spjd /* 1778219089Spjd * Compare the root vdev tree with the information we have 1779219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1780219089Spjd * with the corresponding MOS config top-level (mtvd). 1781219089Spjd */ 1782219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1783213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1784219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1785213197Smm 1786219089Spjd /* 1787219089Spjd * Resolve any "missing" vdevs in the current configuration. 1788332525Smav * Also trust the MOS config about any "indirect" vdevs. 1789219089Spjd * If we find that the MOS config has more accurate information 1790219089Spjd * about the top-level vdev then use that vdev instead. 1791219089Spjd */ 1792332525Smav if ((tvd->vdev_ops == &vdev_missing_ops && 1793332525Smav mtvd->vdev_ops != &vdev_missing_ops) || 1794332525Smav (mtvd->vdev_ops == &vdev_indirect_ops && 1795332525Smav tvd->vdev_ops != &vdev_indirect_ops)) { 1796219089Spjd 1797219089Spjd /* 1798219089Spjd * Device specific actions. 1799219089Spjd */ 1800219089Spjd if (mtvd->vdev_islog) { 1801332525Smav if (!(spa->spa_import_flags & 1802332525Smav ZFS_IMPORT_MISSING_LOG)) { 1803332525Smav continue; 1804332525Smav } 1805332525Smav 1806219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1807332525Smav } else if (mtvd->vdev_ops != &vdev_indirect_ops) { 1808219089Spjd continue; 1809219089Spjd } 1810219089Spjd 1811219089Spjd /* 1812219089Spjd * Swap the missing vdev with the data we were 1813219089Spjd * able to obtain from the MOS config. 1814219089Spjd */ 1815219089Spjd vdev_remove_child(rvd, tvd); 1816219089Spjd vdev_remove_child(mrvd, mtvd); 1817219089Spjd 1818219089Spjd vdev_add_child(rvd, mtvd); 1819219089Spjd vdev_add_child(mrvd, tvd); 1820219089Spjd 1821219089Spjd vdev_reopen(rvd); 1822299441Smav } else { 1823299441Smav if (mtvd->vdev_islog) { 1824299441Smav /* 1825299441Smav * Load the slog device's state from the MOS 1826299441Smav * config since it's possible that the label 1827299441Smav * does not contain the most up-to-date 1828299441Smav * information. 1829299441Smav */ 1830299441Smav vdev_load_log_state(tvd, mtvd); 1831299441Smav vdev_reopen(tvd); 1832299441Smav } 1833299441Smav 1834219089Spjd /* 1835299441Smav * Per-vdev ZAP info is stored exclusively in the MOS. 1836219089Spjd */ 1837299441Smav spa_config_valid_zaps(tvd, mtvd); 1838219089Spjd } 1839332525Smav 1840332525Smav /* 1841332525Smav * Never trust this info from userland; always use what's 1842332525Smav * in the MOS. This prevents it from getting out of sync 1843332525Smav * with the rest of the info in the MOS. 1844332525Smav */ 1845332525Smav tvd->vdev_removing = mtvd->vdev_removing; 1846332525Smav tvd->vdev_indirect_config = mtvd->vdev_indirect_config; 1847213197Smm } 1848299441Smav 1849219089Spjd vdev_free(mrvd); 1850219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1851219089Spjd 1852219089Spjd /* 1853219089Spjd * Ensure we were able to validate the config. 1854219089Spjd */ 1855219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1856213197Smm} 1857213197Smm 1858213197Smm/* 1859185029Spjd * Check for missing log devices 1860185029Spjd */ 1861248571Smmstatic boolean_t 1862185029Spjdspa_check_logs(spa_t *spa) 1863185029Spjd{ 1864248571Smm boolean_t rv = B_FALSE; 1865286686Smav dsl_pool_t *dp = spa_get_dsl(spa); 1866248571Smm 1867185029Spjd switch (spa->spa_log_state) { 1868185029Spjd case SPA_LOG_MISSING: 1869185029Spjd /* need to recheck in case slog has been restored */ 1870185029Spjd case SPA_LOG_UNKNOWN: 1871286686Smav rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1872286686Smav zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1873248571Smm if (rv) 1874219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1875185029Spjd break; 1876185029Spjd } 1877248571Smm return (rv); 1878185029Spjd} 1879185029Spjd 1880219089Spjdstatic boolean_t 1881219089Spjdspa_passivate_log(spa_t *spa) 1882219089Spjd{ 1883219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1884219089Spjd boolean_t slog_found = B_FALSE; 1885219089Spjd 1886219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1887219089Spjd 1888219089Spjd if (!spa_has_slogs(spa)) 1889219089Spjd return (B_FALSE); 1890219089Spjd 1891219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1892219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1893219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1894219089Spjd 1895219089Spjd if (tvd->vdev_islog) { 1896219089Spjd metaslab_group_passivate(mg); 1897219089Spjd slog_found = B_TRUE; 1898219089Spjd } 1899219089Spjd } 1900219089Spjd 1901219089Spjd return (slog_found); 1902219089Spjd} 1903219089Spjd 1904219089Spjdstatic void 1905219089Spjdspa_activate_log(spa_t *spa) 1906219089Spjd{ 1907219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1908219089Spjd 1909219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1910219089Spjd 1911219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1912219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1913219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1914219089Spjd 1915219089Spjd if (tvd->vdev_islog) 1916219089Spjd metaslab_group_activate(mg); 1917219089Spjd } 1918219089Spjd} 1919219089Spjd 1920219089Spjdint 1921332525Smavspa_reset_logs(spa_t *spa) 1922219089Spjd{ 1923248571Smm int error; 1924219089Spjd 1925332525Smav error = dmu_objset_find(spa_name(spa), zil_reset, 1926248571Smm NULL, DS_FIND_CHILDREN); 1927248571Smm if (error == 0) { 1928219089Spjd /* 1929219089Spjd * We successfully offlined the log device, sync out the 1930219089Spjd * current txg so that the "stubby" block can be removed 1931219089Spjd * by zil_sync(). 1932219089Spjd */ 1933219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1934219089Spjd } 1935219089Spjd return (error); 1936219089Spjd} 1937219089Spjd 1938219089Spjdstatic void 1939219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1940219089Spjd{ 1941219089Spjd int i; 1942219089Spjd 1943219089Spjd for (i = 0; i < sav->sav_count; i++) 1944219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1945219089Spjd} 1946219089Spjd 1947219089Spjdvoid 1948219089Spjdspa_claim_notify(zio_t *zio) 1949219089Spjd{ 1950219089Spjd spa_t *spa = zio->io_spa; 1951219089Spjd 1952219089Spjd if (zio->io_error) 1953219089Spjd return; 1954219089Spjd 1955219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1956219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1957219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1958219089Spjd mutex_exit(&spa->spa_props_lock); 1959219089Spjd} 1960219089Spjd 1961219089Spjdtypedef struct spa_load_error { 1962219089Spjd uint64_t sle_meta_count; 1963219089Spjd uint64_t sle_data_count; 1964219089Spjd} spa_load_error_t; 1965219089Spjd 1966219089Spjdstatic void 1967219089Spjdspa_load_verify_done(zio_t *zio) 1968219089Spjd{ 1969219089Spjd blkptr_t *bp = zio->io_bp; 1970219089Spjd spa_load_error_t *sle = zio->io_private; 1971219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1972219089Spjd int error = zio->io_error; 1973268720Sdelphij spa_t *spa = zio->io_spa; 1974219089Spjd 1975321610Smav abd_free(zio->io_abd); 1976219089Spjd if (error) { 1977236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1978219089Spjd type != DMU_OT_INTENT_LOG) 1979270247Sdelphij atomic_inc_64(&sle->sle_meta_count); 1980219089Spjd else 1981270247Sdelphij atomic_inc_64(&sle->sle_data_count); 1982219089Spjd } 1983268720Sdelphij 1984268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 1985268720Sdelphij spa->spa_scrub_inflight--; 1986268720Sdelphij cv_broadcast(&spa->spa_scrub_io_cv); 1987268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 1988219089Spjd} 1989219089Spjd 1990268720Sdelphij/* 1991268720Sdelphij * Maximum number of concurrent scrub i/os to create while verifying 1992268720Sdelphij * a pool while importing it. 1993268720Sdelphij */ 1994268720Sdelphijint spa_load_verify_maxinflight = 10000; 1995268720Sdelphijboolean_t spa_load_verify_metadata = B_TRUE; 1996268720Sdelphijboolean_t spa_load_verify_data = B_TRUE; 1997268720Sdelphij 1998268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, 1999268720Sdelphij &spa_load_verify_maxinflight, 0, 2000268720Sdelphij "Maximum number of concurrent scrub I/Os to create while verifying a " 2001268720Sdelphij "pool while importing it"); 2002268720Sdelphij 2003268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, 2004268720Sdelphij &spa_load_verify_metadata, 0, 2005268720Sdelphij "Check metadata on import?"); 2006268720Sdelphij 2007268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, 2008268720Sdelphij &spa_load_verify_data, 0, 2009268720Sdelphij "Check user data on import?"); 2010268720Sdelphij 2011219089Spjd/*ARGSUSED*/ 2012219089Spjdstatic int 2013219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2014268123Sdelphij const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2015219089Spjd{ 2016286705Smav if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 2017268720Sdelphij return (0); 2018268720Sdelphij /* 2019268720Sdelphij * Note: normally this routine will not be called if 2020268720Sdelphij * spa_load_verify_metadata is not set. However, it may be useful 2021268720Sdelphij * to manually set the flag after the traversal has begun. 2022268720Sdelphij */ 2023268720Sdelphij if (!spa_load_verify_metadata) 2024268720Sdelphij return (0); 2025321610Smav if (!BP_IS_METADATA(bp) && !spa_load_verify_data) 2026268720Sdelphij return (0); 2027219089Spjd 2028268720Sdelphij zio_t *rio = arg; 2029268720Sdelphij size_t size = BP_GET_PSIZE(bp); 2030268720Sdelphij 2031268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 2032268720Sdelphij while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 2033268720Sdelphij cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2034268720Sdelphij spa->spa_scrub_inflight++; 2035268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 2036268720Sdelphij 2037321610Smav zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2038268720Sdelphij spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2039268720Sdelphij ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2040268720Sdelphij ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2041219089Spjd return (0); 2042219089Spjd} 2043219089Spjd 2044307045Smav/* ARGSUSED */ 2045307045Smavint 2046307045Smavverify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2047307045Smav{ 2048307108Smav if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2049307045Smav return (SET_ERROR(ENAMETOOLONG)); 2050307045Smav 2051307045Smav return (0); 2052307045Smav} 2053307045Smav 2054219089Spjdstatic int 2055219089Spjdspa_load_verify(spa_t *spa) 2056219089Spjd{ 2057219089Spjd zio_t *rio; 2058219089Spjd spa_load_error_t sle = { 0 }; 2059219089Spjd zpool_rewind_policy_t policy; 2060219089Spjd boolean_t verify_ok = B_FALSE; 2061268720Sdelphij int error = 0; 2062219089Spjd 2063219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 2064219089Spjd 2065219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 2066219089Spjd return (0); 2067219089Spjd 2068307045Smav dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2069307045Smav error = dmu_objset_find_dp(spa->spa_dsl_pool, 2070307045Smav spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2071307045Smav DS_FIND_CHILDREN); 2072307045Smav dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2073307045Smav if (error != 0) 2074307045Smav return (error); 2075307045Smav 2076219089Spjd rio = zio_root(spa, NULL, &sle, 2077219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2078219089Spjd 2079268720Sdelphij if (spa_load_verify_metadata) { 2080332530Smav if (spa->spa_extreme_rewind) { 2081332530Smav spa_load_note(spa, "performing a complete scan of the " 2082332530Smav "pool since extreme rewind is on. This may take " 2083332530Smav "a very long time.\n (spa_load_verify_data=%u, " 2084332530Smav "spa_load_verify_metadata=%u)", 2085332530Smav spa_load_verify_data, spa_load_verify_metadata); 2086332530Smav } 2087268720Sdelphij error = traverse_pool(spa, spa->spa_verify_min_txg, 2088268720Sdelphij TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 2089268720Sdelphij spa_load_verify_cb, rio); 2090268720Sdelphij } 2091219089Spjd 2092219089Spjd (void) zio_wait(rio); 2093219089Spjd 2094219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 2095219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 2096219089Spjd 2097332531Smav if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2098332531Smav spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2099332531Smav "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2100332531Smav (u_longlong_t)sle.sle_data_count); 2101332531Smav } 2102332531Smav 2103332531Smav if (spa_load_verify_dryrun || 2104332531Smav (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 2105332531Smav sle.sle_data_count <= policy.zrp_maxdata)) { 2106219089Spjd int64_t loss = 0; 2107219089Spjd 2108219089Spjd verify_ok = B_TRUE; 2109219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2110219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2111219089Spjd 2112219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2113219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 2114219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2115219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 2116219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2117219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 2118219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2119219089Spjd } else { 2120219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2121219089Spjd } 2122219089Spjd 2123332531Smav if (spa_load_verify_dryrun) 2124332531Smav return (0); 2125332531Smav 2126219089Spjd if (error) { 2127219089Spjd if (error != ENXIO && error != EIO) 2128249195Smm error = SET_ERROR(EIO); 2129219089Spjd return (error); 2130219089Spjd } 2131219089Spjd 2132219089Spjd return (verify_ok ? 0 : EIO); 2133219089Spjd} 2134219089Spjd 2135185029Spjd/* 2136219089Spjd * Find a value in the pool props object. 2137168404Spjd */ 2138219089Spjdstatic void 2139219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2140219089Spjd{ 2141219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2142219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2143219089Spjd} 2144219089Spjd 2145219089Spjd/* 2146219089Spjd * Find a value in the pool directory object. 2147219089Spjd */ 2148168404Spjdstatic int 2149332530Smavspa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2150168404Spjd{ 2151332530Smav int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2152332530Smav name, sizeof (uint64_t), 1, val); 2153332530Smav 2154332530Smav if (error != 0 && (error != ENOENT || log_enoent)) { 2155332530Smav spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2156332530Smav "[error=%d]", name, error); 2157332530Smav } 2158332530Smav 2159332530Smav return (error); 2160219089Spjd} 2161168404Spjd 2162219089Spjdstatic int 2163219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2164219089Spjd{ 2165219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2166332525Smav return (SET_ERROR(err)); 2167219089Spjd} 2168219089Spjd 2169219089Spjd/* 2170219089Spjd * Fix up config after a partly-completed split. This is done with the 2171219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2172219089Spjd * pool have that entry in their config, but only the splitting one contains 2173219089Spjd * a list of all the guids of the vdevs that are being split off. 2174219089Spjd * 2175219089Spjd * This function determines what to do with that list: either rejoin 2176219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 2177219089Spjd * the rejoin, each disk that is offlined is marked online again, and 2178219089Spjd * we do a reopen() call. If the vdev label for every disk that was 2179219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2180219089Spjd * then we call vdev_split() on each disk, and complete the split. 2181219089Spjd * 2182219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 2183219089Spjd * the original pool. 2184219089Spjd */ 2185219089Spjdstatic void 2186219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 2187219089Spjd{ 2188219089Spjd uint_t extracted; 2189219089Spjd uint64_t *glist; 2190219089Spjd uint_t i, gcount; 2191219089Spjd nvlist_t *nvl; 2192219089Spjd vdev_t **vd; 2193219089Spjd boolean_t attempt_reopen; 2194219089Spjd 2195219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2196219089Spjd return; 2197219089Spjd 2198219089Spjd /* check that the config is complete */ 2199219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2200219089Spjd &glist, &gcount) != 0) 2201219089Spjd return; 2202219089Spjd 2203219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2204219089Spjd 2205219089Spjd /* attempt to online all the vdevs & validate */ 2206219089Spjd attempt_reopen = B_TRUE; 2207219089Spjd for (i = 0; i < gcount; i++) { 2208219089Spjd if (glist[i] == 0) /* vdev is hole */ 2209219089Spjd continue; 2210219089Spjd 2211219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2212219089Spjd if (vd[i] == NULL) { 2213219089Spjd /* 2214219089Spjd * Don't bother attempting to reopen the disks; 2215219089Spjd * just do the split. 2216219089Spjd */ 2217219089Spjd attempt_reopen = B_FALSE; 2218219089Spjd } else { 2219219089Spjd /* attempt to re-online it */ 2220219089Spjd vd[i]->vdev_offline = B_FALSE; 2221219089Spjd } 2222219089Spjd } 2223219089Spjd 2224219089Spjd if (attempt_reopen) { 2225219089Spjd vdev_reopen(spa->spa_root_vdev); 2226219089Spjd 2227219089Spjd /* check each device to see what state it's in */ 2228219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 2229219089Spjd if (vd[i] != NULL && 2230219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2231219089Spjd break; 2232219089Spjd ++extracted; 2233219089Spjd } 2234219089Spjd } 2235219089Spjd 2236209962Smm /* 2237219089Spjd * If every disk has been moved to the new pool, or if we never 2238219089Spjd * even attempted to look at them, then we split them off for 2239219089Spjd * good. 2240209962Smm */ 2241219089Spjd if (!attempt_reopen || gcount == extracted) { 2242219089Spjd for (i = 0; i < gcount; i++) 2243219089Spjd if (vd[i] != NULL) 2244219089Spjd vdev_split(vd[i]); 2245219089Spjd vdev_reopen(spa->spa_root_vdev); 2246219089Spjd } 2247209962Smm 2248219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 2249219089Spjd} 2250185029Spjd 2251219089Spjdstatic int 2252219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2253332529Smav boolean_t trust_config) 2254219089Spjd{ 2255219089Spjd nvlist_t *config = spa->spa_config; 2256219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 2257228103Smm char *comment; 2258219089Spjd int error; 2259219089Spjd uint64_t pool_guid; 2260219089Spjd nvlist_t *nvl; 2261168404Spjd 2262219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2263249195Smm return (SET_ERROR(EINVAL)); 2264168404Spjd 2265228103Smm ASSERT(spa->spa_comment == NULL); 2266228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2267228103Smm spa->spa_comment = spa_strdup(comment); 2268228103Smm 2269168404Spjd /* 2270168404Spjd * Versioning wasn't explicitly added to the label until later, so if 2271168404Spjd * it's not present treat it as the initial version. 2272168404Spjd */ 2273219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2274219089Spjd &spa->spa_ubsync.ub_version) != 0) 2275219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2276168404Spjd 2277168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2278168404Spjd &spa->spa_config_txg); 2279168404Spjd 2280168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2281168404Spjd spa_guid_exists(pool_guid, 0)) { 2282249195Smm error = SET_ERROR(EEXIST); 2283219089Spjd } else { 2284228103Smm spa->spa_config_guid = pool_guid; 2285219089Spjd 2286219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2287219089Spjd &nvl) == 0) { 2288219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2289219089Spjd KM_SLEEP) == 0); 2290219089Spjd } 2291219089Spjd 2292236884Smm nvlist_free(spa->spa_load_info); 2293236884Smm spa->spa_load_info = fnvlist_alloc(); 2294236884Smm 2295219089Spjd gethrestime(&spa->spa_loaded_ts); 2296219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 2297332529Smav trust_config, &ereport); 2298168404Spjd } 2299168404Spjd 2300286575Smav /* 2301286575Smav * Don't count references from objsets that are already closed 2302286575Smav * and are making their way through the eviction process. 2303286575Smav */ 2304286575Smav spa_evicting_os_wait(spa); 2305219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2306219089Spjd if (error) { 2307219089Spjd if (error != EEXIST) { 2308219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2309219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2310219089Spjd } 2311219089Spjd if (error != EBADF) { 2312219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2313219089Spjd } 2314219089Spjd } 2315219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2316219089Spjd spa->spa_ena = 0; 2317168404Spjd 2318219089Spjd return (error); 2319219089Spjd} 2320219089Spjd 2321219089Spjd/* 2322299441Smav * Count the number of per-vdev ZAPs associated with all of the vdevs in the 2323299441Smav * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 2324299441Smav * spa's per-vdev ZAP list. 2325299441Smav */ 2326299441Smavstatic uint64_t 2327299441Smavvdev_count_verify_zaps(vdev_t *vd) 2328299441Smav{ 2329299441Smav spa_t *spa = vd->vdev_spa; 2330299441Smav uint64_t total = 0; 2331299441Smav if (vd->vdev_top_zap != 0) { 2332299441Smav total++; 2333299441Smav ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2334299441Smav spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 2335299441Smav } 2336299441Smav if (vd->vdev_leaf_zap != 0) { 2337299441Smav total++; 2338299441Smav ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2339299441Smav spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 2340299441Smav } 2341299441Smav 2342299441Smav for (uint64_t i = 0; i < vd->vdev_children; i++) { 2343299441Smav total += vdev_count_verify_zaps(vd->vdev_child[i]); 2344299441Smav } 2345299441Smav 2346299441Smav return (total); 2347299441Smav} 2348299441Smav 2349219089Spjdstatic int 2350332529Smavspa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2351332530Smav spa_import_type_t type) 2352219089Spjd{ 2353219089Spjd int error = 0; 2354332529Smav nvlist_t *nvtree = NULL; 2355332529Smav int parse; 2356219089Spjd vdev_t *rvd; 2357219089Spjd 2358332530Smav if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 2359332530Smav spa_load_failed(spa, "invalid config provided: '%s' missing", 2360332530Smav ZPOOL_CONFIG_VDEV_TREE); 2361249195Smm return (SET_ERROR(EINVAL)); 2362332530Smav } 2363219089Spjd 2364219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 2365219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2366219089Spjd 2367219089Spjd /* 2368209962Smm * Create "The Godfather" zio to hold all async IOs 2369209962Smm */ 2370272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2371272598Sdelphij KM_SLEEP); 2372272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 2373272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2374272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2375272598Sdelphij ZIO_FLAG_GODFATHER); 2376272598Sdelphij } 2377209962Smm 2378209962Smm /* 2379168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2380168404Spjd * value that will be returned by spa_version() since parsing the 2381168404Spjd * configuration requires knowing the version number. 2382168404Spjd */ 2383185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2384332529Smav error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 2385185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2386168404Spjd 2387332530Smav if (error != 0) { 2388332530Smav spa_load_failed(spa, "unable to parse config [error=%d]", 2389332530Smav error); 2390219089Spjd return (error); 2391332530Smav } 2392168404Spjd 2393168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2394284304Savg ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2395284304Savg ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2396168404Spjd 2397219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2398219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2399219089Spjd } 2400219089Spjd 2401332529Smav return (0); 2402332529Smav} 2403332529Smav 2404332529Smavstatic int 2405332529Smavspa_ld_open_vdevs(spa_t *spa) 2406332529Smav{ 2407332529Smav int error = 0; 2408332529Smav 2409185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2410332529Smav error = vdev_open(spa->spa_root_vdev); 2411185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2412332530Smav if (error != 0) { 2413332530Smav spa_load_failed(spa, "unable to open vdev tree [error=%d]", 2414332530Smav error); 2415332530Smav } 2416168404Spjd 2417332529Smav return (error); 2418332529Smav} 2419332529Smav 2420332529Smavstatic int 2421332529Smavspa_ld_validate_vdevs(spa_t *spa, spa_import_type_t type, 2422332529Smav boolean_t trust_config) 2423332529Smav{ 2424332529Smav int error = 0; 2425332529Smav vdev_t *rvd = spa->spa_root_vdev; 2426332529Smav 2427168404Spjd /* 2428209962Smm * We need to validate the vdev labels against the configuration that 2429332529Smav * we have in hand, which is dependent on the setting of trust_config. 2430332529Smav * If trust_config is true then we're validating the vdev labels based 2431332529Smav * on that config. Otherwise, we're validating against the cached 2432332529Smav * config (zpool.cache) that was read when we loaded the zfs module, and 2433332529Smav * then later we will recursively call spa_load() and validate against 2434209962Smm * the vdev config. 2435219089Spjd * 2436219089Spjd * If we're assembling a new pool that's been split off from an 2437219089Spjd * existing pool, the labels haven't yet been updated so we skip 2438219089Spjd * validation for now. 2439168404Spjd */ 2440219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2441219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2442332525Smav error = vdev_validate(rvd, trust_config); 2443219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2444168404Spjd 2445332530Smav if (error != 0) { 2446332530Smav spa_load_failed(spa, "vdev_validate failed [error=%d]", 2447332530Smav error); 2448219089Spjd return (error); 2449332530Smav } 2450219089Spjd 2451332530Smav if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 2452332530Smav spa_load_failed(spa, "cannot open vdev tree after " 2453332530Smav "invalidating some vdevs"); 2454249195Smm return (SET_ERROR(ENXIO)); 2455332530Smav } 2456168404Spjd } 2457168404Spjd 2458332529Smav return (0); 2459332529Smav} 2460332529Smav 2461332529Smavstatic int 2462332529Smavspa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type, 2463332529Smav boolean_t trust_config) 2464332529Smav{ 2465332529Smav vdev_t *rvd = spa->spa_root_vdev; 2466332529Smav nvlist_t *label; 2467332529Smav uberblock_t *ub = &spa->spa_uberblock; 2468332529Smav uint64_t children; 2469332529Smav 2470168404Spjd /* 2471168404Spjd * Find the best uberblock. 2472168404Spjd */ 2473236884Smm vdev_uberblock_load(rvd, ub, &label); 2474168404Spjd 2475168404Spjd /* 2476168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2477168404Spjd */ 2478236884Smm if (ub->ub_txg == 0) { 2479236884Smm nvlist_free(label); 2480332530Smav spa_load_failed(spa, "no valid uberblock found"); 2481219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2482236884Smm } 2483168404Spjd 2484332530Smav spa_load_note(spa, "using uberblock with txg=%llu", 2485332530Smav (u_longlong_t)ub->ub_txg); 2486332530Smav 2487168404Spjd /* 2488236884Smm * If the pool has an unsupported version we can't open it. 2489168404Spjd */ 2490236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2491236884Smm nvlist_free(label); 2492332530Smav spa_load_failed(spa, "version %llu is not supported", 2493332530Smav (u_longlong_t)ub->ub_version); 2494219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2495236884Smm } 2496168404Spjd 2497236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2498236884Smm nvlist_t *features; 2499236884Smm 2500236884Smm /* 2501236884Smm * If we weren't able to find what's necessary for reading the 2502236884Smm * MOS in the label, return failure. 2503236884Smm */ 2504332530Smav if (label == NULL) { 2505332530Smav spa_load_failed(spa, "label config unavailable"); 2506332530Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2507332530Smav ENXIO)); 2508332530Smav } 2509332530Smav 2510332530Smav if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 2511332530Smav &features) != 0) { 2512236884Smm nvlist_free(label); 2513332530Smav spa_load_failed(spa, "invalid label: '%s' missing", 2514332530Smav ZPOOL_CONFIG_FEATURES_FOR_READ); 2515236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2516236884Smm ENXIO)); 2517236884Smm } 2518236884Smm 2519236884Smm /* 2520236884Smm * Update our in-core representation with the definitive values 2521236884Smm * from the label. 2522236884Smm */ 2523236884Smm nvlist_free(spa->spa_label_features); 2524236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2525236884Smm } 2526236884Smm 2527236884Smm nvlist_free(label); 2528236884Smm 2529168404Spjd /* 2530236884Smm * Look through entries in the label nvlist's features_for_read. If 2531236884Smm * there is a feature listed there which we don't understand then we 2532236884Smm * cannot open a pool. 2533236884Smm */ 2534236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2535236884Smm nvlist_t *unsup_feat; 2536236884Smm 2537236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2538236884Smm 0); 2539236884Smm 2540236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2541236884Smm NULL); nvp != NULL; 2542236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2543236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2544236884Smm VERIFY(nvlist_add_string(unsup_feat, 2545236884Smm nvpair_name(nvp), "") == 0); 2546236884Smm } 2547236884Smm } 2548236884Smm 2549236884Smm if (!nvlist_empty(unsup_feat)) { 2550236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2551236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2552236884Smm nvlist_free(unsup_feat); 2553332530Smav spa_load_failed(spa, "some features are unsupported"); 2554236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2555236884Smm ENOTSUP)); 2556236884Smm } 2557236884Smm 2558236884Smm nvlist_free(unsup_feat); 2559236884Smm } 2560236884Smm 2561236884Smm /* 2562168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2563219089Spjd * incomplete configuration. We first check to see if the pool 2564219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2565219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2566219089Spjd * can handle missing vdevs. 2567168404Spjd */ 2568219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2569332525Smav &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE && 2570332530Smav rvd->vdev_guid_sum != ub->ub_guid_sum) { 2571332530Smav spa_load_failed(spa, "guid sum in config doesn't match guid " 2572332530Smav "sum in uberblock (%llu != %llu)", 2573332530Smav (u_longlong_t)rvd->vdev_guid_sum, 2574332530Smav (u_longlong_t)ub->ub_guid_sum); 2575219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2576332530Smav } 2577219089Spjd 2578219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2579219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2580219089Spjd spa_try_repair(spa, config); 2581219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2582219089Spjd nvlist_free(spa->spa_config_splitting); 2583219089Spjd spa->spa_config_splitting = NULL; 2584168404Spjd } 2585168404Spjd 2586168404Spjd /* 2587168404Spjd * Initialize internal SPA structures. 2588168404Spjd */ 2589168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2590168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2591219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2592219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2593219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2594219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2595219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2596219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2597219089Spjd 2598332529Smav return (0); 2599332529Smav} 2600332525Smav 2601332529Smavstatic int 2602332529Smavspa_ld_open_rootbp(spa_t *spa) 2603332529Smav{ 2604332529Smav int error = 0; 2605332529Smav vdev_t *rvd = spa->spa_root_vdev; 2606332529Smav 2607236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2608332530Smav if (error != 0) { 2609332530Smav spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 2610332530Smav "[error=%d]", error); 2611219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2612332530Smav } 2613168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2614168404Spjd 2615332529Smav return (0); 2616332529Smav} 2617332529Smav 2618332529Smavstatic int 2619332529Smavspa_ld_validate_config(spa_t *spa, spa_import_type_t type) 2620332529Smav{ 2621332529Smav vdev_t *rvd = spa->spa_root_vdev; 2622332529Smav 2623332530Smav if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 2624332530Smav != 0) 2625219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2626168404Spjd 2627332525Smav /* 2628332525Smav * Validate the config, using the MOS config to fill in any 2629332525Smav * information which might be missing. If we fail to validate 2630332525Smav * the config then declare the pool unfit for use. If we're 2631332525Smav * assembling a pool from a split, the log is not transferred 2632332525Smav * over. 2633332525Smav */ 2634332525Smav if (type != SPA_IMPORT_ASSEMBLE) { 2635332525Smav nvlist_t *mos_config; 2636332530Smav if (load_nvlist(spa, spa->spa_config_object, &mos_config) 2637332530Smav != 0) { 2638332530Smav spa_load_failed(spa, "unable to retrieve MOS config"); 2639332525Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2640332530Smav } 2641332525Smav 2642332525Smav if (!spa_config_valid(spa, mos_config)) { 2643332525Smav nvlist_free(mos_config); 2644332530Smav spa_load_failed(spa, "mismatch between config provided " 2645332530Smav "and config stored in MOS"); 2646332525Smav return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2647332525Smav ENXIO)); 2648332525Smav } 2649332525Smav nvlist_free(mos_config); 2650332525Smav 2651332525Smav /* 2652332525Smav * Now that we've validated the config, check the state of the 2653332525Smav * root vdev. If it can't be opened, it indicates one or 2654332525Smav * more toplevel vdevs are faulted. 2655332525Smav */ 2656332530Smav if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 2657332530Smav spa_load_failed(spa, "some top vdevs are unavailable"); 2658332525Smav return (SET_ERROR(ENXIO)); 2659332530Smav } 2660332525Smav } 2661332525Smav 2662332529Smav return (0); 2663332529Smav} 2664332529Smav 2665332529Smavstatic int 2666332529Smavspa_ld_open_indirect_vdev_metadata(spa_t *spa) 2667332529Smav{ 2668332529Smav int error = 0; 2669332529Smav vdev_t *rvd = spa->spa_root_vdev; 2670332529Smav 2671332525Smav /* 2672332525Smav * Everything that we read before spa_remove_init() must be stored 2673332525Smav * on concreted vdevs. Therefore we do this as early as possible. 2674332525Smav */ 2675332530Smav error = spa_remove_init(spa); 2676332530Smav if (error != 0) { 2677332530Smav spa_load_failed(spa, "spa_remove_init failed [error=%d]", 2678332530Smav error); 2679332525Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2680332530Smav } 2681332525Smav 2682332529Smav /* 2683332529Smav * Retrieve information needed to condense indirect vdev mappings. 2684332529Smav */ 2685332529Smav error = spa_condense_init(spa); 2686332529Smav if (error != 0) { 2687332530Smav spa_load_failed(spa, "spa_condense_init failed [error=%d]", 2688332530Smav error); 2689332529Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 2690332529Smav } 2691332529Smav 2692332529Smav return (0); 2693332529Smav} 2694332529Smav 2695332529Smavstatic int 2696332530Smavspa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 2697332529Smav{ 2698332529Smav int error = 0; 2699332529Smav vdev_t *rvd = spa->spa_root_vdev; 2700332529Smav 2701236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2702236884Smm boolean_t missing_feat_read = B_FALSE; 2703238926Smm nvlist_t *unsup_feat, *enabled_feat; 2704236884Smm 2705236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2706332530Smav &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 2707236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2708236884Smm } 2709236884Smm 2710236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2711332530Smav &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 2712236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2713236884Smm } 2714236884Smm 2715236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2716332530Smav &spa->spa_feat_desc_obj, B_TRUE) != 0) { 2717236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2718236884Smm } 2719236884Smm 2720238926Smm enabled_feat = fnvlist_alloc(); 2721238926Smm unsup_feat = fnvlist_alloc(); 2722236884Smm 2723259813Sdelphij if (!spa_features_check(spa, B_FALSE, 2724238926Smm unsup_feat, enabled_feat)) 2725236884Smm missing_feat_read = B_TRUE; 2726236884Smm 2727332530Smav if (spa_writeable(spa) || 2728332530Smav spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 2729259813Sdelphij if (!spa_features_check(spa, B_TRUE, 2730238926Smm unsup_feat, enabled_feat)) { 2731332529Smav *missing_feat_writep = B_TRUE; 2732238926Smm } 2733236884Smm } 2734236884Smm 2735238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2736238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2737238926Smm 2738236884Smm if (!nvlist_empty(unsup_feat)) { 2739238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2740238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2741236884Smm } 2742236884Smm 2743238926Smm fnvlist_free(enabled_feat); 2744238926Smm fnvlist_free(unsup_feat); 2745236884Smm 2746236884Smm if (!missing_feat_read) { 2747236884Smm fnvlist_add_boolean(spa->spa_load_info, 2748236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2749236884Smm } 2750236884Smm 2751236884Smm /* 2752236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2753236884Smm * twofold: to determine whether the pool is available for 2754236884Smm * import in read-write mode and (if it is not) whether the 2755236884Smm * pool is available for import in read-only mode. If the pool 2756236884Smm * is available for import in read-write mode, it is displayed 2757236884Smm * as available in userland; if it is not available for import 2758236884Smm * in read-only mode, it is displayed as unavailable in 2759236884Smm * userland. If the pool is available for import in read-only 2760236884Smm * mode but not read-write mode, it is displayed as unavailable 2761236884Smm * in userland with a special note that the pool is actually 2762236884Smm * available for open in read-only mode. 2763236884Smm * 2764236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2765236884Smm * missing a feature for write, we must first determine whether 2766236884Smm * the pool can be opened read-only before returning to 2767236884Smm * userland in order to know whether to display the 2768236884Smm * abovementioned note. 2769236884Smm */ 2770332529Smav if (missing_feat_read || (*missing_feat_writep && 2771236884Smm spa_writeable(spa))) { 2772332530Smav spa_load_failed(spa, "pool uses unsupported features"); 2773236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2774236884Smm ENOTSUP)); 2775236884Smm } 2776260150Sdelphij 2777260150Sdelphij /* 2778260150Sdelphij * Load refcounts for ZFS features from disk into an in-memory 2779260150Sdelphij * cache during SPA initialization. 2780260150Sdelphij */ 2781260150Sdelphij for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2782260150Sdelphij uint64_t refcount; 2783260150Sdelphij 2784260150Sdelphij error = feature_get_refcount_from_disk(spa, 2785260150Sdelphij &spa_feature_table[i], &refcount); 2786260150Sdelphij if (error == 0) { 2787260150Sdelphij spa->spa_feat_refcount_cache[i] = refcount; 2788260150Sdelphij } else if (error == ENOTSUP) { 2789260150Sdelphij spa->spa_feat_refcount_cache[i] = 2790260150Sdelphij SPA_FEATURE_DISABLED; 2791260150Sdelphij } else { 2792332530Smav spa_load_failed(spa, "error getting refcount " 2793332530Smav "for feature %s [error=%d]", 2794332530Smav spa_feature_table[i].fi_guid, error); 2795260150Sdelphij return (spa_vdev_err(rvd, 2796260150Sdelphij VDEV_AUX_CORRUPT_DATA, EIO)); 2797260150Sdelphij } 2798260150Sdelphij } 2799236884Smm } 2800236884Smm 2801260150Sdelphij if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2802260150Sdelphij if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2803332530Smav &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 2804260150Sdelphij return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2805260150Sdelphij } 2806260150Sdelphij 2807332529Smav return (0); 2808332529Smav} 2809332529Smav 2810332529Smavstatic int 2811332529Smavspa_ld_load_special_directories(spa_t *spa) 2812332529Smav{ 2813332529Smav int error = 0; 2814332529Smav vdev_t *rvd = spa->spa_root_vdev; 2815332529Smav 2816236884Smm spa->spa_is_initializing = B_TRUE; 2817236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2818236884Smm spa->spa_is_initializing = B_FALSE; 2819332530Smav if (error != 0) { 2820332530Smav spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 2821236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2822332530Smav } 2823236884Smm 2824332529Smav return (0); 2825332529Smav} 2826168404Spjd 2827332529Smavstatic int 2828332529Smavspa_ld_prepare_for_reload(spa_t *spa, int orig_mode) 2829332529Smav{ 2830332529Smav vdev_t *rvd = spa->spa_root_vdev; 2831168404Spjd 2832332529Smav uint64_t hostid; 2833332529Smav nvlist_t *policy = NULL; 2834332529Smav nvlist_t *mos_config; 2835168498Spjd 2836332530Smav if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 2837332530Smav spa_load_failed(spa, "unable to retrieve MOS config"); 2838332529Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2839332530Smav } 2840168498Spjd 2841332529Smav if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 2842332529Smav ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2843332529Smav char *hostname; 2844332529Smav unsigned long myhostid = 0; 2845332529Smav 2846332529Smav VERIFY(nvlist_lookup_string(mos_config, 2847332529Smav ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2848332529Smav 2849219089Spjd#ifdef _KERNEL 2850332529Smav myhostid = zone_get_hostid(NULL); 2851219089Spjd#else /* _KERNEL */ 2852332529Smav /* 2853332529Smav * We're emulating the system's hostid in userland, so 2854332529Smav * we can't use zone_get_hostid(). 2855332529Smav */ 2856332529Smav (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2857219089Spjd#endif /* _KERNEL */ 2858332529Smav if (check_hostid && hostid != 0 && myhostid != 0 && 2859332529Smav hostid != myhostid) { 2860332529Smav nvlist_free(mos_config); 2861332529Smav cmn_err(CE_WARN, "pool '%s' could not be " 2862332529Smav "loaded as it was last accessed by " 2863332529Smav "another system (host: %s hostid: 0x%lx). " 2864332529Smav "See: http://illumos.org/msg/ZFS-8000-EY", 2865332529Smav spa_name(spa), hostname, 2866332529Smav (unsigned long)hostid); 2867332529Smav return (SET_ERROR(EBADF)); 2868168498Spjd } 2869332529Smav } 2870332529Smav if (nvlist_lookup_nvlist(spa->spa_config, 2871332529Smav ZPOOL_REWIND_POLICY, &policy) == 0) 2872332529Smav VERIFY(nvlist_add_nvlist(mos_config, 2873332529Smav ZPOOL_REWIND_POLICY, policy) == 0); 2874168498Spjd 2875332529Smav spa_config_set(spa, mos_config); 2876332529Smav spa_unload(spa); 2877332529Smav spa_deactivate(spa); 2878332529Smav spa_activate(spa, orig_mode); 2879168404Spjd 2880332529Smav return (0); 2881332529Smav} 2882168404Spjd 2883332529Smavstatic int 2884332529Smavspa_ld_get_props(spa_t *spa) 2885332529Smav{ 2886332529Smav int error = 0; 2887332529Smav uint64_t obj; 2888332529Smav vdev_t *rvd = spa->spa_root_vdev; 2889332529Smav 2890289422Smav /* Grab the secret checksum salt from the MOS. */ 2891289422Smav error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2892289422Smav DMU_POOL_CHECKSUM_SALT, 1, 2893289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes), 2894289422Smav spa->spa_cksum_salt.zcs_bytes); 2895289422Smav if (error == ENOENT) { 2896289422Smav /* Generate a new salt for subsequent use */ 2897289422Smav (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 2898289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes)); 2899289422Smav } else if (error != 0) { 2900332530Smav spa_load_failed(spa, "unable to retrieve checksum salt from " 2901332530Smav "MOS [error=%d]", error); 2902289422Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2903289422Smav } 2904289422Smav 2905332530Smav if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 2906219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2907219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2908332530Smav if (error != 0) { 2909332530Smav spa_load_failed(spa, "error opening deferred-frees bpobj " 2910332530Smav "[error=%d]", error); 2911219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2912332530Smav } 2913168404Spjd 2914168404Spjd /* 2915168404Spjd * Load the bit that tells us to use the new accounting function 2916168404Spjd * (raid-z deflation). If we have an older pool, this will not 2917168404Spjd * be present. 2918168404Spjd */ 2919332530Smav error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 2920219089Spjd if (error != 0 && error != ENOENT) 2921219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2922168404Spjd 2923219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2924332530Smav &spa->spa_creation_version, B_FALSE); 2925219089Spjd if (error != 0 && error != ENOENT) 2926219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2927219089Spjd 2928168404Spjd /* 2929168404Spjd * Load the persistent error log. If we have an older pool, this will 2930168404Spjd * not be present. 2931168404Spjd */ 2932332530Smav error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 2933332530Smav B_FALSE); 2934219089Spjd if (error != 0 && error != ENOENT) 2935219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2936168404Spjd 2937219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2938332530Smav &spa->spa_errlog_scrub, B_FALSE); 2939219089Spjd if (error != 0 && error != ENOENT) 2940219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2941168404Spjd 2942168404Spjd /* 2943168404Spjd * Load the history object. If we have an older pool, this 2944168404Spjd * will not be present. 2945168404Spjd */ 2946332530Smav error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 2947219089Spjd if (error != 0 && error != ENOENT) 2948219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2949168404Spjd 2950168404Spjd /* 2951299441Smav * Load the per-vdev ZAP map. If we have an older pool, this will not 2952299441Smav * be present; in this case, defer its creation to a later time to 2953299441Smav * avoid dirtying the MOS this early / out of sync context. See 2954299441Smav * spa_sync_config_object. 2955299441Smav */ 2956299441Smav 2957299441Smav /* The sentinel is only available in the MOS config. */ 2958299441Smav nvlist_t *mos_config; 2959332530Smav if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 2960332530Smav spa_load_failed(spa, "unable to retrieve MOS config"); 2961299441Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2962332530Smav } 2963299441Smav 2964299441Smav error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 2965332530Smav &spa->spa_all_vdev_zaps, B_FALSE); 2966299441Smav 2967321540Smav if (error == ENOENT) { 2968321540Smav VERIFY(!nvlist_exists(mos_config, 2969321540Smav ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 2970321540Smav spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 2971321540Smav ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2972321540Smav } else if (error != 0) { 2973299441Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2974321540Smav } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 2975299441Smav /* 2976299441Smav * An older version of ZFS overwrote the sentinel value, so 2977299441Smav * we have orphaned per-vdev ZAPs in the MOS. Defer their 2978299441Smav * destruction to later; see spa_sync_config_object. 2979299441Smav */ 2980299441Smav spa->spa_avz_action = AVZ_ACTION_DESTROY; 2981299441Smav /* 2982299441Smav * We're assuming that no vdevs have had their ZAPs created 2983299441Smav * before this. Better be sure of it. 2984299441Smav */ 2985299441Smav ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2986299441Smav } 2987299441Smav nvlist_free(mos_config); 2988299441Smav 2989332529Smav spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2990332529Smav 2991332530Smav error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 2992332530Smav B_FALSE); 2993332529Smav if (error && error != ENOENT) 2994332529Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2995332529Smav 2996332529Smav if (error == 0) { 2997332529Smav uint64_t autoreplace; 2998332529Smav 2999332529Smav spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 3000332529Smav spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 3001332529Smav spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 3002332529Smav spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 3003332529Smav spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 3004332529Smav spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 3005332529Smav &spa->spa_dedup_ditto); 3006332529Smav 3007332529Smav spa->spa_autoreplace = (autoreplace != 0); 3008332529Smav } 3009332529Smav 3010332529Smav return (0); 3011332529Smav} 3012332529Smav 3013332529Smavstatic int 3014332529Smavspa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 3015332529Smav{ 3016332529Smav int error = 0; 3017332529Smav vdev_t *rvd = spa->spa_root_vdev; 3018332529Smav 3019299441Smav /* 3020219089Spjd * If we're assembling the pool from the split-off vdevs of 3021219089Spjd * an existing pool, we don't want to attach the spares & cache 3022219089Spjd * devices. 3023219089Spjd */ 3024219089Spjd 3025219089Spjd /* 3026168404Spjd * Load any hot spares for this pool. 3027168404Spjd */ 3028332530Smav error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 3029332530Smav B_FALSE); 3030219089Spjd if (error != 0 && error != ENOENT) 3031219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3032219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 3033185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 3034185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 3035332530Smav &spa->spa_spares.sav_config) != 0) { 3036332530Smav spa_load_failed(spa, "error loading spares nvlist"); 3037219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3038332530Smav } 3039168404Spjd 3040185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3041168404Spjd spa_load_spares(spa); 3042185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3043219089Spjd } else if (error == 0) { 3044219089Spjd spa->spa_spares.sav_sync = B_TRUE; 3045168404Spjd } 3046168404Spjd 3047185029Spjd /* 3048185029Spjd * Load any level 2 ARC devices for this pool. 3049185029Spjd */ 3050219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 3051332530Smav &spa->spa_l2cache.sav_object, B_FALSE); 3052219089Spjd if (error != 0 && error != ENOENT) 3053219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3054219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 3055185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 3056185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 3057332530Smav &spa->spa_l2cache.sav_config) != 0) { 3058332530Smav spa_load_failed(spa, "error loading l2cache nvlist"); 3059219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3060332530Smav } 3061185029Spjd 3062185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3063185029Spjd spa_load_l2cache(spa); 3064185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3065219089Spjd } else if (error == 0) { 3066219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3067185029Spjd } 3068185029Spjd 3069332529Smav return (0); 3070332529Smav} 3071213197Smm 3072332529Smavstatic int 3073332530Smavspa_ld_load_vdev_metadata(spa_t *spa) 3074332529Smav{ 3075332529Smav int error = 0; 3076332529Smav vdev_t *rvd = spa->spa_root_vdev; 3077185029Spjd 3078168404Spjd /* 3079185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 3080185029Spjd * the ZFS DE that it should not issue any faults for unopenable 3081185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 3082185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 3083185029Spjd * over. 3084185029Spjd */ 3085332530Smav if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 3086185029Spjd spa_check_removed(spa->spa_root_vdev); 3087219089Spjd /* 3088219089Spjd * For the import case, this is done in spa_import(), because 3089219089Spjd * at this point we're using the spare definitions from 3090219089Spjd * the MOS config, not necessarily from the userland config. 3091219089Spjd */ 3092332530Smav if (spa->spa_load_state != SPA_LOAD_IMPORT) { 3093219089Spjd spa_aux_check_removed(&spa->spa_spares); 3094219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 3095219089Spjd } 3096219089Spjd } 3097185029Spjd 3098185029Spjd /* 3099332529Smav * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 3100168404Spjd */ 3101332525Smav error = vdev_load(rvd); 3102332525Smav if (error != 0) { 3103332530Smav spa_load_failed(spa, "vdev_load failed [error=%d]", error); 3104332525Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3105332525Smav } 3106168404Spjd 3107168404Spjd /* 3108332529Smav * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 3109168404Spjd */ 3110185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3111168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 3112185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3113168404Spjd 3114332529Smav return (0); 3115332529Smav} 3116332529Smav 3117332529Smavstatic int 3118332529Smavspa_ld_load_dedup_tables(spa_t *spa) 3119332529Smav{ 3120332529Smav int error = 0; 3121332529Smav vdev_t *rvd = spa->spa_root_vdev; 3122332529Smav 3123219089Spjd error = ddt_load(spa); 3124332530Smav if (error != 0) { 3125332530Smav spa_load_failed(spa, "ddt_load failed [error=%d]", error); 3126219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3127332530Smav } 3128219089Spjd 3129332529Smav return (0); 3130332529Smav} 3131219089Spjd 3132332529Smavstatic int 3133332529Smavspa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) 3134332529Smav{ 3135332529Smav vdev_t *rvd = spa->spa_root_vdev; 3136332529Smav 3137332530Smav if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 3138332530Smav boolean_t missing = spa_check_logs(spa); 3139332530Smav if (missing) { 3140332530Smav *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 3141332530Smav spa_load_failed(spa, "spa_check_logs failed"); 3142332530Smav return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 3143332530Smav } 3144168404Spjd } 3145168404Spjd 3146332529Smav return (0); 3147332529Smav} 3148332529Smav 3149332529Smavstatic int 3150332530Smavspa_ld_verify_pool_data(spa_t *spa) 3151332529Smav{ 3152332529Smav int error = 0; 3153332529Smav vdev_t *rvd = spa->spa_root_vdev; 3154332529Smav 3155332529Smav /* 3156332529Smav * We've successfully opened the pool, verify that we're ready 3157332529Smav * to start pushing transactions. 3158332529Smav */ 3159332530Smav if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 3160332529Smav error = spa_load_verify(spa); 3161332529Smav if (error != 0) { 3162332530Smav spa_load_failed(spa, "spa_load_verify failed " 3163332530Smav "[error=%d]", error); 3164332529Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3165332529Smav error)); 3166332529Smav } 3167332529Smav } 3168332529Smav 3169332529Smav return (0); 3170332529Smav} 3171332529Smav 3172332529Smavstatic void 3173332529Smavspa_ld_claim_log_blocks(spa_t *spa) 3174332529Smav{ 3175332529Smav dmu_tx_t *tx; 3176332529Smav dsl_pool_t *dp = spa_get_dsl(spa); 3177332529Smav 3178332529Smav /* 3179332529Smav * Claim log blocks that haven't been committed yet. 3180332529Smav * This must all happen in a single txg. 3181332529Smav * Note: spa_claim_max_txg is updated by spa_claim_notify(), 3182332529Smav * invoked from zil_claim_log_block()'s i/o done callback. 3183332529Smav * Price of rollback is that we abandon the log. 3184332529Smav */ 3185332529Smav spa->spa_claiming = B_TRUE; 3186332529Smav 3187332529Smav tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 3188332529Smav (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 3189332529Smav zil_claim, tx, DS_FIND_CHILDREN); 3190332529Smav dmu_tx_commit(tx); 3191332529Smav 3192332529Smav spa->spa_claiming = B_FALSE; 3193332529Smav 3194332529Smav spa_set_log_state(spa, SPA_LOG_GOOD); 3195332529Smav} 3196332529Smav 3197332529Smavstatic void 3198332530Smavspa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg) 3199332529Smav{ 3200332529Smav vdev_t *rvd = spa->spa_root_vdev; 3201332529Smav int need_update = B_FALSE; 3202332529Smav 3203332529Smav /* 3204332529Smav * If the config cache is stale, or we have uninitialized 3205332529Smav * metaslabs (see spa_vdev_add()), then update the config. 3206332529Smav * 3207332529Smav * If this is a verbatim import, trust the current 3208332529Smav * in-core spa_config and update the disk labels. 3209332529Smav */ 3210332529Smav if (config_cache_txg != spa->spa_config_txg || 3211332530Smav spa->spa_load_state == SPA_LOAD_IMPORT || 3212332530Smav spa->spa_load_state == SPA_LOAD_RECOVER || 3213332529Smav (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 3214332529Smav need_update = B_TRUE; 3215332529Smav 3216332529Smav for (int c = 0; c < rvd->vdev_children; c++) 3217332529Smav if (rvd->vdev_child[c]->vdev_ms_array == 0) 3218332529Smav need_update = B_TRUE; 3219332529Smav 3220332529Smav /* 3221332529Smav * Update the config cache asychronously in case we're the 3222332529Smav * root pool, in which case the config cache isn't writable yet. 3223332529Smav */ 3224332529Smav if (need_update) 3225332529Smav spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 3226332529Smav} 3227332529Smav 3228332529Smav/* 3229332529Smav * Load an existing storage pool, using the config provided. This config 3230332529Smav * describes which vdevs are part of the pool and is later validated against 3231332529Smav * partial configs present in each vdev's label and an entire copy of the 3232332529Smav * config stored in the MOS. 3233332529Smav */ 3234332529Smavstatic int 3235332529Smavspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 3236332529Smav spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, 3237332529Smav char **ereport) 3238332529Smav{ 3239332529Smav int error = 0; 3240332529Smav uint64_t config_cache_txg = spa->spa_config_txg; 3241332529Smav int orig_mode = spa->spa_mode; 3242332529Smav boolean_t missing_feat_write = B_FALSE; 3243332529Smav 3244332530Smav ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3245332530Smav 3246332530Smav spa->spa_load_state = state; 3247332530Smav spa_load_note(spa, "LOADING"); 3248332530Smav 3249332529Smav /* 3250332529Smav * If this is an untrusted config, first access the pool in read-only 3251332529Smav * mode. We will then retrieve a trusted copy of the config from the MOS 3252332529Smav * and use it to reopen the pool in read-write mode. 3253332529Smav */ 3254332529Smav if (!trust_config) 3255332529Smav spa->spa_mode = FREAD; 3256332529Smav 3257332529Smav /* 3258332529Smav * Parse the config provided to create a vdev tree. 3259332529Smav */ 3260332530Smav error = spa_ld_parse_config(spa, pool_guid, config, type); 3261332529Smav if (error != 0) 3262332529Smav return (error); 3263332529Smav 3264332529Smav /* 3265332529Smav * Now that we have the vdev tree, try to open each vdev. This involves 3266332529Smav * opening the underlying physical device, retrieving its geometry and 3267332529Smav * probing the vdev with a dummy I/O. The state of each vdev will be set 3268332529Smav * based on the success of those operations. After this we'll be ready 3269332529Smav * to read from the vdevs. 3270332529Smav */ 3271332529Smav error = spa_ld_open_vdevs(spa); 3272332529Smav if (error != 0) 3273332529Smav return (error); 3274332529Smav 3275332529Smav /* 3276332529Smav * Read the label of each vdev and make sure that the GUIDs stored 3277332529Smav * there match the GUIDs in the config provided. 3278332529Smav */ 3279332529Smav error = spa_ld_validate_vdevs(spa, type, trust_config); 3280332529Smav if (error != 0) 3281332529Smav return (error); 3282332529Smav 3283332529Smav /* 3284332529Smav * Read vdev labels to find the best uberblock (i.e. latest, unless 3285332529Smav * spa_load_max_txg is set) and store it in spa_uberblock. We get the 3286332529Smav * list of features required to read blkptrs in the MOS from the vdev 3287332529Smav * label with the best uberblock and verify that our version of zfs 3288332529Smav * supports them all. 3289332529Smav */ 3290332529Smav error = spa_ld_select_uberblock(spa, config, type, trust_config); 3291332529Smav if (error != 0) 3292332529Smav return (error); 3293332529Smav 3294332529Smav /* 3295332529Smav * Pass that uberblock to the dsl_pool layer which will open the root 3296332529Smav * blkptr. This blkptr points to the latest version of the MOS and will 3297332529Smav * allow us to read its contents. 3298332529Smav */ 3299332529Smav error = spa_ld_open_rootbp(spa); 3300332529Smav if (error != 0) 3301332529Smav return (error); 3302332529Smav 3303332529Smav /* 3304332529Smav * Retrieve the config stored in the MOS and use it to validate the 3305332529Smav * config provided. Also extract some information from the MOS config 3306332529Smav * to update our vdev tree. 3307332529Smav */ 3308332529Smav error = spa_ld_validate_config(spa, type); 3309332529Smav if (error != 0) 3310332529Smav return (error); 3311332529Smav 3312332529Smav /* 3313332529Smav * Retrieve the mapping of indirect vdevs. Those vdevs were removed 3314332529Smav * from the pool and their contents were re-mapped to other vdevs. Note 3315332529Smav * that everything that we read before this step must have been 3316332529Smav * rewritten on concrete vdevs after the last device removal was 3317332529Smav * initiated. Otherwise we could be reading from indirect vdevs before 3318332529Smav * we have loaded their mappings. 3319332529Smav */ 3320332529Smav error = spa_ld_open_indirect_vdev_metadata(spa); 3321332529Smav if (error != 0) 3322332529Smav return (error); 3323332529Smav 3324332529Smav /* 3325332529Smav * Retrieve the full list of active features from the MOS and check if 3326332529Smav * they are all supported. 3327332529Smav */ 3328332530Smav error = spa_ld_check_features(spa, &missing_feat_write); 3329332529Smav if (error != 0) 3330332529Smav return (error); 3331332529Smav 3332332529Smav /* 3333332529Smav * Load several special directories from the MOS needed by the dsl_pool 3334332529Smav * layer. 3335332529Smav */ 3336332529Smav error = spa_ld_load_special_directories(spa); 3337332529Smav if (error != 0) 3338332529Smav return (error); 3339332529Smav 3340332529Smav /* 3341332529Smav * If the config provided is not trusted, discard it and use the config 3342332529Smav * from the MOS to reload the pool. 3343332529Smav */ 3344332529Smav if (!trust_config) { 3345332529Smav error = spa_ld_prepare_for_reload(spa, orig_mode); 3346332529Smav if (error != 0) 3347332529Smav return (error); 3348332530Smav 3349332530Smav spa_load_note(spa, "RELOADING"); 3350332529Smav return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 3351332529Smav } 3352332529Smav 3353332529Smav /* 3354332529Smav * Retrieve pool properties from the MOS. 3355332529Smav */ 3356332529Smav error = spa_ld_get_props(spa); 3357332529Smav if (error != 0) 3358332529Smav return (error); 3359332529Smav 3360332529Smav /* 3361332529Smav * Retrieve the list of auxiliary devices - cache devices and spares - 3362332529Smav * and open them. 3363332529Smav */ 3364332529Smav error = spa_ld_open_aux_vdevs(spa, type); 3365332529Smav if (error != 0) 3366332529Smav return (error); 3367332529Smav 3368332529Smav /* 3369332529Smav * Load the metadata for all vdevs. Also check if unopenable devices 3370332529Smav * should be autoreplaced. 3371332529Smav */ 3372332530Smav error = spa_ld_load_vdev_metadata(spa); 3373332529Smav if (error != 0) 3374332529Smav return (error); 3375332529Smav 3376332529Smav error = spa_ld_load_dedup_tables(spa); 3377332529Smav if (error != 0) 3378332529Smav return (error); 3379332529Smav 3380332529Smav /* 3381332529Smav * Verify the logs now to make sure we don't have any unexpected errors 3382332529Smav * when we claim log blocks later. 3383332529Smav */ 3384332529Smav error = spa_ld_verify_logs(spa, type, ereport); 3385332529Smav if (error != 0) 3386332529Smav return (error); 3387332529Smav 3388236884Smm if (missing_feat_write) { 3389236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 3390236884Smm 3391236884Smm /* 3392236884Smm * At this point, we know that we can open the pool in 3393236884Smm * read-only mode but not read-write mode. We now have enough 3394236884Smm * information and can return to userland. 3395236884Smm */ 3396332529Smav return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 3397332529Smav ENOTSUP)); 3398236884Smm } 3399236884Smm 3400219089Spjd /* 3401332529Smav * Traverse the last txgs to make sure the pool was left off in a safe 3402332529Smav * state. When performing an extreme rewind, we verify the whole pool, 3403332529Smav * which can take a very long time. 3404219089Spjd */ 3405332530Smav error = spa_ld_verify_pool_data(spa); 3406332529Smav if (error != 0) 3407332529Smav return (error); 3408219089Spjd 3409332529Smav /* 3410332529Smav * Calculate the deflated space for the pool. This must be done before 3411332529Smav * we write anything to the pool because we'd need to update the space 3412332529Smav * accounting using the deflated sizes. 3413332529Smav */ 3414332529Smav spa_update_dspace(spa); 3415332529Smav 3416332529Smav /* 3417332529Smav * We have now retrieved all the information we needed to open the 3418332529Smav * pool. If we are importing the pool in read-write mode, a few 3419332529Smav * additional steps must be performed to finish the import. 3420332529Smav */ 3421219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 3422219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 3423332529Smav ASSERT(state != SPA_LOAD_TRYIMPORT); 3424168404Spjd 3425332525Smav /* 3426332525Smav * We must check this before we start the sync thread, because 3427332525Smav * we only want to start a condense thread for condense 3428332525Smav * operations that were in progress when the pool was 3429332525Smav * imported. Once we start syncing, spa_sync() could 3430332525Smav * initiate a condense (and start a thread for it). In 3431332525Smav * that case it would be wrong to start a second 3432332525Smav * condense thread. 3433332525Smav */ 3434332525Smav boolean_t condense_in_progress = 3435332525Smav (spa->spa_condensing_indirect != NULL); 3436332525Smav 3437332529Smav /* 3438332529Smav * Traverse the ZIL and claim all blocks. 3439332529Smav */ 3440332529Smav spa_ld_claim_log_blocks(spa); 3441209962Smm 3442168404Spjd /* 3443332529Smav * Kick-off the syncing thread. 3444168404Spjd */ 3445168404Spjd spa->spa_sync_on = B_TRUE; 3446168404Spjd txg_sync_start(spa->spa_dsl_pool); 3447168404Spjd 3448168404Spjd /* 3449219089Spjd * Wait for all claims to sync. We sync up to the highest 3450219089Spjd * claimed log block birth time so that claimed log blocks 3451219089Spjd * don't appear to be from the future. spa_claim_max_txg 3452332529Smav * will have been set for us by ZIL traversal operations 3453332529Smav * performed above. 3454168404Spjd */ 3455219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 3456168404Spjd 3457168404Spjd /* 3458332529Smav * Check if we need to request an update of the config. On the 3459332529Smav * next sync, we would update the config stored in vdev labels 3460332529Smav * and the cachefile (by default /etc/zfs/zpool.cache). 3461168404Spjd */ 3462332530Smav spa_ld_check_for_config_update(spa, config_cache_txg); 3463168404Spjd 3464168404Spjd /* 3465208683Spjd * Check all DTLs to see if anything needs resilvering. 3466208683Spjd */ 3467219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 3468332529Smav vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) 3469208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 3470219089Spjd 3471219089Spjd /* 3472248571Smm * Log the fact that we booted up (so that we can detect if 3473248571Smm * we rebooted in the middle of an operation). 3474248571Smm */ 3475248571Smm spa_history_log_version(spa, "open"); 3476248571Smm 3477248571Smm /* 3478219089Spjd * Delete any inconsistent datasets. 3479219089Spjd */ 3480219089Spjd (void) dmu_objset_find(spa_name(spa), 3481219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 3482219089Spjd 3483219089Spjd /* 3484219089Spjd * Clean up any stale temporary dataset userrefs. 3485219089Spjd */ 3486219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 3487332525Smav 3488332525Smav /* 3489332525Smav * Note: unlike condensing, we don't need an analogous 3490332525Smav * "removal_in_progress" dance because no other thread 3491332525Smav * can start a removal while we hold the spa_namespace_lock. 3492332525Smav */ 3493332525Smav spa_restart_removal(spa); 3494332525Smav 3495332525Smav if (condense_in_progress) 3496332525Smav spa_condense_indirect_restart(spa); 3497168404Spjd } 3498168404Spjd 3499332530Smav spa_load_note(spa, "LOADED"); 3500332530Smav 3501219089Spjd return (0); 3502219089Spjd} 3503168404Spjd 3504219089Spjdstatic int 3505332529Smavspa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config) 3506219089Spjd{ 3507219089Spjd int mode = spa->spa_mode; 3508219089Spjd 3509219089Spjd spa_unload(spa); 3510219089Spjd spa_deactivate(spa); 3511219089Spjd 3512268720Sdelphij spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 3513219089Spjd 3514219089Spjd spa_activate(spa, mode); 3515219089Spjd spa_async_suspend(spa); 3516219089Spjd 3517332530Smav spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 3518332530Smav (u_longlong_t)spa->spa_load_max_txg); 3519332530Smav 3520332529Smav return (spa_load(spa, state, SPA_IMPORT_EXISTING, trust_config)); 3521168404Spjd} 3522168404Spjd 3523236884Smm/* 3524236884Smm * If spa_load() fails this function will try loading prior txg's. If 3525236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 3526236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 3527236884Smm * function will not rewind the pool and will return the same error as 3528236884Smm * spa_load(). 3529236884Smm */ 3530219089Spjdstatic int 3531332529Smavspa_load_best(spa_t *spa, spa_load_state_t state, int trust_config, 3532219089Spjd uint64_t max_request, int rewind_flags) 3533219089Spjd{ 3534236884Smm nvlist_t *loadinfo = NULL; 3535219089Spjd nvlist_t *config = NULL; 3536219089Spjd int load_error, rewind_error; 3537219089Spjd uint64_t safe_rewind_txg; 3538219089Spjd uint64_t min_txg; 3539219089Spjd 3540219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 3541219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 3542219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 3543219089Spjd } else { 3544219089Spjd spa->spa_load_max_txg = max_request; 3545268720Sdelphij if (max_request != UINT64_MAX) 3546268720Sdelphij spa->spa_extreme_rewind = B_TRUE; 3547219089Spjd } 3548219089Spjd 3549219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 3550332529Smav trust_config); 3551219089Spjd if (load_error == 0) 3552219089Spjd return (0); 3553219089Spjd 3554219089Spjd if (spa->spa_root_vdev != NULL) 3555219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3556219089Spjd 3557219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 3558219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 3559219089Spjd 3560219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 3561219089Spjd nvlist_free(config); 3562219089Spjd return (load_error); 3563219089Spjd } 3564219089Spjd 3565236884Smm if (state == SPA_LOAD_RECOVER) { 3566236884Smm /* Price of rolling back is discarding txgs, including log */ 3567219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 3568236884Smm } else { 3569236884Smm /* 3570236884Smm * If we aren't rolling back save the load info from our first 3571236884Smm * import attempt so that we can restore it after attempting 3572236884Smm * to rewind. 3573236884Smm */ 3574236884Smm loadinfo = spa->spa_load_info; 3575236884Smm spa->spa_load_info = fnvlist_alloc(); 3576236884Smm } 3577219089Spjd 3578219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 3579219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 3580219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 3581219089Spjd TXG_INITIAL : safe_rewind_txg; 3582219089Spjd 3583219089Spjd /* 3584219089Spjd * Continue as long as we're finding errors, we're still within 3585219089Spjd * the acceptable rewind range, and we're still finding uberblocks 3586219089Spjd */ 3587219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 3588219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 3589219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 3590219089Spjd spa->spa_extreme_rewind = B_TRUE; 3591332529Smav rewind_error = spa_load_retry(spa, state, trust_config); 3592219089Spjd } 3593219089Spjd 3594219089Spjd spa->spa_extreme_rewind = B_FALSE; 3595219089Spjd spa->spa_load_max_txg = UINT64_MAX; 3596219089Spjd 3597219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 3598219089Spjd spa_config_set(spa, config); 3599325535Savg else 3600325535Savg nvlist_free(config); 3601219089Spjd 3602236884Smm if (state == SPA_LOAD_RECOVER) { 3603236884Smm ASSERT3P(loadinfo, ==, NULL); 3604236884Smm return (rewind_error); 3605236884Smm } else { 3606236884Smm /* Store the rewind info as part of the initial load info */ 3607236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 3608236884Smm spa->spa_load_info); 3609236884Smm 3610236884Smm /* Restore the initial load info */ 3611236884Smm fnvlist_free(spa->spa_load_info); 3612236884Smm spa->spa_load_info = loadinfo; 3613236884Smm 3614236884Smm return (load_error); 3615236884Smm } 3616219089Spjd} 3617219089Spjd 3618168404Spjd/* 3619168404Spjd * Pool Open/Import 3620168404Spjd * 3621168404Spjd * The import case is identical to an open except that the configuration is sent 3622168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 3623168404Spjd * case of an open, the pool configuration will exist in the 3624185029Spjd * POOL_STATE_UNINITIALIZED state. 3625168404Spjd * 3626168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 3627168404Spjd * the same time open the pool, without having to keep around the spa_t in some 3628168404Spjd * ambiguous state. 3629168404Spjd */ 3630168404Spjdstatic int 3631219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 3632219089Spjd nvlist_t **config) 3633168404Spjd{ 3634168404Spjd spa_t *spa; 3635219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 3636168404Spjd int error; 3637168404Spjd int locked = B_FALSE; 3638219089Spjd int firstopen = B_FALSE; 3639168404Spjd 3640168404Spjd *spapp = NULL; 3641168404Spjd 3642168404Spjd /* 3643168404Spjd * As disgusting as this is, we need to support recursive calls to this 3644168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 3645168404Spjd * up calling spa_open() again. The real fix is to figure out how to 3646168404Spjd * avoid dsl_dir_open() calling this in the first place. 3647168404Spjd */ 3648168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 3649168404Spjd mutex_enter(&spa_namespace_lock); 3650168404Spjd locked = B_TRUE; 3651168404Spjd } 3652168404Spjd 3653168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 3654168404Spjd if (locked) 3655168404Spjd mutex_exit(&spa_namespace_lock); 3656249195Smm return (SET_ERROR(ENOENT)); 3657168404Spjd } 3658219089Spjd 3659168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 3660219089Spjd zpool_rewind_policy_t policy; 3661168404Spjd 3662219089Spjd firstopen = B_TRUE; 3663219089Spjd 3664219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 3665219089Spjd &policy); 3666219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 3667219089Spjd state = SPA_LOAD_RECOVER; 3668219089Spjd 3669209962Smm spa_activate(spa, spa_mode_global); 3670168404Spjd 3671219089Spjd if (state != SPA_LOAD_RECOVER) 3672219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3673168404Spjd 3674332530Smav zfs_dbgmsg("spa_open_common: opening %s", pool); 3675219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3676219089Spjd policy.zrp_request); 3677219089Spjd 3678168404Spjd if (error == EBADF) { 3679168404Spjd /* 3680168404Spjd * If vdev_validate() returns failure (indicated by 3681168404Spjd * EBADF), it indicates that one of the vdevs indicates 3682168404Spjd * that the pool has been exported or destroyed. If 3683168404Spjd * this is the case, the config cache is out of sync and 3684168404Spjd * we should remove the pool from the namespace. 3685168404Spjd */ 3686168404Spjd spa_unload(spa); 3687168404Spjd spa_deactivate(spa); 3688332525Smav spa_write_cachefile(spa, B_TRUE, B_TRUE); 3689168404Spjd spa_remove(spa); 3690168404Spjd if (locked) 3691168404Spjd mutex_exit(&spa_namespace_lock); 3692249195Smm return (SET_ERROR(ENOENT)); 3693168404Spjd } 3694168404Spjd 3695168404Spjd if (error) { 3696168404Spjd /* 3697168404Spjd * We can't open the pool, but we still have useful 3698168404Spjd * information: the state of each vdev after the 3699168404Spjd * attempted vdev_open(). Return this to the user. 3700168404Spjd */ 3701219089Spjd if (config != NULL && spa->spa_config) { 3702219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 3703219089Spjd KM_SLEEP) == 0); 3704219089Spjd VERIFY(nvlist_add_nvlist(*config, 3705219089Spjd ZPOOL_CONFIG_LOAD_INFO, 3706219089Spjd spa->spa_load_info) == 0); 3707219089Spjd } 3708168404Spjd spa_unload(spa); 3709168404Spjd spa_deactivate(spa); 3710219089Spjd spa->spa_last_open_failed = error; 3711168404Spjd if (locked) 3712168404Spjd mutex_exit(&spa_namespace_lock); 3713168404Spjd *spapp = NULL; 3714168404Spjd return (error); 3715168404Spjd } 3716168404Spjd } 3717168404Spjd 3718168404Spjd spa_open_ref(spa, tag); 3719185029Spjd 3720219089Spjd if (config != NULL) 3721219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3722219089Spjd 3723219089Spjd /* 3724219089Spjd * If we've recovered the pool, pass back any information we 3725219089Spjd * gathered while doing the load. 3726219089Spjd */ 3727219089Spjd if (state == SPA_LOAD_RECOVER) { 3728219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3729219089Spjd spa->spa_load_info) == 0); 3730219089Spjd } 3731219089Spjd 3732219089Spjd if (locked) { 3733219089Spjd spa->spa_last_open_failed = 0; 3734219089Spjd spa->spa_last_ubsync_txg = 0; 3735219089Spjd spa->spa_load_txg = 0; 3736168404Spjd mutex_exit(&spa_namespace_lock); 3737219089Spjd#ifdef __FreeBSD__ 3738219089Spjd#ifdef _KERNEL 3739219089Spjd if (firstopen) 3740249047Savg zvol_create_minors(spa->spa_name); 3741219089Spjd#endif 3742219089Spjd#endif 3743219089Spjd } 3744168404Spjd 3745168404Spjd *spapp = spa; 3746168404Spjd 3747168404Spjd return (0); 3748168404Spjd} 3749168404Spjd 3750168404Spjdint 3751219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3752219089Spjd nvlist_t **config) 3753219089Spjd{ 3754219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 3755219089Spjd} 3756219089Spjd 3757219089Spjdint 3758168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 3759168404Spjd{ 3760219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 3761168404Spjd} 3762168404Spjd 3763168404Spjd/* 3764168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 3765168404Spjd * preventing it from being exported or destroyed. 3766168404Spjd */ 3767168404Spjdspa_t * 3768168404Spjdspa_inject_addref(char *name) 3769168404Spjd{ 3770168404Spjd spa_t *spa; 3771168404Spjd 3772168404Spjd mutex_enter(&spa_namespace_lock); 3773168404Spjd if ((spa = spa_lookup(name)) == NULL) { 3774168404Spjd mutex_exit(&spa_namespace_lock); 3775168404Spjd return (NULL); 3776168404Spjd } 3777168404Spjd spa->spa_inject_ref++; 3778168404Spjd mutex_exit(&spa_namespace_lock); 3779168404Spjd 3780168404Spjd return (spa); 3781168404Spjd} 3782168404Spjd 3783168404Spjdvoid 3784168404Spjdspa_inject_delref(spa_t *spa) 3785168404Spjd{ 3786168404Spjd mutex_enter(&spa_namespace_lock); 3787168404Spjd spa->spa_inject_ref--; 3788168404Spjd mutex_exit(&spa_namespace_lock); 3789168404Spjd} 3790168404Spjd 3791185029Spjd/* 3792185029Spjd * Add spares device information to the nvlist. 3793185029Spjd */ 3794168404Spjdstatic void 3795168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 3796168404Spjd{ 3797168404Spjd nvlist_t **spares; 3798168404Spjd uint_t i, nspares; 3799168404Spjd nvlist_t *nvroot; 3800168404Spjd uint64_t guid; 3801168404Spjd vdev_stat_t *vs; 3802168404Spjd uint_t vsc; 3803168404Spjd uint64_t pool; 3804168404Spjd 3805209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3806209962Smm 3807185029Spjd if (spa->spa_spares.sav_count == 0) 3808168404Spjd return; 3809168404Spjd 3810168404Spjd VERIFY(nvlist_lookup_nvlist(config, 3811168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3812185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3813168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3814168404Spjd if (nspares != 0) { 3815168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3816168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3817168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3818168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3819168404Spjd 3820168404Spjd /* 3821168404Spjd * Go through and find any spares which have since been 3822168404Spjd * repurposed as an active spare. If this is the case, update 3823168404Spjd * their status appropriately. 3824168404Spjd */ 3825168404Spjd for (i = 0; i < nspares; i++) { 3826168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 3827168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3828185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 3829185029Spjd pool != 0ULL) { 3830168404Spjd VERIFY(nvlist_lookup_uint64_array( 3831219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 3832168404Spjd (uint64_t **)&vs, &vsc) == 0); 3833168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 3834168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 3835168404Spjd } 3836168404Spjd } 3837168404Spjd } 3838168404Spjd} 3839168404Spjd 3840185029Spjd/* 3841185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 3842185029Spjd */ 3843185029Spjdstatic void 3844185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 3845185029Spjd{ 3846185029Spjd nvlist_t **l2cache; 3847185029Spjd uint_t i, j, nl2cache; 3848185029Spjd nvlist_t *nvroot; 3849185029Spjd uint64_t guid; 3850185029Spjd vdev_t *vd; 3851185029Spjd vdev_stat_t *vs; 3852185029Spjd uint_t vsc; 3853185029Spjd 3854209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3855209962Smm 3856185029Spjd if (spa->spa_l2cache.sav_count == 0) 3857185029Spjd return; 3858185029Spjd 3859185029Spjd VERIFY(nvlist_lookup_nvlist(config, 3860185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3861185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3862185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3863185029Spjd if (nl2cache != 0) { 3864185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3865185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3866185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3867185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3868185029Spjd 3869185029Spjd /* 3870185029Spjd * Update level 2 cache device stats. 3871185029Spjd */ 3872185029Spjd 3873185029Spjd for (i = 0; i < nl2cache; i++) { 3874185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 3875185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3876185029Spjd 3877185029Spjd vd = NULL; 3878185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3879185029Spjd if (guid == 3880185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3881185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 3882185029Spjd break; 3883185029Spjd } 3884185029Spjd } 3885185029Spjd ASSERT(vd != NULL); 3886185029Spjd 3887185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3888219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3889219089Spjd == 0); 3890185029Spjd vdev_get_stats(vd, vs); 3891185029Spjd } 3892185029Spjd } 3893185029Spjd} 3894185029Spjd 3895236884Smmstatic void 3896236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3897236884Smm{ 3898236884Smm nvlist_t *features; 3899236884Smm zap_cursor_t zc; 3900236884Smm zap_attribute_t za; 3901236884Smm 3902236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3903236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3904236884Smm 3905253993Smav /* We may be unable to read features if pool is suspended. */ 3906253993Smav if (spa_suspended(spa)) 3907253993Smav goto out; 3908253993Smav 3909236884Smm if (spa->spa_feat_for_read_obj != 0) { 3910236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3911236884Smm spa->spa_feat_for_read_obj); 3912236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3913236884Smm zap_cursor_advance(&zc)) { 3914236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3915236884Smm za.za_num_integers == 1); 3916236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3917236884Smm za.za_first_integer)); 3918236884Smm } 3919236884Smm zap_cursor_fini(&zc); 3920236884Smm } 3921236884Smm 3922236884Smm if (spa->spa_feat_for_write_obj != 0) { 3923236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3924236884Smm spa->spa_feat_for_write_obj); 3925236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3926236884Smm zap_cursor_advance(&zc)) { 3927236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3928236884Smm za.za_num_integers == 1); 3929236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3930236884Smm za.za_first_integer)); 3931236884Smm } 3932236884Smm zap_cursor_fini(&zc); 3933236884Smm } 3934236884Smm 3935253993Smavout: 3936236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3937236884Smm features) == 0); 3938236884Smm nvlist_free(features); 3939236884Smm} 3940236884Smm 3941168404Spjdint 3942236884Smmspa_get_stats(const char *name, nvlist_t **config, 3943236884Smm char *altroot, size_t buflen) 3944168404Spjd{ 3945168404Spjd int error; 3946168404Spjd spa_t *spa; 3947168404Spjd 3948168404Spjd *config = NULL; 3949219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3950168404Spjd 3951209962Smm if (spa != NULL) { 3952209962Smm /* 3953209962Smm * This still leaves a window of inconsistency where the spares 3954209962Smm * or l2cache devices could change and the config would be 3955209962Smm * self-inconsistent. 3956209962Smm */ 3957209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3958168404Spjd 3959209962Smm if (*config != NULL) { 3960219089Spjd uint64_t loadtimes[2]; 3961219089Spjd 3962219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3963219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3964219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3965219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3966219089Spjd 3967185029Spjd VERIFY(nvlist_add_uint64(*config, 3968209962Smm ZPOOL_CONFIG_ERRCOUNT, 3969209962Smm spa_get_errlog_size(spa)) == 0); 3970185029Spjd 3971209962Smm if (spa_suspended(spa)) 3972209962Smm VERIFY(nvlist_add_uint64(*config, 3973209962Smm ZPOOL_CONFIG_SUSPENDED, 3974209962Smm spa->spa_failmode) == 0); 3975209962Smm 3976209962Smm spa_add_spares(spa, *config); 3977209962Smm spa_add_l2cache(spa, *config); 3978236884Smm spa_add_feature_stats(spa, *config); 3979209962Smm } 3980168404Spjd } 3981168404Spjd 3982168404Spjd /* 3983168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3984168404Spjd * and call spa_lookup() directly. 3985168404Spjd */ 3986168404Spjd if (altroot) { 3987168404Spjd if (spa == NULL) { 3988168404Spjd mutex_enter(&spa_namespace_lock); 3989168404Spjd spa = spa_lookup(name); 3990168404Spjd if (spa) 3991168404Spjd spa_altroot(spa, altroot, buflen); 3992168404Spjd else 3993168404Spjd altroot[0] = '\0'; 3994168404Spjd spa = NULL; 3995168404Spjd mutex_exit(&spa_namespace_lock); 3996168404Spjd } else { 3997168404Spjd spa_altroot(spa, altroot, buflen); 3998168404Spjd } 3999168404Spjd } 4000168404Spjd 4001209962Smm if (spa != NULL) { 4002209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4003168404Spjd spa_close(spa, FTAG); 4004209962Smm } 4005168404Spjd 4006168404Spjd return (error); 4007168404Spjd} 4008168404Spjd 4009168404Spjd/* 4010185029Spjd * Validate that the auxiliary device array is well formed. We must have an 4011185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 4012185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 4013185029Spjd * specified, as long as they are well-formed. 4014168404Spjd */ 4015168404Spjdstatic int 4016185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 4017185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 4018185029Spjd vdev_labeltype_t label) 4019168404Spjd{ 4020185029Spjd nvlist_t **dev; 4021185029Spjd uint_t i, ndev; 4022168404Spjd vdev_t *vd; 4023168404Spjd int error; 4024168404Spjd 4025185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4026185029Spjd 4027168404Spjd /* 4028185029Spjd * It's acceptable to have no devs specified. 4029168404Spjd */ 4030185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 4031168404Spjd return (0); 4032168404Spjd 4033185029Spjd if (ndev == 0) 4034249195Smm return (SET_ERROR(EINVAL)); 4035168404Spjd 4036168404Spjd /* 4037185029Spjd * Make sure the pool is formatted with a version that supports this 4038185029Spjd * device type. 4039168404Spjd */ 4040185029Spjd if (spa_version(spa) < version) 4041249195Smm return (SET_ERROR(ENOTSUP)); 4042168404Spjd 4043168404Spjd /* 4044185029Spjd * Set the pending device list so we correctly handle device in-use 4045168404Spjd * checking. 4046168404Spjd */ 4047185029Spjd sav->sav_pending = dev; 4048185029Spjd sav->sav_npending = ndev; 4049168404Spjd 4050185029Spjd for (i = 0; i < ndev; i++) { 4051185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 4052168404Spjd mode)) != 0) 4053168404Spjd goto out; 4054168404Spjd 4055168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 4056168404Spjd vdev_free(vd); 4057249195Smm error = SET_ERROR(EINVAL); 4058168404Spjd goto out; 4059168404Spjd } 4060168404Spjd 4061185029Spjd /* 4062185029Spjd * The L2ARC currently only supports disk devices in 4063185029Spjd * kernel context. For user-level testing, we allow it. 4064185029Spjd */ 4065185029Spjd#ifdef _KERNEL 4066185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 4067185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 4068249195Smm error = SET_ERROR(ENOTBLK); 4069230514Smm vdev_free(vd); 4070185029Spjd goto out; 4071185029Spjd } 4072185029Spjd#endif 4073168404Spjd vd->vdev_top = vd; 4074168404Spjd 4075168404Spjd if ((error = vdev_open(vd)) == 0 && 4076185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 4077185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 4078168404Spjd vd->vdev_guid) == 0); 4079168404Spjd } 4080168404Spjd 4081168404Spjd vdev_free(vd); 4082168404Spjd 4083185029Spjd if (error && 4084185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 4085168404Spjd goto out; 4086168404Spjd else 4087168404Spjd error = 0; 4088168404Spjd } 4089168404Spjd 4090168404Spjdout: 4091185029Spjd sav->sav_pending = NULL; 4092185029Spjd sav->sav_npending = 0; 4093168404Spjd return (error); 4094168404Spjd} 4095168404Spjd 4096185029Spjdstatic int 4097185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 4098185029Spjd{ 4099185029Spjd int error; 4100185029Spjd 4101185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4102185029Spjd 4103185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 4104185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 4105185029Spjd VDEV_LABEL_SPARE)) != 0) { 4106185029Spjd return (error); 4107185029Spjd } 4108185029Spjd 4109185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 4110185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 4111185029Spjd VDEV_LABEL_L2CACHE)); 4112185029Spjd} 4113185029Spjd 4114185029Spjdstatic void 4115185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 4116185029Spjd const char *config) 4117185029Spjd{ 4118185029Spjd int i; 4119185029Spjd 4120185029Spjd if (sav->sav_config != NULL) { 4121185029Spjd nvlist_t **olddevs; 4122185029Spjd uint_t oldndevs; 4123185029Spjd nvlist_t **newdevs; 4124185029Spjd 4125185029Spjd /* 4126185029Spjd * Generate new dev list by concatentating with the 4127185029Spjd * current dev list. 4128185029Spjd */ 4129185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 4130185029Spjd &olddevs, &oldndevs) == 0); 4131185029Spjd 4132185029Spjd newdevs = kmem_alloc(sizeof (void *) * 4133185029Spjd (ndevs + oldndevs), KM_SLEEP); 4134185029Spjd for (i = 0; i < oldndevs; i++) 4135185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 4136185029Spjd KM_SLEEP) == 0); 4137185029Spjd for (i = 0; i < ndevs; i++) 4138185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 4139185029Spjd KM_SLEEP) == 0); 4140185029Spjd 4141185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 4142185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 4143185029Spjd 4144185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 4145185029Spjd config, newdevs, ndevs + oldndevs) == 0); 4146185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 4147185029Spjd nvlist_free(newdevs[i]); 4148185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 4149185029Spjd } else { 4150185029Spjd /* 4151185029Spjd * Generate a new dev list. 4152185029Spjd */ 4153185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 4154185029Spjd KM_SLEEP) == 0); 4155185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 4156185029Spjd devs, ndevs) == 0); 4157185029Spjd } 4158185029Spjd} 4159185029Spjd 4160168404Spjd/* 4161185029Spjd * Stop and drop level 2 ARC devices 4162185029Spjd */ 4163185029Spjdvoid 4164185029Spjdspa_l2cache_drop(spa_t *spa) 4165185029Spjd{ 4166185029Spjd vdev_t *vd; 4167185029Spjd int i; 4168185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 4169185029Spjd 4170185029Spjd for (i = 0; i < sav->sav_count; i++) { 4171185029Spjd uint64_t pool; 4172185029Spjd 4173185029Spjd vd = sav->sav_vdevs[i]; 4174185029Spjd ASSERT(vd != NULL); 4175185029Spjd 4176209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 4177209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 4178185029Spjd l2arc_remove_vdev(vd); 4179185029Spjd } 4180185029Spjd} 4181185029Spjd 4182185029Spjd/* 4183168404Spjd * Pool Creation 4184168404Spjd */ 4185168404Spjdint 4186185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 4187248571Smm nvlist_t *zplprops) 4188168404Spjd{ 4189168404Spjd spa_t *spa; 4190185029Spjd char *altroot = NULL; 4191168404Spjd vdev_t *rvd; 4192168404Spjd dsl_pool_t *dp; 4193168404Spjd dmu_tx_t *tx; 4194219089Spjd int error = 0; 4195168404Spjd uint64_t txg = TXG_INITIAL; 4196185029Spjd nvlist_t **spares, **l2cache; 4197185029Spjd uint_t nspares, nl2cache; 4198219089Spjd uint64_t version, obj; 4199236884Smm boolean_t has_features; 4200168404Spjd 4201168404Spjd /* 4202168404Spjd * If this pool already exists, return failure. 4203168404Spjd */ 4204168404Spjd mutex_enter(&spa_namespace_lock); 4205168404Spjd if (spa_lookup(pool) != NULL) { 4206168404Spjd mutex_exit(&spa_namespace_lock); 4207249195Smm return (SET_ERROR(EEXIST)); 4208168404Spjd } 4209168404Spjd 4210168404Spjd /* 4211168404Spjd * Allocate a new spa_t structure. 4212168404Spjd */ 4213185029Spjd (void) nvlist_lookup_string(props, 4214185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4215219089Spjd spa = spa_add(pool, NULL, altroot); 4216209962Smm spa_activate(spa, spa_mode_global); 4217168404Spjd 4218185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 4219185029Spjd spa_deactivate(spa); 4220185029Spjd spa_remove(spa); 4221185029Spjd mutex_exit(&spa_namespace_lock); 4222185029Spjd return (error); 4223185029Spjd } 4224185029Spjd 4225236884Smm has_features = B_FALSE; 4226236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 4227236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 4228236884Smm if (zpool_prop_feature(nvpair_name(elem))) 4229236884Smm has_features = B_TRUE; 4230236884Smm } 4231236884Smm 4232236884Smm if (has_features || nvlist_lookup_uint64(props, 4233236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 4234185029Spjd version = SPA_VERSION; 4235236884Smm } 4236236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 4237219089Spjd 4238219089Spjd spa->spa_first_txg = txg; 4239219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 4240185029Spjd spa->spa_uberblock.ub_version = version; 4241168404Spjd spa->spa_ubsync = spa->spa_uberblock; 4242307277Smav spa->spa_load_state = SPA_LOAD_CREATE; 4243332525Smav spa->spa_removing_phys.sr_state = DSS_NONE; 4244332525Smav spa->spa_removing_phys.sr_removing_vdev = -1; 4245332525Smav spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 4246168404Spjd 4247168404Spjd /* 4248209962Smm * Create "The Godfather" zio to hold all async IOs 4249209962Smm */ 4250272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 4251272598Sdelphij KM_SLEEP); 4252272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 4253272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 4254272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 4255272598Sdelphij ZIO_FLAG_GODFATHER); 4256272598Sdelphij } 4257209962Smm 4258209962Smm /* 4259168404Spjd * Create the root vdev. 4260168404Spjd */ 4261185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4262168404Spjd 4263168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 4264168404Spjd 4265168404Spjd ASSERT(error != 0 || rvd != NULL); 4266168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 4267168404Spjd 4268185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 4269249195Smm error = SET_ERROR(EINVAL); 4270168404Spjd 4271168404Spjd if (error == 0 && 4272168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 4273185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 4274168404Spjd VDEV_ALLOC_ADD)) == 0) { 4275219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 4276254591Sgibbs vdev_ashift_optimize(rvd->vdev_child[c]); 4277219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 4278219089Spjd vdev_expand(rvd->vdev_child[c], txg); 4279219089Spjd } 4280168404Spjd } 4281168404Spjd 4282185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4283168404Spjd 4284168404Spjd if (error != 0) { 4285168404Spjd spa_unload(spa); 4286168404Spjd spa_deactivate(spa); 4287168404Spjd spa_remove(spa); 4288168404Spjd mutex_exit(&spa_namespace_lock); 4289168404Spjd return (error); 4290168404Spjd } 4291168404Spjd 4292168404Spjd /* 4293168404Spjd * Get the list of spares, if specified. 4294168404Spjd */ 4295168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4296168404Spjd &spares, &nspares) == 0) { 4297185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 4298168404Spjd KM_SLEEP) == 0); 4299185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4300168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4301185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4302168404Spjd spa_load_spares(spa); 4303185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4304185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4305168404Spjd } 4306168404Spjd 4307185029Spjd /* 4308185029Spjd * Get the list of level 2 cache devices, if specified. 4309185029Spjd */ 4310185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4311185029Spjd &l2cache, &nl2cache) == 0) { 4312185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4313185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 4314185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4315185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4316185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4317185029Spjd spa_load_l2cache(spa); 4318185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4319185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4320185029Spjd } 4321185029Spjd 4322236884Smm spa->spa_is_initializing = B_TRUE; 4323185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 4324168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 4325236884Smm spa->spa_is_initializing = B_FALSE; 4326168404Spjd 4327219089Spjd /* 4328219089Spjd * Create DDTs (dedup tables). 4329219089Spjd */ 4330219089Spjd ddt_create(spa); 4331219089Spjd 4332219089Spjd spa_update_dspace(spa); 4333219089Spjd 4334168404Spjd tx = dmu_tx_create_assigned(dp, txg); 4335168404Spjd 4336168404Spjd /* 4337168404Spjd * Create the pool config object. 4338168404Spjd */ 4339168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 4340185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 4341168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 4342168404Spjd 4343168404Spjd if (zap_add(spa->spa_meta_objset, 4344168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 4345168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 4346168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 4347168404Spjd } 4348168404Spjd 4349236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 4350236884Smm spa_feature_create_zap_objects(spa, tx); 4351236884Smm 4352219089Spjd if (zap_add(spa->spa_meta_objset, 4353219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 4354219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 4355219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 4356219089Spjd } 4357219089Spjd 4358185029Spjd /* Newly created pools with the right version are always deflated. */ 4359185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 4360185029Spjd spa->spa_deflate = TRUE; 4361185029Spjd if (zap_add(spa->spa_meta_objset, 4362185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4363185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 4364185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 4365185029Spjd } 4366168404Spjd } 4367168404Spjd 4368168404Spjd /* 4369219089Spjd * Create the deferred-free bpobj. Turn off compression 4370168404Spjd * because sync-to-convergence takes longer if the blocksize 4371168404Spjd * keeps changing. 4372168404Spjd */ 4373219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 4374219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 4375168404Spjd ZIO_COMPRESS_OFF, tx); 4376168404Spjd if (zap_add(spa->spa_meta_objset, 4377219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 4378219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 4379219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 4380168404Spjd } 4381219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 4382219089Spjd spa->spa_meta_objset, obj)); 4383168404Spjd 4384168404Spjd /* 4385168404Spjd * Create the pool's history object. 4386168404Spjd */ 4387185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 4388185029Spjd spa_history_create_obj(spa, tx); 4389168404Spjd 4390185029Spjd /* 4391289422Smav * Generate some random noise for salted checksums to operate on. 4392289422Smav */ 4393289422Smav (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4394289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes)); 4395289422Smav 4396289422Smav /* 4397185029Spjd * Set pool properties. 4398185029Spjd */ 4399185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 4400185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4401185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 4402219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 4403219089Spjd 4404209962Smm if (props != NULL) { 4405209962Smm spa_configfile_set(spa, props, B_FALSE); 4406248571Smm spa_sync_props(props, tx); 4407209962Smm } 4408185029Spjd 4409168404Spjd dmu_tx_commit(tx); 4410168404Spjd 4411168404Spjd spa->spa_sync_on = B_TRUE; 4412168404Spjd txg_sync_start(spa->spa_dsl_pool); 4413168404Spjd 4414168404Spjd /* 4415168404Spjd * We explicitly wait for the first transaction to complete so that our 4416168404Spjd * bean counters are appropriately updated. 4417168404Spjd */ 4418168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 4419168404Spjd 4420332525Smav spa_write_cachefile(spa, B_FALSE, B_TRUE); 4421331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 4422168404Spjd 4423248571Smm spa_history_log_version(spa, "create"); 4424185029Spjd 4425286575Smav /* 4426286575Smav * Don't count references from objsets that are already closed 4427286575Smav * and are making their way through the eviction process. 4428286575Smav */ 4429286575Smav spa_evicting_os_wait(spa); 4430208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 4431307277Smav spa->spa_load_state = SPA_LOAD_NONE; 4432208442Smm 4433168404Spjd mutex_exit(&spa_namespace_lock); 4434168404Spjd 4435168404Spjd return (0); 4436168404Spjd} 4437168404Spjd 4438241286Savg#ifdef _KERNEL 4439277300Ssmh#ifdef illumos 4440185029Spjd/* 4441219089Spjd * Get the root pool information from the root disk, then import the root pool 4442219089Spjd * during the system boot up time. 4443185029Spjd */ 4444219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 4445219089Spjd 4446219089Spjdstatic nvlist_t * 4447219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 4448185029Spjd{ 4449219089Spjd nvlist_t *config; 4450185029Spjd nvlist_t *nvtop, *nvroot; 4451185029Spjd uint64_t pgid; 4452185029Spjd 4453219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 4454219089Spjd return (NULL); 4455219089Spjd 4456168404Spjd /* 4457185029Spjd * Add this top-level vdev to the child array. 4458168404Spjd */ 4459219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4460219089Spjd &nvtop) == 0); 4461219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4462219089Spjd &pgid) == 0); 4463219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 4464168404Spjd 4465185029Spjd /* 4466185029Spjd * Put this pool's top-level vdevs into a root vdev. 4467185029Spjd */ 4468185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4469219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4470219089Spjd VDEV_TYPE_ROOT) == 0); 4471185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4472185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4473185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4474185029Spjd &nvtop, 1) == 0); 4475168404Spjd 4476168404Spjd /* 4477185029Spjd * Replace the existing vdev_tree with the new root vdev in 4478185029Spjd * this pool's configuration (remove the old, add the new). 4479168404Spjd */ 4480185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4481185029Spjd nvlist_free(nvroot); 4482219089Spjd return (config); 4483185029Spjd} 4484168404Spjd 4485185029Spjd/* 4486219089Spjd * Walk the vdev tree and see if we can find a device with "better" 4487219089Spjd * configuration. A configuration is "better" if the label on that 4488219089Spjd * device has a more recent txg. 4489185029Spjd */ 4490219089Spjdstatic void 4491219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 4492185029Spjd{ 4493219089Spjd for (int c = 0; c < vd->vdev_children; c++) 4494219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 4495185029Spjd 4496219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 4497219089Spjd nvlist_t *label; 4498219089Spjd uint64_t label_txg; 4499185029Spjd 4500219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 4501219089Spjd &label) != 0) 4502219089Spjd return; 4503185029Spjd 4504219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 4505219089Spjd &label_txg) == 0); 4506168404Spjd 4507219089Spjd /* 4508219089Spjd * Do we have a better boot device? 4509219089Spjd */ 4510219089Spjd if (label_txg > *txg) { 4511219089Spjd *txg = label_txg; 4512219089Spjd *avd = vd; 4513185029Spjd } 4514219089Spjd nvlist_free(label); 4515185029Spjd } 4516185029Spjd} 4517185029Spjd 4518185029Spjd/* 4519185029Spjd * Import a root pool. 4520185029Spjd * 4521185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 4522185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 4523185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 4524185029Spjd * 4525185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 4526185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 4527185029Spjd * e.g. 4528185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 4529185029Spjd */ 4530185029Spjdint 4531185029Spjdspa_import_rootpool(char *devpath, char *devid) 4532185029Spjd{ 4533219089Spjd spa_t *spa; 4534219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 4535219089Spjd nvlist_t *config, *nvtop; 4536219089Spjd uint64_t guid, txg; 4537185029Spjd char *pname; 4538185029Spjd int error; 4539185029Spjd 4540185029Spjd /* 4541219089Spjd * Read the label from the boot device and generate a configuration. 4542185029Spjd */ 4543219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 4544219089Spjd#if defined(_OBP) && defined(_KERNEL) 4545219089Spjd if (config == NULL) { 4546219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 4547219089Spjd /* iscsi boot */ 4548219089Spjd get_iscsi_bootpath_phy(devpath); 4549219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 4550219089Spjd } 4551219089Spjd } 4552219089Spjd#endif 4553219089Spjd if (config == NULL) { 4554236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 4555219089Spjd devpath); 4556249195Smm return (SET_ERROR(EIO)); 4557219089Spjd } 4558185029Spjd 4559219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4560219089Spjd &pname) == 0); 4561219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 4562185029Spjd 4563209962Smm mutex_enter(&spa_namespace_lock); 4564209962Smm if ((spa = spa_lookup(pname)) != NULL) { 4565209962Smm /* 4566209962Smm * Remove the existing root pool from the namespace so that we 4567209962Smm * can replace it with the correct config we just read in. 4568209962Smm */ 4569209962Smm spa_remove(spa); 4570209962Smm } 4571185029Spjd 4572219089Spjd spa = spa_add(pname, config, NULL); 4573209962Smm spa->spa_is_root = B_TRUE; 4574219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4575331721Smav if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4576331721Smav &spa->spa_ubsync.ub_version) != 0) 4577331721Smav spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4578209962Smm 4579219089Spjd /* 4580219089Spjd * Build up a vdev tree based on the boot device's label config. 4581219089Spjd */ 4582219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4583219089Spjd &nvtop) == 0); 4584219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4585219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4586219089Spjd VDEV_ALLOC_ROOTPOOL); 4587219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4588219089Spjd if (error) { 4589209962Smm mutex_exit(&spa_namespace_lock); 4590219089Spjd nvlist_free(config); 4591219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4592219089Spjd pname); 4593219089Spjd return (error); 4594209962Smm } 4595209962Smm 4596219089Spjd /* 4597219089Spjd * Get the boot vdev. 4598219089Spjd */ 4599219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 4600219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 4601219089Spjd (u_longlong_t)guid); 4602249195Smm error = SET_ERROR(ENOENT); 4603219089Spjd goto out; 4604219089Spjd } 4605209962Smm 4606219089Spjd /* 4607219089Spjd * Determine if there is a better boot device. 4608219089Spjd */ 4609219089Spjd avd = bvd; 4610219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 4611219089Spjd if (avd != bvd) { 4612219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 4613219089Spjd "try booting from '%s'", avd->vdev_path); 4614249195Smm error = SET_ERROR(EINVAL); 4615219089Spjd goto out; 4616219089Spjd } 4617209962Smm 4618219089Spjd /* 4619219089Spjd * If the boot device is part of a spare vdev then ensure that 4620219089Spjd * we're booting off the active spare. 4621219089Spjd */ 4622219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 4623219089Spjd !bvd->vdev_isspare) { 4624219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 4625219089Spjd "try booting from '%s'", 4626219089Spjd bvd->vdev_parent-> 4627219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 4628249195Smm error = SET_ERROR(EINVAL); 4629219089Spjd goto out; 4630219089Spjd } 4631209962Smm 4632219089Spjd error = 0; 4633219089Spjdout: 4634219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4635219089Spjd vdev_free(rvd); 4636219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4637209962Smm mutex_exit(&spa_namespace_lock); 4638209962Smm 4639219089Spjd nvlist_free(config); 4640219089Spjd return (error); 4641185029Spjd} 4642185029Spjd 4643277300Ssmh#else /* !illumos */ 4644241286Savg 4645243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 4646243502Savg uint64_t *count); 4647241286Savg 4648241286Savgstatic nvlist_t * 4649241286Savgspa_generate_rootconf(const char *name) 4650241286Savg{ 4651243502Savg nvlist_t **configs, **tops; 4652241286Savg nvlist_t *config; 4653243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 4654243502Savg uint64_t *holes; 4655243502Savg uint64_t best_txg; 4656243213Savg uint64_t nchildren; 4657241286Savg uint64_t pgid; 4658243502Savg uint64_t count; 4659243502Savg uint64_t i; 4660243502Savg uint_t nholes; 4661241286Savg 4662243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 4663241286Savg return (NULL); 4664241286Savg 4665243502Savg ASSERT3U(count, !=, 0); 4666243502Savg best_txg = 0; 4667243502Savg for (i = 0; i < count; i++) { 4668243502Savg uint64_t txg; 4669243502Savg 4670243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 4671243502Savg &txg) == 0); 4672243502Savg if (txg > best_txg) { 4673243502Savg best_txg = txg; 4674243502Savg best_cfg = configs[i]; 4675243502Savg } 4676243502Savg } 4677243502Savg 4678245945Savg nchildren = 1; 4679245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 4680243502Savg holes = NULL; 4681243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 4682243502Savg &holes, &nholes); 4683243502Savg 4684244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 4685243502Savg for (i = 0; i < nchildren; i++) { 4686243502Savg if (i >= count) 4687243502Savg break; 4688243502Savg if (configs[i] == NULL) 4689243502Savg continue; 4690243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 4691243502Savg &nvtop) == 0); 4692243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 4693243213Savg } 4694243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 4695243502Savg if (i >= nchildren) 4696243502Savg continue; 4697243502Savg if (tops[holes[i]] != NULL) 4698243502Savg continue; 4699243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 4700243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 4701243502Savg VDEV_TYPE_HOLE) == 0); 4702243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 4703243502Savg holes[i]) == 0); 4704243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 4705243502Savg 0) == 0); 4706243502Savg } 4707243502Savg for (i = 0; i < nchildren; i++) { 4708243502Savg if (tops[i] != NULL) 4709243502Savg continue; 4710243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 4711243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 4712243502Savg VDEV_TYPE_MISSING) == 0); 4713243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 4714243502Savg i) == 0); 4715243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 4716243502Savg 0) == 0); 4717243502Savg } 4718243213Savg 4719243213Savg /* 4720243502Savg * Create pool config based on the best vdev config. 4721241286Savg */ 4722243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 4723241286Savg 4724241286Savg /* 4725241286Savg * Put this pool's top-level vdevs into a root vdev. 4726241286Savg */ 4727243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4728243502Savg &pgid) == 0); 4729241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4730241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4731241286Savg VDEV_TYPE_ROOT) == 0); 4732241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4733241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4734241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4735243502Savg tops, nchildren) == 0); 4736241286Savg 4737241286Savg /* 4738241286Savg * Replace the existing vdev_tree with the new root vdev in 4739241286Savg * this pool's configuration (remove the old, add the new). 4740241286Savg */ 4741241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4742243502Savg 4743243502Savg /* 4744243502Savg * Drop vdev config elements that should not be present at pool level. 4745243502Savg */ 4746243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 4747243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 4748243502Savg 4749243502Savg for (i = 0; i < count; i++) 4750243502Savg nvlist_free(configs[i]); 4751243502Savg kmem_free(configs, count * sizeof(void *)); 4752243502Savg for (i = 0; i < nchildren; i++) 4753243502Savg nvlist_free(tops[i]); 4754243502Savg kmem_free(tops, nchildren * sizeof(void *)); 4755241286Savg nvlist_free(nvroot); 4756241286Savg return (config); 4757241286Savg} 4758241286Savg 4759241286Savgint 4760241286Savgspa_import_rootpool(const char *name) 4761241286Savg{ 4762241286Savg spa_t *spa; 4763241286Savg vdev_t *rvd, *bvd, *avd = NULL; 4764241286Savg nvlist_t *config, *nvtop; 4765241286Savg uint64_t txg; 4766241286Savg char *pname; 4767241286Savg int error; 4768241286Savg 4769241286Savg /* 4770241286Savg * Read the label from the boot device and generate a configuration. 4771241286Savg */ 4772241286Savg config = spa_generate_rootconf(name); 4773243213Savg 4774243213Savg mutex_enter(&spa_namespace_lock); 4775243213Savg if (config != NULL) { 4776243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4777243213Savg &pname) == 0 && strcmp(name, pname) == 0); 4778243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 4779243213Savg == 0); 4780243213Savg 4781243213Savg if ((spa = spa_lookup(pname)) != NULL) { 4782243213Savg /* 4783323746Savg * The pool could already be imported, 4784323746Savg * e.g., after reboot -r. 4785323746Savg */ 4786323746Savg if (spa->spa_state == POOL_STATE_ACTIVE) { 4787323746Savg mutex_exit(&spa_namespace_lock); 4788323746Savg nvlist_free(config); 4789323746Savg return (0); 4790323746Savg } 4791323746Savg 4792323746Savg /* 4793243213Savg * Remove the existing root pool from the namespace so 4794243213Savg * that we can replace it with the correct config 4795243213Savg * we just read in. 4796243213Savg */ 4797243213Savg spa_remove(spa); 4798243213Savg } 4799243213Savg spa = spa_add(pname, config, NULL); 4800243501Savg 4801243501Savg /* 4802243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4803243501Savg * via spa_version(). 4804243501Savg */ 4805243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4806243501Savg &spa->spa_ubsync.ub_version) != 0) 4807243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4808243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 4809287100Savg mutex_exit(&spa_namespace_lock); 4810287100Savg nvlist_free(config); 4811241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4812241286Savg name); 4813241286Savg return (EIO); 4814243213Savg } else { 4815243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4816241286Savg } 4817241286Savg spa->spa_is_root = B_TRUE; 4818241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4819241286Savg 4820241286Savg /* 4821241286Savg * Build up a vdev tree based on the boot device's label config. 4822241286Savg */ 4823241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4824241286Savg &nvtop) == 0); 4825241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4826241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4827241286Savg VDEV_ALLOC_ROOTPOOL); 4828241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4829241286Savg if (error) { 4830241286Savg mutex_exit(&spa_namespace_lock); 4831241286Savg nvlist_free(config); 4832241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4833241286Savg pname); 4834241286Savg return (error); 4835241286Savg } 4836241286Savg 4837241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4838241286Savg vdev_free(rvd); 4839241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4840241286Savg mutex_exit(&spa_namespace_lock); 4841241286Savg 4842243213Savg nvlist_free(config); 4843243213Savg return (0); 4844241286Savg} 4845241286Savg 4846277300Ssmh#endif /* illumos */ 4847277300Ssmh#endif /* _KERNEL */ 4848219089Spjd 4849209962Smm/* 4850209962Smm * Import a non-root pool into the system. 4851209962Smm */ 4852185029Spjdint 4853219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4854185029Spjd{ 4855209962Smm spa_t *spa; 4856209962Smm char *altroot = NULL; 4857219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 4858219089Spjd zpool_rewind_policy_t policy; 4859219089Spjd uint64_t mode = spa_mode_global; 4860219089Spjd uint64_t readonly = B_FALSE; 4861209962Smm int error; 4862209962Smm nvlist_t *nvroot; 4863209962Smm nvlist_t **spares, **l2cache; 4864209962Smm uint_t nspares, nl2cache; 4865209962Smm 4866209962Smm /* 4867209962Smm * If a pool with this name exists, return failure. 4868209962Smm */ 4869209962Smm mutex_enter(&spa_namespace_lock); 4870219089Spjd if (spa_lookup(pool) != NULL) { 4871209962Smm mutex_exit(&spa_namespace_lock); 4872249195Smm return (SET_ERROR(EEXIST)); 4873209962Smm } 4874209962Smm 4875209962Smm /* 4876209962Smm * Create and initialize the spa structure. 4877209962Smm */ 4878209962Smm (void) nvlist_lookup_string(props, 4879209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4880219089Spjd (void) nvlist_lookup_uint64(props, 4881219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4882219089Spjd if (readonly) 4883219089Spjd mode = FREAD; 4884219089Spjd spa = spa_add(pool, config, altroot); 4885219089Spjd spa->spa_import_flags = flags; 4886209962Smm 4887209962Smm /* 4888219089Spjd * Verbatim import - Take a pool and insert it into the namespace 4889219089Spjd * as if it had been loaded at boot. 4890219089Spjd */ 4891219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4892219089Spjd if (props != NULL) 4893219089Spjd spa_configfile_set(spa, props, B_FALSE); 4894219089Spjd 4895332525Smav spa_write_cachefile(spa, B_FALSE, B_TRUE); 4896331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 4897332530Smav zfs_dbgmsg("spa_import: verbatim import of %s", pool); 4898219089Spjd mutex_exit(&spa_namespace_lock); 4899219089Spjd return (0); 4900219089Spjd } 4901219089Spjd 4902219089Spjd spa_activate(spa, mode); 4903219089Spjd 4904219089Spjd /* 4905209962Smm * Don't start async tasks until we know everything is healthy. 4906209962Smm */ 4907209962Smm spa_async_suspend(spa); 4908209962Smm 4909219089Spjd zpool_get_rewind_policy(config, &policy); 4910219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 4911219089Spjd state = SPA_LOAD_RECOVER; 4912219089Spjd 4913209962Smm /* 4914332529Smav * Pass off the heavy lifting to spa_load(). Pass TRUE for trust_config 4915209962Smm * because the user-supplied config is actually the one to trust when 4916209962Smm * doing an import. 4917209962Smm */ 4918219089Spjd if (state != SPA_LOAD_RECOVER) 4919219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4920209962Smm 4921332530Smav zfs_dbgmsg("spa_import: importing %s%s", pool, 4922332530Smav (state == SPA_LOAD_RECOVER) ? " (RECOVERY MODE)" : ""); 4923219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4924219089Spjd policy.zrp_request); 4925219089Spjd 4926219089Spjd /* 4927219089Spjd * Propagate anything learned while loading the pool and pass it 4928219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 4929219089Spjd */ 4930219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4931219089Spjd spa->spa_load_info) == 0); 4932219089Spjd 4933209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4934209962Smm /* 4935209962Smm * Toss any existing sparelist, as it doesn't have any validity 4936209962Smm * anymore, and conflicts with spa_has_spare(). 4937209962Smm */ 4938209962Smm if (spa->spa_spares.sav_config) { 4939209962Smm nvlist_free(spa->spa_spares.sav_config); 4940209962Smm spa->spa_spares.sav_config = NULL; 4941209962Smm spa_load_spares(spa); 4942209962Smm } 4943209962Smm if (spa->spa_l2cache.sav_config) { 4944209962Smm nvlist_free(spa->spa_l2cache.sav_config); 4945209962Smm spa->spa_l2cache.sav_config = NULL; 4946209962Smm spa_load_l2cache(spa); 4947209962Smm } 4948209962Smm 4949209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4950209962Smm &nvroot) == 0); 4951209962Smm if (error == 0) 4952209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4953209962Smm VDEV_ALLOC_SPARE); 4954209962Smm if (error == 0) 4955209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4956209962Smm VDEV_ALLOC_L2CACHE); 4957209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4958209962Smm 4959209962Smm if (props != NULL) 4960209962Smm spa_configfile_set(spa, props, B_FALSE); 4961209962Smm 4962209962Smm if (error != 0 || (props && spa_writeable(spa) && 4963209962Smm (error = spa_prop_set(spa, props)))) { 4964209962Smm spa_unload(spa); 4965209962Smm spa_deactivate(spa); 4966209962Smm spa_remove(spa); 4967209962Smm mutex_exit(&spa_namespace_lock); 4968209962Smm return (error); 4969209962Smm } 4970209962Smm 4971209962Smm spa_async_resume(spa); 4972209962Smm 4973209962Smm /* 4974209962Smm * Override any spares and level 2 cache devices as specified by 4975209962Smm * the user, as these may have correct device names/devids, etc. 4976209962Smm */ 4977209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4978209962Smm &spares, &nspares) == 0) { 4979209962Smm if (spa->spa_spares.sav_config) 4980209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4981209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4982209962Smm else 4983209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4984209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4985209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4986209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4987209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4988209962Smm spa_load_spares(spa); 4989209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4990209962Smm spa->spa_spares.sav_sync = B_TRUE; 4991209962Smm } 4992209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4993209962Smm &l2cache, &nl2cache) == 0) { 4994209962Smm if (spa->spa_l2cache.sav_config) 4995209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4996209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4997209962Smm else 4998209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4999209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 5000209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 5001209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 5002209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5003209962Smm spa_load_l2cache(spa); 5004209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5005209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 5006209962Smm } 5007209962Smm 5008219089Spjd /* 5009219089Spjd * Check for any removed devices. 5010219089Spjd */ 5011219089Spjd if (spa->spa_autoreplace) { 5012219089Spjd spa_aux_check_removed(&spa->spa_spares); 5013219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 5014219089Spjd } 5015219089Spjd 5016209962Smm if (spa_writeable(spa)) { 5017209962Smm /* 5018209962Smm * Update the config cache to include the newly-imported pool. 5019209962Smm */ 5020209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5021209962Smm } 5022209962Smm 5023219089Spjd /* 5024219089Spjd * It's possible that the pool was expanded while it was exported. 5025219089Spjd * We kick off an async task to handle this for us. 5026219089Spjd */ 5027219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 5028219089Spjd 5029248571Smm spa_history_log_version(spa, "import"); 5030209962Smm 5031331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 5032287745Sdelphij 5033287745Sdelphij mutex_exit(&spa_namespace_lock); 5034287745Sdelphij 5035219089Spjd#ifdef __FreeBSD__ 5036219089Spjd#ifdef _KERNEL 5037219089Spjd zvol_create_minors(pool); 5038219089Spjd#endif 5039219089Spjd#endif 5040209962Smm return (0); 5041185029Spjd} 5042185029Spjd 5043168404Spjdnvlist_t * 5044168404Spjdspa_tryimport(nvlist_t *tryconfig) 5045168404Spjd{ 5046168404Spjd nvlist_t *config = NULL; 5047168404Spjd char *poolname; 5048168404Spjd spa_t *spa; 5049168404Spjd uint64_t state; 5050208443Smm int error; 5051168404Spjd 5052168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 5053168404Spjd return (NULL); 5054168404Spjd 5055168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 5056168404Spjd return (NULL); 5057168404Spjd 5058168404Spjd /* 5059168404Spjd * Create and initialize the spa structure. 5060168404Spjd */ 5061168404Spjd mutex_enter(&spa_namespace_lock); 5062219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 5063209962Smm spa_activate(spa, FREAD); 5064168404Spjd 5065332530Smav zfs_dbgmsg("spa_tryimport: importing %s", poolname); 5066332530Smav 5067168404Spjd /* 5068168404Spjd * Pass off the heavy lifting to spa_load(). 5069332529Smav * Pass TRUE for trust_config because the user-supplied config 5070168404Spjd * is actually the one to trust when doing an import. 5071168404Spjd */ 5072219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 5073168404Spjd 5074168404Spjd /* 5075168404Spjd * If 'tryconfig' was at least parsable, return the current config. 5076168404Spjd */ 5077168404Spjd if (spa->spa_root_vdev != NULL) { 5078168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5079168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 5080168404Spjd poolname) == 0); 5081168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5082168404Spjd state) == 0); 5083168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 5084168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 5085236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 5086236884Smm spa->spa_load_info) == 0); 5087168404Spjd 5088168404Spjd /* 5089185029Spjd * If the bootfs property exists on this pool then we 5090185029Spjd * copy it out so that external consumers can tell which 5091185029Spjd * pools are bootable. 5092168404Spjd */ 5093208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 5094185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 5095185029Spjd 5096185029Spjd /* 5097185029Spjd * We have to play games with the name since the 5098185029Spjd * pool was opened as TRYIMPORT_NAME. 5099185029Spjd */ 5100185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 5101185029Spjd spa->spa_bootfs, tmpname) == 0) { 5102185029Spjd char *cp; 5103185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 5104185029Spjd 5105185029Spjd cp = strchr(tmpname, '/'); 5106185029Spjd if (cp == NULL) { 5107185029Spjd (void) strlcpy(dsname, tmpname, 5108185029Spjd MAXPATHLEN); 5109185029Spjd } else { 5110185029Spjd (void) snprintf(dsname, MAXPATHLEN, 5111185029Spjd "%s/%s", poolname, ++cp); 5112185029Spjd } 5113185029Spjd VERIFY(nvlist_add_string(config, 5114185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 5115185029Spjd kmem_free(dsname, MAXPATHLEN); 5116185029Spjd } 5117185029Spjd kmem_free(tmpname, MAXPATHLEN); 5118185029Spjd } 5119185029Spjd 5120185029Spjd /* 5121185029Spjd * Add the list of hot spares and level 2 cache devices. 5122185029Spjd */ 5123209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5124168404Spjd spa_add_spares(spa, config); 5125185029Spjd spa_add_l2cache(spa, config); 5126209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 5127168404Spjd } 5128168404Spjd 5129168404Spjd spa_unload(spa); 5130168404Spjd spa_deactivate(spa); 5131168404Spjd spa_remove(spa); 5132168404Spjd mutex_exit(&spa_namespace_lock); 5133168404Spjd 5134168404Spjd return (config); 5135168404Spjd} 5136168404Spjd 5137168404Spjd/* 5138168404Spjd * Pool export/destroy 5139168404Spjd * 5140168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 5141168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 5142168404Spjd * update the pool state and sync all the labels to disk, removing the 5143207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 5144207670Smm * we don't sync the labels or remove the configuration cache. 5145168404Spjd */ 5146168404Spjdstatic int 5147185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 5148207670Smm boolean_t force, boolean_t hardforce) 5149168404Spjd{ 5150168404Spjd spa_t *spa; 5151168404Spjd 5152168404Spjd if (oldconfig) 5153168404Spjd *oldconfig = NULL; 5154168404Spjd 5155209962Smm if (!(spa_mode_global & FWRITE)) 5156249195Smm return (SET_ERROR(EROFS)); 5157168404Spjd 5158168404Spjd mutex_enter(&spa_namespace_lock); 5159168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 5160168404Spjd mutex_exit(&spa_namespace_lock); 5161249195Smm return (SET_ERROR(ENOENT)); 5162168404Spjd } 5163168404Spjd 5164168404Spjd /* 5165168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 5166168404Spjd * reacquire the namespace lock, and see if we can export. 5167168404Spjd */ 5168168404Spjd spa_open_ref(spa, FTAG); 5169168404Spjd mutex_exit(&spa_namespace_lock); 5170168404Spjd spa_async_suspend(spa); 5171168404Spjd mutex_enter(&spa_namespace_lock); 5172168404Spjd spa_close(spa, FTAG); 5173168404Spjd 5174168404Spjd /* 5175168404Spjd * The pool will be in core if it's openable, 5176168404Spjd * in which case we can modify its state. 5177168404Spjd */ 5178168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 5179168404Spjd /* 5180168404Spjd * Objsets may be open only because they're dirty, so we 5181168404Spjd * have to force it to sync before checking spa_refcnt. 5182168404Spjd */ 5183168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 5184286575Smav spa_evicting_os_wait(spa); 5185168404Spjd 5186168404Spjd /* 5187168404Spjd * A pool cannot be exported or destroyed if there are active 5188168404Spjd * references. If we are resetting a pool, allow references by 5189168404Spjd * fault injection handlers. 5190168404Spjd */ 5191168404Spjd if (!spa_refcount_zero(spa) || 5192168404Spjd (spa->spa_inject_ref != 0 && 5193168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 5194168404Spjd spa_async_resume(spa); 5195168404Spjd mutex_exit(&spa_namespace_lock); 5196249195Smm return (SET_ERROR(EBUSY)); 5197168404Spjd } 5198168404Spjd 5199185029Spjd /* 5200185029Spjd * A pool cannot be exported if it has an active shared spare. 5201185029Spjd * This is to prevent other pools stealing the active spare 5202185029Spjd * from an exported pool. At user's own will, such pool can 5203185029Spjd * be forcedly exported. 5204185029Spjd */ 5205185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 5206185029Spjd spa_has_active_shared_spare(spa)) { 5207185029Spjd spa_async_resume(spa); 5208185029Spjd mutex_exit(&spa_namespace_lock); 5209249195Smm return (SET_ERROR(EXDEV)); 5210185029Spjd } 5211168404Spjd 5212168404Spjd /* 5213168404Spjd * We want this to be reflected on every label, 5214168404Spjd * so mark them all dirty. spa_unload() will do the 5215168404Spjd * final sync that pushes these changes out. 5216168404Spjd */ 5217207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 5218185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5219168404Spjd spa->spa_state = new_state; 5220219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 5221219089Spjd TXG_DEFER_SIZE + 1; 5222168404Spjd vdev_config_dirty(spa->spa_root_vdev); 5223185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 5224168404Spjd } 5225168404Spjd } 5226168404Spjd 5227331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 5228185029Spjd 5229168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 5230168404Spjd spa_unload(spa); 5231168404Spjd spa_deactivate(spa); 5232168404Spjd } 5233168404Spjd 5234168404Spjd if (oldconfig && spa->spa_config) 5235168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 5236168404Spjd 5237168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 5238207670Smm if (!hardforce) 5239332525Smav spa_write_cachefile(spa, B_TRUE, B_TRUE); 5240168404Spjd spa_remove(spa); 5241168404Spjd } 5242168404Spjd mutex_exit(&spa_namespace_lock); 5243168404Spjd 5244168404Spjd return (0); 5245168404Spjd} 5246168404Spjd 5247168404Spjd/* 5248168404Spjd * Destroy a storage pool. 5249168404Spjd */ 5250168404Spjdint 5251168404Spjdspa_destroy(char *pool) 5252168404Spjd{ 5253207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 5254207670Smm B_FALSE, B_FALSE)); 5255168404Spjd} 5256168404Spjd 5257168404Spjd/* 5258168404Spjd * Export a storage pool. 5259168404Spjd */ 5260168404Spjdint 5261207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 5262207670Smm boolean_t hardforce) 5263168404Spjd{ 5264207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 5265207670Smm force, hardforce)); 5266168404Spjd} 5267168404Spjd 5268168404Spjd/* 5269168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 5270168404Spjd * from the namespace in any way. 5271168404Spjd */ 5272168404Spjdint 5273168404Spjdspa_reset(char *pool) 5274168404Spjd{ 5275185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 5276207670Smm B_FALSE, B_FALSE)); 5277168404Spjd} 5278168404Spjd 5279168404Spjd/* 5280168404Spjd * ========================================================================== 5281168404Spjd * Device manipulation 5282168404Spjd * ========================================================================== 5283168404Spjd */ 5284168404Spjd 5285168404Spjd/* 5286185029Spjd * Add a device to a storage pool. 5287168404Spjd */ 5288168404Spjdint 5289168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 5290168404Spjd{ 5291219089Spjd uint64_t txg, id; 5292209962Smm int error; 5293168404Spjd vdev_t *rvd = spa->spa_root_vdev; 5294168404Spjd vdev_t *vd, *tvd; 5295185029Spjd nvlist_t **spares, **l2cache; 5296185029Spjd uint_t nspares, nl2cache; 5297168404Spjd 5298219089Spjd ASSERT(spa_writeable(spa)); 5299219089Spjd 5300168404Spjd txg = spa_vdev_enter(spa); 5301168404Spjd 5302168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 5303168404Spjd VDEV_ALLOC_ADD)) != 0) 5304168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5305168404Spjd 5306185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 5307168404Spjd 5308185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 5309185029Spjd &nspares) != 0) 5310168404Spjd nspares = 0; 5311168404Spjd 5312185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 5313185029Spjd &nl2cache) != 0) 5314185029Spjd nl2cache = 0; 5315185029Spjd 5316185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 5317168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5318168404Spjd 5319185029Spjd if (vd->vdev_children != 0 && 5320185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 5321185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 5322168404Spjd 5323168404Spjd /* 5324185029Spjd * We must validate the spares and l2cache devices after checking the 5325185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 5326168404Spjd */ 5327185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 5328168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 5329168404Spjd 5330168404Spjd /* 5331332525Smav * If we are in the middle of a device removal, we can only add 5332332525Smav * devices which match the existing devices in the pool. 5333332525Smav * If we are in the middle of a removal, or have some indirect 5334332525Smav * vdevs, we can not add raidz toplevels. 5335168404Spjd */ 5336332525Smav if (spa->spa_vdev_removal != NULL || 5337332525Smav spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 5338332525Smav for (int c = 0; c < vd->vdev_children; c++) { 5339332525Smav tvd = vd->vdev_child[c]; 5340332525Smav if (spa->spa_vdev_removal != NULL && 5341332525Smav tvd->vdev_ashift != 5342332525Smav spa->spa_vdev_removal->svr_vdev->vdev_ashift) { 5343332525Smav return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5344332525Smav } 5345332525Smav /* Fail if top level vdev is raidz */ 5346332525Smav if (tvd->vdev_ops == &vdev_raidz_ops) { 5347332525Smav return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5348332525Smav } 5349332525Smav /* 5350332525Smav * Need the top level mirror to be 5351332525Smav * a mirror of leaf vdevs only 5352332525Smav */ 5353332525Smav if (tvd->vdev_ops == &vdev_mirror_ops) { 5354332525Smav for (uint64_t cid = 0; 5355332525Smav cid < tvd->vdev_children; cid++) { 5356332525Smav vdev_t *cvd = tvd->vdev_child[cid]; 5357332525Smav if (!cvd->vdev_ops->vdev_op_leaf) { 5358332525Smav return (spa_vdev_exit(spa, vd, 5359332525Smav txg, EINVAL)); 5360332525Smav } 5361332525Smav } 5362332525Smav } 5363332525Smav } 5364332525Smav } 5365332525Smav 5366209962Smm for (int c = 0; c < vd->vdev_children; c++) { 5367219089Spjd 5368219089Spjd /* 5369219089Spjd * Set the vdev id to the first hole, if one exists. 5370219089Spjd */ 5371219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 5372219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 5373219089Spjd vdev_free(rvd->vdev_child[id]); 5374219089Spjd break; 5375219089Spjd } 5376219089Spjd } 5377168404Spjd tvd = vd->vdev_child[c]; 5378168404Spjd vdev_remove_child(vd, tvd); 5379219089Spjd tvd->vdev_id = id; 5380168404Spjd vdev_add_child(rvd, tvd); 5381168404Spjd vdev_config_dirty(tvd); 5382168404Spjd } 5383168404Spjd 5384168404Spjd if (nspares != 0) { 5385185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 5386185029Spjd ZPOOL_CONFIG_SPARES); 5387168404Spjd spa_load_spares(spa); 5388185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5389168404Spjd } 5390168404Spjd 5391185029Spjd if (nl2cache != 0) { 5392185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 5393185029Spjd ZPOOL_CONFIG_L2CACHE); 5394185029Spjd spa_load_l2cache(spa); 5395185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5396185029Spjd } 5397185029Spjd 5398168404Spjd /* 5399168404Spjd * We have to be careful when adding new vdevs to an existing pool. 5400168404Spjd * If other threads start allocating from these vdevs before we 5401168404Spjd * sync the config cache, and we lose power, then upon reboot we may 5402168404Spjd * fail to open the pool because there are DVAs that the config cache 5403168404Spjd * can't translate. Therefore, we first add the vdevs without 5404168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 5405168404Spjd * and then let spa_config_update() initialize the new metaslabs. 5406168404Spjd * 5407168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 5408168404Spjd * if we lose power at any point in this sequence, the remaining 5409168404Spjd * steps will be completed the next time we load the pool. 5410168404Spjd */ 5411168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 5412168404Spjd 5413168404Spjd mutex_enter(&spa_namespace_lock); 5414168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5415331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 5416168404Spjd mutex_exit(&spa_namespace_lock); 5417168404Spjd 5418168404Spjd return (0); 5419168404Spjd} 5420168404Spjd 5421168404Spjd/* 5422168404Spjd * Attach a device to a mirror. The arguments are the path to any device 5423168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 5424168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 5425168404Spjd * 5426168404Spjd * If 'replacing' is specified, the new device is intended to replace the 5427168404Spjd * existing device; in this case the two devices are made into their own 5428185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 5429168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 5430168404Spjd * extra rules: you can't attach to it after it's been created, and upon 5431168404Spjd * completion of resilvering, the first disk (the one being replaced) 5432168404Spjd * is automatically detached. 5433168404Spjd */ 5434168404Spjdint 5435168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 5436168404Spjd{ 5437219089Spjd uint64_t txg, dtl_max_txg; 5438168404Spjd vdev_t *rvd = spa->spa_root_vdev; 5439168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 5440168404Spjd vdev_ops_t *pvops; 5441185029Spjd char *oldvdpath, *newvdpath; 5442185029Spjd int newvd_isspare; 5443185029Spjd int error; 5444168404Spjd 5445219089Spjd ASSERT(spa_writeable(spa)); 5446219089Spjd 5447168404Spjd txg = spa_vdev_enter(spa); 5448168404Spjd 5449185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 5450168404Spjd 5451332525Smav if (spa->spa_vdev_removal != NULL || 5452332525Smav spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 5453332525Smav return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5454332525Smav } 5455332525Smav 5456168404Spjd if (oldvd == NULL) 5457168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 5458168404Spjd 5459168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 5460168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5461168404Spjd 5462168404Spjd pvd = oldvd->vdev_parent; 5463168404Spjd 5464168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 5465230514Smm VDEV_ALLOC_ATTACH)) != 0) 5466185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5467185029Spjd 5468185029Spjd if (newrootvd->vdev_children != 1) 5469168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 5470168404Spjd 5471168404Spjd newvd = newrootvd->vdev_child[0]; 5472168404Spjd 5473168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 5474168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 5475168404Spjd 5476168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 5477168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 5478168404Spjd 5479185029Spjd /* 5480185029Spjd * Spares can't replace logs 5481185029Spjd */ 5482185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 5483185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5484185029Spjd 5485168404Spjd if (!replacing) { 5486168404Spjd /* 5487168404Spjd * For attach, the only allowable parent is a mirror or the root 5488168404Spjd * vdev. 5489168404Spjd */ 5490168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 5491168404Spjd pvd->vdev_ops != &vdev_root_ops) 5492168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5493168404Spjd 5494168404Spjd pvops = &vdev_mirror_ops; 5495168404Spjd } else { 5496168404Spjd /* 5497168404Spjd * Active hot spares can only be replaced by inactive hot 5498168404Spjd * spares. 5499168404Spjd */ 5500168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 5501219089Spjd oldvd->vdev_isspare && 5502168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 5503168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5504168404Spjd 5505168404Spjd /* 5506168404Spjd * If the source is a hot spare, and the parent isn't already a 5507168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 5508168404Spjd * want to create a replacing vdev. The user is not allowed to 5509168404Spjd * attach to a spared vdev child unless the 'isspare' state is 5510168404Spjd * the same (spare replaces spare, non-spare replaces 5511168404Spjd * non-spare). 5512168404Spjd */ 5513219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 5514219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 5515168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5516219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 5517219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 5518168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5519219089Spjd } 5520219089Spjd 5521219089Spjd if (newvd->vdev_isspare) 5522168404Spjd pvops = &vdev_spare_ops; 5523168404Spjd else 5524168404Spjd pvops = &vdev_replacing_ops; 5525168404Spjd } 5526168404Spjd 5527168404Spjd /* 5528219089Spjd * Make sure the new device is big enough. 5529168404Spjd */ 5530219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 5531168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 5532168404Spjd 5533168404Spjd /* 5534168404Spjd * The new device cannot have a higher alignment requirement 5535168404Spjd * than the top-level vdev. 5536168404Spjd */ 5537168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 5538168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 5539168404Spjd 5540168404Spjd /* 5541168404Spjd * If this is an in-place replacement, update oldvd's path and devid 5542168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 5543168404Spjd */ 5544168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 5545168404Spjd spa_strfree(oldvd->vdev_path); 5546168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 5547168404Spjd KM_SLEEP); 5548168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 5549168404Spjd newvd->vdev_path, "old"); 5550168404Spjd if (oldvd->vdev_devid != NULL) { 5551168404Spjd spa_strfree(oldvd->vdev_devid); 5552168404Spjd oldvd->vdev_devid = NULL; 5553168404Spjd } 5554168404Spjd } 5555168404Spjd 5556219089Spjd /* mark the device being resilvered */ 5557254112Sdelphij newvd->vdev_resilver_txg = txg; 5558219089Spjd 5559168404Spjd /* 5560168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 5561168404Spjd * mirror/replacing/spare vdev above oldvd. 5562168404Spjd */ 5563168404Spjd if (pvd->vdev_ops != pvops) 5564168404Spjd pvd = vdev_add_parent(oldvd, pvops); 5565168404Spjd 5566168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 5567168404Spjd ASSERT(pvd->vdev_ops == pvops); 5568168404Spjd ASSERT(oldvd->vdev_parent == pvd); 5569168404Spjd 5570168404Spjd /* 5571168404Spjd * Extract the new device from its root and add it to pvd. 5572168404Spjd */ 5573168404Spjd vdev_remove_child(newrootvd, newvd); 5574168404Spjd newvd->vdev_id = pvd->vdev_children; 5575219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 5576168404Spjd vdev_add_child(pvd, newvd); 5577168404Spjd 5578168404Spjd tvd = newvd->vdev_top; 5579168404Spjd ASSERT(pvd->vdev_top == tvd); 5580168404Spjd ASSERT(tvd->vdev_parent == rvd); 5581168404Spjd 5582168404Spjd vdev_config_dirty(tvd); 5583168404Spjd 5584168404Spjd /* 5585219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 5586219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 5587219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 5588168404Spjd */ 5589219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 5590168404Spjd 5591219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 5592219089Spjd dtl_max_txg - TXG_INITIAL); 5593168404Spjd 5594209962Smm if (newvd->vdev_isspare) { 5595168404Spjd spa_spare_activate(newvd); 5596331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 5597209962Smm } 5598209962Smm 5599185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 5600185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 5601185029Spjd newvd_isspare = newvd->vdev_isspare; 5602168404Spjd 5603168404Spjd /* 5604168404Spjd * Mark newvd's DTL dirty in this txg. 5605168404Spjd */ 5606168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 5607168404Spjd 5608219089Spjd /* 5609258717Savg * Schedule the resilver to restart in the future. We do this to 5610258717Savg * ensure that dmu_sync-ed blocks have been stitched into the 5611258717Savg * respective datasets. 5612219089Spjd */ 5613219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 5614168404Spjd 5615287745Sdelphij if (spa->spa_bootfs) 5616331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 5617287745Sdelphij 5618331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 5619287745Sdelphij 5620219089Spjd /* 5621219089Spjd * Commit the config 5622219089Spjd */ 5623219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 5624185029Spjd 5625248571Smm spa_history_log_internal(spa, "vdev attach", NULL, 5626219089Spjd "%s vdev=%s %s vdev=%s", 5627219089Spjd replacing && newvd_isspare ? "spare in" : 5628219089Spjd replacing ? "replace" : "attach", newvdpath, 5629219089Spjd replacing ? "for" : "to", oldvdpath); 5630219089Spjd 5631185029Spjd spa_strfree(oldvdpath); 5632185029Spjd spa_strfree(newvdpath); 5633185029Spjd 5634168404Spjd return (0); 5635168404Spjd} 5636168404Spjd 5637168404Spjd/* 5638168404Spjd * Detach a device from a mirror or replacing vdev. 5639251631Sdelphij * 5640168404Spjd * If 'replace_done' is specified, only detach if the parent 5641168404Spjd * is a replacing vdev. 5642168404Spjd */ 5643168404Spjdint 5644209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 5645168404Spjd{ 5646168404Spjd uint64_t txg; 5647209962Smm int error; 5648168404Spjd vdev_t *rvd = spa->spa_root_vdev; 5649168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 5650168404Spjd boolean_t unspare = B_FALSE; 5651247187Smm uint64_t unspare_guid = 0; 5652219089Spjd char *vdpath; 5653168404Spjd 5654219089Spjd ASSERT(spa_writeable(spa)); 5655219089Spjd 5656168404Spjd txg = spa_vdev_enter(spa); 5657168404Spjd 5658185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5659168404Spjd 5660168404Spjd if (vd == NULL) 5661168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 5662168404Spjd 5663168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5664168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5665168404Spjd 5666168404Spjd pvd = vd->vdev_parent; 5667168404Spjd 5668168404Spjd /* 5669209962Smm * If the parent/child relationship is not as expected, don't do it. 5670209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 5671209962Smm * vdev that's replacing B with C. The user's intent in replacing 5672209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 5673209962Smm * the replace by detaching C, the expected behavior is to end up 5674209962Smm * M(A,B). But suppose that right after deciding to detach C, 5675209962Smm * the replacement of B completes. We would have M(A,C), and then 5676209962Smm * ask to detach C, which would leave us with just A -- not what 5677209962Smm * the user wanted. To prevent this, we make sure that the 5678209962Smm * parent/child relationship hasn't changed -- in this example, 5679209962Smm * that C's parent is still the replacing vdev R. 5680209962Smm */ 5681209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 5682209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5683209962Smm 5684209962Smm /* 5685219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 5686168404Spjd */ 5687219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 5688219089Spjd pvd->vdev_ops != &vdev_spare_ops) 5689219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5690168404Spjd 5691168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 5692185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 5693168404Spjd 5694168404Spjd /* 5695168404Spjd * Only mirror, replacing, and spare vdevs support detach. 5696168404Spjd */ 5697168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 5698168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 5699168404Spjd pvd->vdev_ops != &vdev_spare_ops) 5700168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5701168404Spjd 5702168404Spjd /* 5703209962Smm * If this device has the only valid copy of some data, 5704209962Smm * we cannot safely detach it. 5705168404Spjd */ 5706209962Smm if (vdev_dtl_required(vd)) 5707168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5708168404Spjd 5709209962Smm ASSERT(pvd->vdev_children >= 2); 5710168404Spjd 5711168404Spjd /* 5712185029Spjd * If we are detaching the second disk from a replacing vdev, then 5713185029Spjd * check to see if we changed the original vdev's path to have "/old" 5714185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 5715168404Spjd */ 5716219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 5717219089Spjd vd->vdev_path != NULL) { 5718219089Spjd size_t len = strlen(vd->vdev_path); 5719219089Spjd 5720219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 5721219089Spjd cvd = pvd->vdev_child[c]; 5722219089Spjd 5723219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 5724219089Spjd continue; 5725219089Spjd 5726219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 5727219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 5728219089Spjd spa_strfree(cvd->vdev_path); 5729219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 5730219089Spjd break; 5731219089Spjd } 5732185029Spjd } 5733185029Spjd } 5734168404Spjd 5735168404Spjd /* 5736168404Spjd * If we are detaching the original disk from a spare, then it implies 5737168404Spjd * that the spare should become a real disk, and be removed from the 5738168404Spjd * active spare list for the pool. 5739168404Spjd */ 5740168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 5741219089Spjd vd->vdev_id == 0 && 5742219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 5743168404Spjd unspare = B_TRUE; 5744168404Spjd 5745168404Spjd /* 5746168404Spjd * Erase the disk labels so the disk can be used for other things. 5747168404Spjd * This must be done after all other error cases are handled, 5748168404Spjd * but before we disembowel vd (so we can still do I/O to it). 5749168404Spjd * But if we can't do it, don't treat the error as fatal -- 5750168404Spjd * it may be that the unwritability of the disk is the reason 5751168404Spjd * it's being detached! 5752168404Spjd */ 5753168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5754168404Spjd 5755168404Spjd /* 5756168404Spjd * Remove vd from its parent and compact the parent's children. 5757168404Spjd */ 5758168404Spjd vdev_remove_child(pvd, vd); 5759168404Spjd vdev_compact_children(pvd); 5760168404Spjd 5761168404Spjd /* 5762168404Spjd * Remember one of the remaining children so we can get tvd below. 5763168404Spjd */ 5764219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 5765168404Spjd 5766168404Spjd /* 5767168404Spjd * If we need to remove the remaining child from the list of hot spares, 5768209962Smm * do it now, marking the vdev as no longer a spare in the process. 5769209962Smm * We must do this before vdev_remove_parent(), because that can 5770209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 5771209962Smm * reason, we must remove the spare now, in the same txg as the detach; 5772209962Smm * otherwise someone could attach a new sibling, change the GUID, and 5773209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 5774168404Spjd */ 5775168404Spjd if (unspare) { 5776168404Spjd ASSERT(cvd->vdev_isspare); 5777168404Spjd spa_spare_remove(cvd); 5778168404Spjd unspare_guid = cvd->vdev_guid; 5779209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 5780219089Spjd cvd->vdev_unspare = B_TRUE; 5781168404Spjd } 5782168404Spjd 5783168404Spjd /* 5784168404Spjd * If the parent mirror/replacing vdev only has one child, 5785168404Spjd * the parent is no longer needed. Remove it from the tree. 5786168404Spjd */ 5787219089Spjd if (pvd->vdev_children == 1) { 5788219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 5789219089Spjd cvd->vdev_unspare = B_FALSE; 5790168404Spjd vdev_remove_parent(cvd); 5791219089Spjd } 5792168404Spjd 5793219089Spjd 5794168404Spjd /* 5795168404Spjd * We don't set tvd until now because the parent we just removed 5796168404Spjd * may have been the previous top-level vdev. 5797168404Spjd */ 5798168404Spjd tvd = cvd->vdev_top; 5799168404Spjd ASSERT(tvd->vdev_parent == rvd); 5800168404Spjd 5801168404Spjd /* 5802168404Spjd * Reevaluate the parent vdev state. 5803168404Spjd */ 5804185029Spjd vdev_propagate_state(cvd); 5805168404Spjd 5806168404Spjd /* 5807219089Spjd * If the 'autoexpand' property is set on the pool then automatically 5808219089Spjd * try to expand the size of the pool. For example if the device we 5809219089Spjd * just detached was smaller than the others, it may be possible to 5810219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5811219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 5812168404Spjd */ 5813219089Spjd if (spa->spa_autoexpand) { 5814219089Spjd vdev_reopen(tvd); 5815219089Spjd vdev_expand(tvd, txg); 5816219089Spjd } 5817168404Spjd 5818168404Spjd vdev_config_dirty(tvd); 5819168404Spjd 5820168404Spjd /* 5821168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5822168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 5823168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 5824168404Spjd * prevent vd from being accessed after it's freed. 5825168404Spjd */ 5826219089Spjd vdpath = spa_strdup(vd->vdev_path); 5827209962Smm for (int t = 0; t < TXG_SIZE; t++) 5828168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5829168404Spjd vd->vdev_detached = B_TRUE; 5830168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 5831168404Spjd 5832331397Smav spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 5833185029Spjd 5834219089Spjd /* hang on to the spa before we release the lock */ 5835219089Spjd spa_open_ref(spa, FTAG); 5836219089Spjd 5837168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 5838168404Spjd 5839248571Smm spa_history_log_internal(spa, "detach", NULL, 5840219089Spjd "vdev=%s", vdpath); 5841219089Spjd spa_strfree(vdpath); 5842219089Spjd 5843168404Spjd /* 5844168404Spjd * If this was the removal of the original device in a hot spare vdev, 5845168404Spjd * then we want to go through and remove the device from the hot spare 5846168404Spjd * list of every other pool. 5847168404Spjd */ 5848168404Spjd if (unspare) { 5849219089Spjd spa_t *altspa = NULL; 5850219089Spjd 5851168404Spjd mutex_enter(&spa_namespace_lock); 5852219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 5853219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 5854219089Spjd altspa == spa) 5855168404Spjd continue; 5856219089Spjd 5857219089Spjd spa_open_ref(altspa, FTAG); 5858185029Spjd mutex_exit(&spa_namespace_lock); 5859219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5860185029Spjd mutex_enter(&spa_namespace_lock); 5861219089Spjd spa_close(altspa, FTAG); 5862168404Spjd } 5863168404Spjd mutex_exit(&spa_namespace_lock); 5864219089Spjd 5865219089Spjd /* search the rest of the vdevs for spares to remove */ 5866219089Spjd spa_vdev_resilver_done(spa); 5867168404Spjd } 5868168404Spjd 5869219089Spjd /* all done with the spa; OK to release */ 5870219089Spjd mutex_enter(&spa_namespace_lock); 5871219089Spjd spa_close(spa, FTAG); 5872219089Spjd mutex_exit(&spa_namespace_lock); 5873219089Spjd 5874168404Spjd return (error); 5875168404Spjd} 5876168404Spjd 5877219089Spjd/* 5878219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 5879219089Spjd */ 5880219089Spjdint 5881219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5882219089Spjd nvlist_t *props, boolean_t exp) 5883219089Spjd{ 5884219089Spjd int error = 0; 5885219089Spjd uint64_t txg, *glist; 5886219089Spjd spa_t *newspa; 5887219089Spjd uint_t c, children, lastlog; 5888219089Spjd nvlist_t **child, *nvl, *tmp; 5889219089Spjd dmu_tx_t *tx; 5890219089Spjd char *altroot = NULL; 5891219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5892219089Spjd boolean_t activate_slog; 5893219089Spjd 5894219089Spjd ASSERT(spa_writeable(spa)); 5895219089Spjd 5896219089Spjd txg = spa_vdev_enter(spa); 5897219089Spjd 5898219089Spjd /* clear the log and flush everything up to now */ 5899219089Spjd activate_slog = spa_passivate_log(spa); 5900219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5901332525Smav error = spa_reset_logs(spa); 5902219089Spjd txg = spa_vdev_config_enter(spa); 5903219089Spjd 5904219089Spjd if (activate_slog) 5905219089Spjd spa_activate_log(spa); 5906219089Spjd 5907219089Spjd if (error != 0) 5908219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5909219089Spjd 5910219089Spjd /* check new spa name before going any further */ 5911219089Spjd if (spa_lookup(newname) != NULL) 5912219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5913219089Spjd 5914219089Spjd /* 5915219089Spjd * scan through all the children to ensure they're all mirrors 5916219089Spjd */ 5917219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5918219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5919219089Spjd &children) != 0) 5920219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5921219089Spjd 5922219089Spjd /* first, check to ensure we've got the right child count */ 5923219089Spjd rvd = spa->spa_root_vdev; 5924219089Spjd lastlog = 0; 5925219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 5926219089Spjd vdev_t *vd = rvd->vdev_child[c]; 5927219089Spjd 5928219089Spjd /* don't count the holes & logs as children */ 5929332525Smav if (vd->vdev_islog || !vdev_is_concrete(vd)) { 5930219089Spjd if (lastlog == 0) 5931219089Spjd lastlog = c; 5932219089Spjd continue; 5933219089Spjd } 5934219089Spjd 5935219089Spjd lastlog = 0; 5936219089Spjd } 5937219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5938219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5939219089Spjd 5940219089Spjd /* next, ensure no spare or cache devices are part of the split */ 5941219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5942219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5943219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5944219089Spjd 5945219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5946219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5947219089Spjd 5948219089Spjd /* then, loop over each vdev and validate it */ 5949219089Spjd for (c = 0; c < children; c++) { 5950219089Spjd uint64_t is_hole = 0; 5951219089Spjd 5952219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5953219089Spjd &is_hole); 5954219089Spjd 5955219089Spjd if (is_hole != 0) { 5956219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5957219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5958219089Spjd continue; 5959219089Spjd } else { 5960249195Smm error = SET_ERROR(EINVAL); 5961219089Spjd break; 5962219089Spjd } 5963219089Spjd } 5964219089Spjd 5965219089Spjd /* which disk is going to be split? */ 5966219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5967219089Spjd &glist[c]) != 0) { 5968249195Smm error = SET_ERROR(EINVAL); 5969219089Spjd break; 5970219089Spjd } 5971219089Spjd 5972219089Spjd /* look it up in the spa */ 5973219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5974219089Spjd if (vml[c] == NULL) { 5975249195Smm error = SET_ERROR(ENODEV); 5976219089Spjd break; 5977219089Spjd } 5978219089Spjd 5979219089Spjd /* make sure there's nothing stopping the split */ 5980219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5981219089Spjd vml[c]->vdev_islog || 5982332525Smav !vdev_is_concrete(vml[c]) || 5983219089Spjd vml[c]->vdev_isspare || 5984219089Spjd vml[c]->vdev_isl2cache || 5985219089Spjd !vdev_writeable(vml[c]) || 5986219089Spjd vml[c]->vdev_children != 0 || 5987219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5988219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5989249195Smm error = SET_ERROR(EINVAL); 5990219089Spjd break; 5991219089Spjd } 5992219089Spjd 5993219089Spjd if (vdev_dtl_required(vml[c])) { 5994249195Smm error = SET_ERROR(EBUSY); 5995219089Spjd break; 5996219089Spjd } 5997219089Spjd 5998219089Spjd /* we need certain info from the top level */ 5999219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 6000219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 6001219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 6002219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 6003219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 6004219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 6005219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 6006219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 6007299441Smav 6008299441Smav /* transfer per-vdev ZAPs */ 6009299441Smav ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 6010299441Smav VERIFY0(nvlist_add_uint64(child[c], 6011299441Smav ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 6012299441Smav 6013299441Smav ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 6014299441Smav VERIFY0(nvlist_add_uint64(child[c], 6015299441Smav ZPOOL_CONFIG_VDEV_TOP_ZAP, 6016299441Smav vml[c]->vdev_parent->vdev_top_zap)); 6017219089Spjd } 6018219089Spjd 6019219089Spjd if (error != 0) { 6020219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 6021219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 6022219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 6023219089Spjd } 6024219089Spjd 6025219089Spjd /* stop writers from using the disks */ 6026219089Spjd for (c = 0; c < children; c++) { 6027219089Spjd if (vml[c] != NULL) 6028219089Spjd vml[c]->vdev_offline = B_TRUE; 6029219089Spjd } 6030219089Spjd vdev_reopen(spa->spa_root_vdev); 6031219089Spjd 6032219089Spjd /* 6033219089Spjd * Temporarily record the splitting vdevs in the spa config. This 6034219089Spjd * will disappear once the config is regenerated. 6035219089Spjd */ 6036219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6037219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 6038219089Spjd glist, children) == 0); 6039219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 6040219089Spjd 6041219089Spjd mutex_enter(&spa->spa_props_lock); 6042219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 6043219089Spjd nvl) == 0); 6044219089Spjd mutex_exit(&spa->spa_props_lock); 6045219089Spjd spa->spa_config_splitting = nvl; 6046219089Spjd vdev_config_dirty(spa->spa_root_vdev); 6047219089Spjd 6048219089Spjd /* configure and create the new pool */ 6049219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 6050219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 6051219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 6052219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6053219089Spjd spa_version(spa)) == 0); 6054219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 6055219089Spjd spa->spa_config_txg) == 0); 6056219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 6057219089Spjd spa_generate_guid(NULL)) == 0); 6058299441Smav VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 6059219089Spjd (void) nvlist_lookup_string(props, 6060219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6061219089Spjd 6062219089Spjd /* add the new pool to the namespace */ 6063219089Spjd newspa = spa_add(newname, config, altroot); 6064299441Smav newspa->spa_avz_action = AVZ_ACTION_REBUILD; 6065219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 6066219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 6067219089Spjd 6068219089Spjd /* release the spa config lock, retaining the namespace lock */ 6069219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 6070219089Spjd 6071219089Spjd if (zio_injection_enabled) 6072219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 6073219089Spjd 6074219089Spjd spa_activate(newspa, spa_mode_global); 6075219089Spjd spa_async_suspend(newspa); 6076219089Spjd 6077277300Ssmh#ifndef illumos 6078219089Spjd /* mark that we are creating new spa by splitting */ 6079219089Spjd newspa->spa_splitting_newspa = B_TRUE; 6080219089Spjd#endif 6081219089Spjd /* create the new pool from the disks of the original pool */ 6082219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 6083277300Ssmh#ifndef illumos 6084219089Spjd newspa->spa_splitting_newspa = B_FALSE; 6085219089Spjd#endif 6086219089Spjd if (error) 6087219089Spjd goto out; 6088219089Spjd 6089219089Spjd /* if that worked, generate a real config for the new pool */ 6090219089Spjd if (newspa->spa_root_vdev != NULL) { 6091219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 6092219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 6093219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 6094219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 6095219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 6096219089Spjd B_TRUE)); 6097219089Spjd } 6098219089Spjd 6099219089Spjd /* set the props */ 6100219089Spjd if (props != NULL) { 6101219089Spjd spa_configfile_set(newspa, props, B_FALSE); 6102219089Spjd error = spa_prop_set(newspa, props); 6103219089Spjd if (error) 6104219089Spjd goto out; 6105219089Spjd } 6106219089Spjd 6107219089Spjd /* flush everything */ 6108219089Spjd txg = spa_vdev_config_enter(newspa); 6109219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 6110219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 6111219089Spjd 6112219089Spjd if (zio_injection_enabled) 6113219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 6114219089Spjd 6115219089Spjd spa_async_resume(newspa); 6116219089Spjd 6117219089Spjd /* finally, update the original pool's config */ 6118219089Spjd txg = spa_vdev_config_enter(spa); 6119219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 6120219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 6121219089Spjd if (error != 0) 6122219089Spjd dmu_tx_abort(tx); 6123219089Spjd for (c = 0; c < children; c++) { 6124219089Spjd if (vml[c] != NULL) { 6125219089Spjd vdev_split(vml[c]); 6126219089Spjd if (error == 0) 6127248571Smm spa_history_log_internal(spa, "detach", tx, 6128248571Smm "vdev=%s", vml[c]->vdev_path); 6129299441Smav 6130219089Spjd vdev_free(vml[c]); 6131219089Spjd } 6132219089Spjd } 6133299441Smav spa->spa_avz_action = AVZ_ACTION_REBUILD; 6134219089Spjd vdev_config_dirty(spa->spa_root_vdev); 6135219089Spjd spa->spa_config_splitting = NULL; 6136219089Spjd nvlist_free(nvl); 6137219089Spjd if (error == 0) 6138219089Spjd dmu_tx_commit(tx); 6139219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 6140219089Spjd 6141219089Spjd if (zio_injection_enabled) 6142219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 6143219089Spjd 6144219089Spjd /* split is complete; log a history record */ 6145248571Smm spa_history_log_internal(newspa, "split", NULL, 6146248571Smm "from pool %s", spa_name(spa)); 6147219089Spjd 6148219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 6149219089Spjd 6150219089Spjd /* if we're not going to mount the filesystems in userland, export */ 6151219089Spjd if (exp) 6152219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 6153219089Spjd B_FALSE, B_FALSE); 6154219089Spjd 6155219089Spjd return (error); 6156219089Spjd 6157219089Spjdout: 6158219089Spjd spa_unload(newspa); 6159219089Spjd spa_deactivate(newspa); 6160219089Spjd spa_remove(newspa); 6161219089Spjd 6162219089Spjd txg = spa_vdev_config_enter(spa); 6163219089Spjd 6164219089Spjd /* re-online all offlined disks */ 6165219089Spjd for (c = 0; c < children; c++) { 6166219089Spjd if (vml[c] != NULL) 6167219089Spjd vml[c]->vdev_offline = B_FALSE; 6168219089Spjd } 6169219089Spjd vdev_reopen(spa->spa_root_vdev); 6170219089Spjd 6171219089Spjd nvlist_free(spa->spa_config_splitting); 6172219089Spjd spa->spa_config_splitting = NULL; 6173219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 6174219089Spjd 6175219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 6176219089Spjd return (error); 6177219089Spjd} 6178219089Spjd 6179168404Spjd/* 6180185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 6181251631Sdelphij * currently spared, so we can detach it. 6182168404Spjd */ 6183168404Spjdstatic vdev_t * 6184185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 6185168404Spjd{ 6186168404Spjd vdev_t *newvd, *oldvd; 6187168404Spjd 6188219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 6189185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 6190168404Spjd if (oldvd != NULL) 6191168404Spjd return (oldvd); 6192168404Spjd } 6193168404Spjd 6194185029Spjd /* 6195219089Spjd * Check for a completed replacement. We always consider the first 6196219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 6197219089Spjd * the newest (see spa_vdev_attach() for how that works). In 6198219089Spjd * the case where the newest vdev is faulted, we will not automatically 6199219089Spjd * remove it after a resilver completes. This is OK as it will require 6200219089Spjd * user intervention to determine which disk the admin wishes to keep. 6201185029Spjd */ 6202219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 6203219089Spjd ASSERT(vd->vdev_children > 1); 6204219089Spjd 6205219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 6206168404Spjd oldvd = vd->vdev_child[0]; 6207168404Spjd 6208209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 6209219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 6210209962Smm !vdev_dtl_required(oldvd)) 6211168404Spjd return (oldvd); 6212168404Spjd } 6213168404Spjd 6214185029Spjd /* 6215185029Spjd * Check for a completed resilver with the 'unspare' flag set. 6216185029Spjd */ 6217219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 6218219089Spjd vdev_t *first = vd->vdev_child[0]; 6219219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 6220185029Spjd 6221219089Spjd if (last->vdev_unspare) { 6222219089Spjd oldvd = first; 6223219089Spjd newvd = last; 6224219089Spjd } else if (first->vdev_unspare) { 6225219089Spjd oldvd = last; 6226219089Spjd newvd = first; 6227219089Spjd } else { 6228219089Spjd oldvd = NULL; 6229219089Spjd } 6230219089Spjd 6231219089Spjd if (oldvd != NULL && 6232209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 6233219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 6234219089Spjd !vdev_dtl_required(oldvd)) 6235185029Spjd return (oldvd); 6236219089Spjd 6237219089Spjd /* 6238219089Spjd * If there are more than two spares attached to a disk, 6239219089Spjd * and those spares are not required, then we want to 6240219089Spjd * attempt to free them up now so that they can be used 6241219089Spjd * by other pools. Once we're back down to a single 6242219089Spjd * disk+spare, we stop removing them. 6243219089Spjd */ 6244219089Spjd if (vd->vdev_children > 2) { 6245219089Spjd newvd = vd->vdev_child[1]; 6246219089Spjd 6247219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 6248219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 6249219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 6250219089Spjd !vdev_dtl_required(newvd)) 6251219089Spjd return (newvd); 6252185029Spjd } 6253185029Spjd } 6254185029Spjd 6255168404Spjd return (NULL); 6256168404Spjd} 6257168404Spjd 6258168404Spjdstatic void 6259185029Spjdspa_vdev_resilver_done(spa_t *spa) 6260168404Spjd{ 6261209962Smm vdev_t *vd, *pvd, *ppvd; 6262209962Smm uint64_t guid, sguid, pguid, ppguid; 6263168404Spjd 6264209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6265168404Spjd 6266185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 6267209962Smm pvd = vd->vdev_parent; 6268209962Smm ppvd = pvd->vdev_parent; 6269168404Spjd guid = vd->vdev_guid; 6270209962Smm pguid = pvd->vdev_guid; 6271209962Smm ppguid = ppvd->vdev_guid; 6272209962Smm sguid = 0; 6273168404Spjd /* 6274168404Spjd * If we have just finished replacing a hot spared device, then 6275168404Spjd * we need to detach the parent's first child (the original hot 6276168404Spjd * spare) as well. 6277168404Spjd */ 6278219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 6279219089Spjd ppvd->vdev_children == 2) { 6280168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 6281209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 6282168404Spjd } 6283254112Sdelphij ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 6284254112Sdelphij 6285209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 6286209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 6287168404Spjd return; 6288209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 6289168404Spjd return; 6290209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6291168404Spjd } 6292168404Spjd 6293209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 6294168404Spjd} 6295168404Spjd 6296168404Spjd/* 6297219089Spjd * Update the stored path or FRU for this vdev. 6298168404Spjd */ 6299168404Spjdint 6300209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 6301209962Smm boolean_t ispath) 6302168404Spjd{ 6303185029Spjd vdev_t *vd; 6304219089Spjd boolean_t sync = B_FALSE; 6305168404Spjd 6306219089Spjd ASSERT(spa_writeable(spa)); 6307168404Spjd 6308219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 6309219089Spjd 6310209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 6311219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 6312168404Spjd 6313168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 6314219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 6315168404Spjd 6316209962Smm if (ispath) { 6317219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 6318219089Spjd spa_strfree(vd->vdev_path); 6319219089Spjd vd->vdev_path = spa_strdup(value); 6320219089Spjd sync = B_TRUE; 6321219089Spjd } 6322209962Smm } else { 6323219089Spjd if (vd->vdev_fru == NULL) { 6324219089Spjd vd->vdev_fru = spa_strdup(value); 6325219089Spjd sync = B_TRUE; 6326219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 6327209962Smm spa_strfree(vd->vdev_fru); 6328219089Spjd vd->vdev_fru = spa_strdup(value); 6329219089Spjd sync = B_TRUE; 6330219089Spjd } 6331209962Smm } 6332168404Spjd 6333219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 6334168404Spjd} 6335168404Spjd 6336209962Smmint 6337209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 6338209962Smm{ 6339209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 6340209962Smm} 6341209962Smm 6342209962Smmint 6343209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 6344209962Smm{ 6345209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 6346209962Smm} 6347209962Smm 6348168404Spjd/* 6349168404Spjd * ========================================================================== 6350219089Spjd * SPA Scanning 6351168404Spjd * ========================================================================== 6352168404Spjd */ 6353324010Savgint 6354324010Savgspa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 6355324010Savg{ 6356324010Savg ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6357168404Spjd 6358324010Savg if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6359324010Savg return (SET_ERROR(EBUSY)); 6360324010Savg 6361324010Savg return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 6362324010Savg} 6363324010Savg 6364168404Spjdint 6365219089Spjdspa_scan_stop(spa_t *spa) 6366168404Spjd{ 6367185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6368219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6369249195Smm return (SET_ERROR(EBUSY)); 6370219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 6371219089Spjd} 6372168404Spjd 6373219089Spjdint 6374219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 6375219089Spjd{ 6376219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6377219089Spjd 6378219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 6379249195Smm return (SET_ERROR(ENOTSUP)); 6380168404Spjd 6381168404Spjd /* 6382185029Spjd * If a resilver was requested, but there is no DTL on a 6383185029Spjd * writeable leaf device, we have nothing to do. 6384168404Spjd */ 6385219089Spjd if (func == POOL_SCAN_RESILVER && 6386185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 6387185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 6388168404Spjd return (0); 6389168404Spjd } 6390168404Spjd 6391219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 6392168404Spjd} 6393168404Spjd 6394168404Spjd/* 6395168404Spjd * ========================================================================== 6396168404Spjd * SPA async task processing 6397168404Spjd * ========================================================================== 6398168404Spjd */ 6399168404Spjd 6400168404Spjdstatic void 6401185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 6402168404Spjd{ 6403185029Spjd if (vd->vdev_remove_wanted) { 6404219089Spjd vd->vdev_remove_wanted = B_FALSE; 6405219089Spjd vd->vdev_delayed_close = B_FALSE; 6406185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 6407209962Smm 6408209962Smm /* 6409209962Smm * We want to clear the stats, but we don't want to do a full 6410209962Smm * vdev_clear() as that will cause us to throw away 6411209962Smm * degraded/faulted state as well as attempt to reopen the 6412209962Smm * device, all of which is a waste. 6413209962Smm */ 6414209962Smm vd->vdev_stat.vs_read_errors = 0; 6415209962Smm vd->vdev_stat.vs_write_errors = 0; 6416209962Smm vd->vdev_stat.vs_checksum_errors = 0; 6417209962Smm 6418185029Spjd vdev_state_dirty(vd->vdev_top); 6419294027Sasomers /* Tell userspace that the vdev is gone. */ 6420294027Sasomers zfs_post_remove(spa, vd); 6421185029Spjd } 6422168404Spjd 6423185029Spjd for (int c = 0; c < vd->vdev_children; c++) 6424185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 6425185029Spjd} 6426168404Spjd 6427185029Spjdstatic void 6428185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 6429185029Spjd{ 6430185029Spjd if (vd->vdev_probe_wanted) { 6431219089Spjd vd->vdev_probe_wanted = B_FALSE; 6432185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 6433168404Spjd } 6434168404Spjd 6435185029Spjd for (int c = 0; c < vd->vdev_children; c++) 6436185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 6437168404Spjd} 6438168404Spjd 6439168404Spjdstatic void 6440219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 6441219089Spjd{ 6442219089Spjd sysevent_id_t eid; 6443219089Spjd nvlist_t *attr; 6444219089Spjd char *physpath; 6445219089Spjd 6446219089Spjd if (!spa->spa_autoexpand) 6447219089Spjd return; 6448219089Spjd 6449219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 6450219089Spjd vdev_t *cvd = vd->vdev_child[c]; 6451219089Spjd spa_async_autoexpand(spa, cvd); 6452219089Spjd } 6453219089Spjd 6454219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 6455219089Spjd return; 6456219089Spjd 6457219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 6458219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 6459219089Spjd 6460219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6461219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 6462219089Spjd 6463219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 6464219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 6465219089Spjd 6466219089Spjd nvlist_free(attr); 6467219089Spjd kmem_free(physpath, MAXPATHLEN); 6468219089Spjd} 6469219089Spjd 6470219089Spjdstatic void 6471168404Spjdspa_async_thread(void *arg) 6472168404Spjd{ 6473331399Smav spa_t *spa = (spa_t *)arg; 6474168404Spjd int tasks; 6475168404Spjd 6476168404Spjd ASSERT(spa->spa_sync_on); 6477168404Spjd 6478168404Spjd mutex_enter(&spa->spa_async_lock); 6479168404Spjd tasks = spa->spa_async_tasks; 6480253990Smav spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 6481168404Spjd mutex_exit(&spa->spa_async_lock); 6482168404Spjd 6483168404Spjd /* 6484168404Spjd * See if the config needs to be updated. 6485168404Spjd */ 6486168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 6487219089Spjd uint64_t old_space, new_space; 6488219089Spjd 6489168404Spjd mutex_enter(&spa_namespace_lock); 6490219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 6491168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6492219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 6493168404Spjd mutex_exit(&spa_namespace_lock); 6494219089Spjd 6495219089Spjd /* 6496219089Spjd * If the pool grew as a result of the config update, 6497219089Spjd * then log an internal history event. 6498219089Spjd */ 6499219089Spjd if (new_space != old_space) { 6500248571Smm spa_history_log_internal(spa, "vdev online", NULL, 6501219089Spjd "pool '%s' size: %llu(+%llu)", 6502219089Spjd spa_name(spa), new_space, new_space - old_space); 6503219089Spjd } 6504168404Spjd } 6505168404Spjd 6506219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 6507219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6508219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 6509219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6510219089Spjd } 6511219089Spjd 6512168404Spjd /* 6513185029Spjd * See if any devices need to be probed. 6514168404Spjd */ 6515185029Spjd if (tasks & SPA_ASYNC_PROBE) { 6516219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 6517185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 6518185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 6519185029Spjd } 6520168404Spjd 6521168404Spjd /* 6522185029Spjd * If any devices are done replacing, detach them. 6523168404Spjd */ 6524185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 6525185029Spjd spa_vdev_resilver_done(spa); 6526168404Spjd 6527168404Spjd /* 6528168404Spjd * Kick off a resilver. 6529168404Spjd */ 6530168404Spjd if (tasks & SPA_ASYNC_RESILVER) 6531219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 6532168404Spjd 6533168404Spjd /* 6534168404Spjd * Let the world know that we're done. 6535168404Spjd */ 6536168404Spjd mutex_enter(&spa->spa_async_lock); 6537168404Spjd spa->spa_async_thread = NULL; 6538168404Spjd cv_broadcast(&spa->spa_async_cv); 6539168404Spjd mutex_exit(&spa->spa_async_lock); 6540168404Spjd thread_exit(); 6541168404Spjd} 6542168404Spjd 6543253990Smavstatic void 6544253990Smavspa_async_thread_vd(void *arg) 6545253990Smav{ 6546253990Smav spa_t *spa = arg; 6547253990Smav int tasks; 6548253990Smav 6549253990Smav mutex_enter(&spa->spa_async_lock); 6550253990Smav tasks = spa->spa_async_tasks; 6551253990Smavretry: 6552253990Smav spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 6553253990Smav mutex_exit(&spa->spa_async_lock); 6554253990Smav 6555253990Smav /* 6556253990Smav * See if any devices need to be marked REMOVED. 6557253990Smav */ 6558253990Smav if (tasks & SPA_ASYNC_REMOVE) { 6559253990Smav spa_vdev_state_enter(spa, SCL_NONE); 6560253990Smav spa_async_remove(spa, spa->spa_root_vdev); 6561253990Smav for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 6562253990Smav spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 6563253990Smav for (int i = 0; i < spa->spa_spares.sav_count; i++) 6564253990Smav spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 6565253990Smav (void) spa_vdev_state_exit(spa, NULL, 0); 6566253990Smav } 6567253990Smav 6568253990Smav /* 6569253990Smav * Let the world know that we're done. 6570253990Smav */ 6571253990Smav mutex_enter(&spa->spa_async_lock); 6572253990Smav tasks = spa->spa_async_tasks; 6573253990Smav if ((tasks & SPA_ASYNC_REMOVE) != 0) 6574253990Smav goto retry; 6575253990Smav spa->spa_async_thread_vd = NULL; 6576253990Smav cv_broadcast(&spa->spa_async_cv); 6577253990Smav mutex_exit(&spa->spa_async_lock); 6578253990Smav thread_exit(); 6579253990Smav} 6580253990Smav 6581168404Spjdvoid 6582168404Spjdspa_async_suspend(spa_t *spa) 6583168404Spjd{ 6584168404Spjd mutex_enter(&spa->spa_async_lock); 6585168404Spjd spa->spa_async_suspended++; 6586332525Smav while (spa->spa_async_thread != NULL || 6587332525Smav spa->spa_async_thread_vd != NULL || 6588332525Smav spa->spa_condense_thread != NULL) 6589168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6590168404Spjd mutex_exit(&spa->spa_async_lock); 6591332525Smav 6592332525Smav spa_vdev_remove_suspend(spa); 6593168404Spjd} 6594168404Spjd 6595168404Spjdvoid 6596168404Spjdspa_async_resume(spa_t *spa) 6597168404Spjd{ 6598168404Spjd mutex_enter(&spa->spa_async_lock); 6599168404Spjd ASSERT(spa->spa_async_suspended != 0); 6600168404Spjd spa->spa_async_suspended--; 6601168404Spjd mutex_exit(&spa->spa_async_lock); 6602332525Smav spa_restart_removal(spa); 6603168404Spjd} 6604168404Spjd 6605251636Sdelphijstatic boolean_t 6606251636Sdelphijspa_async_tasks_pending(spa_t *spa) 6607251636Sdelphij{ 6608251636Sdelphij uint_t non_config_tasks; 6609251636Sdelphij uint_t config_task; 6610251636Sdelphij boolean_t config_task_suspended; 6611251636Sdelphij 6612253990Smav non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6613253990Smav SPA_ASYNC_REMOVE); 6614251636Sdelphij config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6615251636Sdelphij if (spa->spa_ccw_fail_time == 0) { 6616251636Sdelphij config_task_suspended = B_FALSE; 6617251636Sdelphij } else { 6618251636Sdelphij config_task_suspended = 6619251636Sdelphij (gethrtime() - spa->spa_ccw_fail_time) < 6620251636Sdelphij (zfs_ccw_retry_interval * NANOSEC); 6621251636Sdelphij } 6622251636Sdelphij 6623251636Sdelphij return (non_config_tasks || (config_task && !config_task_suspended)); 6624251636Sdelphij} 6625251636Sdelphij 6626168404Spjdstatic void 6627168404Spjdspa_async_dispatch(spa_t *spa) 6628168404Spjd{ 6629168404Spjd mutex_enter(&spa->spa_async_lock); 6630251636Sdelphij if (spa_async_tasks_pending(spa) && 6631251636Sdelphij !spa->spa_async_suspended && 6632168404Spjd spa->spa_async_thread == NULL && 6633251636Sdelphij rootdir != NULL) 6634168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 6635168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6636168404Spjd mutex_exit(&spa->spa_async_lock); 6637168404Spjd} 6638168404Spjd 6639253990Smavstatic void 6640253990Smavspa_async_dispatch_vd(spa_t *spa) 6641253990Smav{ 6642253990Smav mutex_enter(&spa->spa_async_lock); 6643253990Smav if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6644253990Smav !spa->spa_async_suspended && 6645253990Smav spa->spa_async_thread_vd == NULL && 6646253990Smav rootdir != NULL) 6647253990Smav spa->spa_async_thread_vd = thread_create(NULL, 0, 6648253990Smav spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6649253990Smav mutex_exit(&spa->spa_async_lock); 6650253990Smav} 6651253990Smav 6652168404Spjdvoid 6653168404Spjdspa_async_request(spa_t *spa, int task) 6654168404Spjd{ 6655219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6656168404Spjd mutex_enter(&spa->spa_async_lock); 6657168404Spjd spa->spa_async_tasks |= task; 6658168404Spjd mutex_exit(&spa->spa_async_lock); 6659253990Smav spa_async_dispatch_vd(spa); 6660168404Spjd} 6661168404Spjd 6662168404Spjd/* 6663168404Spjd * ========================================================================== 6664168404Spjd * SPA syncing routines 6665168404Spjd * ========================================================================== 6666168404Spjd */ 6667168404Spjd 6668219089Spjdstatic int 6669219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6670168404Spjd{ 6671219089Spjd bpobj_t *bpo = arg; 6672219089Spjd bpobj_enqueue(bpo, bp, tx); 6673219089Spjd return (0); 6674219089Spjd} 6675168404Spjd 6676219089Spjdstatic int 6677219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6678219089Spjd{ 6679219089Spjd zio_t *zio = arg; 6680168404Spjd 6681219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6682240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 6683219089Spjd return (0); 6684168404Spjd} 6685168404Spjd 6686258632Savg/* 6687258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6688258632Savg * amount of time spent syncing frees. 6689258632Savg */ 6690168404Spjdstatic void 6691258632Savgspa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6692258632Savg{ 6693258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6694258632Savg bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6695258632Savg VERIFY(zio_wait(zio) == 0); 6696258632Savg} 6697258632Savg 6698258632Savg/* 6699258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6700258632Savg * amount of time spent syncing deferred frees. 6701258632Savg */ 6702258632Savgstatic void 6703258632Savgspa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6704258632Savg{ 6705258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6706258632Savg VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6707258632Savg spa_free_sync_cb, zio, tx), ==, 0); 6708258632Savg VERIFY0(zio_wait(zio)); 6709258632Savg} 6710258632Savg 6711258632Savg 6712258632Savgstatic void 6713168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6714168404Spjd{ 6715168404Spjd char *packed = NULL; 6716185029Spjd size_t bufsize; 6717168404Spjd size_t nvsize = 0; 6718168404Spjd dmu_buf_t *db; 6719168404Spjd 6720168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6721168404Spjd 6722185029Spjd /* 6723185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6724260150Sdelphij * information. This avoids the dmu_buf_will_dirty() path and 6725185029Spjd * saves us a pre-read to get data we don't actually care about. 6726185029Spjd */ 6727236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6728185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 6729168404Spjd 6730168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6731168404Spjd KM_SLEEP) == 0); 6732185029Spjd bzero(packed + nvsize, bufsize - nvsize); 6733168404Spjd 6734185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6735168404Spjd 6736185029Spjd kmem_free(packed, bufsize); 6737168404Spjd 6738168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6739168404Spjd dmu_buf_will_dirty(db, tx); 6740168404Spjd *(uint64_t *)db->db_data = nvsize; 6741168404Spjd dmu_buf_rele(db, FTAG); 6742168404Spjd} 6743168404Spjd 6744168404Spjdstatic void 6745185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6746185029Spjd const char *config, const char *entry) 6747168404Spjd{ 6748168404Spjd nvlist_t *nvroot; 6749185029Spjd nvlist_t **list; 6750168404Spjd int i; 6751168404Spjd 6752185029Spjd if (!sav->sav_sync) 6753168404Spjd return; 6754168404Spjd 6755168404Spjd /* 6756185029Spjd * Update the MOS nvlist describing the list of available devices. 6757185029Spjd * spa_validate_aux() will have already made sure this nvlist is 6758185029Spjd * valid and the vdevs are labeled appropriately. 6759168404Spjd */ 6760185029Spjd if (sav->sav_object == 0) { 6761185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6762185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6763185029Spjd sizeof (uint64_t), tx); 6764168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 6765185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6766185029Spjd &sav->sav_object, tx) == 0); 6767168404Spjd } 6768168404Spjd 6769168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6770185029Spjd if (sav->sav_count == 0) { 6771185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6772168404Spjd } else { 6773185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6774185029Spjd for (i = 0; i < sav->sav_count; i++) 6775185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6776219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 6777185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6778185029Spjd sav->sav_count) == 0); 6779185029Spjd for (i = 0; i < sav->sav_count; i++) 6780185029Spjd nvlist_free(list[i]); 6781185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 6782168404Spjd } 6783168404Spjd 6784185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6785168404Spjd nvlist_free(nvroot); 6786168404Spjd 6787185029Spjd sav->sav_sync = B_FALSE; 6788168404Spjd} 6789168404Spjd 6790299441Smav/* 6791299441Smav * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 6792299441Smav * The all-vdev ZAP must be empty. 6793299441Smav */ 6794168404Spjdstatic void 6795299441Smavspa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 6796299441Smav{ 6797299441Smav spa_t *spa = vd->vdev_spa; 6798299441Smav if (vd->vdev_top_zap != 0) { 6799299441Smav VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6800299441Smav vd->vdev_top_zap, tx)); 6801299441Smav } 6802299441Smav if (vd->vdev_leaf_zap != 0) { 6803299441Smav VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6804299441Smav vd->vdev_leaf_zap, tx)); 6805299441Smav } 6806299441Smav for (uint64_t i = 0; i < vd->vdev_children; i++) { 6807299441Smav spa_avz_build(vd->vdev_child[i], avz, tx); 6808299441Smav } 6809299441Smav} 6810299441Smav 6811299441Smavstatic void 6812168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6813168404Spjd{ 6814168404Spjd nvlist_t *config; 6815168404Spjd 6816299441Smav /* 6817299441Smav * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 6818299441Smav * its config may not be dirty but we still need to build per-vdev ZAPs. 6819299441Smav * Similarly, if the pool is being assembled (e.g. after a split), we 6820299441Smav * need to rebuild the AVZ although the config may not be dirty. 6821299441Smav */ 6822299441Smav if (list_is_empty(&spa->spa_config_dirty_list) && 6823299441Smav spa->spa_avz_action == AVZ_ACTION_NONE) 6824168404Spjd return; 6825168404Spjd 6826185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6827168404Spjd 6828299441Smav ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 6829321540Smav spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 6830299441Smav spa->spa_all_vdev_zaps != 0); 6831299441Smav 6832299441Smav if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 6833299441Smav /* Make and build the new AVZ */ 6834299441Smav uint64_t new_avz = zap_create(spa->spa_meta_objset, 6835299441Smav DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 6836299441Smav spa_avz_build(spa->spa_root_vdev, new_avz, tx); 6837299441Smav 6838299441Smav /* Diff old AVZ with new one */ 6839299441Smav zap_cursor_t zc; 6840299441Smav zap_attribute_t za; 6841299441Smav 6842299441Smav for (zap_cursor_init(&zc, spa->spa_meta_objset, 6843299441Smav spa->spa_all_vdev_zaps); 6844299441Smav zap_cursor_retrieve(&zc, &za) == 0; 6845299441Smav zap_cursor_advance(&zc)) { 6846299441Smav uint64_t vdzap = za.za_first_integer; 6847299441Smav if (zap_lookup_int(spa->spa_meta_objset, new_avz, 6848299441Smav vdzap) == ENOENT) { 6849299441Smav /* 6850299441Smav * ZAP is listed in old AVZ but not in new one; 6851299441Smav * destroy it 6852299441Smav */ 6853299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 6854299441Smav tx)); 6855299441Smav } 6856299441Smav } 6857299441Smav 6858299441Smav zap_cursor_fini(&zc); 6859299441Smav 6860299441Smav /* Destroy the old AVZ */ 6861299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, 6862299441Smav spa->spa_all_vdev_zaps, tx)); 6863299441Smav 6864299441Smav /* Replace the old AVZ in the dir obj with the new one */ 6865299441Smav VERIFY0(zap_update(spa->spa_meta_objset, 6866299441Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 6867299441Smav sizeof (new_avz), 1, &new_avz, tx)); 6868299441Smav 6869299441Smav spa->spa_all_vdev_zaps = new_avz; 6870299441Smav } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 6871299441Smav zap_cursor_t zc; 6872299441Smav zap_attribute_t za; 6873299441Smav 6874299441Smav /* Walk through the AVZ and destroy all listed ZAPs */ 6875299441Smav for (zap_cursor_init(&zc, spa->spa_meta_objset, 6876299441Smav spa->spa_all_vdev_zaps); 6877299441Smav zap_cursor_retrieve(&zc, &za) == 0; 6878299441Smav zap_cursor_advance(&zc)) { 6879299441Smav uint64_t zap = za.za_first_integer; 6880299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 6881299441Smav } 6882299441Smav 6883299441Smav zap_cursor_fini(&zc); 6884299441Smav 6885299441Smav /* Destroy and unlink the AVZ itself */ 6886299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, 6887299441Smav spa->spa_all_vdev_zaps, tx)); 6888299441Smav VERIFY0(zap_remove(spa->spa_meta_objset, 6889299441Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 6890299441Smav spa->spa_all_vdev_zaps = 0; 6891299441Smav } 6892299441Smav 6893299441Smav if (spa->spa_all_vdev_zaps == 0) { 6894299441Smav spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 6895299441Smav DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 6896299441Smav DMU_POOL_VDEV_ZAP_MAP, tx); 6897299441Smav } 6898299441Smav spa->spa_avz_action = AVZ_ACTION_NONE; 6899299441Smav 6900299441Smav /* Create ZAPs for vdevs that don't have them. */ 6901299441Smav vdev_construct_zaps(spa->spa_root_vdev, tx); 6902299441Smav 6903185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 6904185029Spjd dmu_tx_get_txg(tx), B_FALSE); 6905185029Spjd 6906243505Smm /* 6907243505Smm * If we're upgrading the spa version then make sure that 6908243505Smm * the config object gets updated with the correct version. 6909243505Smm */ 6910243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6911243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6912243505Smm spa->spa_uberblock.ub_version); 6913243505Smm 6914185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6915185029Spjd 6916296528Smav nvlist_free(spa->spa_config_syncing); 6917168404Spjd spa->spa_config_syncing = config; 6918168404Spjd 6919168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6920168404Spjd} 6921168404Spjd 6922236884Smmstatic void 6923248571Smmspa_sync_version(void *arg, dmu_tx_t *tx) 6924236884Smm{ 6925248571Smm uint64_t *versionp = arg; 6926248571Smm uint64_t version = *versionp; 6927248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6928236884Smm 6929236884Smm /* 6930236884Smm * Setting the version is special cased when first creating the pool. 6931236884Smm */ 6932236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 6933236884Smm 6934247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6935236884Smm ASSERT(version >= spa_version(spa)); 6936236884Smm 6937236884Smm spa->spa_uberblock.ub_version = version; 6938236884Smm vdev_config_dirty(spa->spa_root_vdev); 6939248571Smm spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6940236884Smm} 6941236884Smm 6942185029Spjd/* 6943185029Spjd * Set zpool properties. 6944185029Spjd */ 6945168404Spjdstatic void 6946248571Smmspa_sync_props(void *arg, dmu_tx_t *tx) 6947168404Spjd{ 6948248571Smm nvlist_t *nvp = arg; 6949248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6950185029Spjd objset_t *mos = spa->spa_meta_objset; 6951236884Smm nvpair_t *elem = NULL; 6952168404Spjd 6953168404Spjd mutex_enter(&spa->spa_props_lock); 6954168404Spjd 6955185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 6956236884Smm uint64_t intval; 6957236884Smm char *strval, *fname; 6958236884Smm zpool_prop_t prop; 6959236884Smm const char *propname; 6960236884Smm zprop_type_t proptype; 6961259813Sdelphij spa_feature_t fid; 6962236884Smm 6963185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6964329493Smav case ZPOOL_PROP_INVAL: 6965236884Smm /* 6966236884Smm * We checked this earlier in spa_prop_validate(). 6967236884Smm */ 6968236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 6969236884Smm 6970236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 6971259813Sdelphij VERIFY0(zfeature_lookup_name(fname, &fid)); 6972236884Smm 6973259813Sdelphij spa_feature_enable(spa, fid, tx); 6974248571Smm spa_history_log_internal(spa, "set", tx, 6975248571Smm "%s=enabled", nvpair_name(elem)); 6976236884Smm break; 6977236884Smm 6978185029Spjd case ZPOOL_PROP_VERSION: 6979258717Savg intval = fnvpair_value_uint64(elem); 6980185029Spjd /* 6981236884Smm * The version is synced seperatly before other 6982236884Smm * properties and should be correct by now. 6983185029Spjd */ 6984236884Smm ASSERT3U(spa_version(spa), >=, intval); 6985185029Spjd break; 6986168404Spjd 6987185029Spjd case ZPOOL_PROP_ALTROOT: 6988185029Spjd /* 6989185029Spjd * 'altroot' is a non-persistent property. It should 6990185029Spjd * have been set temporarily at creation or import time. 6991185029Spjd */ 6992185029Spjd ASSERT(spa->spa_root != NULL); 6993185029Spjd break; 6994168404Spjd 6995219089Spjd case ZPOOL_PROP_READONLY: 6996185029Spjd case ZPOOL_PROP_CACHEFILE: 6997185029Spjd /* 6998219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 6999219089Spjd * properties. 7000185029Spjd */ 7001168404Spjd break; 7002228103Smm case ZPOOL_PROP_COMMENT: 7003258717Savg strval = fnvpair_value_string(elem); 7004228103Smm if (spa->spa_comment != NULL) 7005228103Smm spa_strfree(spa->spa_comment); 7006228103Smm spa->spa_comment = spa_strdup(strval); 7007228103Smm /* 7008228103Smm * We need to dirty the configuration on all the vdevs 7009228103Smm * so that their labels get updated. It's unnecessary 7010228103Smm * to do this for pool creation since the vdev's 7011228103Smm * configuratoin has already been dirtied. 7012228103Smm */ 7013228103Smm if (tx->tx_txg != TXG_INITIAL) 7014228103Smm vdev_config_dirty(spa->spa_root_vdev); 7015248571Smm spa_history_log_internal(spa, "set", tx, 7016248571Smm "%s=%s", nvpair_name(elem), strval); 7017228103Smm break; 7018185029Spjd default: 7019185029Spjd /* 7020185029Spjd * Set pool property values in the poolprops mos object. 7021185029Spjd */ 7022185029Spjd if (spa->spa_pool_props_object == 0) { 7023236884Smm spa->spa_pool_props_object = 7024236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 7025185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 7026236884Smm tx); 7027185029Spjd } 7028185029Spjd 7029185029Spjd /* normalize the property name */ 7030185029Spjd propname = zpool_prop_to_name(prop); 7031185029Spjd proptype = zpool_prop_get_type(prop); 7032185029Spjd 7033185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 7034185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 7035258717Savg strval = fnvpair_value_string(elem); 7036258717Savg VERIFY0(zap_update(mos, 7037185029Spjd spa->spa_pool_props_object, propname, 7038258717Savg 1, strlen(strval) + 1, strval, tx)); 7039248571Smm spa_history_log_internal(spa, "set", tx, 7040248571Smm "%s=%s", nvpair_name(elem), strval); 7041185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 7042258717Savg intval = fnvpair_value_uint64(elem); 7043185029Spjd 7044185029Spjd if (proptype == PROP_TYPE_INDEX) { 7045185029Spjd const char *unused; 7046258717Savg VERIFY0(zpool_prop_index_to_string( 7047258717Savg prop, intval, &unused)); 7048185029Spjd } 7049258717Savg VERIFY0(zap_update(mos, 7050185029Spjd spa->spa_pool_props_object, propname, 7051258717Savg 8, 1, &intval, tx)); 7052248571Smm spa_history_log_internal(spa, "set", tx, 7053248571Smm "%s=%lld", nvpair_name(elem), intval); 7054185029Spjd } else { 7055185029Spjd ASSERT(0); /* not allowed */ 7056185029Spjd } 7057185029Spjd 7058185029Spjd switch (prop) { 7059185029Spjd case ZPOOL_PROP_DELEGATION: 7060185029Spjd spa->spa_delegation = intval; 7061185029Spjd break; 7062185029Spjd case ZPOOL_PROP_BOOTFS: 7063185029Spjd spa->spa_bootfs = intval; 7064185029Spjd break; 7065185029Spjd case ZPOOL_PROP_FAILUREMODE: 7066185029Spjd spa->spa_failmode = intval; 7067185029Spjd break; 7068219089Spjd case ZPOOL_PROP_AUTOEXPAND: 7069219089Spjd spa->spa_autoexpand = intval; 7070219089Spjd if (tx->tx_txg != TXG_INITIAL) 7071219089Spjd spa_async_request(spa, 7072219089Spjd SPA_ASYNC_AUTOEXPAND); 7073219089Spjd break; 7074219089Spjd case ZPOOL_PROP_DEDUPDITTO: 7075219089Spjd spa->spa_dedup_ditto = intval; 7076219089Spjd break; 7077185029Spjd default: 7078185029Spjd break; 7079185029Spjd } 7080168404Spjd } 7081185029Spjd 7082168404Spjd } 7083185029Spjd 7084185029Spjd mutex_exit(&spa->spa_props_lock); 7085168404Spjd} 7086168404Spjd 7087168404Spjd/* 7088219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 7089219089Spjd * reflect the new version this txg, so there must be no changes this 7090219089Spjd * txg to anything that the upgrade code depends on after it executes. 7091219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 7092219089Spjd * tasks. 7093219089Spjd */ 7094219089Spjdstatic void 7095219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 7096219089Spjd{ 7097219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 7098219089Spjd 7099219089Spjd ASSERT(spa->spa_sync_pass == 1); 7100219089Spjd 7101248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 7102248571Smm 7103219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 7104219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 7105219089Spjd dsl_pool_create_origin(dp, tx); 7106219089Spjd 7107219089Spjd /* Keeping the origin open increases spa_minref */ 7108219089Spjd spa->spa_minref += 3; 7109219089Spjd } 7110219089Spjd 7111219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 7112219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 7113219089Spjd dsl_pool_upgrade_clones(dp, tx); 7114219089Spjd } 7115219089Spjd 7116219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 7117219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 7118219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 7119219089Spjd 7120219089Spjd /* Keeping the freedir open increases spa_minref */ 7121219089Spjd spa->spa_minref += 3; 7122219089Spjd } 7123236884Smm 7124236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 7125236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 7126236884Smm spa_feature_create_zap_objects(spa, tx); 7127236884Smm } 7128268126Sdelphij 7129268126Sdelphij /* 7130268126Sdelphij * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 7131268126Sdelphij * when possibility to use lz4 compression for metadata was added 7132268126Sdelphij * Old pools that have this feature enabled must be upgraded to have 7133268126Sdelphij * this feature active 7134268126Sdelphij */ 7135268126Sdelphij if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 7136268126Sdelphij boolean_t lz4_en = spa_feature_is_enabled(spa, 7137268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 7138268126Sdelphij boolean_t lz4_ac = spa_feature_is_active(spa, 7139268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 7140268126Sdelphij 7141268126Sdelphij if (lz4_en && !lz4_ac) 7142268126Sdelphij spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 7143268126Sdelphij } 7144289422Smav 7145289422Smav /* 7146289422Smav * If we haven't written the salt, do so now. Note that the 7147289422Smav * feature may not be activated yet, but that's fine since 7148289422Smav * the presence of this ZAP entry is backwards compatible. 7149289422Smav */ 7150289422Smav if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 7151289422Smav DMU_POOL_CHECKSUM_SALT) == ENOENT) { 7152289422Smav VERIFY0(zap_add(spa->spa_meta_objset, 7153289422Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 7154289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes), 7155289422Smav spa->spa_cksum_salt.zcs_bytes, tx)); 7156289422Smav } 7157289422Smav 7158248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 7159219089Spjd} 7160219089Spjd 7161332525Smavstatic void 7162332525Smavvdev_indirect_state_sync_verify(vdev_t *vd) 7163332525Smav{ 7164332525Smav vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 7165332525Smav vdev_indirect_births_t *vib = vd->vdev_indirect_births; 7166332525Smav 7167332525Smav if (vd->vdev_ops == &vdev_indirect_ops) { 7168332525Smav ASSERT(vim != NULL); 7169332525Smav ASSERT(vib != NULL); 7170332525Smav } 7171332525Smav 7172332525Smav if (vdev_obsolete_sm_object(vd) != 0) { 7173332525Smav ASSERT(vd->vdev_obsolete_sm != NULL); 7174332525Smav ASSERT(vd->vdev_removing || 7175332525Smav vd->vdev_ops == &vdev_indirect_ops); 7176332525Smav ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 7177332525Smav ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 7178332525Smav 7179332525Smav ASSERT3U(vdev_obsolete_sm_object(vd), ==, 7180332525Smav space_map_object(vd->vdev_obsolete_sm)); 7181332525Smav ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 7182332525Smav space_map_allocated(vd->vdev_obsolete_sm)); 7183332525Smav } 7184332525Smav ASSERT(vd->vdev_obsolete_segments != NULL); 7185332525Smav 7186332525Smav /* 7187332525Smav * Since frees / remaps to an indirect vdev can only 7188332525Smav * happen in syncing context, the obsolete segments 7189332525Smav * tree must be empty when we start syncing. 7190332525Smav */ 7191332525Smav ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 7192332525Smav} 7193332525Smav 7194219089Spjd/* 7195168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 7196168404Spjd * part of the process, so we iterate until it converges. 7197168404Spjd */ 7198168404Spjdvoid 7199168404Spjdspa_sync(spa_t *spa, uint64_t txg) 7200168404Spjd{ 7201168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 7202168404Spjd objset_t *mos = spa->spa_meta_objset; 7203219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 7204168404Spjd vdev_t *rvd = spa->spa_root_vdev; 7205168404Spjd vdev_t *vd; 7206168404Spjd dmu_tx_t *tx; 7207185029Spjd int error; 7208307277Smav uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 7209307277Smav zfs_vdev_queue_depth_pct / 100; 7210168404Spjd 7211219089Spjd VERIFY(spa_writeable(spa)); 7212219089Spjd 7213168404Spjd /* 7214332525Smav * Wait for i/os issued in open context that need to complete 7215332525Smav * before this txg syncs. 7216332525Smav */ 7217332525Smav VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK])); 7218332525Smav spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0); 7219332525Smav 7220332525Smav /* 7221168404Spjd * Lock out configuration changes. 7222168404Spjd */ 7223185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7224168404Spjd 7225168404Spjd spa->spa_syncing_txg = txg; 7226168404Spjd spa->spa_sync_pass = 0; 7227168404Spjd 7228307277Smav mutex_enter(&spa->spa_alloc_lock); 7229307277Smav VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 7230307277Smav mutex_exit(&spa->spa_alloc_lock); 7231307277Smav 7232185029Spjd /* 7233185029Spjd * If there are any pending vdev state changes, convert them 7234185029Spjd * into config changes that go out with this transaction group. 7235185029Spjd */ 7236185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7237209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 7238209962Smm /* 7239209962Smm * We need the write lock here because, for aux vdevs, 7240209962Smm * calling vdev_config_dirty() modifies sav_config. 7241209962Smm * This is ugly and will become unnecessary when we 7242209962Smm * eliminate the aux vdev wart by integrating all vdevs 7243209962Smm * into the root vdev tree. 7244209962Smm */ 7245209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7246209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 7247209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 7248209962Smm vdev_state_clean(vd); 7249209962Smm vdev_config_dirty(vd); 7250209962Smm } 7251209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7252209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7253185029Spjd } 7254185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 7255185029Spjd 7256168404Spjd tx = dmu_tx_create_assigned(dp, txg); 7257168404Spjd 7258247265Smm spa->spa_sync_starttime = gethrtime(); 7259247265Smm#ifdef illumos 7260247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 7261247265Smm spa->spa_sync_starttime + spa->spa_deadman_synctime)); 7262277300Ssmh#else /* !illumos */ 7263247265Smm#ifdef _KERNEL 7264314665Savg callout_schedule(&spa->spa_deadman_cycid, 7265314665Savg hz * spa->spa_deadman_synctime / NANOSEC); 7266247265Smm#endif 7267277300Ssmh#endif /* illumos */ 7268247265Smm 7269168404Spjd /* 7270185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 7271168404Spjd * set spa_deflate if we have no raid-z vdevs. 7272168404Spjd */ 7273185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 7274185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 7275168404Spjd int i; 7276168404Spjd 7277168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 7278168404Spjd vd = rvd->vdev_child[i]; 7279168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 7280168404Spjd break; 7281168404Spjd } 7282168404Spjd if (i == rvd->vdev_children) { 7283168404Spjd spa->spa_deflate = TRUE; 7284168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 7285168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 7286168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 7287168404Spjd } 7288168404Spjd } 7289168404Spjd 7290168404Spjd /* 7291307277Smav * Set the top-level vdev's max queue depth. Evaluate each 7292307277Smav * top-level's async write queue depth in case it changed. 7293307277Smav * The max queue depth will not change in the middle of syncing 7294307277Smav * out this txg. 7295307277Smav */ 7296307277Smav uint64_t queue_depth_total = 0; 7297307277Smav for (int c = 0; c < rvd->vdev_children; c++) { 7298307277Smav vdev_t *tvd = rvd->vdev_child[c]; 7299307277Smav metaslab_group_t *mg = tvd->vdev_mg; 7300307277Smav 7301307277Smav if (mg == NULL || mg->mg_class != spa_normal_class(spa) || 7302307277Smav !metaslab_group_initialized(mg)) 7303307277Smav continue; 7304307277Smav 7305307277Smav /* 7306307277Smav * It is safe to do a lock-free check here because only async 7307307277Smav * allocations look at mg_max_alloc_queue_depth, and async 7308307277Smav * allocations all happen from spa_sync(). 7309307277Smav */ 7310307277Smav ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); 7311307277Smav mg->mg_max_alloc_queue_depth = max_queue_depth; 7312307277Smav queue_depth_total += mg->mg_max_alloc_queue_depth; 7313307277Smav } 7314307277Smav metaslab_class_t *mc = spa_normal_class(spa); 7315307277Smav ASSERT0(refcount_count(&mc->mc_alloc_slots)); 7316307277Smav mc->mc_alloc_max_slots = queue_depth_total; 7317307277Smav mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 7318307277Smav 7319307277Smav ASSERT3U(mc->mc_alloc_max_slots, <=, 7320307277Smav max_queue_depth * rvd->vdev_children); 7321307277Smav 7322332525Smav for (int c = 0; c < rvd->vdev_children; c++) { 7323332525Smav vdev_t *vd = rvd->vdev_child[c]; 7324332525Smav vdev_indirect_state_sync_verify(vd); 7325332525Smav 7326332525Smav if (vdev_indirect_should_condense(vd)) { 7327332525Smav spa_condense_indirect_start_sync(vd, tx); 7328332525Smav break; 7329332525Smav } 7330332525Smav } 7331332525Smav 7332307277Smav /* 7333168404Spjd * Iterate to convergence. 7334168404Spjd */ 7335168404Spjd do { 7336219089Spjd int pass = ++spa->spa_sync_pass; 7337168404Spjd 7338168404Spjd spa_sync_config_object(spa, tx); 7339185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 7340185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 7341185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 7342185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 7343168404Spjd spa_errlog_sync(spa, txg); 7344168404Spjd dsl_pool_sync(dp, txg); 7345168404Spjd 7346243503Smm if (pass < zfs_sync_pass_deferred_free) { 7347258632Savg spa_sync_frees(spa, free_bpl, tx); 7348219089Spjd } else { 7349275781Sdelphij /* 7350275781Sdelphij * We can not defer frees in pass 1, because 7351275781Sdelphij * we sync the deferred frees later in pass 1. 7352275781Sdelphij */ 7353275781Sdelphij ASSERT3U(pass, >, 1); 7354219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 7355258632Savg &spa->spa_deferred_bpobj, tx); 7356168404Spjd } 7357168404Spjd 7358219089Spjd ddt_sync(spa, txg); 7359219089Spjd dsl_scan_sync(dp, tx); 7360168404Spjd 7361332525Smav if (spa->spa_vdev_removal != NULL) 7362332525Smav svr_sync(spa, tx); 7363332525Smav 7364332525Smav while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 7365332525Smav != NULL) 7366219089Spjd vdev_sync(vd, txg); 7367168404Spjd 7368275781Sdelphij if (pass == 1) { 7369219089Spjd spa_sync_upgrades(spa, tx); 7370275781Sdelphij ASSERT3U(txg, >=, 7371275781Sdelphij spa->spa_uberblock.ub_rootbp.blk_birth); 7372275781Sdelphij /* 7373275781Sdelphij * Note: We need to check if the MOS is dirty 7374275781Sdelphij * because we could have marked the MOS dirty 7375275781Sdelphij * without updating the uberblock (e.g. if we 7376275781Sdelphij * have sync tasks but no dirty user data). We 7377275781Sdelphij * need to check the uberblock's rootbp because 7378275781Sdelphij * it is updated if we have synced out dirty 7379275781Sdelphij * data (though in this case the MOS will most 7380275781Sdelphij * likely also be dirty due to second order 7381275781Sdelphij * effects, we don't want to rely on that here). 7382275781Sdelphij */ 7383275781Sdelphij if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 7384275781Sdelphij !dmu_objset_is_dirty(mos, txg)) { 7385275781Sdelphij /* 7386275781Sdelphij * Nothing changed on the first pass, 7387275781Sdelphij * therefore this TXG is a no-op. Avoid 7388275781Sdelphij * syncing deferred frees, so that we 7389275781Sdelphij * can keep this TXG as a no-op. 7390275781Sdelphij */ 7391275781Sdelphij ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 7392275781Sdelphij txg)); 7393275781Sdelphij ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7394275781Sdelphij ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 7395275781Sdelphij break; 7396275781Sdelphij } 7397275781Sdelphij spa_sync_deferred_frees(spa, tx); 7398275781Sdelphij } 7399168404Spjd 7400219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 7401219089Spjd 7402299441Smav if (!list_is_empty(&spa->spa_config_dirty_list)) { 7403299441Smav /* 7404299441Smav * Make sure that the number of ZAPs for all the vdevs matches 7405299441Smav * the number of ZAPs in the per-vdev ZAP list. This only gets 7406299441Smav * called if the config is dirty; otherwise there may be 7407299441Smav * outstanding AVZ operations that weren't completed in 7408299441Smav * spa_sync_config_object. 7409299441Smav */ 7410299441Smav uint64_t all_vdev_zap_entry_count; 7411299441Smav ASSERT0(zap_count(spa->spa_meta_objset, 7412299441Smav spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 7413299441Smav ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 7414299441Smav all_vdev_zap_entry_count); 7415299441Smav } 7416299441Smav 7417332525Smav if (spa->spa_vdev_removal != NULL) { 7418332525Smav ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 7419332525Smav } 7420332525Smav 7421168404Spjd /* 7422168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 7423168404Spjd * to commit the transaction group. 7424168404Spjd * 7425185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 7426185029Spjd * random top-level vdevs that are known to be visible in the 7427185029Spjd * config cache (see spa_vdev_add() for a complete description). 7428185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 7429168404Spjd */ 7430185029Spjd for (;;) { 7431185029Spjd /* 7432185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 7433185029Spjd * while we're attempting to write the vdev labels. 7434185029Spjd */ 7435185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7436168404Spjd 7437185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 7438185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 7439185029Spjd int svdcount = 0; 7440185029Spjd int children = rvd->vdev_children; 7441185029Spjd int c0 = spa_get_random(children); 7442185029Spjd 7443219089Spjd for (int c = 0; c < children; c++) { 7444185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 7445332525Smav if (vd->vdev_ms_array == 0 || vd->vdev_islog || 7446332525Smav !vdev_is_concrete(vd)) 7447185029Spjd continue; 7448185029Spjd svd[svdcount++] = vd; 7449185029Spjd if (svdcount == SPA_DVAS_PER_BP) 7450185029Spjd break; 7451185029Spjd } 7452294811Smav error = vdev_config_sync(svd, svdcount, txg); 7453185029Spjd } else { 7454185029Spjd error = vdev_config_sync(rvd->vdev_child, 7455294811Smav rvd->vdev_children, txg); 7456168404Spjd } 7457185029Spjd 7458239620Smm if (error == 0) 7459239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 7460239620Smm 7461185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 7462185029Spjd 7463185029Spjd if (error == 0) 7464185029Spjd break; 7465185029Spjd zio_suspend(spa, NULL); 7466185029Spjd zio_resume_wait(spa); 7467168404Spjd } 7468168404Spjd dmu_tx_commit(tx); 7469168404Spjd 7470247265Smm#ifdef illumos 7471247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 7472277300Ssmh#else /* !illumos */ 7473247265Smm#ifdef _KERNEL 7474247265Smm callout_drain(&spa->spa_deadman_cycid); 7475247265Smm#endif 7476277300Ssmh#endif /* illumos */ 7477247265Smm 7478168404Spjd /* 7479168404Spjd * Clear the dirty config list. 7480168404Spjd */ 7481185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 7482168404Spjd vdev_config_clean(vd); 7483168404Spjd 7484168404Spjd /* 7485168404Spjd * Now that the new config has synced transactionally, 7486168404Spjd * let it become visible to the config cache. 7487168404Spjd */ 7488168404Spjd if (spa->spa_config_syncing != NULL) { 7489168404Spjd spa_config_set(spa, spa->spa_config_syncing); 7490168404Spjd spa->spa_config_txg = txg; 7491168404Spjd spa->spa_config_syncing = NULL; 7492168404Spjd } 7493168404Spjd 7494219089Spjd dsl_pool_sync_done(dp, txg); 7495168404Spjd 7496307277Smav mutex_enter(&spa->spa_alloc_lock); 7497307277Smav VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 7498307277Smav mutex_exit(&spa->spa_alloc_lock); 7499307277Smav 7500168404Spjd /* 7501168404Spjd * Update usable space statistics. 7502168404Spjd */ 7503168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 7504168404Spjd vdev_sync_done(vd, txg); 7505168404Spjd 7506219089Spjd spa_update_dspace(spa); 7507219089Spjd 7508168404Spjd /* 7509168404Spjd * It had better be the case that we didn't dirty anything 7510168404Spjd * since vdev_config_sync(). 7511168404Spjd */ 7512168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 7513168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7514168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 7515168404Spjd 7516219089Spjd spa->spa_sync_pass = 0; 7517219089Spjd 7518310515Savg /* 7519310515Savg * Update the last synced uberblock here. We want to do this at 7520310515Savg * the end of spa_sync() so that consumers of spa_last_synced_txg() 7521310515Savg * will be guaranteed that all the processing associated with 7522310515Savg * that txg has been completed. 7523310515Savg */ 7524310515Savg spa->spa_ubsync = spa->spa_uberblock; 7525185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 7526168404Spjd 7527219089Spjd spa_handle_ignored_writes(spa); 7528219089Spjd 7529168404Spjd /* 7530168404Spjd * If any async tasks have been requested, kick them off. 7531168404Spjd */ 7532168404Spjd spa_async_dispatch(spa); 7533253990Smav spa_async_dispatch_vd(spa); 7534168404Spjd} 7535168404Spjd 7536168404Spjd/* 7537168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 7538168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 7539168404Spjd * sync. 7540168404Spjd */ 7541168404Spjdvoid 7542168404Spjdspa_sync_allpools(void) 7543168404Spjd{ 7544168404Spjd spa_t *spa = NULL; 7545168404Spjd mutex_enter(&spa_namespace_lock); 7546168404Spjd while ((spa = spa_next(spa)) != NULL) { 7547219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 7548219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 7549168404Spjd continue; 7550168404Spjd spa_open_ref(spa, FTAG); 7551168404Spjd mutex_exit(&spa_namespace_lock); 7552168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 7553168404Spjd mutex_enter(&spa_namespace_lock); 7554168404Spjd spa_close(spa, FTAG); 7555168404Spjd } 7556168404Spjd mutex_exit(&spa_namespace_lock); 7557168404Spjd} 7558168404Spjd 7559168404Spjd/* 7560168404Spjd * ========================================================================== 7561168404Spjd * Miscellaneous routines 7562168404Spjd * ========================================================================== 7563168404Spjd */ 7564168404Spjd 7565168404Spjd/* 7566168404Spjd * Remove all pools in the system. 7567168404Spjd */ 7568168404Spjdvoid 7569168404Spjdspa_evict_all(void) 7570168404Spjd{ 7571168404Spjd spa_t *spa; 7572168404Spjd 7573168404Spjd /* 7574168404Spjd * Remove all cached state. All pools should be closed now, 7575168404Spjd * so every spa in the AVL tree should be unreferenced. 7576168404Spjd */ 7577168404Spjd mutex_enter(&spa_namespace_lock); 7578168404Spjd while ((spa = spa_next(NULL)) != NULL) { 7579168404Spjd /* 7580168404Spjd * Stop async tasks. The async thread may need to detach 7581168404Spjd * a device that's been replaced, which requires grabbing 7582168404Spjd * spa_namespace_lock, so we must drop it here. 7583168404Spjd */ 7584168404Spjd spa_open_ref(spa, FTAG); 7585168404Spjd mutex_exit(&spa_namespace_lock); 7586168404Spjd spa_async_suspend(spa); 7587168404Spjd mutex_enter(&spa_namespace_lock); 7588168404Spjd spa_close(spa, FTAG); 7589168404Spjd 7590168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7591168404Spjd spa_unload(spa); 7592168404Spjd spa_deactivate(spa); 7593168404Spjd } 7594168404Spjd spa_remove(spa); 7595168404Spjd } 7596168404Spjd mutex_exit(&spa_namespace_lock); 7597168404Spjd} 7598168404Spjd 7599168404Spjdvdev_t * 7600209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 7601168404Spjd{ 7602185029Spjd vdev_t *vd; 7603185029Spjd int i; 7604185029Spjd 7605185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 7606185029Spjd return (vd); 7607185029Spjd 7608209962Smm if (aux) { 7609185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 7610185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 7611185029Spjd if (vd->vdev_guid == guid) 7612185029Spjd return (vd); 7613185029Spjd } 7614209962Smm 7615209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 7616209962Smm vd = spa->spa_spares.sav_vdevs[i]; 7617209962Smm if (vd->vdev_guid == guid) 7618209962Smm return (vd); 7619209962Smm } 7620185029Spjd } 7621185029Spjd 7622185029Spjd return (NULL); 7623168404Spjd} 7624168404Spjd 7625168404Spjdvoid 7626185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 7627168404Spjd{ 7628219089Spjd ASSERT(spa_writeable(spa)); 7629219089Spjd 7630185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7631168404Spjd 7632168404Spjd /* 7633168404Spjd * This should only be called for a non-faulted pool, and since a 7634168404Spjd * future version would result in an unopenable pool, this shouldn't be 7635168404Spjd * possible. 7636168404Spjd */ 7637247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 7638268075Sdelphij ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 7639168404Spjd 7640185029Spjd spa->spa_uberblock.ub_version = version; 7641168404Spjd vdev_config_dirty(spa->spa_root_vdev); 7642168404Spjd 7643185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 7644168404Spjd 7645168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 7646168404Spjd} 7647168404Spjd 7648168404Spjdboolean_t 7649168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 7650168404Spjd{ 7651168404Spjd int i; 7652168404Spjd uint64_t spareguid; 7653185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 7654168404Spjd 7655185029Spjd for (i = 0; i < sav->sav_count; i++) 7656185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 7657168404Spjd return (B_TRUE); 7658168404Spjd 7659185029Spjd for (i = 0; i < sav->sav_npending; i++) { 7660185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 7661185029Spjd &spareguid) == 0 && spareguid == guid) 7662168404Spjd return (B_TRUE); 7663168404Spjd } 7664168404Spjd 7665168404Spjd return (B_FALSE); 7666168404Spjd} 7667168404Spjd 7668185029Spjd/* 7669185029Spjd * Check if a pool has an active shared spare device. 7670185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 7671185029Spjd */ 7672185029Spjdstatic boolean_t 7673185029Spjdspa_has_active_shared_spare(spa_t *spa) 7674168404Spjd{ 7675185029Spjd int i, refcnt; 7676185029Spjd uint64_t pool; 7677185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 7678185029Spjd 7679185029Spjd for (i = 0; i < sav->sav_count; i++) { 7680185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 7681185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 7682185029Spjd refcnt > 2) 7683185029Spjd return (B_TRUE); 7684185029Spjd } 7685185029Spjd 7686185029Spjd return (B_FALSE); 7687168404Spjd} 7688168404Spjd 7689332525Smavsysevent_t * 7690331397Smavspa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 7691168404Spjd{ 7692307113Smav sysevent_t *ev = NULL; 7693185029Spjd#ifdef _KERNEL 7694185029Spjd sysevent_attr_list_t *attr = NULL; 7695185029Spjd sysevent_value_t value; 7696168404Spjd 7697185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 7698185029Spjd SE_SLEEP); 7699307113Smav ASSERT(ev != NULL); 7700168404Spjd 7701185029Spjd value.value_type = SE_DATA_TYPE_STRING; 7702185029Spjd value.value.sv_string = spa_name(spa); 7703185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 7704185029Spjd goto done; 7705168404Spjd 7706185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 7707185029Spjd value.value.sv_uint64 = spa_guid(spa); 7708185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 7709185029Spjd goto done; 7710168404Spjd 7711185029Spjd if (vd) { 7712185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 7713185029Spjd value.value.sv_uint64 = vd->vdev_guid; 7714185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 7715185029Spjd SE_SLEEP) != 0) 7716185029Spjd goto done; 7717168404Spjd 7718185029Spjd if (vd->vdev_path) { 7719185029Spjd value.value_type = SE_DATA_TYPE_STRING; 7720185029Spjd value.value.sv_string = vd->vdev_path; 7721185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 7722185029Spjd &value, SE_SLEEP) != 0) 7723185029Spjd goto done; 7724168404Spjd } 7725168404Spjd } 7726168404Spjd 7727331397Smav if (hist_nvl != NULL) { 7728331397Smav fnvlist_merge((nvlist_t *)attr, hist_nvl); 7729331397Smav } 7730331397Smav 7731185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 7732185029Spjd goto done; 7733185029Spjd attr = NULL; 7734168404Spjd 7735185029Spjddone: 7736185029Spjd if (attr) 7737185029Spjd sysevent_free_attr(attr); 7738307113Smav 7739307113Smav#endif 7740307113Smav return (ev); 7741307113Smav} 7742307113Smav 7743332525Smavvoid 7744307113Smavspa_event_post(sysevent_t *ev) 7745307113Smav{ 7746307113Smav#ifdef _KERNEL 7747307113Smav sysevent_id_t eid; 7748307113Smav 7749307113Smav (void) log_sysevent(ev, SE_SLEEP, &eid); 7750185029Spjd sysevent_free(ev); 7751185029Spjd#endif 7752168404Spjd} 7753307113Smav 7754332525Smavvoid 7755332525Smavspa_event_discard(sysevent_t *ev) 7756332525Smav{ 7757332525Smav#ifdef _KERNEL 7758332525Smav sysevent_free(ev); 7759332525Smav#endif 7760332525Smav} 7761332525Smav 7762307113Smav/* 7763307113Smav * Post a sysevent corresponding to the given event. The 'name' must be one of 7764307113Smav * the event definitions in sys/sysevent/eventdefs.h. The payload will be 7765331397Smav * filled in from the spa and (optionally) the vdev and history nvl. This 7766331397Smav * doesn't do anything in the userland libzpool, as we don't want consumers to 7767331397Smav * misinterpret ztest or zdb as real changes. 7768307113Smav */ 7769307113Smavvoid 7770331397Smavspa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 7771307113Smav{ 7772331397Smav spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 7773307113Smav} 7774