spa.c revision 331399
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24321567Smav * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 25287745Sdelphij * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26247265Smm * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27286575Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28289422Smav * Copyright 2013 Saso Kiselkov. All rights reserved. 29296519Smav * Copyright (c) 2014 Integros [integros.com] 30331397Smav * Copyright 2017 Joyent, Inc. 31324010Savg * Copyright (c) 2017 Datto Inc. 32168404Spjd */ 33168404Spjd 34168404Spjd/* 35251629Sdelphij * SPA: Storage Pool Allocator 36251629Sdelphij * 37168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 38168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 39168404Spjd * pool. 40168404Spjd */ 41168404Spjd 42168404Spjd#include <sys/zfs_context.h> 43168404Spjd#include <sys/fm/fs/zfs.h> 44168404Spjd#include <sys/spa_impl.h> 45168404Spjd#include <sys/zio.h> 46168404Spjd#include <sys/zio_checksum.h> 47168404Spjd#include <sys/dmu.h> 48168404Spjd#include <sys/dmu_tx.h> 49168404Spjd#include <sys/zap.h> 50168404Spjd#include <sys/zil.h> 51219089Spjd#include <sys/ddt.h> 52168404Spjd#include <sys/vdev_impl.h> 53168404Spjd#include <sys/metaslab.h> 54219089Spjd#include <sys/metaslab_impl.h> 55168404Spjd#include <sys/uberblock_impl.h> 56168404Spjd#include <sys/txg.h> 57168404Spjd#include <sys/avl.h> 58168404Spjd#include <sys/dmu_traverse.h> 59168404Spjd#include <sys/dmu_objset.h> 60168404Spjd#include <sys/unique.h> 61168404Spjd#include <sys/dsl_pool.h> 62168404Spjd#include <sys/dsl_dataset.h> 63168404Spjd#include <sys/dsl_dir.h> 64168404Spjd#include <sys/dsl_prop.h> 65168404Spjd#include <sys/dsl_synctask.h> 66168404Spjd#include <sys/fs/zfs.h> 67185029Spjd#include <sys/arc.h> 68168404Spjd#include <sys/callb.h> 69185029Spjd#include <sys/spa_boot.h> 70219089Spjd#include <sys/zfs_ioctl.h> 71219089Spjd#include <sys/dsl_scan.h> 72248571Smm#include <sys/dmu_send.h> 73248571Smm#include <sys/dsl_destroy.h> 74248571Smm#include <sys/dsl_userhold.h> 75236884Smm#include <sys/zfeature.h> 76219089Spjd#include <sys/zvol.h> 77240868Spjd#include <sys/trim_map.h> 78321610Smav#include <sys/abd.h> 79168404Spjd 80219089Spjd#ifdef _KERNEL 81219089Spjd#include <sys/callb.h> 82219089Spjd#include <sys/cpupart.h> 83219089Spjd#include <sys/zone.h> 84219089Spjd#endif /* _KERNEL */ 85219089Spjd 86185029Spjd#include "zfs_prop.h" 87185029Spjd#include "zfs_comutil.h" 88168404Spjd 89204073Spjd/* Check hostid on import? */ 90204073Spjdstatic int check_hostid = 1; 91204073Spjd 92251636Sdelphij/* 93251636Sdelphij * The interval, in seconds, at which failed configuration cache file writes 94251636Sdelphij * should be retried. 95251636Sdelphij */ 96251636Sdelphijstatic int zfs_ccw_retry_interval = 300; 97251636Sdelphij 98271785SwillSYSCTL_DECL(_vfs_zfs); 99271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, 100271785Swill "Check hostid on import?"); 101271785SwillTUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); 102271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, 103271785Swill &zfs_ccw_retry_interval, 0, 104271785Swill "Configuration cache file write, retry after failure, interval (seconds)"); 105271785Swill 106219089Spjdtypedef enum zti_modes { 107258631Savg ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 108258631Savg ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 109258631Savg ZTI_MODE_NULL, /* don't create a taskq */ 110258631Savg ZTI_NMODES 111219089Spjd} zti_modes_t; 112168712Spjd 113258631Savg#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 114258631Savg#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 115258631Savg#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 116209962Smm 117258631Savg#define ZTI_N(n) ZTI_P(n, 1) 118258631Savg#define ZTI_ONE ZTI_N(1) 119209962Smm 120209962Smmtypedef struct zio_taskq_info { 121258631Savg zti_modes_t zti_mode; 122211931Smm uint_t zti_value; 123258631Savg uint_t zti_count; 124209962Smm} zio_taskq_info_t; 125209962Smm 126209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 127219089Spjd "issue", "issue_high", "intr", "intr_high" 128209962Smm}; 129209962Smm 130211931Smm/* 131258631Savg * This table defines the taskq settings for each ZFS I/O type. When 132258631Savg * initializing a pool, we use this table to create an appropriately sized 133258631Savg * taskq. Some operations are low volume and therefore have a small, static 134258631Savg * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 135258631Savg * macros. Other operations process a large amount of data; the ZTI_BATCH 136258631Savg * macro causes us to create a taskq oriented for throughput. Some operations 137258631Savg * are so high frequency and short-lived that the taskq itself can become a a 138258631Savg * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 139258631Savg * additional degree of parallelism specified by the number of threads per- 140258631Savg * taskq and the number of taskqs; when dispatching an event in this case, the 141258631Savg * particular taskq is chosen at random. 142258631Savg * 143258631Savg * The different taskq priorities are to handle the different contexts (issue 144258631Savg * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 145258631Savg * need to be handled with minimum delay. 146211931Smm */ 147211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 148211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 149258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 150264670Sdelphij { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 151258631Savg { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 152258631Savg { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 153258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 154258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 155209962Smm}; 156209962Smm 157331397Smavstatic sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, 158331397Smav const char *name); 159307113Smavstatic void spa_event_post(sysevent_t *ev); 160248571Smmstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 161248571Smmstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 162185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 163219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 164219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 165219089Spjd char **ereport); 166219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 167185029Spjd 168258632Savguint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 169219089Spjd#ifdef PSRSET_BIND 170219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 171219089Spjd#endif 172219089Spjd#ifdef SYSDC 173219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 174314355Savguint_t zio_taskq_basedc = 80; /* base duty cycle */ 175219089Spjd#endif 176219089Spjd 177219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 178243503Smmextern int zfs_sync_pass_deferred_free; 179219089Spjd 180168404Spjd/* 181219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 182219089Spjd * to get the vdev stats associated with the imported devices. 183219089Spjd */ 184219089Spjd#define TRYIMPORT_NAME "$import" 185219089Spjd 186219089Spjd/* 187168404Spjd * ========================================================================== 188185029Spjd * SPA properties routines 189185029Spjd * ========================================================================== 190185029Spjd */ 191185029Spjd 192185029Spjd/* 193185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 194185029Spjd */ 195185029Spjdstatic void 196185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 197185029Spjd uint64_t intval, zprop_source_t src) 198185029Spjd{ 199185029Spjd const char *propname = zpool_prop_to_name(prop); 200185029Spjd nvlist_t *propval; 201185029Spjd 202185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 203185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 204185029Spjd 205185029Spjd if (strval != NULL) 206185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 207185029Spjd else 208185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 209185029Spjd 210185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 211185029Spjd nvlist_free(propval); 212185029Spjd} 213185029Spjd 214185029Spjd/* 215185029Spjd * Get property values from the spa configuration. 216185029Spjd */ 217185029Spjdstatic void 218185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 219185029Spjd{ 220236155Smm vdev_t *rvd = spa->spa_root_vdev; 221236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 222269118Sdelphij uint64_t size, alloc, cap, version; 223185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 224185029Spjd spa_config_dirent_t *dp; 225269118Sdelphij metaslab_class_t *mc = spa_normal_class(spa); 226185029Spjd 227185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 228185029Spjd 229236155Smm if (rvd != NULL) { 230219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 231219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 232209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 233209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 234219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 235219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 236219089Spjd size - alloc, src); 237236155Smm 238269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 239269118Sdelphij metaslab_class_fragmentation(mc), src); 240269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 241269118Sdelphij metaslab_class_expandable_space(mc), src); 242219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 243219089Spjd (spa_mode(spa) == FREAD), src); 244185029Spjd 245219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 246209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 247185029Spjd 248219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 249219089Spjd ddt_get_pool_dedup_ratio(spa), src); 250219089Spjd 251209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 252236155Smm rvd->vdev_state, src); 253209962Smm 254209962Smm version = spa_version(spa); 255209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 256209962Smm src = ZPROP_SRC_DEFAULT; 257209962Smm else 258209962Smm src = ZPROP_SRC_LOCAL; 259209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 260209962Smm } 261209962Smm 262236884Smm if (pool != NULL) { 263236884Smm /* 264236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 265236884Smm * when opening pools before this version freedir will be NULL. 266236884Smm */ 267268079Sdelphij if (pool->dp_free_dir != NULL) { 268236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 269275782Sdelphij dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 270275782Sdelphij src); 271236884Smm } else { 272236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 273236884Smm NULL, 0, src); 274236884Smm } 275268079Sdelphij 276268079Sdelphij if (pool->dp_leak_dir != NULL) { 277268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 278275782Sdelphij dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 279275782Sdelphij src); 280268079Sdelphij } else { 281268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 282268079Sdelphij NULL, 0, src); 283268079Sdelphij } 284236884Smm } 285236884Smm 286185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 287185029Spjd 288228103Smm if (spa->spa_comment != NULL) { 289228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 290228103Smm 0, ZPROP_SRC_LOCAL); 291228103Smm } 292228103Smm 293185029Spjd if (spa->spa_root != NULL) 294185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 295185029Spjd 0, ZPROP_SRC_LOCAL); 296185029Spjd 297274337Sdelphij if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 298274337Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 299274337Sdelphij MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 300274337Sdelphij } else { 301274337Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 302274337Sdelphij SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 303274337Sdelphij } 304274337Sdelphij 305185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 306185029Spjd if (dp->scd_path == NULL) { 307185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 308185029Spjd "none", 0, ZPROP_SRC_LOCAL); 309185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 310185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 311185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 312185029Spjd } 313185029Spjd } 314185029Spjd} 315185029Spjd 316185029Spjd/* 317185029Spjd * Get zpool property values. 318185029Spjd */ 319185029Spjdint 320185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 321185029Spjd{ 322219089Spjd objset_t *mos = spa->spa_meta_objset; 323185029Spjd zap_cursor_t zc; 324185029Spjd zap_attribute_t za; 325185029Spjd int err; 326185029Spjd 327185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 328185029Spjd 329185029Spjd mutex_enter(&spa->spa_props_lock); 330185029Spjd 331185029Spjd /* 332185029Spjd * Get properties from the spa config. 333185029Spjd */ 334185029Spjd spa_prop_get_config(spa, nvp); 335185029Spjd 336185029Spjd /* If no pool property object, no more prop to get. */ 337219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 338185029Spjd mutex_exit(&spa->spa_props_lock); 339185029Spjd return (0); 340185029Spjd } 341185029Spjd 342185029Spjd /* 343185029Spjd * Get properties from the MOS pool property object. 344185029Spjd */ 345185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 346185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 347185029Spjd zap_cursor_advance(&zc)) { 348185029Spjd uint64_t intval = 0; 349185029Spjd char *strval = NULL; 350185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 351185029Spjd zpool_prop_t prop; 352185029Spjd 353329493Smav if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) 354185029Spjd continue; 355185029Spjd 356185029Spjd switch (za.za_integer_length) { 357185029Spjd case 8: 358185029Spjd /* integer property */ 359185029Spjd if (za.za_first_integer != 360185029Spjd zpool_prop_default_numeric(prop)) 361185029Spjd src = ZPROP_SRC_LOCAL; 362185029Spjd 363185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 364185029Spjd dsl_pool_t *dp; 365185029Spjd dsl_dataset_t *ds = NULL; 366185029Spjd 367185029Spjd dp = spa_get_dsl(spa); 368248571Smm dsl_pool_config_enter(dp, FTAG); 369185029Spjd if (err = dsl_dataset_hold_obj(dp, 370185029Spjd za.za_first_integer, FTAG, &ds)) { 371248571Smm dsl_pool_config_exit(dp, FTAG); 372185029Spjd break; 373185029Spjd } 374185029Spjd 375307108Smav strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 376185029Spjd KM_SLEEP); 377185029Spjd dsl_dataset_name(ds, strval); 378185029Spjd dsl_dataset_rele(ds, FTAG); 379248571Smm dsl_pool_config_exit(dp, FTAG); 380185029Spjd } else { 381185029Spjd strval = NULL; 382185029Spjd intval = za.za_first_integer; 383185029Spjd } 384185029Spjd 385185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 386185029Spjd 387185029Spjd if (strval != NULL) 388307108Smav kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 389185029Spjd 390185029Spjd break; 391185029Spjd 392185029Spjd case 1: 393185029Spjd /* string property */ 394185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 395185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 396185029Spjd za.za_name, 1, za.za_num_integers, strval); 397185029Spjd if (err) { 398185029Spjd kmem_free(strval, za.za_num_integers); 399185029Spjd break; 400185029Spjd } 401185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 402185029Spjd kmem_free(strval, za.za_num_integers); 403185029Spjd break; 404185029Spjd 405185029Spjd default: 406185029Spjd break; 407185029Spjd } 408185029Spjd } 409185029Spjd zap_cursor_fini(&zc); 410185029Spjd mutex_exit(&spa->spa_props_lock); 411185029Spjdout: 412185029Spjd if (err && err != ENOENT) { 413185029Spjd nvlist_free(*nvp); 414185029Spjd *nvp = NULL; 415185029Spjd return (err); 416185029Spjd } 417185029Spjd 418185029Spjd return (0); 419185029Spjd} 420185029Spjd 421185029Spjd/* 422185029Spjd * Validate the given pool properties nvlist and modify the list 423185029Spjd * for the property values to be set. 424185029Spjd */ 425185029Spjdstatic int 426185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 427185029Spjd{ 428185029Spjd nvpair_t *elem; 429185029Spjd int error = 0, reset_bootfs = 0; 430247187Smm uint64_t objnum = 0; 431236884Smm boolean_t has_feature = B_FALSE; 432185029Spjd 433185029Spjd elem = NULL; 434185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 435185029Spjd uint64_t intval; 436236884Smm char *strval, *slash, *check, *fname; 437236884Smm const char *propname = nvpair_name(elem); 438236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 439185029Spjd 440236884Smm switch (prop) { 441329493Smav case ZPOOL_PROP_INVAL: 442236884Smm if (!zpool_prop_feature(propname)) { 443249195Smm error = SET_ERROR(EINVAL); 444236884Smm break; 445236884Smm } 446185029Spjd 447236884Smm /* 448236884Smm * Sanitize the input. 449236884Smm */ 450236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 451249195Smm error = SET_ERROR(EINVAL); 452236884Smm break; 453236884Smm } 454185029Spjd 455236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 456249195Smm error = SET_ERROR(EINVAL); 457236884Smm break; 458236884Smm } 459236884Smm 460236884Smm if (intval != 0) { 461249195Smm error = SET_ERROR(EINVAL); 462236884Smm break; 463236884Smm } 464236884Smm 465236884Smm fname = strchr(propname, '@') + 1; 466236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 467249195Smm error = SET_ERROR(EINVAL); 468236884Smm break; 469236884Smm } 470236884Smm 471236884Smm has_feature = B_TRUE; 472236884Smm break; 473236884Smm 474185029Spjd case ZPOOL_PROP_VERSION: 475185029Spjd error = nvpair_value_uint64(elem, &intval); 476185029Spjd if (!error && 477236884Smm (intval < spa_version(spa) || 478236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 479236884Smm has_feature)) 480249195Smm error = SET_ERROR(EINVAL); 481185029Spjd break; 482185029Spjd 483185029Spjd case ZPOOL_PROP_DELEGATION: 484185029Spjd case ZPOOL_PROP_AUTOREPLACE: 485185029Spjd case ZPOOL_PROP_LISTSNAPS: 486219089Spjd case ZPOOL_PROP_AUTOEXPAND: 487185029Spjd error = nvpair_value_uint64(elem, &intval); 488185029Spjd if (!error && intval > 1) 489249195Smm error = SET_ERROR(EINVAL); 490185029Spjd break; 491185029Spjd 492185029Spjd case ZPOOL_PROP_BOOTFS: 493209962Smm /* 494209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 495209962Smm * or the pool is still being created (version == 0), 496209962Smm * the bootfs property cannot be set. 497209962Smm */ 498185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 499249195Smm error = SET_ERROR(ENOTSUP); 500185029Spjd break; 501185029Spjd } 502185029Spjd 503185029Spjd /* 504185029Spjd * Make sure the vdev config is bootable 505185029Spjd */ 506185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 507249195Smm error = SET_ERROR(ENOTSUP); 508185029Spjd break; 509185029Spjd } 510185029Spjd 511185029Spjd reset_bootfs = 1; 512185029Spjd 513185029Spjd error = nvpair_value_string(elem, &strval); 514185029Spjd 515185029Spjd if (!error) { 516236884Smm objset_t *os; 517274337Sdelphij uint64_t propval; 518185029Spjd 519185029Spjd if (strval == NULL || strval[0] == '\0') { 520185029Spjd objnum = zpool_prop_default_numeric( 521185029Spjd ZPOOL_PROP_BOOTFS); 522185029Spjd break; 523185029Spjd } 524185029Spjd 525219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 526185029Spjd break; 527185029Spjd 528274337Sdelphij /* 529274337Sdelphij * Must be ZPL, and its property settings 530274337Sdelphij * must be supported by GRUB (compression 531274337Sdelphij * is not gzip, and large blocks are not used). 532274337Sdelphij */ 533219089Spjd 534219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 535249195Smm error = SET_ERROR(ENOTSUP); 536248571Smm } else if ((error = 537248571Smm dsl_prop_get_int_ds(dmu_objset_ds(os), 538185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 539274337Sdelphij &propval)) == 0 && 540274337Sdelphij !BOOTFS_COMPRESS_VALID(propval)) { 541249195Smm error = SET_ERROR(ENOTSUP); 542274337Sdelphij } else if ((error = 543274337Sdelphij dsl_prop_get_int_ds(dmu_objset_ds(os), 544274337Sdelphij zfs_prop_to_name(ZFS_PROP_RECORDSIZE), 545274337Sdelphij &propval)) == 0 && 546274337Sdelphij propval > SPA_OLD_MAXBLOCKSIZE) { 547274337Sdelphij error = SET_ERROR(ENOTSUP); 548185029Spjd } else { 549185029Spjd objnum = dmu_objset_id(os); 550185029Spjd } 551219089Spjd dmu_objset_rele(os, FTAG); 552185029Spjd } 553185029Spjd break; 554185029Spjd 555185029Spjd case ZPOOL_PROP_FAILUREMODE: 556185029Spjd error = nvpair_value_uint64(elem, &intval); 557185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 558185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 559249195Smm error = SET_ERROR(EINVAL); 560185029Spjd 561185029Spjd /* 562185029Spjd * This is a special case which only occurs when 563185029Spjd * the pool has completely failed. This allows 564185029Spjd * the user to change the in-core failmode property 565185029Spjd * without syncing it out to disk (I/Os might 566185029Spjd * currently be blocked). We do this by returning 567185029Spjd * EIO to the caller (spa_prop_set) to trick it 568185029Spjd * into thinking we encountered a property validation 569185029Spjd * error. 570185029Spjd */ 571185029Spjd if (!error && spa_suspended(spa)) { 572185029Spjd spa->spa_failmode = intval; 573249195Smm error = SET_ERROR(EIO); 574185029Spjd } 575185029Spjd break; 576185029Spjd 577185029Spjd case ZPOOL_PROP_CACHEFILE: 578185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 579185029Spjd break; 580185029Spjd 581185029Spjd if (strval[0] == '\0') 582185029Spjd break; 583185029Spjd 584185029Spjd if (strcmp(strval, "none") == 0) 585185029Spjd break; 586185029Spjd 587185029Spjd if (strval[0] != '/') { 588249195Smm error = SET_ERROR(EINVAL); 589185029Spjd break; 590185029Spjd } 591185029Spjd 592185029Spjd slash = strrchr(strval, '/'); 593185029Spjd ASSERT(slash != NULL); 594185029Spjd 595185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 596185029Spjd strcmp(slash, "/..") == 0) 597249195Smm error = SET_ERROR(EINVAL); 598185029Spjd break; 599219089Spjd 600228103Smm case ZPOOL_PROP_COMMENT: 601228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 602228103Smm break; 603228103Smm for (check = strval; *check != '\0'; check++) { 604228103Smm /* 605228103Smm * The kernel doesn't have an easy isprint() 606228103Smm * check. For this kernel check, we merely 607228103Smm * check ASCII apart from DEL. Fix this if 608228103Smm * there is an easy-to-use kernel isprint(). 609228103Smm */ 610228103Smm if (*check >= 0x7f) { 611249195Smm error = SET_ERROR(EINVAL); 612228103Smm break; 613228103Smm } 614228103Smm } 615228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 616228103Smm error = E2BIG; 617228103Smm break; 618228103Smm 619219089Spjd case ZPOOL_PROP_DEDUPDITTO: 620219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 621249195Smm error = SET_ERROR(ENOTSUP); 622219089Spjd else 623219089Spjd error = nvpair_value_uint64(elem, &intval); 624219089Spjd if (error == 0 && 625219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 626249195Smm error = SET_ERROR(EINVAL); 627219089Spjd break; 628185029Spjd } 629185029Spjd 630185029Spjd if (error) 631185029Spjd break; 632185029Spjd } 633185029Spjd 634185029Spjd if (!error && reset_bootfs) { 635185029Spjd error = nvlist_remove(props, 636185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 637185029Spjd 638185029Spjd if (!error) { 639185029Spjd error = nvlist_add_uint64(props, 640185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 641185029Spjd } 642185029Spjd } 643185029Spjd 644185029Spjd return (error); 645185029Spjd} 646185029Spjd 647209962Smmvoid 648209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 649209962Smm{ 650209962Smm char *cachefile; 651209962Smm spa_config_dirent_t *dp; 652209962Smm 653209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 654209962Smm &cachefile) != 0) 655209962Smm return; 656209962Smm 657209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 658209962Smm KM_SLEEP); 659209962Smm 660209962Smm if (cachefile[0] == '\0') 661209962Smm dp->scd_path = spa_strdup(spa_config_path); 662209962Smm else if (strcmp(cachefile, "none") == 0) 663209962Smm dp->scd_path = NULL; 664209962Smm else 665209962Smm dp->scd_path = spa_strdup(cachefile); 666209962Smm 667209962Smm list_insert_head(&spa->spa_config_list, dp); 668209962Smm if (need_sync) 669209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 670209962Smm} 671209962Smm 672185029Spjdint 673185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 674185029Spjd{ 675185029Spjd int error; 676236884Smm nvpair_t *elem = NULL; 677209962Smm boolean_t need_sync = B_FALSE; 678185029Spjd 679185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 680185029Spjd return (error); 681185029Spjd 682209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 683236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 684209962Smm 685219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 686219089Spjd prop == ZPOOL_PROP_ALTROOT || 687219089Spjd prop == ZPOOL_PROP_READONLY) 688209962Smm continue; 689209962Smm 690329493Smav if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 691236884Smm uint64_t ver; 692236884Smm 693236884Smm if (prop == ZPOOL_PROP_VERSION) { 694236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 695236884Smm } else { 696236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 697236884Smm ver = SPA_VERSION_FEATURES; 698236884Smm need_sync = B_TRUE; 699236884Smm } 700236884Smm 701236884Smm /* Save time if the version is already set. */ 702236884Smm if (ver == spa_version(spa)) 703236884Smm continue; 704236884Smm 705236884Smm /* 706236884Smm * In addition to the pool directory object, we might 707236884Smm * create the pool properties object, the features for 708236884Smm * read object, the features for write object, or the 709236884Smm * feature descriptions object. 710236884Smm */ 711248571Smm error = dsl_sync_task(spa->spa_name, NULL, 712268473Sdelphij spa_sync_version, &ver, 713268473Sdelphij 6, ZFS_SPACE_CHECK_RESERVED); 714236884Smm if (error) 715236884Smm return (error); 716236884Smm continue; 717236884Smm } 718236884Smm 719209962Smm need_sync = B_TRUE; 720209962Smm break; 721209962Smm } 722209962Smm 723236884Smm if (need_sync) { 724248571Smm return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 725268473Sdelphij nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 726236884Smm } 727236884Smm 728236884Smm return (0); 729185029Spjd} 730185029Spjd 731185029Spjd/* 732185029Spjd * If the bootfs property value is dsobj, clear it. 733185029Spjd */ 734185029Spjdvoid 735185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 736185029Spjd{ 737185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 738185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 739185029Spjd spa->spa_pool_props_object, 740185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 741185029Spjd spa->spa_bootfs = 0; 742185029Spjd } 743185029Spjd} 744185029Spjd 745239620Smm/*ARGSUSED*/ 746239620Smmstatic int 747248571Smmspa_change_guid_check(void *arg, dmu_tx_t *tx) 748239620Smm{ 749248571Smm uint64_t *newguid = arg; 750248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 751239620Smm vdev_t *rvd = spa->spa_root_vdev; 752239620Smm uint64_t vdev_state; 753239620Smm 754239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 755239620Smm vdev_state = rvd->vdev_state; 756239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 757239620Smm 758239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 759249195Smm return (SET_ERROR(ENXIO)); 760239620Smm 761239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 762239620Smm 763239620Smm return (0); 764239620Smm} 765239620Smm 766239620Smmstatic void 767248571Smmspa_change_guid_sync(void *arg, dmu_tx_t *tx) 768239620Smm{ 769248571Smm uint64_t *newguid = arg; 770248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 771239620Smm uint64_t oldguid; 772239620Smm vdev_t *rvd = spa->spa_root_vdev; 773239620Smm 774239620Smm oldguid = spa_guid(spa); 775239620Smm 776239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 777239620Smm rvd->vdev_guid = *newguid; 778239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 779239620Smm vdev_config_dirty(rvd); 780239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 781239620Smm 782248571Smm spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 783239620Smm oldguid, *newguid); 784239620Smm} 785239620Smm 786185029Spjd/* 787228103Smm * Change the GUID for the pool. This is done so that we can later 788228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 789228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 790228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 791228103Smm * online when we do this, or else any vdevs that weren't present 792228103Smm * would be orphaned from our pool. We are also going to issue a 793228103Smm * sysevent to update any watchers. 794228103Smm */ 795228103Smmint 796228103Smmspa_change_guid(spa_t *spa) 797228103Smm{ 798239620Smm int error; 799239620Smm uint64_t guid; 800228103Smm 801254074Sdelphij mutex_enter(&spa->spa_vdev_top_lock); 802239620Smm mutex_enter(&spa_namespace_lock); 803239620Smm guid = spa_generate_guid(NULL); 804228103Smm 805248571Smm error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 806268473Sdelphij spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 807228103Smm 808239620Smm if (error == 0) { 809239620Smm spa_config_sync(spa, B_FALSE, B_TRUE); 810331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 811239620Smm } 812228103Smm 813239620Smm mutex_exit(&spa_namespace_lock); 814254074Sdelphij mutex_exit(&spa->spa_vdev_top_lock); 815228103Smm 816239620Smm return (error); 817228103Smm} 818228103Smm 819228103Smm/* 820185029Spjd * ========================================================================== 821168404Spjd * SPA state manipulation (open/create/destroy/import/export) 822168404Spjd * ========================================================================== 823168404Spjd */ 824168404Spjd 825168404Spjdstatic int 826168404Spjdspa_error_entry_compare(const void *a, const void *b) 827168404Spjd{ 828168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 829168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 830168404Spjd int ret; 831168404Spjd 832168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 833268123Sdelphij sizeof (zbookmark_phys_t)); 834168404Spjd 835168404Spjd if (ret < 0) 836168404Spjd return (-1); 837168404Spjd else if (ret > 0) 838168404Spjd return (1); 839168404Spjd else 840168404Spjd return (0); 841168404Spjd} 842168404Spjd 843168404Spjd/* 844168404Spjd * Utility function which retrieves copies of the current logs and 845168404Spjd * re-initializes them in the process. 846168404Spjd */ 847168404Spjdvoid 848168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 849168404Spjd{ 850168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 851168404Spjd 852168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 853168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 854168404Spjd 855168404Spjd avl_create(&spa->spa_errlist_scrub, 856168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 857168404Spjd offsetof(spa_error_entry_t, se_avl)); 858168404Spjd avl_create(&spa->spa_errlist_last, 859168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 860168404Spjd offsetof(spa_error_entry_t, se_avl)); 861168404Spjd} 862168404Spjd 863258631Savgstatic void 864258631Savgspa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 865168404Spjd{ 866258631Savg const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 867258631Savg enum zti_modes mode = ztip->zti_mode; 868258631Savg uint_t value = ztip->zti_value; 869258631Savg uint_t count = ztip->zti_count; 870258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 871258631Savg char name[32]; 872258630Savg uint_t flags = 0; 873219089Spjd boolean_t batch = B_FALSE; 874168404Spjd 875258631Savg if (mode == ZTI_MODE_NULL) { 876258631Savg tqs->stqs_count = 0; 877258631Savg tqs->stqs_taskq = NULL; 878258631Savg return; 879258631Savg } 880168404Spjd 881258631Savg ASSERT3U(count, >, 0); 882168404Spjd 883258631Savg tqs->stqs_count = count; 884258631Savg tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 885219089Spjd 886258632Savg switch (mode) { 887258632Savg case ZTI_MODE_FIXED: 888258632Savg ASSERT3U(value, >=, 1); 889258632Savg value = MAX(value, 1); 890258632Savg break; 891219089Spjd 892258632Savg case ZTI_MODE_BATCH: 893258632Savg batch = B_TRUE; 894258632Savg flags |= TASKQ_THREADS_CPU_PCT; 895258632Savg value = zio_taskq_batch_pct; 896258632Savg break; 897219089Spjd 898258632Savg default: 899258632Savg panic("unrecognized mode for %s_%s taskq (%u:%u) in " 900258632Savg "spa_activate()", 901258632Savg zio_type_name[t], zio_taskq_types[q], mode, value); 902258632Savg break; 903258632Savg } 904258631Savg 905258632Savg for (uint_t i = 0; i < count; i++) { 906258632Savg taskq_t *tq; 907258631Savg 908258631Savg if (count > 1) { 909258631Savg (void) snprintf(name, sizeof (name), "%s_%s_%u", 910258631Savg zio_type_name[t], zio_taskq_types[q], i); 911258631Savg } else { 912258631Savg (void) snprintf(name, sizeof (name), "%s_%s", 913258631Savg zio_type_name[t], zio_taskq_types[q]); 914258631Savg } 915258631Savg 916219089Spjd#ifdef SYSDC 917258631Savg if (zio_taskq_sysdc && spa->spa_proc != &p0) { 918258631Savg if (batch) 919258631Savg flags |= TASKQ_DC_BATCH; 920219089Spjd 921258631Savg tq = taskq_create_sysdc(name, value, 50, INT_MAX, 922258631Savg spa->spa_proc, zio_taskq_basedc, flags); 923258631Savg } else { 924258631Savg#endif 925258632Savg pri_t pri = maxclsyspri; 926258632Savg /* 927258632Savg * The write issue taskq can be extremely CPU 928258632Savg * intensive. Run it at slightly lower priority 929258632Savg * than the other taskqs. 930314858Savg * FreeBSD notes: 931314858Savg * - numerically higher priorities are lower priorities; 932314858Savg * - if priorities divided by four (RQ_PPQ) are equal 933314858Savg * then a difference between them is insignificant. 934258632Savg */ 935258632Savg if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 936314858Savg#ifdef illumos 937314858Savg pri--; 938314858Savg#else 939314858Savg pri += 4; 940314858Savg#endif 941258632Savg 942258632Savg tq = taskq_create_proc(name, value, pri, 50, 943258631Savg INT_MAX, spa->spa_proc, flags); 944258631Savg#ifdef SYSDC 945258631Savg } 946258631Savg#endif 947258631Savg 948258631Savg tqs->stqs_taskq[i] = tq; 949219089Spjd } 950219089Spjd} 951219089Spjd 952219089Spjdstatic void 953258631Savgspa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 954258631Savg{ 955258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 956258631Savg 957258631Savg if (tqs->stqs_taskq == NULL) { 958258631Savg ASSERT0(tqs->stqs_count); 959258631Savg return; 960258631Savg } 961258631Savg 962258631Savg for (uint_t i = 0; i < tqs->stqs_count; i++) { 963258631Savg ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 964258631Savg taskq_destroy(tqs->stqs_taskq[i]); 965258631Savg } 966258631Savg 967258631Savg kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 968258631Savg tqs->stqs_taskq = NULL; 969258631Savg} 970258631Savg 971258631Savg/* 972258631Savg * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 973258631Savg * Note that a type may have multiple discrete taskqs to avoid lock contention 974258631Savg * on the taskq itself. In that case we choose which taskq at random by using 975258631Savg * the low bits of gethrtime(). 976258631Savg */ 977258631Savgvoid 978258631Savgspa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 979258631Savg task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 980258631Savg{ 981258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 982258631Savg taskq_t *tq; 983258631Savg 984258631Savg ASSERT3P(tqs->stqs_taskq, !=, NULL); 985258631Savg ASSERT3U(tqs->stqs_count, !=, 0); 986258631Savg 987258631Savg if (tqs->stqs_count == 1) { 988258631Savg tq = tqs->stqs_taskq[0]; 989258631Savg } else { 990267038Sbdrewery#ifdef _KERNEL 991267029Smav tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 992267038Sbdrewery#else 993267038Sbdrewery tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 994267038Sbdrewery#endif 995258631Savg } 996258631Savg 997258631Savg taskq_dispatch_ent(tq, func, arg, flags, ent); 998258631Savg} 999258631Savg 1000258631Savgstatic void 1001219089Spjdspa_create_zio_taskqs(spa_t *spa) 1002219089Spjd{ 1003185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1004185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1005258631Savg spa_taskqs_init(spa, t, q); 1006219089Spjd } 1007219089Spjd } 1008219089Spjd} 1009209962Smm 1010219089Spjd#ifdef _KERNEL 1011219089Spjd#ifdef SPA_PROCESS 1012219089Spjdstatic void 1013219089Spjdspa_thread(void *arg) 1014219089Spjd{ 1015219089Spjd callb_cpr_t cprinfo; 1016209962Smm 1017219089Spjd spa_t *spa = arg; 1018219089Spjd user_t *pu = PTOU(curproc); 1019209962Smm 1020219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1021219089Spjd spa->spa_name); 1022209962Smm 1023219089Spjd ASSERT(curproc != &p0); 1024219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1025219089Spjd "zpool-%s", spa->spa_name); 1026219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1027211931Smm 1028219089Spjd#ifdef PSRSET_BIND 1029219089Spjd /* bind this thread to the requested psrset */ 1030219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 1031219089Spjd pool_lock(); 1032219089Spjd mutex_enter(&cpu_lock); 1033219089Spjd mutex_enter(&pidlock); 1034219089Spjd mutex_enter(&curproc->p_lock); 1035219089Spjd 1036219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1037219089Spjd 0, NULL, NULL) == 0) { 1038219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 1039219089Spjd } else { 1040219089Spjd cmn_err(CE_WARN, 1041219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 1042219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1043219089Spjd } 1044219089Spjd 1045219089Spjd mutex_exit(&curproc->p_lock); 1046219089Spjd mutex_exit(&pidlock); 1047219089Spjd mutex_exit(&cpu_lock); 1048219089Spjd pool_unlock(); 1049219089Spjd } 1050219089Spjd#endif 1051219089Spjd 1052219089Spjd#ifdef SYSDC 1053219089Spjd if (zio_taskq_sysdc) { 1054219089Spjd sysdc_thread_enter(curthread, 100, 0); 1055219089Spjd } 1056219089Spjd#endif 1057219089Spjd 1058219089Spjd spa->spa_proc = curproc; 1059219089Spjd spa->spa_did = curthread->t_did; 1060219089Spjd 1061219089Spjd spa_create_zio_taskqs(spa); 1062219089Spjd 1063219089Spjd mutex_enter(&spa->spa_proc_lock); 1064219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1065219089Spjd 1066219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 1067219089Spjd cv_broadcast(&spa->spa_proc_cv); 1068219089Spjd 1069219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 1070219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1071219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1072219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1073219089Spjd 1074219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1075219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 1076219089Spjd spa->spa_proc = &p0; 1077219089Spjd cv_broadcast(&spa->spa_proc_cv); 1078219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1079219089Spjd 1080219089Spjd mutex_enter(&curproc->p_lock); 1081219089Spjd lwp_exit(); 1082219089Spjd} 1083219089Spjd#endif /* SPA_PROCESS */ 1084219089Spjd#endif 1085219089Spjd 1086219089Spjd/* 1087219089Spjd * Activate an uninitialized pool. 1088219089Spjd */ 1089219089Spjdstatic void 1090219089Spjdspa_activate(spa_t *spa, int mode) 1091219089Spjd{ 1092219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1093219089Spjd 1094219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 1095219089Spjd spa->spa_mode = mode; 1096219089Spjd 1097219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1098219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1099219089Spjd 1100219089Spjd /* Try to create a covering process */ 1101219089Spjd mutex_enter(&spa->spa_proc_lock); 1102219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1103219089Spjd ASSERT(spa->spa_proc == &p0); 1104219089Spjd spa->spa_did = 0; 1105219089Spjd 1106219089Spjd#ifdef SPA_PROCESS 1107219089Spjd /* Only create a process if we're going to be around a while. */ 1108219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1109219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1110219089Spjd NULL, 0) == 0) { 1111219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 1112219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 1113219089Spjd cv_wait(&spa->spa_proc_cv, 1114219089Spjd &spa->spa_proc_lock); 1115209962Smm } 1116219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1117219089Spjd ASSERT(spa->spa_proc != &p0); 1118219089Spjd ASSERT(spa->spa_did != 0); 1119219089Spjd } else { 1120219089Spjd#ifdef _KERNEL 1121219089Spjd cmn_err(CE_WARN, 1122219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 1123219089Spjd spa->spa_name); 1124219089Spjd#endif 1125185029Spjd } 1126168404Spjd } 1127219089Spjd#endif /* SPA_PROCESS */ 1128219089Spjd mutex_exit(&spa->spa_proc_lock); 1129168404Spjd 1130219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1131219089Spjd ASSERT(spa->spa_proc == &p0); 1132219089Spjd if (spa->spa_proc == &p0) { 1133219089Spjd spa_create_zio_taskqs(spa); 1134219089Spjd } 1135219089Spjd 1136240868Spjd /* 1137240868Spjd * Start TRIM thread. 1138240868Spjd */ 1139240868Spjd trim_thread_create(spa); 1140240868Spjd 1141185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1142185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1143286575Smav list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1144286575Smav offsetof(objset_t, os_evicting_node)); 1145185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1146185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1147168404Spjd 1148321567Smav txg_list_create(&spa->spa_vdev_txg_list, spa, 1149168404Spjd offsetof(struct vdev, vdev_txg_node)); 1150168404Spjd 1151168404Spjd avl_create(&spa->spa_errlist_scrub, 1152168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1153168404Spjd offsetof(spa_error_entry_t, se_avl)); 1154168404Spjd avl_create(&spa->spa_errlist_last, 1155168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1156168404Spjd offsetof(spa_error_entry_t, se_avl)); 1157168404Spjd} 1158168404Spjd 1159168404Spjd/* 1160168404Spjd * Opposite of spa_activate(). 1161168404Spjd */ 1162168404Spjdstatic void 1163168404Spjdspa_deactivate(spa_t *spa) 1164168404Spjd{ 1165168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1166168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1167168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1168209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1169168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1170168404Spjd 1171240868Spjd /* 1172240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1173240868Spjd * before spa_deactivate(). 1174240868Spjd */ 1175240868Spjd trim_thread_destroy(spa); 1176240868Spjd 1177286575Smav spa_evicting_os_wait(spa); 1178286575Smav 1179168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1180168404Spjd 1181185029Spjd list_destroy(&spa->spa_config_dirty_list); 1182286575Smav list_destroy(&spa->spa_evicting_os_list); 1183185029Spjd list_destroy(&spa->spa_state_dirty_list); 1184168404Spjd 1185185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1186185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1187258631Savg spa_taskqs_fini(spa, t, q); 1188185029Spjd } 1189168404Spjd } 1190168404Spjd 1191168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1192168404Spjd spa->spa_normal_class = NULL; 1193168404Spjd 1194185029Spjd metaslab_class_destroy(spa->spa_log_class); 1195185029Spjd spa->spa_log_class = NULL; 1196185029Spjd 1197168404Spjd /* 1198168404Spjd * If this was part of an import or the open otherwise failed, we may 1199168404Spjd * still have errors left in the queues. Empty them just in case. 1200168404Spjd */ 1201168404Spjd spa_errlog_drain(spa); 1202168404Spjd 1203168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1204168404Spjd avl_destroy(&spa->spa_errlist_last); 1205168404Spjd 1206168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1207219089Spjd 1208219089Spjd mutex_enter(&spa->spa_proc_lock); 1209219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1210219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1211219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1212219089Spjd cv_broadcast(&spa->spa_proc_cv); 1213219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1214219089Spjd ASSERT(spa->spa_proc != &p0); 1215219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1216219089Spjd } 1217219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1218219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1219219089Spjd } 1220219089Spjd ASSERT(spa->spa_proc == &p0); 1221219089Spjd mutex_exit(&spa->spa_proc_lock); 1222219089Spjd 1223219089Spjd#ifdef SPA_PROCESS 1224219089Spjd /* 1225219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1226219089Spjd * module, so that the module can't be unloaded out from underneath 1227219089Spjd * it. 1228219089Spjd */ 1229219089Spjd if (spa->spa_did != 0) { 1230219089Spjd thread_join(spa->spa_did); 1231219089Spjd spa->spa_did = 0; 1232219089Spjd } 1233219089Spjd#endif /* SPA_PROCESS */ 1234168404Spjd} 1235168404Spjd 1236168404Spjd/* 1237168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1238168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1239168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1240168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1241168404Spjd */ 1242168404Spjdstatic int 1243168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1244168404Spjd uint_t id, int atype) 1245168404Spjd{ 1246168404Spjd nvlist_t **child; 1247219089Spjd uint_t children; 1248168404Spjd int error; 1249168404Spjd 1250168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1251168404Spjd return (error); 1252168404Spjd 1253168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1254168404Spjd return (0); 1255168404Spjd 1256185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1257185029Spjd &child, &children); 1258185029Spjd 1259185029Spjd if (error == ENOENT) 1260185029Spjd return (0); 1261185029Spjd 1262185029Spjd if (error) { 1263168404Spjd vdev_free(*vdp); 1264168404Spjd *vdp = NULL; 1265249195Smm return (SET_ERROR(EINVAL)); 1266168404Spjd } 1267168404Spjd 1268219089Spjd for (int c = 0; c < children; c++) { 1269168404Spjd vdev_t *vd; 1270168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1271168404Spjd atype)) != 0) { 1272168404Spjd vdev_free(*vdp); 1273168404Spjd *vdp = NULL; 1274168404Spjd return (error); 1275168404Spjd } 1276168404Spjd } 1277168404Spjd 1278168404Spjd ASSERT(*vdp != NULL); 1279168404Spjd 1280168404Spjd return (0); 1281168404Spjd} 1282168404Spjd 1283168404Spjd/* 1284168404Spjd * Opposite of spa_load(). 1285168404Spjd */ 1286168404Spjdstatic void 1287168404Spjdspa_unload(spa_t *spa) 1288168404Spjd{ 1289168404Spjd int i; 1290168404Spjd 1291185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1292185029Spjd 1293168404Spjd /* 1294240868Spjd * Stop TRIM thread. 1295240868Spjd */ 1296240868Spjd trim_thread_destroy(spa); 1297240868Spjd 1298240868Spjd /* 1299168404Spjd * Stop async tasks. 1300168404Spjd */ 1301168404Spjd spa_async_suspend(spa); 1302168404Spjd 1303168404Spjd /* 1304168404Spjd * Stop syncing. 1305168404Spjd */ 1306168404Spjd if (spa->spa_sync_on) { 1307168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1308168404Spjd spa->spa_sync_on = B_FALSE; 1309168404Spjd } 1310168404Spjd 1311168404Spjd /* 1312321529Smav * Even though vdev_free() also calls vdev_metaslab_fini, we need 1313321529Smav * to call it earlier, before we wait for async i/o to complete. 1314321529Smav * This ensures that there is no async metaslab prefetching, by 1315321529Smav * calling taskq_wait(mg_taskq). 1316321529Smav */ 1317321529Smav if (spa->spa_root_vdev != NULL) { 1318321529Smav spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1319321529Smav for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) 1320321529Smav vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); 1321321529Smav spa_config_exit(spa, SCL_ALL, FTAG); 1322321529Smav } 1323321529Smav 1324321529Smav /* 1325185029Spjd * Wait for any outstanding async I/O to complete. 1326168404Spjd */ 1327209962Smm if (spa->spa_async_zio_root != NULL) { 1328272598Sdelphij for (int i = 0; i < max_ncpus; i++) 1329272598Sdelphij (void) zio_wait(spa->spa_async_zio_root[i]); 1330272598Sdelphij kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1331209962Smm spa->spa_async_zio_root = NULL; 1332209962Smm } 1333168404Spjd 1334219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1335219089Spjd 1336258717Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1337258717Savg 1338168404Spjd /* 1339258717Savg * Close all vdevs. 1340258717Savg */ 1341258717Savg if (spa->spa_root_vdev) 1342258717Savg vdev_free(spa->spa_root_vdev); 1343258717Savg ASSERT(spa->spa_root_vdev == NULL); 1344258717Savg 1345258717Savg /* 1346168404Spjd * Close the dsl pool. 1347168404Spjd */ 1348168404Spjd if (spa->spa_dsl_pool) { 1349168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1350168404Spjd spa->spa_dsl_pool = NULL; 1351219089Spjd spa->spa_meta_objset = NULL; 1352168404Spjd } 1353168404Spjd 1354219089Spjd ddt_unload(spa); 1355219089Spjd 1356168404Spjd /* 1357209962Smm * Drop and purge level 2 cache 1358209962Smm */ 1359209962Smm spa_l2cache_drop(spa); 1360209962Smm 1361185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1362185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1363185029Spjd if (spa->spa_spares.sav_vdevs) { 1364185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1365185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1366185029Spjd spa->spa_spares.sav_vdevs = NULL; 1367168404Spjd } 1368185029Spjd if (spa->spa_spares.sav_config) { 1369185029Spjd nvlist_free(spa->spa_spares.sav_config); 1370185029Spjd spa->spa_spares.sav_config = NULL; 1371168404Spjd } 1372185029Spjd spa->spa_spares.sav_count = 0; 1373168404Spjd 1374230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1375230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1376185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1377230514Smm } 1378185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1379185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1380185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1381185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1382185029Spjd } 1383185029Spjd if (spa->spa_l2cache.sav_config) { 1384185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1385185029Spjd spa->spa_l2cache.sav_config = NULL; 1386185029Spjd } 1387185029Spjd spa->spa_l2cache.sav_count = 0; 1388185029Spjd 1389168404Spjd spa->spa_async_suspended = 0; 1390209962Smm 1391228103Smm if (spa->spa_comment != NULL) { 1392228103Smm spa_strfree(spa->spa_comment); 1393228103Smm spa->spa_comment = NULL; 1394228103Smm } 1395228103Smm 1396209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1397168404Spjd} 1398168404Spjd 1399168404Spjd/* 1400168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1401168404Spjd * this pool. When this is called, we have some form of basic information in 1402185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1403185029Spjd * then re-generate a more complete list including status information. 1404168404Spjd */ 1405168404Spjdstatic void 1406168404Spjdspa_load_spares(spa_t *spa) 1407168404Spjd{ 1408168404Spjd nvlist_t **spares; 1409168404Spjd uint_t nspares; 1410168404Spjd int i; 1411168404Spjd vdev_t *vd, *tvd; 1412168404Spjd 1413185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1414185029Spjd 1415168404Spjd /* 1416168404Spjd * First, close and free any existing spare vdevs. 1417168404Spjd */ 1418185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1419185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1420168404Spjd 1421168404Spjd /* Undo the call to spa_activate() below */ 1422185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1423185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1424168404Spjd spa_spare_remove(tvd); 1425168404Spjd vdev_close(vd); 1426168404Spjd vdev_free(vd); 1427168404Spjd } 1428168404Spjd 1429185029Spjd if (spa->spa_spares.sav_vdevs) 1430185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1431185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1432168404Spjd 1433185029Spjd if (spa->spa_spares.sav_config == NULL) 1434168404Spjd nspares = 0; 1435168404Spjd else 1436185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1437168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1438168404Spjd 1439185029Spjd spa->spa_spares.sav_count = (int)nspares; 1440185029Spjd spa->spa_spares.sav_vdevs = NULL; 1441168404Spjd 1442168404Spjd if (nspares == 0) 1443168404Spjd return; 1444168404Spjd 1445168404Spjd /* 1446168404Spjd * Construct the array of vdevs, opening them to get status in the 1447168404Spjd * process. For each spare, there is potentially two different vdev_t 1448168404Spjd * structures associated with it: one in the list of spares (used only 1449168404Spjd * for basic validation purposes) and one in the active vdev 1450168404Spjd * configuration (if it's spared in). During this phase we open and 1451168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1452168404Spjd * active configuration, then we also mark this vdev as an active spare. 1453168404Spjd */ 1454185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1455185029Spjd KM_SLEEP); 1456185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1457168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1458168404Spjd VDEV_ALLOC_SPARE) == 0); 1459168404Spjd ASSERT(vd != NULL); 1460168404Spjd 1461185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1462168404Spjd 1463185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1464185029Spjd B_FALSE)) != NULL) { 1465168404Spjd if (!tvd->vdev_isspare) 1466168404Spjd spa_spare_add(tvd); 1467168404Spjd 1468168404Spjd /* 1469168404Spjd * We only mark the spare active if we were successfully 1470168404Spjd * able to load the vdev. Otherwise, importing a pool 1471168404Spjd * with a bad active spare would result in strange 1472168404Spjd * behavior, because multiple pool would think the spare 1473168404Spjd * is actively in use. 1474168404Spjd * 1475168404Spjd * There is a vulnerability here to an equally bizarre 1476168404Spjd * circumstance, where a dead active spare is later 1477168404Spjd * brought back to life (onlined or otherwise). Given 1478168404Spjd * the rarity of this scenario, and the extra complexity 1479168404Spjd * it adds, we ignore the possibility. 1480168404Spjd */ 1481168404Spjd if (!vdev_is_dead(tvd)) 1482168404Spjd spa_spare_activate(tvd); 1483168404Spjd } 1484168404Spjd 1485185029Spjd vd->vdev_top = vd; 1486209962Smm vd->vdev_aux = &spa->spa_spares; 1487185029Spjd 1488168404Spjd if (vdev_open(vd) != 0) 1489168404Spjd continue; 1490168404Spjd 1491185029Spjd if (vdev_validate_aux(vd) == 0) 1492185029Spjd spa_spare_add(vd); 1493168404Spjd } 1494168404Spjd 1495168404Spjd /* 1496168404Spjd * Recompute the stashed list of spares, with status information 1497168404Spjd * this time. 1498168404Spjd */ 1499185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1500168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1501168404Spjd 1502185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1503185029Spjd KM_SLEEP); 1504185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1505185029Spjd spares[i] = vdev_config_generate(spa, 1506219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1507185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1508185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1509185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1510168404Spjd nvlist_free(spares[i]); 1511185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1512168404Spjd} 1513168404Spjd 1514185029Spjd/* 1515185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1516185029Spjd * this pool. When this is called, we have some form of basic information in 1517185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1518185029Spjd * then re-generate a more complete list including status information. 1519185029Spjd * Devices which are already active have their details maintained, and are 1520185029Spjd * not re-opened. 1521185029Spjd */ 1522185029Spjdstatic void 1523185029Spjdspa_load_l2cache(spa_t *spa) 1524185029Spjd{ 1525185029Spjd nvlist_t **l2cache; 1526185029Spjd uint_t nl2cache; 1527185029Spjd int i, j, oldnvdevs; 1528219089Spjd uint64_t guid; 1529185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1530185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1531185029Spjd 1532185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1533185029Spjd 1534185029Spjd if (sav->sav_config != NULL) { 1535185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1536185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1537185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1538185029Spjd } else { 1539185029Spjd nl2cache = 0; 1540247187Smm newvdevs = NULL; 1541185029Spjd } 1542185029Spjd 1543185029Spjd oldvdevs = sav->sav_vdevs; 1544185029Spjd oldnvdevs = sav->sav_count; 1545185029Spjd sav->sav_vdevs = NULL; 1546185029Spjd sav->sav_count = 0; 1547185029Spjd 1548185029Spjd /* 1549185029Spjd * Process new nvlist of vdevs. 1550185029Spjd */ 1551185029Spjd for (i = 0; i < nl2cache; i++) { 1552185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1553185029Spjd &guid) == 0); 1554185029Spjd 1555185029Spjd newvdevs[i] = NULL; 1556185029Spjd for (j = 0; j < oldnvdevs; j++) { 1557185029Spjd vd = oldvdevs[j]; 1558185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1559185029Spjd /* 1560185029Spjd * Retain previous vdev for add/remove ops. 1561185029Spjd */ 1562185029Spjd newvdevs[i] = vd; 1563185029Spjd oldvdevs[j] = NULL; 1564185029Spjd break; 1565185029Spjd } 1566185029Spjd } 1567185029Spjd 1568185029Spjd if (newvdevs[i] == NULL) { 1569185029Spjd /* 1570185029Spjd * Create new vdev 1571185029Spjd */ 1572185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1573185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1574185029Spjd ASSERT(vd != NULL); 1575185029Spjd newvdevs[i] = vd; 1576185029Spjd 1577185029Spjd /* 1578185029Spjd * Commit this vdev as an l2cache device, 1579185029Spjd * even if it fails to open. 1580185029Spjd */ 1581185029Spjd spa_l2cache_add(vd); 1582185029Spjd 1583185029Spjd vd->vdev_top = vd; 1584185029Spjd vd->vdev_aux = sav; 1585185029Spjd 1586185029Spjd spa_l2cache_activate(vd); 1587185029Spjd 1588185029Spjd if (vdev_open(vd) != 0) 1589185029Spjd continue; 1590185029Spjd 1591185029Spjd (void) vdev_validate_aux(vd); 1592185029Spjd 1593219089Spjd if (!vdev_is_dead(vd)) 1594219089Spjd l2arc_add_vdev(spa, vd); 1595185029Spjd } 1596185029Spjd } 1597185029Spjd 1598185029Spjd /* 1599185029Spjd * Purge vdevs that were dropped 1600185029Spjd */ 1601185029Spjd for (i = 0; i < oldnvdevs; i++) { 1602185029Spjd uint64_t pool; 1603185029Spjd 1604185029Spjd vd = oldvdevs[i]; 1605185029Spjd if (vd != NULL) { 1606230514Smm ASSERT(vd->vdev_isl2cache); 1607230514Smm 1608209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1609209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1610185029Spjd l2arc_remove_vdev(vd); 1611230514Smm vdev_clear_stats(vd); 1612230514Smm vdev_free(vd); 1613185029Spjd } 1614185029Spjd } 1615185029Spjd 1616185029Spjd if (oldvdevs) 1617185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1618185029Spjd 1619185029Spjd if (sav->sav_config == NULL) 1620185029Spjd goto out; 1621185029Spjd 1622185029Spjd sav->sav_vdevs = newvdevs; 1623185029Spjd sav->sav_count = (int)nl2cache; 1624185029Spjd 1625185029Spjd /* 1626185029Spjd * Recompute the stashed list of l2cache devices, with status 1627185029Spjd * information this time. 1628185029Spjd */ 1629185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1630185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1631185029Spjd 1632185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1633185029Spjd for (i = 0; i < sav->sav_count; i++) 1634185029Spjd l2cache[i] = vdev_config_generate(spa, 1635219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1636185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1637185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1638185029Spjdout: 1639185029Spjd for (i = 0; i < sav->sav_count; i++) 1640185029Spjd nvlist_free(l2cache[i]); 1641185029Spjd if (sav->sav_count) 1642185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1643185029Spjd} 1644185029Spjd 1645168404Spjdstatic int 1646168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1647168404Spjd{ 1648168404Spjd dmu_buf_t *db; 1649168404Spjd char *packed = NULL; 1650168404Spjd size_t nvsize = 0; 1651168404Spjd int error; 1652168404Spjd *value = NULL; 1653168404Spjd 1654262676Sdelphij error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1655262676Sdelphij if (error != 0) 1656262676Sdelphij return (error); 1657287744Sdelphij 1658168404Spjd nvsize = *(uint64_t *)db->db_data; 1659168404Spjd dmu_buf_rele(db, FTAG); 1660168404Spjd 1661168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1662209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1663209962Smm DMU_READ_PREFETCH); 1664168404Spjd if (error == 0) 1665168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1666168404Spjd kmem_free(packed, nvsize); 1667168404Spjd 1668168404Spjd return (error); 1669168404Spjd} 1670168404Spjd 1671168404Spjd/* 1672185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1673185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1674185029Spjd */ 1675185029Spjdstatic void 1676185029Spjdspa_check_removed(vdev_t *vd) 1677185029Spjd{ 1678219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1679185029Spjd spa_check_removed(vd->vdev_child[c]); 1680185029Spjd 1681249188Smm if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1682249188Smm !vd->vdev_ishole) { 1683185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1684331397Smav spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 1685185029Spjd } 1686185029Spjd} 1687185029Spjd 1688299441Smavstatic void 1689299441Smavspa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) 1690299441Smav{ 1691299441Smav ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); 1692299441Smav 1693299441Smav vd->vdev_top_zap = mvd->vdev_top_zap; 1694299441Smav vd->vdev_leaf_zap = mvd->vdev_leaf_zap; 1695299441Smav 1696299441Smav for (uint64_t i = 0; i < vd->vdev_children; i++) { 1697299441Smav spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); 1698299441Smav } 1699299441Smav} 1700299441Smav 1701185029Spjd/* 1702219089Spjd * Validate the current config against the MOS config 1703213197Smm */ 1704219089Spjdstatic boolean_t 1705219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1706213197Smm{ 1707219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1708219089Spjd nvlist_t *nv; 1709213197Smm 1710219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1711213197Smm 1712219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1713219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1714219089Spjd 1715219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1716219089Spjd 1717219089Spjd /* 1718219089Spjd * If we're doing a normal import, then build up any additional 1719219089Spjd * diagnostic information about missing devices in this config. 1720219089Spjd * We'll pass this up to the user for further processing. 1721219089Spjd */ 1722219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1723219089Spjd nvlist_t **child, *nv; 1724219089Spjd uint64_t idx = 0; 1725219089Spjd 1726219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1727219089Spjd KM_SLEEP); 1728219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1729219089Spjd 1730219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1731219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1732219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1733219089Spjd 1734219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1735219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1736219089Spjd mtvd->vdev_islog) 1737219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1738219089Spjd B_FALSE, 0); 1739219089Spjd } 1740219089Spjd 1741219089Spjd if (idx) { 1742219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1743219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1744219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1745219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1746219089Spjd 1747219089Spjd for (int i = 0; i < idx; i++) 1748219089Spjd nvlist_free(child[i]); 1749219089Spjd } 1750219089Spjd nvlist_free(nv); 1751219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1752219089Spjd } 1753219089Spjd 1754219089Spjd /* 1755219089Spjd * Compare the root vdev tree with the information we have 1756219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1757219089Spjd * with the corresponding MOS config top-level (mtvd). 1758219089Spjd */ 1759219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1760213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1761219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1762213197Smm 1763219089Spjd /* 1764219089Spjd * Resolve any "missing" vdevs in the current configuration. 1765219089Spjd * If we find that the MOS config has more accurate information 1766219089Spjd * about the top-level vdev then use that vdev instead. 1767219089Spjd */ 1768219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1769219089Spjd mtvd->vdev_ops != &vdev_missing_ops) { 1770219089Spjd 1771219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1772219089Spjd continue; 1773219089Spjd 1774219089Spjd /* 1775219089Spjd * Device specific actions. 1776219089Spjd */ 1777219089Spjd if (mtvd->vdev_islog) { 1778219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1779219089Spjd } else { 1780219089Spjd /* 1781219089Spjd * XXX - once we have 'readonly' pool 1782219089Spjd * support we should be able to handle 1783219089Spjd * missing data devices by transitioning 1784219089Spjd * the pool to readonly. 1785219089Spjd */ 1786219089Spjd continue; 1787219089Spjd } 1788219089Spjd 1789219089Spjd /* 1790219089Spjd * Swap the missing vdev with the data we were 1791219089Spjd * able to obtain from the MOS config. 1792219089Spjd */ 1793219089Spjd vdev_remove_child(rvd, tvd); 1794219089Spjd vdev_remove_child(mrvd, mtvd); 1795219089Spjd 1796219089Spjd vdev_add_child(rvd, mtvd); 1797219089Spjd vdev_add_child(mrvd, tvd); 1798219089Spjd 1799219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1800219089Spjd vdev_load(mtvd); 1801219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1802219089Spjd 1803219089Spjd vdev_reopen(rvd); 1804299441Smav } else { 1805299441Smav if (mtvd->vdev_islog) { 1806299441Smav /* 1807299441Smav * Load the slog device's state from the MOS 1808299441Smav * config since it's possible that the label 1809299441Smav * does not contain the most up-to-date 1810299441Smav * information. 1811299441Smav */ 1812299441Smav vdev_load_log_state(tvd, mtvd); 1813299441Smav vdev_reopen(tvd); 1814299441Smav } 1815299441Smav 1816219089Spjd /* 1817299441Smav * Per-vdev ZAP info is stored exclusively in the MOS. 1818219089Spjd */ 1819299441Smav spa_config_valid_zaps(tvd, mtvd); 1820219089Spjd } 1821213197Smm } 1822299441Smav 1823219089Spjd vdev_free(mrvd); 1824219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1825219089Spjd 1826219089Spjd /* 1827219089Spjd * Ensure we were able to validate the config. 1828219089Spjd */ 1829219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1830213197Smm} 1831213197Smm 1832213197Smm/* 1833185029Spjd * Check for missing log devices 1834185029Spjd */ 1835248571Smmstatic boolean_t 1836185029Spjdspa_check_logs(spa_t *spa) 1837185029Spjd{ 1838248571Smm boolean_t rv = B_FALSE; 1839286686Smav dsl_pool_t *dp = spa_get_dsl(spa); 1840248571Smm 1841185029Spjd switch (spa->spa_log_state) { 1842185029Spjd case SPA_LOG_MISSING: 1843185029Spjd /* need to recheck in case slog has been restored */ 1844185029Spjd case SPA_LOG_UNKNOWN: 1845286686Smav rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1846286686Smav zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1847248571Smm if (rv) 1848219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1849185029Spjd break; 1850185029Spjd } 1851248571Smm return (rv); 1852185029Spjd} 1853185029Spjd 1854219089Spjdstatic boolean_t 1855219089Spjdspa_passivate_log(spa_t *spa) 1856219089Spjd{ 1857219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1858219089Spjd boolean_t slog_found = B_FALSE; 1859219089Spjd 1860219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1861219089Spjd 1862219089Spjd if (!spa_has_slogs(spa)) 1863219089Spjd return (B_FALSE); 1864219089Spjd 1865219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1866219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1867219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1868219089Spjd 1869219089Spjd if (tvd->vdev_islog) { 1870219089Spjd metaslab_group_passivate(mg); 1871219089Spjd slog_found = B_TRUE; 1872219089Spjd } 1873219089Spjd } 1874219089Spjd 1875219089Spjd return (slog_found); 1876219089Spjd} 1877219089Spjd 1878219089Spjdstatic void 1879219089Spjdspa_activate_log(spa_t *spa) 1880219089Spjd{ 1881219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1882219089Spjd 1883219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1884219089Spjd 1885219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1886219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1887219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1888219089Spjd 1889219089Spjd if (tvd->vdev_islog) 1890219089Spjd metaslab_group_activate(mg); 1891219089Spjd } 1892219089Spjd} 1893219089Spjd 1894219089Spjdint 1895219089Spjdspa_offline_log(spa_t *spa) 1896219089Spjd{ 1897248571Smm int error; 1898219089Spjd 1899248571Smm error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1900248571Smm NULL, DS_FIND_CHILDREN); 1901248571Smm if (error == 0) { 1902219089Spjd /* 1903219089Spjd * We successfully offlined the log device, sync out the 1904219089Spjd * current txg so that the "stubby" block can be removed 1905219089Spjd * by zil_sync(). 1906219089Spjd */ 1907219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1908219089Spjd } 1909219089Spjd return (error); 1910219089Spjd} 1911219089Spjd 1912219089Spjdstatic void 1913219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1914219089Spjd{ 1915219089Spjd int i; 1916219089Spjd 1917219089Spjd for (i = 0; i < sav->sav_count; i++) 1918219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1919219089Spjd} 1920219089Spjd 1921219089Spjdvoid 1922219089Spjdspa_claim_notify(zio_t *zio) 1923219089Spjd{ 1924219089Spjd spa_t *spa = zio->io_spa; 1925219089Spjd 1926219089Spjd if (zio->io_error) 1927219089Spjd return; 1928219089Spjd 1929219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1930219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1931219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1932219089Spjd mutex_exit(&spa->spa_props_lock); 1933219089Spjd} 1934219089Spjd 1935219089Spjdtypedef struct spa_load_error { 1936219089Spjd uint64_t sle_meta_count; 1937219089Spjd uint64_t sle_data_count; 1938219089Spjd} spa_load_error_t; 1939219089Spjd 1940219089Spjdstatic void 1941219089Spjdspa_load_verify_done(zio_t *zio) 1942219089Spjd{ 1943219089Spjd blkptr_t *bp = zio->io_bp; 1944219089Spjd spa_load_error_t *sle = zio->io_private; 1945219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1946219089Spjd int error = zio->io_error; 1947268720Sdelphij spa_t *spa = zio->io_spa; 1948219089Spjd 1949321610Smav abd_free(zio->io_abd); 1950219089Spjd if (error) { 1951236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1952219089Spjd type != DMU_OT_INTENT_LOG) 1953270247Sdelphij atomic_inc_64(&sle->sle_meta_count); 1954219089Spjd else 1955270247Sdelphij atomic_inc_64(&sle->sle_data_count); 1956219089Spjd } 1957268720Sdelphij 1958268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 1959268720Sdelphij spa->spa_scrub_inflight--; 1960268720Sdelphij cv_broadcast(&spa->spa_scrub_io_cv); 1961268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 1962219089Spjd} 1963219089Spjd 1964268720Sdelphij/* 1965268720Sdelphij * Maximum number of concurrent scrub i/os to create while verifying 1966268720Sdelphij * a pool while importing it. 1967268720Sdelphij */ 1968268720Sdelphijint spa_load_verify_maxinflight = 10000; 1969268720Sdelphijboolean_t spa_load_verify_metadata = B_TRUE; 1970268720Sdelphijboolean_t spa_load_verify_data = B_TRUE; 1971268720Sdelphij 1972268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, 1973268720Sdelphij &spa_load_verify_maxinflight, 0, 1974268720Sdelphij "Maximum number of concurrent scrub I/Os to create while verifying a " 1975268720Sdelphij "pool while importing it"); 1976268720Sdelphij 1977268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, 1978268720Sdelphij &spa_load_verify_metadata, 0, 1979268720Sdelphij "Check metadata on import?"); 1980268720Sdelphij 1981268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, 1982268720Sdelphij &spa_load_verify_data, 0, 1983268720Sdelphij "Check user data on import?"); 1984268720Sdelphij 1985219089Spjd/*ARGSUSED*/ 1986219089Spjdstatic int 1987219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1988268123Sdelphij const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1989219089Spjd{ 1990286705Smav if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1991268720Sdelphij return (0); 1992268720Sdelphij /* 1993268720Sdelphij * Note: normally this routine will not be called if 1994268720Sdelphij * spa_load_verify_metadata is not set. However, it may be useful 1995268720Sdelphij * to manually set the flag after the traversal has begun. 1996268720Sdelphij */ 1997268720Sdelphij if (!spa_load_verify_metadata) 1998268720Sdelphij return (0); 1999321610Smav if (!BP_IS_METADATA(bp) && !spa_load_verify_data) 2000268720Sdelphij return (0); 2001219089Spjd 2002268720Sdelphij zio_t *rio = arg; 2003268720Sdelphij size_t size = BP_GET_PSIZE(bp); 2004268720Sdelphij 2005268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 2006268720Sdelphij while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 2007268720Sdelphij cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2008268720Sdelphij spa->spa_scrub_inflight++; 2009268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 2010268720Sdelphij 2011321610Smav zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2012268720Sdelphij spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2013268720Sdelphij ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2014268720Sdelphij ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2015219089Spjd return (0); 2016219089Spjd} 2017219089Spjd 2018307045Smav/* ARGSUSED */ 2019307045Smavint 2020307045Smavverify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2021307045Smav{ 2022307108Smav if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2023307045Smav return (SET_ERROR(ENAMETOOLONG)); 2024307045Smav 2025307045Smav return (0); 2026307045Smav} 2027307045Smav 2028219089Spjdstatic int 2029219089Spjdspa_load_verify(spa_t *spa) 2030219089Spjd{ 2031219089Spjd zio_t *rio; 2032219089Spjd spa_load_error_t sle = { 0 }; 2033219089Spjd zpool_rewind_policy_t policy; 2034219089Spjd boolean_t verify_ok = B_FALSE; 2035268720Sdelphij int error = 0; 2036219089Spjd 2037219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 2038219089Spjd 2039219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 2040219089Spjd return (0); 2041219089Spjd 2042307045Smav dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2043307045Smav error = dmu_objset_find_dp(spa->spa_dsl_pool, 2044307045Smav spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2045307045Smav DS_FIND_CHILDREN); 2046307045Smav dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2047307045Smav if (error != 0) 2048307045Smav return (error); 2049307045Smav 2050219089Spjd rio = zio_root(spa, NULL, &sle, 2051219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2052219089Spjd 2053268720Sdelphij if (spa_load_verify_metadata) { 2054268720Sdelphij error = traverse_pool(spa, spa->spa_verify_min_txg, 2055268720Sdelphij TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 2056268720Sdelphij spa_load_verify_cb, rio); 2057268720Sdelphij } 2058219089Spjd 2059219089Spjd (void) zio_wait(rio); 2060219089Spjd 2061219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 2062219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 2063219089Spjd 2064219089Spjd if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 2065219089Spjd sle.sle_data_count <= policy.zrp_maxdata) { 2066219089Spjd int64_t loss = 0; 2067219089Spjd 2068219089Spjd verify_ok = B_TRUE; 2069219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2070219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2071219089Spjd 2072219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2073219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 2074219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2075219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 2076219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2077219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 2078219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2079219089Spjd } else { 2080219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2081219089Spjd } 2082219089Spjd 2083219089Spjd if (error) { 2084219089Spjd if (error != ENXIO && error != EIO) 2085249195Smm error = SET_ERROR(EIO); 2086219089Spjd return (error); 2087219089Spjd } 2088219089Spjd 2089219089Spjd return (verify_ok ? 0 : EIO); 2090219089Spjd} 2091219089Spjd 2092185029Spjd/* 2093219089Spjd * Find a value in the pool props object. 2094168404Spjd */ 2095219089Spjdstatic void 2096219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2097219089Spjd{ 2098219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2099219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2100219089Spjd} 2101219089Spjd 2102219089Spjd/* 2103219089Spjd * Find a value in the pool directory object. 2104219089Spjd */ 2105168404Spjdstatic int 2106219089Spjdspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 2107168404Spjd{ 2108219089Spjd return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2109219089Spjd name, sizeof (uint64_t), 1, val)); 2110219089Spjd} 2111168404Spjd 2112219089Spjdstatic int 2113219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2114219089Spjd{ 2115219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2116219089Spjd return (err); 2117219089Spjd} 2118219089Spjd 2119219089Spjd/* 2120219089Spjd * Fix up config after a partly-completed split. This is done with the 2121219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2122219089Spjd * pool have that entry in their config, but only the splitting one contains 2123219089Spjd * a list of all the guids of the vdevs that are being split off. 2124219089Spjd * 2125219089Spjd * This function determines what to do with that list: either rejoin 2126219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 2127219089Spjd * the rejoin, each disk that is offlined is marked online again, and 2128219089Spjd * we do a reopen() call. If the vdev label for every disk that was 2129219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2130219089Spjd * then we call vdev_split() on each disk, and complete the split. 2131219089Spjd * 2132219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 2133219089Spjd * the original pool. 2134219089Spjd */ 2135219089Spjdstatic void 2136219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 2137219089Spjd{ 2138219089Spjd uint_t extracted; 2139219089Spjd uint64_t *glist; 2140219089Spjd uint_t i, gcount; 2141219089Spjd nvlist_t *nvl; 2142219089Spjd vdev_t **vd; 2143219089Spjd boolean_t attempt_reopen; 2144219089Spjd 2145219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2146219089Spjd return; 2147219089Spjd 2148219089Spjd /* check that the config is complete */ 2149219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2150219089Spjd &glist, &gcount) != 0) 2151219089Spjd return; 2152219089Spjd 2153219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2154219089Spjd 2155219089Spjd /* attempt to online all the vdevs & validate */ 2156219089Spjd attempt_reopen = B_TRUE; 2157219089Spjd for (i = 0; i < gcount; i++) { 2158219089Spjd if (glist[i] == 0) /* vdev is hole */ 2159219089Spjd continue; 2160219089Spjd 2161219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2162219089Spjd if (vd[i] == NULL) { 2163219089Spjd /* 2164219089Spjd * Don't bother attempting to reopen the disks; 2165219089Spjd * just do the split. 2166219089Spjd */ 2167219089Spjd attempt_reopen = B_FALSE; 2168219089Spjd } else { 2169219089Spjd /* attempt to re-online it */ 2170219089Spjd vd[i]->vdev_offline = B_FALSE; 2171219089Spjd } 2172219089Spjd } 2173219089Spjd 2174219089Spjd if (attempt_reopen) { 2175219089Spjd vdev_reopen(spa->spa_root_vdev); 2176219089Spjd 2177219089Spjd /* check each device to see what state it's in */ 2178219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 2179219089Spjd if (vd[i] != NULL && 2180219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2181219089Spjd break; 2182219089Spjd ++extracted; 2183219089Spjd } 2184219089Spjd } 2185219089Spjd 2186209962Smm /* 2187219089Spjd * If every disk has been moved to the new pool, or if we never 2188219089Spjd * even attempted to look at them, then we split them off for 2189219089Spjd * good. 2190209962Smm */ 2191219089Spjd if (!attempt_reopen || gcount == extracted) { 2192219089Spjd for (i = 0; i < gcount; i++) 2193219089Spjd if (vd[i] != NULL) 2194219089Spjd vdev_split(vd[i]); 2195219089Spjd vdev_reopen(spa->spa_root_vdev); 2196219089Spjd } 2197209962Smm 2198219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 2199219089Spjd} 2200185029Spjd 2201219089Spjdstatic int 2202219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2203219089Spjd boolean_t mosconfig) 2204219089Spjd{ 2205219089Spjd nvlist_t *config = spa->spa_config; 2206219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 2207228103Smm char *comment; 2208219089Spjd int error; 2209219089Spjd uint64_t pool_guid; 2210219089Spjd nvlist_t *nvl; 2211168404Spjd 2212219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2213249195Smm return (SET_ERROR(EINVAL)); 2214168404Spjd 2215228103Smm ASSERT(spa->spa_comment == NULL); 2216228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2217228103Smm spa->spa_comment = spa_strdup(comment); 2218228103Smm 2219168404Spjd /* 2220168404Spjd * Versioning wasn't explicitly added to the label until later, so if 2221168404Spjd * it's not present treat it as the initial version. 2222168404Spjd */ 2223219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2224219089Spjd &spa->spa_ubsync.ub_version) != 0) 2225219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2226168404Spjd 2227168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2228168404Spjd &spa->spa_config_txg); 2229168404Spjd 2230168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2231168404Spjd spa_guid_exists(pool_guid, 0)) { 2232249195Smm error = SET_ERROR(EEXIST); 2233219089Spjd } else { 2234228103Smm spa->spa_config_guid = pool_guid; 2235219089Spjd 2236219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2237219089Spjd &nvl) == 0) { 2238219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2239219089Spjd KM_SLEEP) == 0); 2240219089Spjd } 2241219089Spjd 2242236884Smm nvlist_free(spa->spa_load_info); 2243236884Smm spa->spa_load_info = fnvlist_alloc(); 2244236884Smm 2245219089Spjd gethrestime(&spa->spa_loaded_ts); 2246219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 2247219089Spjd mosconfig, &ereport); 2248168404Spjd } 2249168404Spjd 2250286575Smav /* 2251286575Smav * Don't count references from objsets that are already closed 2252286575Smav * and are making their way through the eviction process. 2253286575Smav */ 2254286575Smav spa_evicting_os_wait(spa); 2255219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2256219089Spjd if (error) { 2257219089Spjd if (error != EEXIST) { 2258219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2259219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2260219089Spjd } 2261219089Spjd if (error != EBADF) { 2262219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2263219089Spjd } 2264219089Spjd } 2265219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2266219089Spjd spa->spa_ena = 0; 2267168404Spjd 2268219089Spjd return (error); 2269219089Spjd} 2270219089Spjd 2271219089Spjd/* 2272299441Smav * Count the number of per-vdev ZAPs associated with all of the vdevs in the 2273299441Smav * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 2274299441Smav * spa's per-vdev ZAP list. 2275299441Smav */ 2276299441Smavstatic uint64_t 2277299441Smavvdev_count_verify_zaps(vdev_t *vd) 2278299441Smav{ 2279299441Smav spa_t *spa = vd->vdev_spa; 2280299441Smav uint64_t total = 0; 2281299441Smav if (vd->vdev_top_zap != 0) { 2282299441Smav total++; 2283299441Smav ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2284299441Smav spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 2285299441Smav } 2286299441Smav if (vd->vdev_leaf_zap != 0) { 2287299441Smav total++; 2288299441Smav ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2289299441Smav spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 2290299441Smav } 2291299441Smav 2292299441Smav for (uint64_t i = 0; i < vd->vdev_children; i++) { 2293299441Smav total += vdev_count_verify_zaps(vd->vdev_child[i]); 2294299441Smav } 2295299441Smav 2296299441Smav return (total); 2297299441Smav} 2298299441Smav 2299299441Smav/* 2300219089Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 2301219089Spjd * source of configuration information. 2302219089Spjd */ 2303219089Spjdstatic int 2304219089Spjdspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2305219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2306219089Spjd char **ereport) 2307219089Spjd{ 2308219089Spjd int error = 0; 2309219089Spjd nvlist_t *nvroot = NULL; 2310236884Smm nvlist_t *label; 2311219089Spjd vdev_t *rvd; 2312219089Spjd uberblock_t *ub = &spa->spa_uberblock; 2313219089Spjd uint64_t children, config_cache_txg = spa->spa_config_txg; 2314219089Spjd int orig_mode = spa->spa_mode; 2315219089Spjd int parse; 2316219089Spjd uint64_t obj; 2317236884Smm boolean_t missing_feat_write = B_FALSE; 2318219089Spjd 2319168404Spjd /* 2320219089Spjd * If this is an untrusted config, access the pool in read-only mode. 2321219089Spjd * This prevents things like resilvering recently removed devices. 2322219089Spjd */ 2323219089Spjd if (!mosconfig) 2324219089Spjd spa->spa_mode = FREAD; 2325219089Spjd 2326219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2327219089Spjd 2328219089Spjd spa->spa_load_state = state; 2329219089Spjd 2330219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2331249195Smm return (SET_ERROR(EINVAL)); 2332219089Spjd 2333219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 2334219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2335219089Spjd 2336219089Spjd /* 2337209962Smm * Create "The Godfather" zio to hold all async IOs 2338209962Smm */ 2339272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2340272598Sdelphij KM_SLEEP); 2341272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 2342272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2343272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2344272598Sdelphij ZIO_FLAG_GODFATHER); 2345272598Sdelphij } 2346209962Smm 2347209962Smm /* 2348168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2349168404Spjd * value that will be returned by spa_version() since parsing the 2350168404Spjd * configuration requires knowing the version number. 2351168404Spjd */ 2352185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2353219089Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2354185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2355168404Spjd 2356168404Spjd if (error != 0) 2357219089Spjd return (error); 2358168404Spjd 2359168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2360284304Savg ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2361284304Savg ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2362168404Spjd 2363219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2364219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2365219089Spjd } 2366219089Spjd 2367168404Spjd /* 2368168404Spjd * Try to open all vdevs, loading each label in the process. 2369168404Spjd */ 2370185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2371168926Spjd error = vdev_open(rvd); 2372185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2373168926Spjd if (error != 0) 2374219089Spjd return (error); 2375168404Spjd 2376168404Spjd /* 2377209962Smm * We need to validate the vdev labels against the configuration that 2378209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 2379209962Smm * mosconfig is true then we're validating the vdev labels based on 2380219089Spjd * that config. Otherwise, we're validating against the cached config 2381209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 2382209962Smm * later we will recursively call spa_load() and validate against 2383209962Smm * the vdev config. 2384219089Spjd * 2385219089Spjd * If we're assembling a new pool that's been split off from an 2386219089Spjd * existing pool, the labels haven't yet been updated so we skip 2387219089Spjd * validation for now. 2388168404Spjd */ 2389219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2390219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2391230514Smm error = vdev_validate(rvd, mosconfig); 2392219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2393168404Spjd 2394219089Spjd if (error != 0) 2395219089Spjd return (error); 2396219089Spjd 2397219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2398249195Smm return (SET_ERROR(ENXIO)); 2399168404Spjd } 2400168404Spjd 2401168404Spjd /* 2402168404Spjd * Find the best uberblock. 2403168404Spjd */ 2404236884Smm vdev_uberblock_load(rvd, ub, &label); 2405168404Spjd 2406168404Spjd /* 2407168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2408168404Spjd */ 2409236884Smm if (ub->ub_txg == 0) { 2410236884Smm nvlist_free(label); 2411219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2412236884Smm } 2413168404Spjd 2414168404Spjd /* 2415236884Smm * If the pool has an unsupported version we can't open it. 2416168404Spjd */ 2417236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2418236884Smm nvlist_free(label); 2419219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2420236884Smm } 2421168404Spjd 2422236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2423236884Smm nvlist_t *features; 2424236884Smm 2425236884Smm /* 2426236884Smm * If we weren't able to find what's necessary for reading the 2427236884Smm * MOS in the label, return failure. 2428236884Smm */ 2429236884Smm if (label == NULL || nvlist_lookup_nvlist(label, 2430236884Smm ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2431236884Smm nvlist_free(label); 2432236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2433236884Smm ENXIO)); 2434236884Smm } 2435236884Smm 2436236884Smm /* 2437236884Smm * Update our in-core representation with the definitive values 2438236884Smm * from the label. 2439236884Smm */ 2440236884Smm nvlist_free(spa->spa_label_features); 2441236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2442236884Smm } 2443236884Smm 2444236884Smm nvlist_free(label); 2445236884Smm 2446168404Spjd /* 2447236884Smm * Look through entries in the label nvlist's features_for_read. If 2448236884Smm * there is a feature listed there which we don't understand then we 2449236884Smm * cannot open a pool. 2450236884Smm */ 2451236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2452236884Smm nvlist_t *unsup_feat; 2453236884Smm 2454236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2455236884Smm 0); 2456236884Smm 2457236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2458236884Smm NULL); nvp != NULL; 2459236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2460236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2461236884Smm VERIFY(nvlist_add_string(unsup_feat, 2462236884Smm nvpair_name(nvp), "") == 0); 2463236884Smm } 2464236884Smm } 2465236884Smm 2466236884Smm if (!nvlist_empty(unsup_feat)) { 2467236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2468236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2469236884Smm nvlist_free(unsup_feat); 2470236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2471236884Smm ENOTSUP)); 2472236884Smm } 2473236884Smm 2474236884Smm nvlist_free(unsup_feat); 2475236884Smm } 2476236884Smm 2477236884Smm /* 2478168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2479219089Spjd * incomplete configuration. We first check to see if the pool 2480219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2481219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2482219089Spjd * can handle missing vdevs. 2483168404Spjd */ 2484219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2485219089Spjd &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2486219089Spjd rvd->vdev_guid_sum != ub->ub_guid_sum) 2487219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2488219089Spjd 2489219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2490219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2491219089Spjd spa_try_repair(spa, config); 2492219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2493219089Spjd nvlist_free(spa->spa_config_splitting); 2494219089Spjd spa->spa_config_splitting = NULL; 2495168404Spjd } 2496168404Spjd 2497168404Spjd /* 2498168404Spjd * Initialize internal SPA structures. 2499168404Spjd */ 2500168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2501168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2502219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2503219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2504219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2505219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2506219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2507219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2508219089Spjd 2509236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2510219089Spjd if (error) 2511219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2512168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2513168404Spjd 2514219089Spjd if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2515219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2516168404Spjd 2517236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2518236884Smm boolean_t missing_feat_read = B_FALSE; 2519238926Smm nvlist_t *unsup_feat, *enabled_feat; 2520236884Smm 2521236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2522236884Smm &spa->spa_feat_for_read_obj) != 0) { 2523236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2524236884Smm } 2525236884Smm 2526236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2527236884Smm &spa->spa_feat_for_write_obj) != 0) { 2528236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2529236884Smm } 2530236884Smm 2531236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2532236884Smm &spa->spa_feat_desc_obj) != 0) { 2533236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2534236884Smm } 2535236884Smm 2536238926Smm enabled_feat = fnvlist_alloc(); 2537238926Smm unsup_feat = fnvlist_alloc(); 2538236884Smm 2539259813Sdelphij if (!spa_features_check(spa, B_FALSE, 2540238926Smm unsup_feat, enabled_feat)) 2541236884Smm missing_feat_read = B_TRUE; 2542236884Smm 2543236884Smm if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2544259813Sdelphij if (!spa_features_check(spa, B_TRUE, 2545238926Smm unsup_feat, enabled_feat)) { 2546236884Smm missing_feat_write = B_TRUE; 2547238926Smm } 2548236884Smm } 2549236884Smm 2550238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2551238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2552238926Smm 2553236884Smm if (!nvlist_empty(unsup_feat)) { 2554238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2555238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2556236884Smm } 2557236884Smm 2558238926Smm fnvlist_free(enabled_feat); 2559238926Smm fnvlist_free(unsup_feat); 2560236884Smm 2561236884Smm if (!missing_feat_read) { 2562236884Smm fnvlist_add_boolean(spa->spa_load_info, 2563236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2564236884Smm } 2565236884Smm 2566236884Smm /* 2567236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2568236884Smm * twofold: to determine whether the pool is available for 2569236884Smm * import in read-write mode and (if it is not) whether the 2570236884Smm * pool is available for import in read-only mode. If the pool 2571236884Smm * is available for import in read-write mode, it is displayed 2572236884Smm * as available in userland; if it is not available for import 2573236884Smm * in read-only mode, it is displayed as unavailable in 2574236884Smm * userland. If the pool is available for import in read-only 2575236884Smm * mode but not read-write mode, it is displayed as unavailable 2576236884Smm * in userland with a special note that the pool is actually 2577236884Smm * available for open in read-only mode. 2578236884Smm * 2579236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2580236884Smm * missing a feature for write, we must first determine whether 2581236884Smm * the pool can be opened read-only before returning to 2582236884Smm * userland in order to know whether to display the 2583236884Smm * abovementioned note. 2584236884Smm */ 2585236884Smm if (missing_feat_read || (missing_feat_write && 2586236884Smm spa_writeable(spa))) { 2587236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2588236884Smm ENOTSUP)); 2589236884Smm } 2590260150Sdelphij 2591260150Sdelphij /* 2592260150Sdelphij * Load refcounts for ZFS features from disk into an in-memory 2593260150Sdelphij * cache during SPA initialization. 2594260150Sdelphij */ 2595260150Sdelphij for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2596260150Sdelphij uint64_t refcount; 2597260150Sdelphij 2598260150Sdelphij error = feature_get_refcount_from_disk(spa, 2599260150Sdelphij &spa_feature_table[i], &refcount); 2600260150Sdelphij if (error == 0) { 2601260150Sdelphij spa->spa_feat_refcount_cache[i] = refcount; 2602260150Sdelphij } else if (error == ENOTSUP) { 2603260150Sdelphij spa->spa_feat_refcount_cache[i] = 2604260150Sdelphij SPA_FEATURE_DISABLED; 2605260150Sdelphij } else { 2606260150Sdelphij return (spa_vdev_err(rvd, 2607260150Sdelphij VDEV_AUX_CORRUPT_DATA, EIO)); 2608260150Sdelphij } 2609260150Sdelphij } 2610236884Smm } 2611236884Smm 2612260150Sdelphij if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2613260150Sdelphij if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2614268075Sdelphij &spa->spa_feat_enabled_txg_obj) != 0) 2615260150Sdelphij return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2616260150Sdelphij } 2617260150Sdelphij 2618236884Smm spa->spa_is_initializing = B_TRUE; 2619236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2620236884Smm spa->spa_is_initializing = B_FALSE; 2621236884Smm if (error != 0) 2622236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2623236884Smm 2624168404Spjd if (!mosconfig) { 2625168498Spjd uint64_t hostid; 2626219089Spjd nvlist_t *policy = NULL, *nvconfig; 2627168404Spjd 2628219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2629219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2630168404Spjd 2631219089Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2632185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2633168498Spjd char *hostname; 2634168498Spjd unsigned long myhostid = 0; 2635168498Spjd 2636219089Spjd VERIFY(nvlist_lookup_string(nvconfig, 2637168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2638168498Spjd 2639219089Spjd#ifdef _KERNEL 2640219089Spjd myhostid = zone_get_hostid(NULL); 2641219089Spjd#else /* _KERNEL */ 2642219089Spjd /* 2643219089Spjd * We're emulating the system's hostid in userland, so 2644219089Spjd * we can't use zone_get_hostid(). 2645219089Spjd */ 2646168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2647219089Spjd#endif /* _KERNEL */ 2648204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 2649219089Spjd hostid != myhostid) { 2650219089Spjd nvlist_free(nvconfig); 2651168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 2652168498Spjd "loaded as it was last accessed by " 2653185029Spjd "another system (host: %s hostid: 0x%lx). " 2654236146Smm "See: http://illumos.org/msg/ZFS-8000-EY", 2655185029Spjd spa_name(spa), hostname, 2656168498Spjd (unsigned long)hostid); 2657249195Smm return (SET_ERROR(EBADF)); 2658168498Spjd } 2659168498Spjd } 2660219089Spjd if (nvlist_lookup_nvlist(spa->spa_config, 2661219089Spjd ZPOOL_REWIND_POLICY, &policy) == 0) 2662219089Spjd VERIFY(nvlist_add_nvlist(nvconfig, 2663219089Spjd ZPOOL_REWIND_POLICY, policy) == 0); 2664168498Spjd 2665219089Spjd spa_config_set(spa, nvconfig); 2666168404Spjd spa_unload(spa); 2667168404Spjd spa_deactivate(spa); 2668209962Smm spa_activate(spa, orig_mode); 2669168404Spjd 2670219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2671168404Spjd } 2672168404Spjd 2673289422Smav /* Grab the secret checksum salt from the MOS. */ 2674289422Smav error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2675289422Smav DMU_POOL_CHECKSUM_SALT, 1, 2676289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes), 2677289422Smav spa->spa_cksum_salt.zcs_bytes); 2678289422Smav if (error == ENOENT) { 2679289422Smav /* Generate a new salt for subsequent use */ 2680289422Smav (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 2681289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes)); 2682289422Smav } else if (error != 0) { 2683289422Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2684289422Smav } 2685289422Smav 2686219089Spjd if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2687219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2688219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2689219089Spjd if (error != 0) 2690219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2691168404Spjd 2692168404Spjd /* 2693168404Spjd * Load the bit that tells us to use the new accounting function 2694168404Spjd * (raid-z deflation). If we have an older pool, this will not 2695168404Spjd * be present. 2696168404Spjd */ 2697219089Spjd error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2698219089Spjd if (error != 0 && error != ENOENT) 2699219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2700168404Spjd 2701219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2702219089Spjd &spa->spa_creation_version); 2703219089Spjd if (error != 0 && error != ENOENT) 2704219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2705219089Spjd 2706168404Spjd /* 2707168404Spjd * Load the persistent error log. If we have an older pool, this will 2708168404Spjd * not be present. 2709168404Spjd */ 2710219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2711219089Spjd if (error != 0 && error != ENOENT) 2712219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2713168404Spjd 2714219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2715219089Spjd &spa->spa_errlog_scrub); 2716219089Spjd if (error != 0 && error != ENOENT) 2717219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2718168404Spjd 2719168404Spjd /* 2720168404Spjd * Load the history object. If we have an older pool, this 2721168404Spjd * will not be present. 2722168404Spjd */ 2723219089Spjd error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2724219089Spjd if (error != 0 && error != ENOENT) 2725219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2726168404Spjd 2727168404Spjd /* 2728299441Smav * Load the per-vdev ZAP map. If we have an older pool, this will not 2729299441Smav * be present; in this case, defer its creation to a later time to 2730299441Smav * avoid dirtying the MOS this early / out of sync context. See 2731299441Smav * spa_sync_config_object. 2732299441Smav */ 2733299441Smav 2734299441Smav /* The sentinel is only available in the MOS config. */ 2735299441Smav nvlist_t *mos_config; 2736299441Smav if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) 2737299441Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2738299441Smav 2739299441Smav error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 2740299441Smav &spa->spa_all_vdev_zaps); 2741299441Smav 2742321540Smav if (error == ENOENT) { 2743321540Smav VERIFY(!nvlist_exists(mos_config, 2744321540Smav ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 2745321540Smav spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 2746321540Smav ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2747321540Smav } else if (error != 0) { 2748299441Smav return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2749321540Smav } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 2750299441Smav /* 2751299441Smav * An older version of ZFS overwrote the sentinel value, so 2752299441Smav * we have orphaned per-vdev ZAPs in the MOS. Defer their 2753299441Smav * destruction to later; see spa_sync_config_object. 2754299441Smav */ 2755299441Smav spa->spa_avz_action = AVZ_ACTION_DESTROY; 2756299441Smav /* 2757299441Smav * We're assuming that no vdevs have had their ZAPs created 2758299441Smav * before this. Better be sure of it. 2759299441Smav */ 2760299441Smav ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2761299441Smav } 2762299441Smav nvlist_free(mos_config); 2763299441Smav 2764299441Smav /* 2765219089Spjd * If we're assembling the pool from the split-off vdevs of 2766219089Spjd * an existing pool, we don't want to attach the spares & cache 2767219089Spjd * devices. 2768219089Spjd */ 2769219089Spjd 2770219089Spjd /* 2771168404Spjd * Load any hot spares for this pool. 2772168404Spjd */ 2773219089Spjd error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2774219089Spjd if (error != 0 && error != ENOENT) 2775219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2776219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2777185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2778185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 2779219089Spjd &spa->spa_spares.sav_config) != 0) 2780219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2781168404Spjd 2782185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2783168404Spjd spa_load_spares(spa); 2784185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2785219089Spjd } else if (error == 0) { 2786219089Spjd spa->spa_spares.sav_sync = B_TRUE; 2787168404Spjd } 2788168404Spjd 2789185029Spjd /* 2790185029Spjd * Load any level 2 ARC devices for this pool. 2791185029Spjd */ 2792219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2793185029Spjd &spa->spa_l2cache.sav_object); 2794219089Spjd if (error != 0 && error != ENOENT) 2795219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2796219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2797185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2798185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2799219089Spjd &spa->spa_l2cache.sav_config) != 0) 2800219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2801185029Spjd 2802185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2803185029Spjd spa_load_l2cache(spa); 2804185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2805219089Spjd } else if (error == 0) { 2806219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2807185029Spjd } 2808185029Spjd 2809219089Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2810213197Smm 2811219089Spjd error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2812219089Spjd if (error && error != ENOENT) 2813219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2814185029Spjd 2815219089Spjd if (error == 0) { 2816219089Spjd uint64_t autoreplace; 2817185029Spjd 2818219089Spjd spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2819219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2820219089Spjd spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2821219089Spjd spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2822219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2823331395Smav spa_prop_find(spa, ZPOOL_PROP_BOOTSIZE, &spa->spa_bootsize); 2824219089Spjd spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2825219089Spjd &spa->spa_dedup_ditto); 2826185029Spjd 2827219089Spjd spa->spa_autoreplace = (autoreplace != 0); 2828168404Spjd } 2829168404Spjd 2830168404Spjd /* 2831185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 2832185029Spjd * the ZFS DE that it should not issue any faults for unopenable 2833185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 2834185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 2835185029Spjd * over. 2836185029Spjd */ 2837219089Spjd if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2838185029Spjd spa_check_removed(spa->spa_root_vdev); 2839219089Spjd /* 2840219089Spjd * For the import case, this is done in spa_import(), because 2841219089Spjd * at this point we're using the spare definitions from 2842219089Spjd * the MOS config, not necessarily from the userland config. 2843219089Spjd */ 2844219089Spjd if (state != SPA_LOAD_IMPORT) { 2845219089Spjd spa_aux_check_removed(&spa->spa_spares); 2846219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 2847219089Spjd } 2848219089Spjd } 2849185029Spjd 2850185029Spjd /* 2851168404Spjd * Load the vdev state for all toplevel vdevs. 2852168404Spjd */ 2853168404Spjd vdev_load(rvd); 2854168404Spjd 2855168404Spjd /* 2856168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 2857168404Spjd */ 2858185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2859168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2860185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2861168404Spjd 2862168404Spjd /* 2863219089Spjd * Load the DDTs (dedup tables). 2864168404Spjd */ 2865219089Spjd error = ddt_load(spa); 2866219089Spjd if (error != 0) 2867219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2868219089Spjd 2869219089Spjd spa_update_dspace(spa); 2870219089Spjd 2871219089Spjd /* 2872219089Spjd * Validate the config, using the MOS config to fill in any 2873219089Spjd * information which might be missing. If we fail to validate 2874219089Spjd * the config then declare the pool unfit for use. If we're 2875219089Spjd * assembling a pool from a split, the log is not transferred 2876219089Spjd * over. 2877219089Spjd */ 2878219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2879219089Spjd nvlist_t *nvconfig; 2880219089Spjd 2881219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2882219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2883219089Spjd 2884219089Spjd if (!spa_config_valid(spa, nvconfig)) { 2885219089Spjd nvlist_free(nvconfig); 2886219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2887219089Spjd ENXIO)); 2888219089Spjd } 2889219089Spjd nvlist_free(nvconfig); 2890219089Spjd 2891219089Spjd /* 2892236884Smm * Now that we've validated the config, check the state of the 2893219089Spjd * root vdev. If it can't be opened, it indicates one or 2894219089Spjd * more toplevel vdevs are faulted. 2895219089Spjd */ 2896219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2897249195Smm return (SET_ERROR(ENXIO)); 2898219089Spjd 2899286600Smav if (spa_writeable(spa) && spa_check_logs(spa)) { 2900219089Spjd *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2901219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2902219089Spjd } 2903168404Spjd } 2904168404Spjd 2905236884Smm if (missing_feat_write) { 2906236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 2907236884Smm 2908236884Smm /* 2909236884Smm * At this point, we know that we can open the pool in 2910236884Smm * read-only mode but not read-write mode. We now have enough 2911236884Smm * information and can return to userland. 2912236884Smm */ 2913236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2914236884Smm } 2915236884Smm 2916219089Spjd /* 2917219089Spjd * We've successfully opened the pool, verify that we're ready 2918219089Spjd * to start pushing transactions. 2919219089Spjd */ 2920219089Spjd if (state != SPA_LOAD_TRYIMPORT) { 2921219089Spjd if (error = spa_load_verify(spa)) 2922219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2923219089Spjd error)); 2924219089Spjd } 2925219089Spjd 2926219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2927219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 2928168404Spjd dmu_tx_t *tx; 2929168404Spjd int need_update = B_FALSE; 2930286686Smav dsl_pool_t *dp = spa_get_dsl(spa); 2931168404Spjd 2932209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 2933209962Smm 2934168404Spjd /* 2935168404Spjd * Claim log blocks that haven't been committed yet. 2936168404Spjd * This must all happen in a single txg. 2937219089Spjd * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2938219089Spjd * invoked from zil_claim_log_block()'s i/o done callback. 2939219089Spjd * Price of rollback is that we abandon the log. 2940168404Spjd */ 2941219089Spjd spa->spa_claiming = B_TRUE; 2942219089Spjd 2943286686Smav tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 2944286686Smav (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2945168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 2946168404Spjd dmu_tx_commit(tx); 2947168404Spjd 2948219089Spjd spa->spa_claiming = B_FALSE; 2949219089Spjd 2950219089Spjd spa_set_log_state(spa, SPA_LOG_GOOD); 2951168404Spjd spa->spa_sync_on = B_TRUE; 2952168404Spjd txg_sync_start(spa->spa_dsl_pool); 2953168404Spjd 2954168404Spjd /* 2955219089Spjd * Wait for all claims to sync. We sync up to the highest 2956219089Spjd * claimed log block birth time so that claimed log blocks 2957219089Spjd * don't appear to be from the future. spa_claim_max_txg 2958219089Spjd * will have been set for us by either zil_check_log_chain() 2959219089Spjd * (invoked from spa_check_logs()) or zil_claim() above. 2960168404Spjd */ 2961219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2962168404Spjd 2963168404Spjd /* 2964168404Spjd * If the config cache is stale, or we have uninitialized 2965168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 2966209962Smm * 2967219089Spjd * If this is a verbatim import, trust the current 2968209962Smm * in-core spa_config and update the disk labels. 2969168404Spjd */ 2970168404Spjd if (config_cache_txg != spa->spa_config_txg || 2971219089Spjd state == SPA_LOAD_IMPORT || 2972219089Spjd state == SPA_LOAD_RECOVER || 2973219089Spjd (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2974168404Spjd need_update = B_TRUE; 2975168404Spjd 2976209962Smm for (int c = 0; c < rvd->vdev_children; c++) 2977168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 2978168404Spjd need_update = B_TRUE; 2979168404Spjd 2980168404Spjd /* 2981168404Spjd * Update the config cache asychronously in case we're the 2982168404Spjd * root pool, in which case the config cache isn't writable yet. 2983168404Spjd */ 2984168404Spjd if (need_update) 2985168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2986208683Spjd 2987208683Spjd /* 2988208683Spjd * Check all DTLs to see if anything needs resilvering. 2989208683Spjd */ 2990219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2991219089Spjd vdev_resilver_needed(rvd, NULL, NULL)) 2992208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2993219089Spjd 2994219089Spjd /* 2995248571Smm * Log the fact that we booted up (so that we can detect if 2996248571Smm * we rebooted in the middle of an operation). 2997248571Smm */ 2998248571Smm spa_history_log_version(spa, "open"); 2999248571Smm 3000248571Smm /* 3001219089Spjd * Delete any inconsistent datasets. 3002219089Spjd */ 3003219089Spjd (void) dmu_objset_find(spa_name(spa), 3004219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 3005219089Spjd 3006219089Spjd /* 3007219089Spjd * Clean up any stale temporary dataset userrefs. 3008219089Spjd */ 3009219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 3010168404Spjd } 3011168404Spjd 3012219089Spjd return (0); 3013219089Spjd} 3014168404Spjd 3015219089Spjdstatic int 3016219089Spjdspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 3017219089Spjd{ 3018219089Spjd int mode = spa->spa_mode; 3019219089Spjd 3020219089Spjd spa_unload(spa); 3021219089Spjd spa_deactivate(spa); 3022219089Spjd 3023268720Sdelphij spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 3024219089Spjd 3025219089Spjd spa_activate(spa, mode); 3026219089Spjd spa_async_suspend(spa); 3027219089Spjd 3028219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 3029168404Spjd} 3030168404Spjd 3031236884Smm/* 3032236884Smm * If spa_load() fails this function will try loading prior txg's. If 3033236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 3034236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 3035236884Smm * function will not rewind the pool and will return the same error as 3036236884Smm * spa_load(). 3037236884Smm */ 3038219089Spjdstatic int 3039219089Spjdspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 3040219089Spjd uint64_t max_request, int rewind_flags) 3041219089Spjd{ 3042236884Smm nvlist_t *loadinfo = NULL; 3043219089Spjd nvlist_t *config = NULL; 3044219089Spjd int load_error, rewind_error; 3045219089Spjd uint64_t safe_rewind_txg; 3046219089Spjd uint64_t min_txg; 3047219089Spjd 3048219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 3049219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 3050219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 3051219089Spjd } else { 3052219089Spjd spa->spa_load_max_txg = max_request; 3053268720Sdelphij if (max_request != UINT64_MAX) 3054268720Sdelphij spa->spa_extreme_rewind = B_TRUE; 3055219089Spjd } 3056219089Spjd 3057219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 3058219089Spjd mosconfig); 3059219089Spjd if (load_error == 0) 3060219089Spjd return (0); 3061219089Spjd 3062219089Spjd if (spa->spa_root_vdev != NULL) 3063219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3064219089Spjd 3065219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 3066219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 3067219089Spjd 3068219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 3069219089Spjd nvlist_free(config); 3070219089Spjd return (load_error); 3071219089Spjd } 3072219089Spjd 3073236884Smm if (state == SPA_LOAD_RECOVER) { 3074236884Smm /* Price of rolling back is discarding txgs, including log */ 3075219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 3076236884Smm } else { 3077236884Smm /* 3078236884Smm * If we aren't rolling back save the load info from our first 3079236884Smm * import attempt so that we can restore it after attempting 3080236884Smm * to rewind. 3081236884Smm */ 3082236884Smm loadinfo = spa->spa_load_info; 3083236884Smm spa->spa_load_info = fnvlist_alloc(); 3084236884Smm } 3085219089Spjd 3086219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 3087219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 3088219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 3089219089Spjd TXG_INITIAL : safe_rewind_txg; 3090219089Spjd 3091219089Spjd /* 3092219089Spjd * Continue as long as we're finding errors, we're still within 3093219089Spjd * the acceptable rewind range, and we're still finding uberblocks 3094219089Spjd */ 3095219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 3096219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 3097219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 3098219089Spjd spa->spa_extreme_rewind = B_TRUE; 3099219089Spjd rewind_error = spa_load_retry(spa, state, mosconfig); 3100219089Spjd } 3101219089Spjd 3102219089Spjd spa->spa_extreme_rewind = B_FALSE; 3103219089Spjd spa->spa_load_max_txg = UINT64_MAX; 3104219089Spjd 3105219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 3106219089Spjd spa_config_set(spa, config); 3107325535Savg else 3108325535Savg nvlist_free(config); 3109219089Spjd 3110236884Smm if (state == SPA_LOAD_RECOVER) { 3111236884Smm ASSERT3P(loadinfo, ==, NULL); 3112236884Smm return (rewind_error); 3113236884Smm } else { 3114236884Smm /* Store the rewind info as part of the initial load info */ 3115236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 3116236884Smm spa->spa_load_info); 3117236884Smm 3118236884Smm /* Restore the initial load info */ 3119236884Smm fnvlist_free(spa->spa_load_info); 3120236884Smm spa->spa_load_info = loadinfo; 3121236884Smm 3122236884Smm return (load_error); 3123236884Smm } 3124219089Spjd} 3125219089Spjd 3126168404Spjd/* 3127168404Spjd * Pool Open/Import 3128168404Spjd * 3129168404Spjd * The import case is identical to an open except that the configuration is sent 3130168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 3131168404Spjd * case of an open, the pool configuration will exist in the 3132185029Spjd * POOL_STATE_UNINITIALIZED state. 3133168404Spjd * 3134168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 3135168404Spjd * the same time open the pool, without having to keep around the spa_t in some 3136168404Spjd * ambiguous state. 3137168404Spjd */ 3138168404Spjdstatic int 3139219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 3140219089Spjd nvlist_t **config) 3141168404Spjd{ 3142168404Spjd spa_t *spa; 3143219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 3144168404Spjd int error; 3145168404Spjd int locked = B_FALSE; 3146219089Spjd int firstopen = B_FALSE; 3147168404Spjd 3148168404Spjd *spapp = NULL; 3149168404Spjd 3150168404Spjd /* 3151168404Spjd * As disgusting as this is, we need to support recursive calls to this 3152168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 3153168404Spjd * up calling spa_open() again. The real fix is to figure out how to 3154168404Spjd * avoid dsl_dir_open() calling this in the first place. 3155168404Spjd */ 3156168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 3157168404Spjd mutex_enter(&spa_namespace_lock); 3158168404Spjd locked = B_TRUE; 3159168404Spjd } 3160168404Spjd 3161168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 3162168404Spjd if (locked) 3163168404Spjd mutex_exit(&spa_namespace_lock); 3164249195Smm return (SET_ERROR(ENOENT)); 3165168404Spjd } 3166219089Spjd 3167168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 3168219089Spjd zpool_rewind_policy_t policy; 3169168404Spjd 3170219089Spjd firstopen = B_TRUE; 3171219089Spjd 3172219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 3173219089Spjd &policy); 3174219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 3175219089Spjd state = SPA_LOAD_RECOVER; 3176219089Spjd 3177209962Smm spa_activate(spa, spa_mode_global); 3178168404Spjd 3179219089Spjd if (state != SPA_LOAD_RECOVER) 3180219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3181168404Spjd 3182219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3183219089Spjd policy.zrp_request); 3184219089Spjd 3185168404Spjd if (error == EBADF) { 3186168404Spjd /* 3187168404Spjd * If vdev_validate() returns failure (indicated by 3188168404Spjd * EBADF), it indicates that one of the vdevs indicates 3189168404Spjd * that the pool has been exported or destroyed. If 3190168404Spjd * this is the case, the config cache is out of sync and 3191168404Spjd * we should remove the pool from the namespace. 3192168404Spjd */ 3193168404Spjd spa_unload(spa); 3194168404Spjd spa_deactivate(spa); 3195185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 3196168404Spjd spa_remove(spa); 3197168404Spjd if (locked) 3198168404Spjd mutex_exit(&spa_namespace_lock); 3199249195Smm return (SET_ERROR(ENOENT)); 3200168404Spjd } 3201168404Spjd 3202168404Spjd if (error) { 3203168404Spjd /* 3204168404Spjd * We can't open the pool, but we still have useful 3205168404Spjd * information: the state of each vdev after the 3206168404Spjd * attempted vdev_open(). Return this to the user. 3207168404Spjd */ 3208219089Spjd if (config != NULL && spa->spa_config) { 3209219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 3210219089Spjd KM_SLEEP) == 0); 3211219089Spjd VERIFY(nvlist_add_nvlist(*config, 3212219089Spjd ZPOOL_CONFIG_LOAD_INFO, 3213219089Spjd spa->spa_load_info) == 0); 3214219089Spjd } 3215168404Spjd spa_unload(spa); 3216168404Spjd spa_deactivate(spa); 3217219089Spjd spa->spa_last_open_failed = error; 3218168404Spjd if (locked) 3219168404Spjd mutex_exit(&spa_namespace_lock); 3220168404Spjd *spapp = NULL; 3221168404Spjd return (error); 3222168404Spjd } 3223168404Spjd } 3224168404Spjd 3225168404Spjd spa_open_ref(spa, tag); 3226185029Spjd 3227219089Spjd if (config != NULL) 3228219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3229219089Spjd 3230219089Spjd /* 3231219089Spjd * If we've recovered the pool, pass back any information we 3232219089Spjd * gathered while doing the load. 3233219089Spjd */ 3234219089Spjd if (state == SPA_LOAD_RECOVER) { 3235219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3236219089Spjd spa->spa_load_info) == 0); 3237219089Spjd } 3238219089Spjd 3239219089Spjd if (locked) { 3240219089Spjd spa->spa_last_open_failed = 0; 3241219089Spjd spa->spa_last_ubsync_txg = 0; 3242219089Spjd spa->spa_load_txg = 0; 3243168404Spjd mutex_exit(&spa_namespace_lock); 3244219089Spjd#ifdef __FreeBSD__ 3245219089Spjd#ifdef _KERNEL 3246219089Spjd if (firstopen) 3247249047Savg zvol_create_minors(spa->spa_name); 3248219089Spjd#endif 3249219089Spjd#endif 3250219089Spjd } 3251168404Spjd 3252168404Spjd *spapp = spa; 3253168404Spjd 3254168404Spjd return (0); 3255168404Spjd} 3256168404Spjd 3257168404Spjdint 3258219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3259219089Spjd nvlist_t **config) 3260219089Spjd{ 3261219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 3262219089Spjd} 3263219089Spjd 3264219089Spjdint 3265168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 3266168404Spjd{ 3267219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 3268168404Spjd} 3269168404Spjd 3270168404Spjd/* 3271168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 3272168404Spjd * preventing it from being exported or destroyed. 3273168404Spjd */ 3274168404Spjdspa_t * 3275168404Spjdspa_inject_addref(char *name) 3276168404Spjd{ 3277168404Spjd spa_t *spa; 3278168404Spjd 3279168404Spjd mutex_enter(&spa_namespace_lock); 3280168404Spjd if ((spa = spa_lookup(name)) == NULL) { 3281168404Spjd mutex_exit(&spa_namespace_lock); 3282168404Spjd return (NULL); 3283168404Spjd } 3284168404Spjd spa->spa_inject_ref++; 3285168404Spjd mutex_exit(&spa_namespace_lock); 3286168404Spjd 3287168404Spjd return (spa); 3288168404Spjd} 3289168404Spjd 3290168404Spjdvoid 3291168404Spjdspa_inject_delref(spa_t *spa) 3292168404Spjd{ 3293168404Spjd mutex_enter(&spa_namespace_lock); 3294168404Spjd spa->spa_inject_ref--; 3295168404Spjd mutex_exit(&spa_namespace_lock); 3296168404Spjd} 3297168404Spjd 3298185029Spjd/* 3299185029Spjd * Add spares device information to the nvlist. 3300185029Spjd */ 3301168404Spjdstatic void 3302168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 3303168404Spjd{ 3304168404Spjd nvlist_t **spares; 3305168404Spjd uint_t i, nspares; 3306168404Spjd nvlist_t *nvroot; 3307168404Spjd uint64_t guid; 3308168404Spjd vdev_stat_t *vs; 3309168404Spjd uint_t vsc; 3310168404Spjd uint64_t pool; 3311168404Spjd 3312209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3313209962Smm 3314185029Spjd if (spa->spa_spares.sav_count == 0) 3315168404Spjd return; 3316168404Spjd 3317168404Spjd VERIFY(nvlist_lookup_nvlist(config, 3318168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3319185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3320168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3321168404Spjd if (nspares != 0) { 3322168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3323168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3324168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3325168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3326168404Spjd 3327168404Spjd /* 3328168404Spjd * Go through and find any spares which have since been 3329168404Spjd * repurposed as an active spare. If this is the case, update 3330168404Spjd * their status appropriately. 3331168404Spjd */ 3332168404Spjd for (i = 0; i < nspares; i++) { 3333168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 3334168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3335185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 3336185029Spjd pool != 0ULL) { 3337168404Spjd VERIFY(nvlist_lookup_uint64_array( 3338219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 3339168404Spjd (uint64_t **)&vs, &vsc) == 0); 3340168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 3341168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 3342168404Spjd } 3343168404Spjd } 3344168404Spjd } 3345168404Spjd} 3346168404Spjd 3347185029Spjd/* 3348185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 3349185029Spjd */ 3350185029Spjdstatic void 3351185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 3352185029Spjd{ 3353185029Spjd nvlist_t **l2cache; 3354185029Spjd uint_t i, j, nl2cache; 3355185029Spjd nvlist_t *nvroot; 3356185029Spjd uint64_t guid; 3357185029Spjd vdev_t *vd; 3358185029Spjd vdev_stat_t *vs; 3359185029Spjd uint_t vsc; 3360185029Spjd 3361209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3362209962Smm 3363185029Spjd if (spa->spa_l2cache.sav_count == 0) 3364185029Spjd return; 3365185029Spjd 3366185029Spjd VERIFY(nvlist_lookup_nvlist(config, 3367185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3368185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3369185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3370185029Spjd if (nl2cache != 0) { 3371185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3372185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3373185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3374185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3375185029Spjd 3376185029Spjd /* 3377185029Spjd * Update level 2 cache device stats. 3378185029Spjd */ 3379185029Spjd 3380185029Spjd for (i = 0; i < nl2cache; i++) { 3381185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 3382185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3383185029Spjd 3384185029Spjd vd = NULL; 3385185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3386185029Spjd if (guid == 3387185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3388185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 3389185029Spjd break; 3390185029Spjd } 3391185029Spjd } 3392185029Spjd ASSERT(vd != NULL); 3393185029Spjd 3394185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3395219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3396219089Spjd == 0); 3397185029Spjd vdev_get_stats(vd, vs); 3398185029Spjd } 3399185029Spjd } 3400185029Spjd} 3401185029Spjd 3402236884Smmstatic void 3403236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3404236884Smm{ 3405236884Smm nvlist_t *features; 3406236884Smm zap_cursor_t zc; 3407236884Smm zap_attribute_t za; 3408236884Smm 3409236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3410236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3411236884Smm 3412253993Smav /* We may be unable to read features if pool is suspended. */ 3413253993Smav if (spa_suspended(spa)) 3414253993Smav goto out; 3415253993Smav 3416236884Smm if (spa->spa_feat_for_read_obj != 0) { 3417236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3418236884Smm spa->spa_feat_for_read_obj); 3419236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3420236884Smm zap_cursor_advance(&zc)) { 3421236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3422236884Smm za.za_num_integers == 1); 3423236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3424236884Smm za.za_first_integer)); 3425236884Smm } 3426236884Smm zap_cursor_fini(&zc); 3427236884Smm } 3428236884Smm 3429236884Smm if (spa->spa_feat_for_write_obj != 0) { 3430236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3431236884Smm spa->spa_feat_for_write_obj); 3432236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3433236884Smm zap_cursor_advance(&zc)) { 3434236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3435236884Smm za.za_num_integers == 1); 3436236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3437236884Smm za.za_first_integer)); 3438236884Smm } 3439236884Smm zap_cursor_fini(&zc); 3440236884Smm } 3441236884Smm 3442253993Smavout: 3443236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3444236884Smm features) == 0); 3445236884Smm nvlist_free(features); 3446236884Smm} 3447236884Smm 3448168404Spjdint 3449236884Smmspa_get_stats(const char *name, nvlist_t **config, 3450236884Smm char *altroot, size_t buflen) 3451168404Spjd{ 3452168404Spjd int error; 3453168404Spjd spa_t *spa; 3454168404Spjd 3455168404Spjd *config = NULL; 3456219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3457168404Spjd 3458209962Smm if (spa != NULL) { 3459209962Smm /* 3460209962Smm * This still leaves a window of inconsistency where the spares 3461209962Smm * or l2cache devices could change and the config would be 3462209962Smm * self-inconsistent. 3463209962Smm */ 3464209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3465168404Spjd 3466209962Smm if (*config != NULL) { 3467219089Spjd uint64_t loadtimes[2]; 3468219089Spjd 3469219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3470219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3471219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3472219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3473219089Spjd 3474185029Spjd VERIFY(nvlist_add_uint64(*config, 3475209962Smm ZPOOL_CONFIG_ERRCOUNT, 3476209962Smm spa_get_errlog_size(spa)) == 0); 3477185029Spjd 3478209962Smm if (spa_suspended(spa)) 3479209962Smm VERIFY(nvlist_add_uint64(*config, 3480209962Smm ZPOOL_CONFIG_SUSPENDED, 3481209962Smm spa->spa_failmode) == 0); 3482209962Smm 3483209962Smm spa_add_spares(spa, *config); 3484209962Smm spa_add_l2cache(spa, *config); 3485236884Smm spa_add_feature_stats(spa, *config); 3486209962Smm } 3487168404Spjd } 3488168404Spjd 3489168404Spjd /* 3490168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3491168404Spjd * and call spa_lookup() directly. 3492168404Spjd */ 3493168404Spjd if (altroot) { 3494168404Spjd if (spa == NULL) { 3495168404Spjd mutex_enter(&spa_namespace_lock); 3496168404Spjd spa = spa_lookup(name); 3497168404Spjd if (spa) 3498168404Spjd spa_altroot(spa, altroot, buflen); 3499168404Spjd else 3500168404Spjd altroot[0] = '\0'; 3501168404Spjd spa = NULL; 3502168404Spjd mutex_exit(&spa_namespace_lock); 3503168404Spjd } else { 3504168404Spjd spa_altroot(spa, altroot, buflen); 3505168404Spjd } 3506168404Spjd } 3507168404Spjd 3508209962Smm if (spa != NULL) { 3509209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3510168404Spjd spa_close(spa, FTAG); 3511209962Smm } 3512168404Spjd 3513168404Spjd return (error); 3514168404Spjd} 3515168404Spjd 3516168404Spjd/* 3517185029Spjd * Validate that the auxiliary device array is well formed. We must have an 3518185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 3519185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3520185029Spjd * specified, as long as they are well-formed. 3521168404Spjd */ 3522168404Spjdstatic int 3523185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3524185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 3525185029Spjd vdev_labeltype_t label) 3526168404Spjd{ 3527185029Spjd nvlist_t **dev; 3528185029Spjd uint_t i, ndev; 3529168404Spjd vdev_t *vd; 3530168404Spjd int error; 3531168404Spjd 3532185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3533185029Spjd 3534168404Spjd /* 3535185029Spjd * It's acceptable to have no devs specified. 3536168404Spjd */ 3537185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3538168404Spjd return (0); 3539168404Spjd 3540185029Spjd if (ndev == 0) 3541249195Smm return (SET_ERROR(EINVAL)); 3542168404Spjd 3543168404Spjd /* 3544185029Spjd * Make sure the pool is formatted with a version that supports this 3545185029Spjd * device type. 3546168404Spjd */ 3547185029Spjd if (spa_version(spa) < version) 3548249195Smm return (SET_ERROR(ENOTSUP)); 3549168404Spjd 3550168404Spjd /* 3551185029Spjd * Set the pending device list so we correctly handle device in-use 3552168404Spjd * checking. 3553168404Spjd */ 3554185029Spjd sav->sav_pending = dev; 3555185029Spjd sav->sav_npending = ndev; 3556168404Spjd 3557185029Spjd for (i = 0; i < ndev; i++) { 3558185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3559168404Spjd mode)) != 0) 3560168404Spjd goto out; 3561168404Spjd 3562168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3563168404Spjd vdev_free(vd); 3564249195Smm error = SET_ERROR(EINVAL); 3565168404Spjd goto out; 3566168404Spjd } 3567168404Spjd 3568185029Spjd /* 3569185029Spjd * The L2ARC currently only supports disk devices in 3570185029Spjd * kernel context. For user-level testing, we allow it. 3571185029Spjd */ 3572185029Spjd#ifdef _KERNEL 3573185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3574185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3575249195Smm error = SET_ERROR(ENOTBLK); 3576230514Smm vdev_free(vd); 3577185029Spjd goto out; 3578185029Spjd } 3579185029Spjd#endif 3580168404Spjd vd->vdev_top = vd; 3581168404Spjd 3582168404Spjd if ((error = vdev_open(vd)) == 0 && 3583185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 3584185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3585168404Spjd vd->vdev_guid) == 0); 3586168404Spjd } 3587168404Spjd 3588168404Spjd vdev_free(vd); 3589168404Spjd 3590185029Spjd if (error && 3591185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3592168404Spjd goto out; 3593168404Spjd else 3594168404Spjd error = 0; 3595168404Spjd } 3596168404Spjd 3597168404Spjdout: 3598185029Spjd sav->sav_pending = NULL; 3599185029Spjd sav->sav_npending = 0; 3600168404Spjd return (error); 3601168404Spjd} 3602168404Spjd 3603185029Spjdstatic int 3604185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3605185029Spjd{ 3606185029Spjd int error; 3607185029Spjd 3608185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3609185029Spjd 3610185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3611185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3612185029Spjd VDEV_LABEL_SPARE)) != 0) { 3613185029Spjd return (error); 3614185029Spjd } 3615185029Spjd 3616185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3617185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3618185029Spjd VDEV_LABEL_L2CACHE)); 3619185029Spjd} 3620185029Spjd 3621185029Spjdstatic void 3622185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3623185029Spjd const char *config) 3624185029Spjd{ 3625185029Spjd int i; 3626185029Spjd 3627185029Spjd if (sav->sav_config != NULL) { 3628185029Spjd nvlist_t **olddevs; 3629185029Spjd uint_t oldndevs; 3630185029Spjd nvlist_t **newdevs; 3631185029Spjd 3632185029Spjd /* 3633185029Spjd * Generate new dev list by concatentating with the 3634185029Spjd * current dev list. 3635185029Spjd */ 3636185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3637185029Spjd &olddevs, &oldndevs) == 0); 3638185029Spjd 3639185029Spjd newdevs = kmem_alloc(sizeof (void *) * 3640185029Spjd (ndevs + oldndevs), KM_SLEEP); 3641185029Spjd for (i = 0; i < oldndevs; i++) 3642185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3643185029Spjd KM_SLEEP) == 0); 3644185029Spjd for (i = 0; i < ndevs; i++) 3645185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3646185029Spjd KM_SLEEP) == 0); 3647185029Spjd 3648185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 3649185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 3650185029Spjd 3651185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3652185029Spjd config, newdevs, ndevs + oldndevs) == 0); 3653185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 3654185029Spjd nvlist_free(newdevs[i]); 3655185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3656185029Spjd } else { 3657185029Spjd /* 3658185029Spjd * Generate a new dev list. 3659185029Spjd */ 3660185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3661185029Spjd KM_SLEEP) == 0); 3662185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3663185029Spjd devs, ndevs) == 0); 3664185029Spjd } 3665185029Spjd} 3666185029Spjd 3667168404Spjd/* 3668185029Spjd * Stop and drop level 2 ARC devices 3669185029Spjd */ 3670185029Spjdvoid 3671185029Spjdspa_l2cache_drop(spa_t *spa) 3672185029Spjd{ 3673185029Spjd vdev_t *vd; 3674185029Spjd int i; 3675185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 3676185029Spjd 3677185029Spjd for (i = 0; i < sav->sav_count; i++) { 3678185029Spjd uint64_t pool; 3679185029Spjd 3680185029Spjd vd = sav->sav_vdevs[i]; 3681185029Spjd ASSERT(vd != NULL); 3682185029Spjd 3683209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3684209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 3685185029Spjd l2arc_remove_vdev(vd); 3686185029Spjd } 3687185029Spjd} 3688185029Spjd 3689185029Spjd/* 3690168404Spjd * Pool Creation 3691168404Spjd */ 3692168404Spjdint 3693185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3694248571Smm nvlist_t *zplprops) 3695168404Spjd{ 3696168404Spjd spa_t *spa; 3697185029Spjd char *altroot = NULL; 3698168404Spjd vdev_t *rvd; 3699168404Spjd dsl_pool_t *dp; 3700168404Spjd dmu_tx_t *tx; 3701219089Spjd int error = 0; 3702168404Spjd uint64_t txg = TXG_INITIAL; 3703185029Spjd nvlist_t **spares, **l2cache; 3704185029Spjd uint_t nspares, nl2cache; 3705219089Spjd uint64_t version, obj; 3706236884Smm boolean_t has_features; 3707168404Spjd 3708168404Spjd /* 3709168404Spjd * If this pool already exists, return failure. 3710168404Spjd */ 3711168404Spjd mutex_enter(&spa_namespace_lock); 3712168404Spjd if (spa_lookup(pool) != NULL) { 3713168404Spjd mutex_exit(&spa_namespace_lock); 3714249195Smm return (SET_ERROR(EEXIST)); 3715168404Spjd } 3716168404Spjd 3717168404Spjd /* 3718168404Spjd * Allocate a new spa_t structure. 3719168404Spjd */ 3720185029Spjd (void) nvlist_lookup_string(props, 3721185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3722219089Spjd spa = spa_add(pool, NULL, altroot); 3723209962Smm spa_activate(spa, spa_mode_global); 3724168404Spjd 3725185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 3726185029Spjd spa_deactivate(spa); 3727185029Spjd spa_remove(spa); 3728185029Spjd mutex_exit(&spa_namespace_lock); 3729185029Spjd return (error); 3730185029Spjd } 3731185029Spjd 3732236884Smm has_features = B_FALSE; 3733236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3734236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3735236884Smm if (zpool_prop_feature(nvpair_name(elem))) 3736236884Smm has_features = B_TRUE; 3737236884Smm } 3738236884Smm 3739236884Smm if (has_features || nvlist_lookup_uint64(props, 3740236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3741185029Spjd version = SPA_VERSION; 3742236884Smm } 3743236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3744219089Spjd 3745219089Spjd spa->spa_first_txg = txg; 3746219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 3747185029Spjd spa->spa_uberblock.ub_version = version; 3748168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3749307277Smav spa->spa_load_state = SPA_LOAD_CREATE; 3750168404Spjd 3751168404Spjd /* 3752209962Smm * Create "The Godfather" zio to hold all async IOs 3753209962Smm */ 3754272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3755272598Sdelphij KM_SLEEP); 3756272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 3757272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3758272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3759272598Sdelphij ZIO_FLAG_GODFATHER); 3760272598Sdelphij } 3761209962Smm 3762209962Smm /* 3763168404Spjd * Create the root vdev. 3764168404Spjd */ 3765185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3766168404Spjd 3767168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3768168404Spjd 3769168404Spjd ASSERT(error != 0 || rvd != NULL); 3770168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3771168404Spjd 3772185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 3773249195Smm error = SET_ERROR(EINVAL); 3774168404Spjd 3775168404Spjd if (error == 0 && 3776168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3777185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 3778168404Spjd VDEV_ALLOC_ADD)) == 0) { 3779219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 3780254591Sgibbs vdev_ashift_optimize(rvd->vdev_child[c]); 3781219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 3782219089Spjd vdev_expand(rvd->vdev_child[c], txg); 3783219089Spjd } 3784168404Spjd } 3785168404Spjd 3786185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3787168404Spjd 3788168404Spjd if (error != 0) { 3789168404Spjd spa_unload(spa); 3790168404Spjd spa_deactivate(spa); 3791168404Spjd spa_remove(spa); 3792168404Spjd mutex_exit(&spa_namespace_lock); 3793168404Spjd return (error); 3794168404Spjd } 3795168404Spjd 3796168404Spjd /* 3797168404Spjd * Get the list of spares, if specified. 3798168404Spjd */ 3799168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3800168404Spjd &spares, &nspares) == 0) { 3801185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3802168404Spjd KM_SLEEP) == 0); 3803185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3804168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3805185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3806168404Spjd spa_load_spares(spa); 3807185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3808185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3809168404Spjd } 3810168404Spjd 3811185029Spjd /* 3812185029Spjd * Get the list of level 2 cache devices, if specified. 3813185029Spjd */ 3814185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3815185029Spjd &l2cache, &nl2cache) == 0) { 3816185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3817185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 3818185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3819185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3820185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3821185029Spjd spa_load_l2cache(spa); 3822185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3823185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3824185029Spjd } 3825185029Spjd 3826236884Smm spa->spa_is_initializing = B_TRUE; 3827185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3828168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 3829236884Smm spa->spa_is_initializing = B_FALSE; 3830168404Spjd 3831219089Spjd /* 3832219089Spjd * Create DDTs (dedup tables). 3833219089Spjd */ 3834219089Spjd ddt_create(spa); 3835219089Spjd 3836219089Spjd spa_update_dspace(spa); 3837219089Spjd 3838168404Spjd tx = dmu_tx_create_assigned(dp, txg); 3839168404Spjd 3840168404Spjd /* 3841168404Spjd * Create the pool config object. 3842168404Spjd */ 3843168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3844185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3845168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3846168404Spjd 3847168404Spjd if (zap_add(spa->spa_meta_objset, 3848168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3849168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3850168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 3851168404Spjd } 3852168404Spjd 3853236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 3854236884Smm spa_feature_create_zap_objects(spa, tx); 3855236884Smm 3856219089Spjd if (zap_add(spa->spa_meta_objset, 3857219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3858219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 3859219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 3860219089Spjd } 3861219089Spjd 3862185029Spjd /* Newly created pools with the right version are always deflated. */ 3863185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3864185029Spjd spa->spa_deflate = TRUE; 3865185029Spjd if (zap_add(spa->spa_meta_objset, 3866185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3867185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3868185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 3869185029Spjd } 3870168404Spjd } 3871168404Spjd 3872168404Spjd /* 3873219089Spjd * Create the deferred-free bpobj. Turn off compression 3874168404Spjd * because sync-to-convergence takes longer if the blocksize 3875168404Spjd * keeps changing. 3876168404Spjd */ 3877219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3878219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 3879168404Spjd ZIO_COMPRESS_OFF, tx); 3880168404Spjd if (zap_add(spa->spa_meta_objset, 3881219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3882219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 3883219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 3884168404Spjd } 3885219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3886219089Spjd spa->spa_meta_objset, obj)); 3887168404Spjd 3888168404Spjd /* 3889168404Spjd * Create the pool's history object. 3890168404Spjd */ 3891185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 3892185029Spjd spa_history_create_obj(spa, tx); 3893168404Spjd 3894185029Spjd /* 3895289422Smav * Generate some random noise for salted checksums to operate on. 3896289422Smav */ 3897289422Smav (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 3898289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes)); 3899289422Smav 3900289422Smav /* 3901185029Spjd * Set pool properties. 3902185029Spjd */ 3903185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3904185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3905185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3906219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3907219089Spjd 3908209962Smm if (props != NULL) { 3909209962Smm spa_configfile_set(spa, props, B_FALSE); 3910248571Smm spa_sync_props(props, tx); 3911209962Smm } 3912185029Spjd 3913168404Spjd dmu_tx_commit(tx); 3914168404Spjd 3915168404Spjd spa->spa_sync_on = B_TRUE; 3916168404Spjd txg_sync_start(spa->spa_dsl_pool); 3917168404Spjd 3918168404Spjd /* 3919168404Spjd * We explicitly wait for the first transaction to complete so that our 3920168404Spjd * bean counters are appropriately updated. 3921168404Spjd */ 3922168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 3923168404Spjd 3924185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3925331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 3926168404Spjd 3927248571Smm spa_history_log_version(spa, "create"); 3928185029Spjd 3929286575Smav /* 3930286575Smav * Don't count references from objsets that are already closed 3931286575Smav * and are making their way through the eviction process. 3932286575Smav */ 3933286575Smav spa_evicting_os_wait(spa); 3934208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 3935307277Smav spa->spa_load_state = SPA_LOAD_NONE; 3936208442Smm 3937168404Spjd mutex_exit(&spa_namespace_lock); 3938168404Spjd 3939168404Spjd return (0); 3940168404Spjd} 3941168404Spjd 3942241286Savg#ifdef _KERNEL 3943277300Ssmh#ifdef illumos 3944185029Spjd/* 3945219089Spjd * Get the root pool information from the root disk, then import the root pool 3946219089Spjd * during the system boot up time. 3947185029Spjd */ 3948219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3949219089Spjd 3950219089Spjdstatic nvlist_t * 3951219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3952185029Spjd{ 3953219089Spjd nvlist_t *config; 3954185029Spjd nvlist_t *nvtop, *nvroot; 3955185029Spjd uint64_t pgid; 3956185029Spjd 3957219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3958219089Spjd return (NULL); 3959219089Spjd 3960168404Spjd /* 3961185029Spjd * Add this top-level vdev to the child array. 3962168404Spjd */ 3963219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3964219089Spjd &nvtop) == 0); 3965219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3966219089Spjd &pgid) == 0); 3967219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3968168404Spjd 3969185029Spjd /* 3970185029Spjd * Put this pool's top-level vdevs into a root vdev. 3971185029Spjd */ 3972185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3973219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3974219089Spjd VDEV_TYPE_ROOT) == 0); 3975185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3976185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3977185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3978185029Spjd &nvtop, 1) == 0); 3979168404Spjd 3980168404Spjd /* 3981185029Spjd * Replace the existing vdev_tree with the new root vdev in 3982185029Spjd * this pool's configuration (remove the old, add the new). 3983168404Spjd */ 3984185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3985185029Spjd nvlist_free(nvroot); 3986219089Spjd return (config); 3987185029Spjd} 3988168404Spjd 3989185029Spjd/* 3990219089Spjd * Walk the vdev tree and see if we can find a device with "better" 3991219089Spjd * configuration. A configuration is "better" if the label on that 3992219089Spjd * device has a more recent txg. 3993185029Spjd */ 3994219089Spjdstatic void 3995219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3996185029Spjd{ 3997219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3998219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3999185029Spjd 4000219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 4001219089Spjd nvlist_t *label; 4002219089Spjd uint64_t label_txg; 4003185029Spjd 4004219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 4005219089Spjd &label) != 0) 4006219089Spjd return; 4007185029Spjd 4008219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 4009219089Spjd &label_txg) == 0); 4010168404Spjd 4011219089Spjd /* 4012219089Spjd * Do we have a better boot device? 4013219089Spjd */ 4014219089Spjd if (label_txg > *txg) { 4015219089Spjd *txg = label_txg; 4016219089Spjd *avd = vd; 4017185029Spjd } 4018219089Spjd nvlist_free(label); 4019185029Spjd } 4020185029Spjd} 4021185029Spjd 4022185029Spjd/* 4023185029Spjd * Import a root pool. 4024185029Spjd * 4025185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 4026185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 4027185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 4028185029Spjd * 4029185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 4030185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 4031185029Spjd * e.g. 4032185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 4033185029Spjd */ 4034185029Spjdint 4035185029Spjdspa_import_rootpool(char *devpath, char *devid) 4036185029Spjd{ 4037219089Spjd spa_t *spa; 4038219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 4039219089Spjd nvlist_t *config, *nvtop; 4040219089Spjd uint64_t guid, txg; 4041185029Spjd char *pname; 4042185029Spjd int error; 4043185029Spjd 4044185029Spjd /* 4045219089Spjd * Read the label from the boot device and generate a configuration. 4046185029Spjd */ 4047219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 4048219089Spjd#if defined(_OBP) && defined(_KERNEL) 4049219089Spjd if (config == NULL) { 4050219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 4051219089Spjd /* iscsi boot */ 4052219089Spjd get_iscsi_bootpath_phy(devpath); 4053219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 4054219089Spjd } 4055219089Spjd } 4056219089Spjd#endif 4057219089Spjd if (config == NULL) { 4058236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 4059219089Spjd devpath); 4060249195Smm return (SET_ERROR(EIO)); 4061219089Spjd } 4062185029Spjd 4063219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4064219089Spjd &pname) == 0); 4065219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 4066185029Spjd 4067209962Smm mutex_enter(&spa_namespace_lock); 4068209962Smm if ((spa = spa_lookup(pname)) != NULL) { 4069209962Smm /* 4070209962Smm * Remove the existing root pool from the namespace so that we 4071209962Smm * can replace it with the correct config we just read in. 4072209962Smm */ 4073209962Smm spa_remove(spa); 4074209962Smm } 4075185029Spjd 4076219089Spjd spa = spa_add(pname, config, NULL); 4077209962Smm spa->spa_is_root = B_TRUE; 4078219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4079209962Smm 4080219089Spjd /* 4081219089Spjd * Build up a vdev tree based on the boot device's label config. 4082219089Spjd */ 4083219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4084219089Spjd &nvtop) == 0); 4085219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4086219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4087219089Spjd VDEV_ALLOC_ROOTPOOL); 4088219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4089219089Spjd if (error) { 4090209962Smm mutex_exit(&spa_namespace_lock); 4091219089Spjd nvlist_free(config); 4092219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4093219089Spjd pname); 4094219089Spjd return (error); 4095209962Smm } 4096209962Smm 4097219089Spjd /* 4098219089Spjd * Get the boot vdev. 4099219089Spjd */ 4100219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 4101219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 4102219089Spjd (u_longlong_t)guid); 4103249195Smm error = SET_ERROR(ENOENT); 4104219089Spjd goto out; 4105219089Spjd } 4106209962Smm 4107219089Spjd /* 4108219089Spjd * Determine if there is a better boot device. 4109219089Spjd */ 4110219089Spjd avd = bvd; 4111219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 4112219089Spjd if (avd != bvd) { 4113219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 4114219089Spjd "try booting from '%s'", avd->vdev_path); 4115249195Smm error = SET_ERROR(EINVAL); 4116219089Spjd goto out; 4117219089Spjd } 4118209962Smm 4119219089Spjd /* 4120219089Spjd * If the boot device is part of a spare vdev then ensure that 4121219089Spjd * we're booting off the active spare. 4122219089Spjd */ 4123219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 4124219089Spjd !bvd->vdev_isspare) { 4125219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 4126219089Spjd "try booting from '%s'", 4127219089Spjd bvd->vdev_parent-> 4128219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 4129249195Smm error = SET_ERROR(EINVAL); 4130219089Spjd goto out; 4131219089Spjd } 4132209962Smm 4133219089Spjd error = 0; 4134219089Spjdout: 4135219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4136219089Spjd vdev_free(rvd); 4137219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4138209962Smm mutex_exit(&spa_namespace_lock); 4139209962Smm 4140219089Spjd nvlist_free(config); 4141219089Spjd return (error); 4142185029Spjd} 4143185029Spjd 4144277300Ssmh#else /* !illumos */ 4145241286Savg 4146243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 4147243502Savg uint64_t *count); 4148241286Savg 4149241286Savgstatic nvlist_t * 4150241286Savgspa_generate_rootconf(const char *name) 4151241286Savg{ 4152243502Savg nvlist_t **configs, **tops; 4153241286Savg nvlist_t *config; 4154243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 4155243502Savg uint64_t *holes; 4156243502Savg uint64_t best_txg; 4157243213Savg uint64_t nchildren; 4158241286Savg uint64_t pgid; 4159243502Savg uint64_t count; 4160243502Savg uint64_t i; 4161243502Savg uint_t nholes; 4162241286Savg 4163243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 4164241286Savg return (NULL); 4165241286Savg 4166243502Savg ASSERT3U(count, !=, 0); 4167243502Savg best_txg = 0; 4168243502Savg for (i = 0; i < count; i++) { 4169243502Savg uint64_t txg; 4170243502Savg 4171243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 4172243502Savg &txg) == 0); 4173243502Savg if (txg > best_txg) { 4174243502Savg best_txg = txg; 4175243502Savg best_cfg = configs[i]; 4176243502Savg } 4177243502Savg } 4178243502Savg 4179245945Savg nchildren = 1; 4180245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 4181243502Savg holes = NULL; 4182243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 4183243502Savg &holes, &nholes); 4184243502Savg 4185244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 4186243502Savg for (i = 0; i < nchildren; i++) { 4187243502Savg if (i >= count) 4188243502Savg break; 4189243502Savg if (configs[i] == NULL) 4190243502Savg continue; 4191243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 4192243502Savg &nvtop) == 0); 4193243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 4194243213Savg } 4195243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 4196243502Savg if (i >= nchildren) 4197243502Savg continue; 4198243502Savg if (tops[holes[i]] != NULL) 4199243502Savg continue; 4200243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 4201243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 4202243502Savg VDEV_TYPE_HOLE) == 0); 4203243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 4204243502Savg holes[i]) == 0); 4205243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 4206243502Savg 0) == 0); 4207243502Savg } 4208243502Savg for (i = 0; i < nchildren; i++) { 4209243502Savg if (tops[i] != NULL) 4210243502Savg continue; 4211243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 4212243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 4213243502Savg VDEV_TYPE_MISSING) == 0); 4214243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 4215243502Savg i) == 0); 4216243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 4217243502Savg 0) == 0); 4218243502Savg } 4219243213Savg 4220243213Savg /* 4221243502Savg * Create pool config based on the best vdev config. 4222241286Savg */ 4223243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 4224241286Savg 4225241286Savg /* 4226241286Savg * Put this pool's top-level vdevs into a root vdev. 4227241286Savg */ 4228243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4229243502Savg &pgid) == 0); 4230241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4231241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4232241286Savg VDEV_TYPE_ROOT) == 0); 4233241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4234241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4235241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4236243502Savg tops, nchildren) == 0); 4237241286Savg 4238241286Savg /* 4239241286Savg * Replace the existing vdev_tree with the new root vdev in 4240241286Savg * this pool's configuration (remove the old, add the new). 4241241286Savg */ 4242241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4243243502Savg 4244243502Savg /* 4245243502Savg * Drop vdev config elements that should not be present at pool level. 4246243502Savg */ 4247243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 4248243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 4249243502Savg 4250243502Savg for (i = 0; i < count; i++) 4251243502Savg nvlist_free(configs[i]); 4252243502Savg kmem_free(configs, count * sizeof(void *)); 4253243502Savg for (i = 0; i < nchildren; i++) 4254243502Savg nvlist_free(tops[i]); 4255243502Savg kmem_free(tops, nchildren * sizeof(void *)); 4256241286Savg nvlist_free(nvroot); 4257241286Savg return (config); 4258241286Savg} 4259241286Savg 4260241286Savgint 4261241286Savgspa_import_rootpool(const char *name) 4262241286Savg{ 4263241286Savg spa_t *spa; 4264241286Savg vdev_t *rvd, *bvd, *avd = NULL; 4265241286Savg nvlist_t *config, *nvtop; 4266241286Savg uint64_t txg; 4267241286Savg char *pname; 4268241286Savg int error; 4269241286Savg 4270241286Savg /* 4271241286Savg * Read the label from the boot device and generate a configuration. 4272241286Savg */ 4273241286Savg config = spa_generate_rootconf(name); 4274243213Savg 4275243213Savg mutex_enter(&spa_namespace_lock); 4276243213Savg if (config != NULL) { 4277243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4278243213Savg &pname) == 0 && strcmp(name, pname) == 0); 4279243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 4280243213Savg == 0); 4281243213Savg 4282243213Savg if ((spa = spa_lookup(pname)) != NULL) { 4283243213Savg /* 4284323746Savg * The pool could already be imported, 4285323746Savg * e.g., after reboot -r. 4286323746Savg */ 4287323746Savg if (spa->spa_state == POOL_STATE_ACTIVE) { 4288323746Savg mutex_exit(&spa_namespace_lock); 4289323746Savg nvlist_free(config); 4290323746Savg return (0); 4291323746Savg } 4292323746Savg 4293323746Savg /* 4294243213Savg * Remove the existing root pool from the namespace so 4295243213Savg * that we can replace it with the correct config 4296243213Savg * we just read in. 4297243213Savg */ 4298243213Savg spa_remove(spa); 4299243213Savg } 4300243213Savg spa = spa_add(pname, config, NULL); 4301243501Savg 4302243501Savg /* 4303243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4304243501Savg * via spa_version(). 4305243501Savg */ 4306243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4307243501Savg &spa->spa_ubsync.ub_version) != 0) 4308243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4309243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 4310287100Savg mutex_exit(&spa_namespace_lock); 4311287100Savg nvlist_free(config); 4312241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4313241286Savg name); 4314241286Savg return (EIO); 4315243213Savg } else { 4316243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4317241286Savg } 4318241286Savg spa->spa_is_root = B_TRUE; 4319241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4320241286Savg 4321241286Savg /* 4322241286Savg * Build up a vdev tree based on the boot device's label config. 4323241286Savg */ 4324241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4325241286Savg &nvtop) == 0); 4326241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4327241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4328241286Savg VDEV_ALLOC_ROOTPOOL); 4329241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4330241286Savg if (error) { 4331241286Savg mutex_exit(&spa_namespace_lock); 4332241286Savg nvlist_free(config); 4333241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4334241286Savg pname); 4335241286Savg return (error); 4336241286Savg } 4337241286Savg 4338241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4339241286Savg vdev_free(rvd); 4340241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4341241286Savg mutex_exit(&spa_namespace_lock); 4342241286Savg 4343243213Savg nvlist_free(config); 4344243213Savg return (0); 4345241286Savg} 4346241286Savg 4347277300Ssmh#endif /* illumos */ 4348277300Ssmh#endif /* _KERNEL */ 4349219089Spjd 4350209962Smm/* 4351209962Smm * Import a non-root pool into the system. 4352209962Smm */ 4353185029Spjdint 4354219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4355185029Spjd{ 4356209962Smm spa_t *spa; 4357209962Smm char *altroot = NULL; 4358219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 4359219089Spjd zpool_rewind_policy_t policy; 4360219089Spjd uint64_t mode = spa_mode_global; 4361219089Spjd uint64_t readonly = B_FALSE; 4362209962Smm int error; 4363209962Smm nvlist_t *nvroot; 4364209962Smm nvlist_t **spares, **l2cache; 4365209962Smm uint_t nspares, nl2cache; 4366209962Smm 4367209962Smm /* 4368209962Smm * If a pool with this name exists, return failure. 4369209962Smm */ 4370209962Smm mutex_enter(&spa_namespace_lock); 4371219089Spjd if (spa_lookup(pool) != NULL) { 4372209962Smm mutex_exit(&spa_namespace_lock); 4373249195Smm return (SET_ERROR(EEXIST)); 4374209962Smm } 4375209962Smm 4376209962Smm /* 4377209962Smm * Create and initialize the spa structure. 4378209962Smm */ 4379209962Smm (void) nvlist_lookup_string(props, 4380209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4381219089Spjd (void) nvlist_lookup_uint64(props, 4382219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4383219089Spjd if (readonly) 4384219089Spjd mode = FREAD; 4385219089Spjd spa = spa_add(pool, config, altroot); 4386219089Spjd spa->spa_import_flags = flags; 4387209962Smm 4388209962Smm /* 4389219089Spjd * Verbatim import - Take a pool and insert it into the namespace 4390219089Spjd * as if it had been loaded at boot. 4391219089Spjd */ 4392219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4393219089Spjd if (props != NULL) 4394219089Spjd spa_configfile_set(spa, props, B_FALSE); 4395219089Spjd 4396219089Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 4397331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 4398219089Spjd 4399219089Spjd mutex_exit(&spa_namespace_lock); 4400219089Spjd return (0); 4401219089Spjd } 4402219089Spjd 4403219089Spjd spa_activate(spa, mode); 4404219089Spjd 4405219089Spjd /* 4406209962Smm * Don't start async tasks until we know everything is healthy. 4407209962Smm */ 4408209962Smm spa_async_suspend(spa); 4409209962Smm 4410219089Spjd zpool_get_rewind_policy(config, &policy); 4411219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 4412219089Spjd state = SPA_LOAD_RECOVER; 4413219089Spjd 4414209962Smm /* 4415209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4416209962Smm * because the user-supplied config is actually the one to trust when 4417209962Smm * doing an import. 4418209962Smm */ 4419219089Spjd if (state != SPA_LOAD_RECOVER) 4420219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4421209962Smm 4422219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4423219089Spjd policy.zrp_request); 4424219089Spjd 4425219089Spjd /* 4426219089Spjd * Propagate anything learned while loading the pool and pass it 4427219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 4428219089Spjd */ 4429219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4430219089Spjd spa->spa_load_info) == 0); 4431219089Spjd 4432209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4433209962Smm /* 4434209962Smm * Toss any existing sparelist, as it doesn't have any validity 4435209962Smm * anymore, and conflicts with spa_has_spare(). 4436209962Smm */ 4437209962Smm if (spa->spa_spares.sav_config) { 4438209962Smm nvlist_free(spa->spa_spares.sav_config); 4439209962Smm spa->spa_spares.sav_config = NULL; 4440209962Smm spa_load_spares(spa); 4441209962Smm } 4442209962Smm if (spa->spa_l2cache.sav_config) { 4443209962Smm nvlist_free(spa->spa_l2cache.sav_config); 4444209962Smm spa->spa_l2cache.sav_config = NULL; 4445209962Smm spa_load_l2cache(spa); 4446209962Smm } 4447209962Smm 4448209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4449209962Smm &nvroot) == 0); 4450209962Smm if (error == 0) 4451209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4452209962Smm VDEV_ALLOC_SPARE); 4453209962Smm if (error == 0) 4454209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4455209962Smm VDEV_ALLOC_L2CACHE); 4456209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4457209962Smm 4458209962Smm if (props != NULL) 4459209962Smm spa_configfile_set(spa, props, B_FALSE); 4460209962Smm 4461209962Smm if (error != 0 || (props && spa_writeable(spa) && 4462209962Smm (error = spa_prop_set(spa, props)))) { 4463209962Smm spa_unload(spa); 4464209962Smm spa_deactivate(spa); 4465209962Smm spa_remove(spa); 4466209962Smm mutex_exit(&spa_namespace_lock); 4467209962Smm return (error); 4468209962Smm } 4469209962Smm 4470209962Smm spa_async_resume(spa); 4471209962Smm 4472209962Smm /* 4473209962Smm * Override any spares and level 2 cache devices as specified by 4474209962Smm * the user, as these may have correct device names/devids, etc. 4475209962Smm */ 4476209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4477209962Smm &spares, &nspares) == 0) { 4478209962Smm if (spa->spa_spares.sav_config) 4479209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4480209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4481209962Smm else 4482209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4483209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4484209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4485209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4486209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4487209962Smm spa_load_spares(spa); 4488209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4489209962Smm spa->spa_spares.sav_sync = B_TRUE; 4490209962Smm } 4491209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4492209962Smm &l2cache, &nl2cache) == 0) { 4493209962Smm if (spa->spa_l2cache.sav_config) 4494209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4495209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4496209962Smm else 4497209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4498209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4499209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4500209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4501209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4502209962Smm spa_load_l2cache(spa); 4503209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4504209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 4505209962Smm } 4506209962Smm 4507219089Spjd /* 4508219089Spjd * Check for any removed devices. 4509219089Spjd */ 4510219089Spjd if (spa->spa_autoreplace) { 4511219089Spjd spa_aux_check_removed(&spa->spa_spares); 4512219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 4513219089Spjd } 4514219089Spjd 4515209962Smm if (spa_writeable(spa)) { 4516209962Smm /* 4517209962Smm * Update the config cache to include the newly-imported pool. 4518209962Smm */ 4519209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4520209962Smm } 4521209962Smm 4522219089Spjd /* 4523219089Spjd * It's possible that the pool was expanded while it was exported. 4524219089Spjd * We kick off an async task to handle this for us. 4525219089Spjd */ 4526219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4527219089Spjd 4528248571Smm spa_history_log_version(spa, "import"); 4529209962Smm 4530331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 4531287745Sdelphij 4532287745Sdelphij mutex_exit(&spa_namespace_lock); 4533287745Sdelphij 4534219089Spjd#ifdef __FreeBSD__ 4535219089Spjd#ifdef _KERNEL 4536219089Spjd zvol_create_minors(pool); 4537219089Spjd#endif 4538219089Spjd#endif 4539209962Smm return (0); 4540185029Spjd} 4541185029Spjd 4542168404Spjdnvlist_t * 4543168404Spjdspa_tryimport(nvlist_t *tryconfig) 4544168404Spjd{ 4545168404Spjd nvlist_t *config = NULL; 4546168404Spjd char *poolname; 4547168404Spjd spa_t *spa; 4548168404Spjd uint64_t state; 4549208443Smm int error; 4550168404Spjd 4551168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4552168404Spjd return (NULL); 4553168404Spjd 4554168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4555168404Spjd return (NULL); 4556168404Spjd 4557168404Spjd /* 4558168404Spjd * Create and initialize the spa structure. 4559168404Spjd */ 4560168404Spjd mutex_enter(&spa_namespace_lock); 4561219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4562209962Smm spa_activate(spa, FREAD); 4563168404Spjd 4564168404Spjd /* 4565168404Spjd * Pass off the heavy lifting to spa_load(). 4566168404Spjd * Pass TRUE for mosconfig because the user-supplied config 4567168404Spjd * is actually the one to trust when doing an import. 4568168404Spjd */ 4569219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4570168404Spjd 4571168404Spjd /* 4572168404Spjd * If 'tryconfig' was at least parsable, return the current config. 4573168404Spjd */ 4574168404Spjd if (spa->spa_root_vdev != NULL) { 4575168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4576168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4577168404Spjd poolname) == 0); 4578168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4579168404Spjd state) == 0); 4580168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4581168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 4582236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4583236884Smm spa->spa_load_info) == 0); 4584168404Spjd 4585168404Spjd /* 4586185029Spjd * If the bootfs property exists on this pool then we 4587185029Spjd * copy it out so that external consumers can tell which 4588185029Spjd * pools are bootable. 4589168404Spjd */ 4590208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 4591185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4592185029Spjd 4593185029Spjd /* 4594185029Spjd * We have to play games with the name since the 4595185029Spjd * pool was opened as TRYIMPORT_NAME. 4596185029Spjd */ 4597185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 4598185029Spjd spa->spa_bootfs, tmpname) == 0) { 4599185029Spjd char *cp; 4600185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4601185029Spjd 4602185029Spjd cp = strchr(tmpname, '/'); 4603185029Spjd if (cp == NULL) { 4604185029Spjd (void) strlcpy(dsname, tmpname, 4605185029Spjd MAXPATHLEN); 4606185029Spjd } else { 4607185029Spjd (void) snprintf(dsname, MAXPATHLEN, 4608185029Spjd "%s/%s", poolname, ++cp); 4609185029Spjd } 4610185029Spjd VERIFY(nvlist_add_string(config, 4611185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4612185029Spjd kmem_free(dsname, MAXPATHLEN); 4613185029Spjd } 4614185029Spjd kmem_free(tmpname, MAXPATHLEN); 4615185029Spjd } 4616185029Spjd 4617185029Spjd /* 4618185029Spjd * Add the list of hot spares and level 2 cache devices. 4619185029Spjd */ 4620209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4621168404Spjd spa_add_spares(spa, config); 4622185029Spjd spa_add_l2cache(spa, config); 4623209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4624168404Spjd } 4625168404Spjd 4626168404Spjd spa_unload(spa); 4627168404Spjd spa_deactivate(spa); 4628168404Spjd spa_remove(spa); 4629168404Spjd mutex_exit(&spa_namespace_lock); 4630168404Spjd 4631168404Spjd return (config); 4632168404Spjd} 4633168404Spjd 4634168404Spjd/* 4635168404Spjd * Pool export/destroy 4636168404Spjd * 4637168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 4638168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 4639168404Spjd * update the pool state and sync all the labels to disk, removing the 4640207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4641207670Smm * we don't sync the labels or remove the configuration cache. 4642168404Spjd */ 4643168404Spjdstatic int 4644185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4645207670Smm boolean_t force, boolean_t hardforce) 4646168404Spjd{ 4647168404Spjd spa_t *spa; 4648168404Spjd 4649168404Spjd if (oldconfig) 4650168404Spjd *oldconfig = NULL; 4651168404Spjd 4652209962Smm if (!(spa_mode_global & FWRITE)) 4653249195Smm return (SET_ERROR(EROFS)); 4654168404Spjd 4655168404Spjd mutex_enter(&spa_namespace_lock); 4656168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4657168404Spjd mutex_exit(&spa_namespace_lock); 4658249195Smm return (SET_ERROR(ENOENT)); 4659168404Spjd } 4660168404Spjd 4661168404Spjd /* 4662168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 4663168404Spjd * reacquire the namespace lock, and see if we can export. 4664168404Spjd */ 4665168404Spjd spa_open_ref(spa, FTAG); 4666168404Spjd mutex_exit(&spa_namespace_lock); 4667168404Spjd spa_async_suspend(spa); 4668168404Spjd mutex_enter(&spa_namespace_lock); 4669168404Spjd spa_close(spa, FTAG); 4670168404Spjd 4671168404Spjd /* 4672168404Spjd * The pool will be in core if it's openable, 4673168404Spjd * in which case we can modify its state. 4674168404Spjd */ 4675168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4676168404Spjd /* 4677168404Spjd * Objsets may be open only because they're dirty, so we 4678168404Spjd * have to force it to sync before checking spa_refcnt. 4679168404Spjd */ 4680168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 4681286575Smav spa_evicting_os_wait(spa); 4682168404Spjd 4683168404Spjd /* 4684168404Spjd * A pool cannot be exported or destroyed if there are active 4685168404Spjd * references. If we are resetting a pool, allow references by 4686168404Spjd * fault injection handlers. 4687168404Spjd */ 4688168404Spjd if (!spa_refcount_zero(spa) || 4689168404Spjd (spa->spa_inject_ref != 0 && 4690168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 4691168404Spjd spa_async_resume(spa); 4692168404Spjd mutex_exit(&spa_namespace_lock); 4693249195Smm return (SET_ERROR(EBUSY)); 4694168404Spjd } 4695168404Spjd 4696185029Spjd /* 4697185029Spjd * A pool cannot be exported if it has an active shared spare. 4698185029Spjd * This is to prevent other pools stealing the active spare 4699185029Spjd * from an exported pool. At user's own will, such pool can 4700185029Spjd * be forcedly exported. 4701185029Spjd */ 4702185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 4703185029Spjd spa_has_active_shared_spare(spa)) { 4704185029Spjd spa_async_resume(spa); 4705185029Spjd mutex_exit(&spa_namespace_lock); 4706249195Smm return (SET_ERROR(EXDEV)); 4707185029Spjd } 4708168404Spjd 4709168404Spjd /* 4710168404Spjd * We want this to be reflected on every label, 4711168404Spjd * so mark them all dirty. spa_unload() will do the 4712168404Spjd * final sync that pushes these changes out. 4713168404Spjd */ 4714207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4715185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4716168404Spjd spa->spa_state = new_state; 4717219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 4718219089Spjd TXG_DEFER_SIZE + 1; 4719168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4720185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4721168404Spjd } 4722168404Spjd } 4723168404Spjd 4724331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 4725185029Spjd 4726168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4727168404Spjd spa_unload(spa); 4728168404Spjd spa_deactivate(spa); 4729168404Spjd } 4730168404Spjd 4731168404Spjd if (oldconfig && spa->spa_config) 4732168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4733168404Spjd 4734168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 4735207670Smm if (!hardforce) 4736207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 4737168404Spjd spa_remove(spa); 4738168404Spjd } 4739168404Spjd mutex_exit(&spa_namespace_lock); 4740168404Spjd 4741168404Spjd return (0); 4742168404Spjd} 4743168404Spjd 4744168404Spjd/* 4745168404Spjd * Destroy a storage pool. 4746168404Spjd */ 4747168404Spjdint 4748168404Spjdspa_destroy(char *pool) 4749168404Spjd{ 4750207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4751207670Smm B_FALSE, B_FALSE)); 4752168404Spjd} 4753168404Spjd 4754168404Spjd/* 4755168404Spjd * Export a storage pool. 4756168404Spjd */ 4757168404Spjdint 4758207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4759207670Smm boolean_t hardforce) 4760168404Spjd{ 4761207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4762207670Smm force, hardforce)); 4763168404Spjd} 4764168404Spjd 4765168404Spjd/* 4766168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 4767168404Spjd * from the namespace in any way. 4768168404Spjd */ 4769168404Spjdint 4770168404Spjdspa_reset(char *pool) 4771168404Spjd{ 4772185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4773207670Smm B_FALSE, B_FALSE)); 4774168404Spjd} 4775168404Spjd 4776168404Spjd/* 4777168404Spjd * ========================================================================== 4778168404Spjd * Device manipulation 4779168404Spjd * ========================================================================== 4780168404Spjd */ 4781168404Spjd 4782168404Spjd/* 4783185029Spjd * Add a device to a storage pool. 4784168404Spjd */ 4785168404Spjdint 4786168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4787168404Spjd{ 4788219089Spjd uint64_t txg, id; 4789209962Smm int error; 4790168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4791168404Spjd vdev_t *vd, *tvd; 4792185029Spjd nvlist_t **spares, **l2cache; 4793185029Spjd uint_t nspares, nl2cache; 4794168404Spjd 4795219089Spjd ASSERT(spa_writeable(spa)); 4796219089Spjd 4797168404Spjd txg = spa_vdev_enter(spa); 4798168404Spjd 4799168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4800168404Spjd VDEV_ALLOC_ADD)) != 0) 4801168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4802168404Spjd 4803185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4804168404Spjd 4805185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4806185029Spjd &nspares) != 0) 4807168404Spjd nspares = 0; 4808168404Spjd 4809185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4810185029Spjd &nl2cache) != 0) 4811185029Spjd nl2cache = 0; 4812185029Spjd 4813185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4814168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4815168404Spjd 4816185029Spjd if (vd->vdev_children != 0 && 4817185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 4818185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4819168404Spjd 4820168404Spjd /* 4821185029Spjd * We must validate the spares and l2cache devices after checking the 4822185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4823168404Spjd */ 4824185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4825168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4826168404Spjd 4827168404Spjd /* 4828168404Spjd * Transfer each new top-level vdev from vd to rvd. 4829168404Spjd */ 4830209962Smm for (int c = 0; c < vd->vdev_children; c++) { 4831219089Spjd 4832219089Spjd /* 4833219089Spjd * Set the vdev id to the first hole, if one exists. 4834219089Spjd */ 4835219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 4836219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 4837219089Spjd vdev_free(rvd->vdev_child[id]); 4838219089Spjd break; 4839219089Spjd } 4840219089Spjd } 4841168404Spjd tvd = vd->vdev_child[c]; 4842168404Spjd vdev_remove_child(vd, tvd); 4843219089Spjd tvd->vdev_id = id; 4844168404Spjd vdev_add_child(rvd, tvd); 4845168404Spjd vdev_config_dirty(tvd); 4846168404Spjd } 4847168404Spjd 4848168404Spjd if (nspares != 0) { 4849185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4850185029Spjd ZPOOL_CONFIG_SPARES); 4851168404Spjd spa_load_spares(spa); 4852185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4853168404Spjd } 4854168404Spjd 4855185029Spjd if (nl2cache != 0) { 4856185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4857185029Spjd ZPOOL_CONFIG_L2CACHE); 4858185029Spjd spa_load_l2cache(spa); 4859185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4860185029Spjd } 4861185029Spjd 4862168404Spjd /* 4863168404Spjd * We have to be careful when adding new vdevs to an existing pool. 4864168404Spjd * If other threads start allocating from these vdevs before we 4865168404Spjd * sync the config cache, and we lose power, then upon reboot we may 4866168404Spjd * fail to open the pool because there are DVAs that the config cache 4867168404Spjd * can't translate. Therefore, we first add the vdevs without 4868168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4869168404Spjd * and then let spa_config_update() initialize the new metaslabs. 4870168404Spjd * 4871168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 4872168404Spjd * if we lose power at any point in this sequence, the remaining 4873168404Spjd * steps will be completed the next time we load the pool. 4874168404Spjd */ 4875168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 4876168404Spjd 4877168404Spjd mutex_enter(&spa_namespace_lock); 4878168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4879331397Smav spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 4880168404Spjd mutex_exit(&spa_namespace_lock); 4881168404Spjd 4882168404Spjd return (0); 4883168404Spjd} 4884168404Spjd 4885168404Spjd/* 4886168404Spjd * Attach a device to a mirror. The arguments are the path to any device 4887168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 4888168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 4889168404Spjd * 4890168404Spjd * If 'replacing' is specified, the new device is intended to replace the 4891168404Spjd * existing device; in this case the two devices are made into their own 4892185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 4893168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 4894168404Spjd * extra rules: you can't attach to it after it's been created, and upon 4895168404Spjd * completion of resilvering, the first disk (the one being replaced) 4896168404Spjd * is automatically detached. 4897168404Spjd */ 4898168404Spjdint 4899168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4900168404Spjd{ 4901219089Spjd uint64_t txg, dtl_max_txg; 4902168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4903168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4904168404Spjd vdev_ops_t *pvops; 4905185029Spjd char *oldvdpath, *newvdpath; 4906185029Spjd int newvd_isspare; 4907185029Spjd int error; 4908168404Spjd 4909219089Spjd ASSERT(spa_writeable(spa)); 4910219089Spjd 4911168404Spjd txg = spa_vdev_enter(spa); 4912168404Spjd 4913185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4914168404Spjd 4915168404Spjd if (oldvd == NULL) 4916168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4917168404Spjd 4918168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 4919168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4920168404Spjd 4921168404Spjd pvd = oldvd->vdev_parent; 4922168404Spjd 4923168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4924230514Smm VDEV_ALLOC_ATTACH)) != 0) 4925185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4926185029Spjd 4927185029Spjd if (newrootvd->vdev_children != 1) 4928168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4929168404Spjd 4930168404Spjd newvd = newrootvd->vdev_child[0]; 4931168404Spjd 4932168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 4933168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4934168404Spjd 4935168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4936168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 4937168404Spjd 4938185029Spjd /* 4939185029Spjd * Spares can't replace logs 4940185029Spjd */ 4941185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4942185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4943185029Spjd 4944168404Spjd if (!replacing) { 4945168404Spjd /* 4946168404Spjd * For attach, the only allowable parent is a mirror or the root 4947168404Spjd * vdev. 4948168404Spjd */ 4949168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 4950168404Spjd pvd->vdev_ops != &vdev_root_ops) 4951168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4952168404Spjd 4953168404Spjd pvops = &vdev_mirror_ops; 4954168404Spjd } else { 4955168404Spjd /* 4956168404Spjd * Active hot spares can only be replaced by inactive hot 4957168404Spjd * spares. 4958168404Spjd */ 4959168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4960219089Spjd oldvd->vdev_isspare && 4961168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 4962168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4963168404Spjd 4964168404Spjd /* 4965168404Spjd * If the source is a hot spare, and the parent isn't already a 4966168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 4967168404Spjd * want to create a replacing vdev. The user is not allowed to 4968168404Spjd * attach to a spared vdev child unless the 'isspare' state is 4969168404Spjd * the same (spare replaces spare, non-spare replaces 4970168404Spjd * non-spare). 4971168404Spjd */ 4972219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 4973219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4974168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4975219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 4976219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 4977168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4978219089Spjd } 4979219089Spjd 4980219089Spjd if (newvd->vdev_isspare) 4981168404Spjd pvops = &vdev_spare_ops; 4982168404Spjd else 4983168404Spjd pvops = &vdev_replacing_ops; 4984168404Spjd } 4985168404Spjd 4986168404Spjd /* 4987219089Spjd * Make sure the new device is big enough. 4988168404Spjd */ 4989219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4990168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4991168404Spjd 4992168404Spjd /* 4993168404Spjd * The new device cannot have a higher alignment requirement 4994168404Spjd * than the top-level vdev. 4995168404Spjd */ 4996168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4997168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4998168404Spjd 4999168404Spjd /* 5000168404Spjd * If this is an in-place replacement, update oldvd's path and devid 5001168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 5002168404Spjd */ 5003168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 5004168404Spjd spa_strfree(oldvd->vdev_path); 5005168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 5006168404Spjd KM_SLEEP); 5007168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 5008168404Spjd newvd->vdev_path, "old"); 5009168404Spjd if (oldvd->vdev_devid != NULL) { 5010168404Spjd spa_strfree(oldvd->vdev_devid); 5011168404Spjd oldvd->vdev_devid = NULL; 5012168404Spjd } 5013168404Spjd } 5014168404Spjd 5015219089Spjd /* mark the device being resilvered */ 5016254112Sdelphij newvd->vdev_resilver_txg = txg; 5017219089Spjd 5018168404Spjd /* 5019168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 5020168404Spjd * mirror/replacing/spare vdev above oldvd. 5021168404Spjd */ 5022168404Spjd if (pvd->vdev_ops != pvops) 5023168404Spjd pvd = vdev_add_parent(oldvd, pvops); 5024168404Spjd 5025168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 5026168404Spjd ASSERT(pvd->vdev_ops == pvops); 5027168404Spjd ASSERT(oldvd->vdev_parent == pvd); 5028168404Spjd 5029168404Spjd /* 5030168404Spjd * Extract the new device from its root and add it to pvd. 5031168404Spjd */ 5032168404Spjd vdev_remove_child(newrootvd, newvd); 5033168404Spjd newvd->vdev_id = pvd->vdev_children; 5034219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 5035168404Spjd vdev_add_child(pvd, newvd); 5036168404Spjd 5037168404Spjd tvd = newvd->vdev_top; 5038168404Spjd ASSERT(pvd->vdev_top == tvd); 5039168404Spjd ASSERT(tvd->vdev_parent == rvd); 5040168404Spjd 5041168404Spjd vdev_config_dirty(tvd); 5042168404Spjd 5043168404Spjd /* 5044219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 5045219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 5046219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 5047168404Spjd */ 5048219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 5049168404Spjd 5050219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 5051219089Spjd dtl_max_txg - TXG_INITIAL); 5052168404Spjd 5053209962Smm if (newvd->vdev_isspare) { 5054168404Spjd spa_spare_activate(newvd); 5055331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 5056209962Smm } 5057209962Smm 5058185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 5059185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 5060185029Spjd newvd_isspare = newvd->vdev_isspare; 5061168404Spjd 5062168404Spjd /* 5063168404Spjd * Mark newvd's DTL dirty in this txg. 5064168404Spjd */ 5065168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 5066168404Spjd 5067219089Spjd /* 5068258717Savg * Schedule the resilver to restart in the future. We do this to 5069258717Savg * ensure that dmu_sync-ed blocks have been stitched into the 5070258717Savg * respective datasets. 5071219089Spjd */ 5072219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 5073168404Spjd 5074287745Sdelphij if (spa->spa_bootfs) 5075331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 5076287745Sdelphij 5077331397Smav spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 5078287745Sdelphij 5079219089Spjd /* 5080219089Spjd * Commit the config 5081219089Spjd */ 5082219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 5083185029Spjd 5084248571Smm spa_history_log_internal(spa, "vdev attach", NULL, 5085219089Spjd "%s vdev=%s %s vdev=%s", 5086219089Spjd replacing && newvd_isspare ? "spare in" : 5087219089Spjd replacing ? "replace" : "attach", newvdpath, 5088219089Spjd replacing ? "for" : "to", oldvdpath); 5089219089Spjd 5090185029Spjd spa_strfree(oldvdpath); 5091185029Spjd spa_strfree(newvdpath); 5092185029Spjd 5093168404Spjd return (0); 5094168404Spjd} 5095168404Spjd 5096168404Spjd/* 5097168404Spjd * Detach a device from a mirror or replacing vdev. 5098251631Sdelphij * 5099168404Spjd * If 'replace_done' is specified, only detach if the parent 5100168404Spjd * is a replacing vdev. 5101168404Spjd */ 5102168404Spjdint 5103209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 5104168404Spjd{ 5105168404Spjd uint64_t txg; 5106209962Smm int error; 5107168404Spjd vdev_t *rvd = spa->spa_root_vdev; 5108168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 5109168404Spjd boolean_t unspare = B_FALSE; 5110247187Smm uint64_t unspare_guid = 0; 5111219089Spjd char *vdpath; 5112168404Spjd 5113219089Spjd ASSERT(spa_writeable(spa)); 5114219089Spjd 5115168404Spjd txg = spa_vdev_enter(spa); 5116168404Spjd 5117185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5118168404Spjd 5119168404Spjd if (vd == NULL) 5120168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 5121168404Spjd 5122168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5123168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5124168404Spjd 5125168404Spjd pvd = vd->vdev_parent; 5126168404Spjd 5127168404Spjd /* 5128209962Smm * If the parent/child relationship is not as expected, don't do it. 5129209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 5130209962Smm * vdev that's replacing B with C. The user's intent in replacing 5131209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 5132209962Smm * the replace by detaching C, the expected behavior is to end up 5133209962Smm * M(A,B). But suppose that right after deciding to detach C, 5134209962Smm * the replacement of B completes. We would have M(A,C), and then 5135209962Smm * ask to detach C, which would leave us with just A -- not what 5136209962Smm * the user wanted. To prevent this, we make sure that the 5137209962Smm * parent/child relationship hasn't changed -- in this example, 5138209962Smm * that C's parent is still the replacing vdev R. 5139209962Smm */ 5140209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 5141209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5142209962Smm 5143209962Smm /* 5144219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 5145168404Spjd */ 5146219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 5147219089Spjd pvd->vdev_ops != &vdev_spare_ops) 5148219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5149168404Spjd 5150168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 5151185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 5152168404Spjd 5153168404Spjd /* 5154168404Spjd * Only mirror, replacing, and spare vdevs support detach. 5155168404Spjd */ 5156168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 5157168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 5158168404Spjd pvd->vdev_ops != &vdev_spare_ops) 5159168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5160168404Spjd 5161168404Spjd /* 5162209962Smm * If this device has the only valid copy of some data, 5163209962Smm * we cannot safely detach it. 5164168404Spjd */ 5165209962Smm if (vdev_dtl_required(vd)) 5166168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5167168404Spjd 5168209962Smm ASSERT(pvd->vdev_children >= 2); 5169168404Spjd 5170168404Spjd /* 5171185029Spjd * If we are detaching the second disk from a replacing vdev, then 5172185029Spjd * check to see if we changed the original vdev's path to have "/old" 5173185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 5174168404Spjd */ 5175219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 5176219089Spjd vd->vdev_path != NULL) { 5177219089Spjd size_t len = strlen(vd->vdev_path); 5178219089Spjd 5179219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 5180219089Spjd cvd = pvd->vdev_child[c]; 5181219089Spjd 5182219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 5183219089Spjd continue; 5184219089Spjd 5185219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 5186219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 5187219089Spjd spa_strfree(cvd->vdev_path); 5188219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 5189219089Spjd break; 5190219089Spjd } 5191185029Spjd } 5192185029Spjd } 5193168404Spjd 5194168404Spjd /* 5195168404Spjd * If we are detaching the original disk from a spare, then it implies 5196168404Spjd * that the spare should become a real disk, and be removed from the 5197168404Spjd * active spare list for the pool. 5198168404Spjd */ 5199168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 5200219089Spjd vd->vdev_id == 0 && 5201219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 5202168404Spjd unspare = B_TRUE; 5203168404Spjd 5204168404Spjd /* 5205168404Spjd * Erase the disk labels so the disk can be used for other things. 5206168404Spjd * This must be done after all other error cases are handled, 5207168404Spjd * but before we disembowel vd (so we can still do I/O to it). 5208168404Spjd * But if we can't do it, don't treat the error as fatal -- 5209168404Spjd * it may be that the unwritability of the disk is the reason 5210168404Spjd * it's being detached! 5211168404Spjd */ 5212168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5213168404Spjd 5214168404Spjd /* 5215168404Spjd * Remove vd from its parent and compact the parent's children. 5216168404Spjd */ 5217168404Spjd vdev_remove_child(pvd, vd); 5218168404Spjd vdev_compact_children(pvd); 5219168404Spjd 5220168404Spjd /* 5221168404Spjd * Remember one of the remaining children so we can get tvd below. 5222168404Spjd */ 5223219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 5224168404Spjd 5225168404Spjd /* 5226168404Spjd * If we need to remove the remaining child from the list of hot spares, 5227209962Smm * do it now, marking the vdev as no longer a spare in the process. 5228209962Smm * We must do this before vdev_remove_parent(), because that can 5229209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 5230209962Smm * reason, we must remove the spare now, in the same txg as the detach; 5231209962Smm * otherwise someone could attach a new sibling, change the GUID, and 5232209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 5233168404Spjd */ 5234168404Spjd if (unspare) { 5235168404Spjd ASSERT(cvd->vdev_isspare); 5236168404Spjd spa_spare_remove(cvd); 5237168404Spjd unspare_guid = cvd->vdev_guid; 5238209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 5239219089Spjd cvd->vdev_unspare = B_TRUE; 5240168404Spjd } 5241168404Spjd 5242168404Spjd /* 5243168404Spjd * If the parent mirror/replacing vdev only has one child, 5244168404Spjd * the parent is no longer needed. Remove it from the tree. 5245168404Spjd */ 5246219089Spjd if (pvd->vdev_children == 1) { 5247219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 5248219089Spjd cvd->vdev_unspare = B_FALSE; 5249168404Spjd vdev_remove_parent(cvd); 5250219089Spjd } 5251168404Spjd 5252219089Spjd 5253168404Spjd /* 5254168404Spjd * We don't set tvd until now because the parent we just removed 5255168404Spjd * may have been the previous top-level vdev. 5256168404Spjd */ 5257168404Spjd tvd = cvd->vdev_top; 5258168404Spjd ASSERT(tvd->vdev_parent == rvd); 5259168404Spjd 5260168404Spjd /* 5261168404Spjd * Reevaluate the parent vdev state. 5262168404Spjd */ 5263185029Spjd vdev_propagate_state(cvd); 5264168404Spjd 5265168404Spjd /* 5266219089Spjd * If the 'autoexpand' property is set on the pool then automatically 5267219089Spjd * try to expand the size of the pool. For example if the device we 5268219089Spjd * just detached was smaller than the others, it may be possible to 5269219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5270219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 5271168404Spjd */ 5272219089Spjd if (spa->spa_autoexpand) { 5273219089Spjd vdev_reopen(tvd); 5274219089Spjd vdev_expand(tvd, txg); 5275219089Spjd } 5276168404Spjd 5277168404Spjd vdev_config_dirty(tvd); 5278168404Spjd 5279168404Spjd /* 5280168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5281168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 5282168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 5283168404Spjd * prevent vd from being accessed after it's freed. 5284168404Spjd */ 5285219089Spjd vdpath = spa_strdup(vd->vdev_path); 5286209962Smm for (int t = 0; t < TXG_SIZE; t++) 5287168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5288168404Spjd vd->vdev_detached = B_TRUE; 5289168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 5290168404Spjd 5291331397Smav spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 5292185029Spjd 5293219089Spjd /* hang on to the spa before we release the lock */ 5294219089Spjd spa_open_ref(spa, FTAG); 5295219089Spjd 5296168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 5297168404Spjd 5298248571Smm spa_history_log_internal(spa, "detach", NULL, 5299219089Spjd "vdev=%s", vdpath); 5300219089Spjd spa_strfree(vdpath); 5301219089Spjd 5302168404Spjd /* 5303168404Spjd * If this was the removal of the original device in a hot spare vdev, 5304168404Spjd * then we want to go through and remove the device from the hot spare 5305168404Spjd * list of every other pool. 5306168404Spjd */ 5307168404Spjd if (unspare) { 5308219089Spjd spa_t *altspa = NULL; 5309219089Spjd 5310168404Spjd mutex_enter(&spa_namespace_lock); 5311219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 5312219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 5313219089Spjd altspa == spa) 5314168404Spjd continue; 5315219089Spjd 5316219089Spjd spa_open_ref(altspa, FTAG); 5317185029Spjd mutex_exit(&spa_namespace_lock); 5318219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5319185029Spjd mutex_enter(&spa_namespace_lock); 5320219089Spjd spa_close(altspa, FTAG); 5321168404Spjd } 5322168404Spjd mutex_exit(&spa_namespace_lock); 5323219089Spjd 5324219089Spjd /* search the rest of the vdevs for spares to remove */ 5325219089Spjd spa_vdev_resilver_done(spa); 5326168404Spjd } 5327168404Spjd 5328219089Spjd /* all done with the spa; OK to release */ 5329219089Spjd mutex_enter(&spa_namespace_lock); 5330219089Spjd spa_close(spa, FTAG); 5331219089Spjd mutex_exit(&spa_namespace_lock); 5332219089Spjd 5333168404Spjd return (error); 5334168404Spjd} 5335168404Spjd 5336219089Spjd/* 5337219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 5338219089Spjd */ 5339219089Spjdint 5340219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5341219089Spjd nvlist_t *props, boolean_t exp) 5342219089Spjd{ 5343219089Spjd int error = 0; 5344219089Spjd uint64_t txg, *glist; 5345219089Spjd spa_t *newspa; 5346219089Spjd uint_t c, children, lastlog; 5347219089Spjd nvlist_t **child, *nvl, *tmp; 5348219089Spjd dmu_tx_t *tx; 5349219089Spjd char *altroot = NULL; 5350219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5351219089Spjd boolean_t activate_slog; 5352219089Spjd 5353219089Spjd ASSERT(spa_writeable(spa)); 5354219089Spjd 5355219089Spjd txg = spa_vdev_enter(spa); 5356219089Spjd 5357219089Spjd /* clear the log and flush everything up to now */ 5358219089Spjd activate_slog = spa_passivate_log(spa); 5359219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5360219089Spjd error = spa_offline_log(spa); 5361219089Spjd txg = spa_vdev_config_enter(spa); 5362219089Spjd 5363219089Spjd if (activate_slog) 5364219089Spjd spa_activate_log(spa); 5365219089Spjd 5366219089Spjd if (error != 0) 5367219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5368219089Spjd 5369219089Spjd /* check new spa name before going any further */ 5370219089Spjd if (spa_lookup(newname) != NULL) 5371219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5372219089Spjd 5373219089Spjd /* 5374219089Spjd * scan through all the children to ensure they're all mirrors 5375219089Spjd */ 5376219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5377219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5378219089Spjd &children) != 0) 5379219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5380219089Spjd 5381219089Spjd /* first, check to ensure we've got the right child count */ 5382219089Spjd rvd = spa->spa_root_vdev; 5383219089Spjd lastlog = 0; 5384219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 5385219089Spjd vdev_t *vd = rvd->vdev_child[c]; 5386219089Spjd 5387219089Spjd /* don't count the holes & logs as children */ 5388219089Spjd if (vd->vdev_islog || vd->vdev_ishole) { 5389219089Spjd if (lastlog == 0) 5390219089Spjd lastlog = c; 5391219089Spjd continue; 5392219089Spjd } 5393219089Spjd 5394219089Spjd lastlog = 0; 5395219089Spjd } 5396219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5397219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5398219089Spjd 5399219089Spjd /* next, ensure no spare or cache devices are part of the split */ 5400219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5401219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5402219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5403219089Spjd 5404219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5405219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5406219089Spjd 5407219089Spjd /* then, loop over each vdev and validate it */ 5408219089Spjd for (c = 0; c < children; c++) { 5409219089Spjd uint64_t is_hole = 0; 5410219089Spjd 5411219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5412219089Spjd &is_hole); 5413219089Spjd 5414219089Spjd if (is_hole != 0) { 5415219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5416219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5417219089Spjd continue; 5418219089Spjd } else { 5419249195Smm error = SET_ERROR(EINVAL); 5420219089Spjd break; 5421219089Spjd } 5422219089Spjd } 5423219089Spjd 5424219089Spjd /* which disk is going to be split? */ 5425219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5426219089Spjd &glist[c]) != 0) { 5427249195Smm error = SET_ERROR(EINVAL); 5428219089Spjd break; 5429219089Spjd } 5430219089Spjd 5431219089Spjd /* look it up in the spa */ 5432219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5433219089Spjd if (vml[c] == NULL) { 5434249195Smm error = SET_ERROR(ENODEV); 5435219089Spjd break; 5436219089Spjd } 5437219089Spjd 5438219089Spjd /* make sure there's nothing stopping the split */ 5439219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5440219089Spjd vml[c]->vdev_islog || 5441219089Spjd vml[c]->vdev_ishole || 5442219089Spjd vml[c]->vdev_isspare || 5443219089Spjd vml[c]->vdev_isl2cache || 5444219089Spjd !vdev_writeable(vml[c]) || 5445219089Spjd vml[c]->vdev_children != 0 || 5446219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5447219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5448249195Smm error = SET_ERROR(EINVAL); 5449219089Spjd break; 5450219089Spjd } 5451219089Spjd 5452219089Spjd if (vdev_dtl_required(vml[c])) { 5453249195Smm error = SET_ERROR(EBUSY); 5454219089Spjd break; 5455219089Spjd } 5456219089Spjd 5457219089Spjd /* we need certain info from the top level */ 5458219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5459219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 5460219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5461219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 5462219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5463219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 5464219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5465219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 5466299441Smav 5467299441Smav /* transfer per-vdev ZAPs */ 5468299441Smav ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 5469299441Smav VERIFY0(nvlist_add_uint64(child[c], 5470299441Smav ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 5471299441Smav 5472299441Smav ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 5473299441Smav VERIFY0(nvlist_add_uint64(child[c], 5474299441Smav ZPOOL_CONFIG_VDEV_TOP_ZAP, 5475299441Smav vml[c]->vdev_parent->vdev_top_zap)); 5476219089Spjd } 5477219089Spjd 5478219089Spjd if (error != 0) { 5479219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5480219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5481219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5482219089Spjd } 5483219089Spjd 5484219089Spjd /* stop writers from using the disks */ 5485219089Spjd for (c = 0; c < children; c++) { 5486219089Spjd if (vml[c] != NULL) 5487219089Spjd vml[c]->vdev_offline = B_TRUE; 5488219089Spjd } 5489219089Spjd vdev_reopen(spa->spa_root_vdev); 5490219089Spjd 5491219089Spjd /* 5492219089Spjd * Temporarily record the splitting vdevs in the spa config. This 5493219089Spjd * will disappear once the config is regenerated. 5494219089Spjd */ 5495219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5496219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5497219089Spjd glist, children) == 0); 5498219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5499219089Spjd 5500219089Spjd mutex_enter(&spa->spa_props_lock); 5501219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5502219089Spjd nvl) == 0); 5503219089Spjd mutex_exit(&spa->spa_props_lock); 5504219089Spjd spa->spa_config_splitting = nvl; 5505219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5506219089Spjd 5507219089Spjd /* configure and create the new pool */ 5508219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5509219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5510219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5511219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5512219089Spjd spa_version(spa)) == 0); 5513219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5514219089Spjd spa->spa_config_txg) == 0); 5515219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5516219089Spjd spa_generate_guid(NULL)) == 0); 5517299441Smav VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 5518219089Spjd (void) nvlist_lookup_string(props, 5519219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5520219089Spjd 5521219089Spjd /* add the new pool to the namespace */ 5522219089Spjd newspa = spa_add(newname, config, altroot); 5523299441Smav newspa->spa_avz_action = AVZ_ACTION_REBUILD; 5524219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 5525219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 5526219089Spjd 5527219089Spjd /* release the spa config lock, retaining the namespace lock */ 5528219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5529219089Spjd 5530219089Spjd if (zio_injection_enabled) 5531219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 5532219089Spjd 5533219089Spjd spa_activate(newspa, spa_mode_global); 5534219089Spjd spa_async_suspend(newspa); 5535219089Spjd 5536277300Ssmh#ifndef illumos 5537219089Spjd /* mark that we are creating new spa by splitting */ 5538219089Spjd newspa->spa_splitting_newspa = B_TRUE; 5539219089Spjd#endif 5540219089Spjd /* create the new pool from the disks of the original pool */ 5541219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5542277300Ssmh#ifndef illumos 5543219089Spjd newspa->spa_splitting_newspa = B_FALSE; 5544219089Spjd#endif 5545219089Spjd if (error) 5546219089Spjd goto out; 5547219089Spjd 5548219089Spjd /* if that worked, generate a real config for the new pool */ 5549219089Spjd if (newspa->spa_root_vdev != NULL) { 5550219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5551219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 5552219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5553219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5554219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5555219089Spjd B_TRUE)); 5556219089Spjd } 5557219089Spjd 5558219089Spjd /* set the props */ 5559219089Spjd if (props != NULL) { 5560219089Spjd spa_configfile_set(newspa, props, B_FALSE); 5561219089Spjd error = spa_prop_set(newspa, props); 5562219089Spjd if (error) 5563219089Spjd goto out; 5564219089Spjd } 5565219089Spjd 5566219089Spjd /* flush everything */ 5567219089Spjd txg = spa_vdev_config_enter(newspa); 5568219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 5569219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5570219089Spjd 5571219089Spjd if (zio_injection_enabled) 5572219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 5573219089Spjd 5574219089Spjd spa_async_resume(newspa); 5575219089Spjd 5576219089Spjd /* finally, update the original pool's config */ 5577219089Spjd txg = spa_vdev_config_enter(spa); 5578219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5579219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 5580219089Spjd if (error != 0) 5581219089Spjd dmu_tx_abort(tx); 5582219089Spjd for (c = 0; c < children; c++) { 5583219089Spjd if (vml[c] != NULL) { 5584219089Spjd vdev_split(vml[c]); 5585219089Spjd if (error == 0) 5586248571Smm spa_history_log_internal(spa, "detach", tx, 5587248571Smm "vdev=%s", vml[c]->vdev_path); 5588299441Smav 5589219089Spjd vdev_free(vml[c]); 5590219089Spjd } 5591219089Spjd } 5592299441Smav spa->spa_avz_action = AVZ_ACTION_REBUILD; 5593219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5594219089Spjd spa->spa_config_splitting = NULL; 5595219089Spjd nvlist_free(nvl); 5596219089Spjd if (error == 0) 5597219089Spjd dmu_tx_commit(tx); 5598219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 5599219089Spjd 5600219089Spjd if (zio_injection_enabled) 5601219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 5602219089Spjd 5603219089Spjd /* split is complete; log a history record */ 5604248571Smm spa_history_log_internal(newspa, "split", NULL, 5605248571Smm "from pool %s", spa_name(spa)); 5606219089Spjd 5607219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5608219089Spjd 5609219089Spjd /* if we're not going to mount the filesystems in userland, export */ 5610219089Spjd if (exp) 5611219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5612219089Spjd B_FALSE, B_FALSE); 5613219089Spjd 5614219089Spjd return (error); 5615219089Spjd 5616219089Spjdout: 5617219089Spjd spa_unload(newspa); 5618219089Spjd spa_deactivate(newspa); 5619219089Spjd spa_remove(newspa); 5620219089Spjd 5621219089Spjd txg = spa_vdev_config_enter(spa); 5622219089Spjd 5623219089Spjd /* re-online all offlined disks */ 5624219089Spjd for (c = 0; c < children; c++) { 5625219089Spjd if (vml[c] != NULL) 5626219089Spjd vml[c]->vdev_offline = B_FALSE; 5627219089Spjd } 5628219089Spjd vdev_reopen(spa->spa_root_vdev); 5629219089Spjd 5630219089Spjd nvlist_free(spa->spa_config_splitting); 5631219089Spjd spa->spa_config_splitting = NULL; 5632219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 5633219089Spjd 5634219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5635219089Spjd return (error); 5636219089Spjd} 5637219089Spjd 5638185029Spjdstatic nvlist_t * 5639185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5640185029Spjd{ 5641185029Spjd for (int i = 0; i < count; i++) { 5642185029Spjd uint64_t guid; 5643185029Spjd 5644185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5645185029Spjd &guid) == 0); 5646185029Spjd 5647185029Spjd if (guid == target_guid) 5648185029Spjd return (nvpp[i]); 5649185029Spjd } 5650185029Spjd 5651185029Spjd return (NULL); 5652185029Spjd} 5653185029Spjd 5654185029Spjdstatic void 5655185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5656307277Smav nvlist_t *dev_to_remove) 5657185029Spjd{ 5658185029Spjd nvlist_t **newdev = NULL; 5659185029Spjd 5660185029Spjd if (count > 1) 5661185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5662185029Spjd 5663185029Spjd for (int i = 0, j = 0; i < count; i++) { 5664185029Spjd if (dev[i] == dev_to_remove) 5665185029Spjd continue; 5666185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5667185029Spjd } 5668185029Spjd 5669185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5670185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5671185029Spjd 5672185029Spjd for (int i = 0; i < count - 1; i++) 5673185029Spjd nvlist_free(newdev[i]); 5674185029Spjd 5675185029Spjd if (count > 1) 5676185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 5677185029Spjd} 5678185029Spjd 5679168404Spjd/* 5680219089Spjd * Evacuate the device. 5681219089Spjd */ 5682219089Spjdstatic int 5683219089Spjdspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5684219089Spjd{ 5685219089Spjd uint64_t txg; 5686219089Spjd int error = 0; 5687219089Spjd 5688219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5689219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5690219089Spjd ASSERT(vd == vd->vdev_top); 5691219089Spjd 5692219089Spjd /* 5693219089Spjd * Evacuate the device. We don't hold the config lock as writer 5694219089Spjd * since we need to do I/O but we do keep the 5695219089Spjd * spa_namespace_lock held. Once this completes the device 5696219089Spjd * should no longer have any blocks allocated on it. 5697219089Spjd */ 5698219089Spjd if (vd->vdev_islog) { 5699219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5700219089Spjd error = spa_offline_log(spa); 5701219089Spjd } else { 5702249195Smm error = SET_ERROR(ENOTSUP); 5703219089Spjd } 5704219089Spjd 5705219089Spjd if (error) 5706219089Spjd return (error); 5707219089Spjd 5708219089Spjd /* 5709219089Spjd * The evacuation succeeded. Remove any remaining MOS metadata 5710219089Spjd * associated with this vdev, and wait for these changes to sync. 5711219089Spjd */ 5712240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 5713219089Spjd txg = spa_vdev_config_enter(spa); 5714219089Spjd vd->vdev_removing = B_TRUE; 5715258717Savg vdev_dirty_leaves(vd, VDD_DTL, txg); 5716219089Spjd vdev_config_dirty(vd); 5717219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5718219089Spjd 5719219089Spjd return (0); 5720219089Spjd} 5721219089Spjd 5722219089Spjd/* 5723219089Spjd * Complete the removal by cleaning up the namespace. 5724219089Spjd */ 5725219089Spjdstatic void 5726219089Spjdspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5727219089Spjd{ 5728219089Spjd vdev_t *rvd = spa->spa_root_vdev; 5729219089Spjd uint64_t id = vd->vdev_id; 5730219089Spjd boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5731219089Spjd 5732219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5733219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5734219089Spjd ASSERT(vd == vd->vdev_top); 5735219089Spjd 5736219089Spjd /* 5737219089Spjd * Only remove any devices which are empty. 5738219089Spjd */ 5739219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5740219089Spjd return; 5741219089Spjd 5742219089Spjd (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5743219089Spjd 5744219089Spjd if (list_link_active(&vd->vdev_state_dirty_node)) 5745219089Spjd vdev_state_clean(vd); 5746219089Spjd if (list_link_active(&vd->vdev_config_dirty_node)) 5747219089Spjd vdev_config_clean(vd); 5748219089Spjd 5749219089Spjd vdev_free(vd); 5750219089Spjd 5751219089Spjd if (last_vdev) { 5752219089Spjd vdev_compact_children(rvd); 5753219089Spjd } else { 5754219089Spjd vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5755219089Spjd vdev_add_child(rvd, vd); 5756219089Spjd } 5757219089Spjd vdev_config_dirty(rvd); 5758219089Spjd 5759219089Spjd /* 5760219089Spjd * Reassess the health of our root vdev. 5761219089Spjd */ 5762219089Spjd vdev_reopen(rvd); 5763219089Spjd} 5764219089Spjd 5765219089Spjd/* 5766219089Spjd * Remove a device from the pool - 5767219089Spjd * 5768219089Spjd * Removing a device from the vdev namespace requires several steps 5769219089Spjd * and can take a significant amount of time. As a result we use 5770219089Spjd * the spa_vdev_config_[enter/exit] functions which allow us to 5771219089Spjd * grab and release the spa_config_lock while still holding the namespace 5772219089Spjd * lock. During each step the configuration is synced out. 5773251631Sdelphij * 5774251631Sdelphij * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5775251631Sdelphij * devices. 5776219089Spjd */ 5777168404Spjdint 5778168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5779168404Spjd{ 5780168404Spjd vdev_t *vd; 5781307113Smav sysevent_t *ev = NULL; 5782219089Spjd metaslab_group_t *mg; 5783185029Spjd nvlist_t **spares, **l2cache, *nv; 5784219089Spjd uint64_t txg = 0; 5785185029Spjd uint_t nspares, nl2cache; 5786185029Spjd int error = 0; 5787209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5788168404Spjd 5789219089Spjd ASSERT(spa_writeable(spa)); 5790219089Spjd 5791209962Smm if (!locked) 5792209962Smm txg = spa_vdev_enter(spa); 5793168404Spjd 5794185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5795168404Spjd 5796185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 5797185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5798185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5799185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5800185029Spjd /* 5801185029Spjd * Only remove the hot spare if it's not currently in use 5802185029Spjd * in this pool. 5803185029Spjd */ 5804185029Spjd if (vd == NULL || unspare) { 5805307113Smav if (vd == NULL) 5806307113Smav vd = spa_lookup_by_guid(spa, guid, B_TRUE); 5807331397Smav ev = spa_event_create(spa, vd, NULL, 5808331397Smav ESC_ZFS_VDEV_REMOVE_AUX); 5809185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 5810185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5811185029Spjd spa_load_spares(spa); 5812185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5813185029Spjd } else { 5814249195Smm error = SET_ERROR(EBUSY); 5815168404Spjd } 5816185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 5817185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5818185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5819185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5820185029Spjd /* 5821185029Spjd * Cache devices can always be removed. 5822185029Spjd */ 5823307113Smav vd = spa_lookup_by_guid(spa, guid, B_TRUE); 5824331397Smav ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); 5825185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5826185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5827185029Spjd spa_load_l2cache(spa); 5828185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5829219089Spjd } else if (vd != NULL && vd->vdev_islog) { 5830219089Spjd ASSERT(!locked); 5831219089Spjd ASSERT(vd == vd->vdev_top); 5832219089Spjd 5833219089Spjd mg = vd->vdev_mg; 5834219089Spjd 5835219089Spjd /* 5836219089Spjd * Stop allocating from this vdev. 5837219089Spjd */ 5838219089Spjd metaslab_group_passivate(mg); 5839219089Spjd 5840219089Spjd /* 5841219089Spjd * Wait for the youngest allocations and frees to sync, 5842219089Spjd * and then wait for the deferral of those frees to finish. 5843219089Spjd */ 5844219089Spjd spa_vdev_config_exit(spa, NULL, 5845219089Spjd txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5846219089Spjd 5847219089Spjd /* 5848219089Spjd * Attempt to evacuate the vdev. 5849219089Spjd */ 5850219089Spjd error = spa_vdev_remove_evacuate(spa, vd); 5851219089Spjd 5852219089Spjd txg = spa_vdev_config_enter(spa); 5853219089Spjd 5854219089Spjd /* 5855219089Spjd * If we couldn't evacuate the vdev, unwind. 5856219089Spjd */ 5857219089Spjd if (error) { 5858219089Spjd metaslab_group_activate(mg); 5859219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5860219089Spjd } 5861219089Spjd 5862219089Spjd /* 5863219089Spjd * Clean up the vdev namespace. 5864219089Spjd */ 5865331397Smav ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); 5866219089Spjd spa_vdev_remove_from_namespace(spa, vd); 5867219089Spjd 5868185029Spjd } else if (vd != NULL) { 5869185029Spjd /* 5870185029Spjd * Normal vdevs cannot be removed (yet). 5871185029Spjd */ 5872249195Smm error = SET_ERROR(ENOTSUP); 5873168404Spjd } else { 5874185029Spjd /* 5875185029Spjd * There is no vdev of any kind with the specified guid. 5876185029Spjd */ 5877249195Smm error = SET_ERROR(ENOENT); 5878168404Spjd } 5879168404Spjd 5880209962Smm if (!locked) 5881307047Smav error = spa_vdev_exit(spa, NULL, txg, error); 5882209962Smm 5883307113Smav if (ev) 5884307113Smav spa_event_post(ev); 5885307113Smav 5886209962Smm return (error); 5887168404Spjd} 5888168404Spjd 5889168404Spjd/* 5890185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 5891251631Sdelphij * currently spared, so we can detach it. 5892168404Spjd */ 5893168404Spjdstatic vdev_t * 5894185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 5895168404Spjd{ 5896168404Spjd vdev_t *newvd, *oldvd; 5897168404Spjd 5898219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5899185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5900168404Spjd if (oldvd != NULL) 5901168404Spjd return (oldvd); 5902168404Spjd } 5903168404Spjd 5904185029Spjd /* 5905219089Spjd * Check for a completed replacement. We always consider the first 5906219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 5907219089Spjd * the newest (see spa_vdev_attach() for how that works). In 5908219089Spjd * the case where the newest vdev is faulted, we will not automatically 5909219089Spjd * remove it after a resilver completes. This is OK as it will require 5910219089Spjd * user intervention to determine which disk the admin wishes to keep. 5911185029Spjd */ 5912219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 5913219089Spjd ASSERT(vd->vdev_children > 1); 5914219089Spjd 5915219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 5916168404Spjd oldvd = vd->vdev_child[0]; 5917168404Spjd 5918209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 5919219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5920209962Smm !vdev_dtl_required(oldvd)) 5921168404Spjd return (oldvd); 5922168404Spjd } 5923168404Spjd 5924185029Spjd /* 5925185029Spjd * Check for a completed resilver with the 'unspare' flag set. 5926185029Spjd */ 5927219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 5928219089Spjd vdev_t *first = vd->vdev_child[0]; 5929219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5930185029Spjd 5931219089Spjd if (last->vdev_unspare) { 5932219089Spjd oldvd = first; 5933219089Spjd newvd = last; 5934219089Spjd } else if (first->vdev_unspare) { 5935219089Spjd oldvd = last; 5936219089Spjd newvd = first; 5937219089Spjd } else { 5938219089Spjd oldvd = NULL; 5939219089Spjd } 5940219089Spjd 5941219089Spjd if (oldvd != NULL && 5942209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 5943219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5944219089Spjd !vdev_dtl_required(oldvd)) 5945185029Spjd return (oldvd); 5946219089Spjd 5947219089Spjd /* 5948219089Spjd * If there are more than two spares attached to a disk, 5949219089Spjd * and those spares are not required, then we want to 5950219089Spjd * attempt to free them up now so that they can be used 5951219089Spjd * by other pools. Once we're back down to a single 5952219089Spjd * disk+spare, we stop removing them. 5953219089Spjd */ 5954219089Spjd if (vd->vdev_children > 2) { 5955219089Spjd newvd = vd->vdev_child[1]; 5956219089Spjd 5957219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 5958219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 5959219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 5960219089Spjd !vdev_dtl_required(newvd)) 5961219089Spjd return (newvd); 5962185029Spjd } 5963185029Spjd } 5964185029Spjd 5965168404Spjd return (NULL); 5966168404Spjd} 5967168404Spjd 5968168404Spjdstatic void 5969185029Spjdspa_vdev_resilver_done(spa_t *spa) 5970168404Spjd{ 5971209962Smm vdev_t *vd, *pvd, *ppvd; 5972209962Smm uint64_t guid, sguid, pguid, ppguid; 5973168404Spjd 5974209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5975168404Spjd 5976185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5977209962Smm pvd = vd->vdev_parent; 5978209962Smm ppvd = pvd->vdev_parent; 5979168404Spjd guid = vd->vdev_guid; 5980209962Smm pguid = pvd->vdev_guid; 5981209962Smm ppguid = ppvd->vdev_guid; 5982209962Smm sguid = 0; 5983168404Spjd /* 5984168404Spjd * If we have just finished replacing a hot spared device, then 5985168404Spjd * we need to detach the parent's first child (the original hot 5986168404Spjd * spare) as well. 5987168404Spjd */ 5988219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5989219089Spjd ppvd->vdev_children == 2) { 5990168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5991209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 5992168404Spjd } 5993254112Sdelphij ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5994254112Sdelphij 5995209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5996209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5997168404Spjd return; 5998209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5999168404Spjd return; 6000209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6001168404Spjd } 6002168404Spjd 6003209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 6004168404Spjd} 6005168404Spjd 6006168404Spjd/* 6007219089Spjd * Update the stored path or FRU for this vdev. 6008168404Spjd */ 6009168404Spjdint 6010209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 6011209962Smm boolean_t ispath) 6012168404Spjd{ 6013185029Spjd vdev_t *vd; 6014219089Spjd boolean_t sync = B_FALSE; 6015168404Spjd 6016219089Spjd ASSERT(spa_writeable(spa)); 6017168404Spjd 6018219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 6019219089Spjd 6020209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 6021219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 6022168404Spjd 6023168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 6024219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 6025168404Spjd 6026209962Smm if (ispath) { 6027219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 6028219089Spjd spa_strfree(vd->vdev_path); 6029219089Spjd vd->vdev_path = spa_strdup(value); 6030219089Spjd sync = B_TRUE; 6031219089Spjd } 6032209962Smm } else { 6033219089Spjd if (vd->vdev_fru == NULL) { 6034219089Spjd vd->vdev_fru = spa_strdup(value); 6035219089Spjd sync = B_TRUE; 6036219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 6037209962Smm spa_strfree(vd->vdev_fru); 6038219089Spjd vd->vdev_fru = spa_strdup(value); 6039219089Spjd sync = B_TRUE; 6040219089Spjd } 6041209962Smm } 6042168404Spjd 6043219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 6044168404Spjd} 6045168404Spjd 6046209962Smmint 6047209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 6048209962Smm{ 6049209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 6050209962Smm} 6051209962Smm 6052209962Smmint 6053209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 6054209962Smm{ 6055209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 6056209962Smm} 6057209962Smm 6058168404Spjd/* 6059168404Spjd * ========================================================================== 6060219089Spjd * SPA Scanning 6061168404Spjd * ========================================================================== 6062168404Spjd */ 6063324010Savgint 6064324010Savgspa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 6065324010Savg{ 6066324010Savg ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6067168404Spjd 6068324010Savg if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6069324010Savg return (SET_ERROR(EBUSY)); 6070324010Savg 6071324010Savg return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 6072324010Savg} 6073324010Savg 6074168404Spjdint 6075219089Spjdspa_scan_stop(spa_t *spa) 6076168404Spjd{ 6077185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6078219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6079249195Smm return (SET_ERROR(EBUSY)); 6080219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 6081219089Spjd} 6082168404Spjd 6083219089Spjdint 6084219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 6085219089Spjd{ 6086219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6087219089Spjd 6088219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 6089249195Smm return (SET_ERROR(ENOTSUP)); 6090168404Spjd 6091168404Spjd /* 6092185029Spjd * If a resilver was requested, but there is no DTL on a 6093185029Spjd * writeable leaf device, we have nothing to do. 6094168404Spjd */ 6095219089Spjd if (func == POOL_SCAN_RESILVER && 6096185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 6097185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 6098168404Spjd return (0); 6099168404Spjd } 6100168404Spjd 6101219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 6102168404Spjd} 6103168404Spjd 6104168404Spjd/* 6105168404Spjd * ========================================================================== 6106168404Spjd * SPA async task processing 6107168404Spjd * ========================================================================== 6108168404Spjd */ 6109168404Spjd 6110168404Spjdstatic void 6111185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 6112168404Spjd{ 6113185029Spjd if (vd->vdev_remove_wanted) { 6114219089Spjd vd->vdev_remove_wanted = B_FALSE; 6115219089Spjd vd->vdev_delayed_close = B_FALSE; 6116185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 6117209962Smm 6118209962Smm /* 6119209962Smm * We want to clear the stats, but we don't want to do a full 6120209962Smm * vdev_clear() as that will cause us to throw away 6121209962Smm * degraded/faulted state as well as attempt to reopen the 6122209962Smm * device, all of which is a waste. 6123209962Smm */ 6124209962Smm vd->vdev_stat.vs_read_errors = 0; 6125209962Smm vd->vdev_stat.vs_write_errors = 0; 6126209962Smm vd->vdev_stat.vs_checksum_errors = 0; 6127209962Smm 6128185029Spjd vdev_state_dirty(vd->vdev_top); 6129294027Sasomers /* Tell userspace that the vdev is gone. */ 6130294027Sasomers zfs_post_remove(spa, vd); 6131185029Spjd } 6132168404Spjd 6133185029Spjd for (int c = 0; c < vd->vdev_children; c++) 6134185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 6135185029Spjd} 6136168404Spjd 6137185029Spjdstatic void 6138185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 6139185029Spjd{ 6140185029Spjd if (vd->vdev_probe_wanted) { 6141219089Spjd vd->vdev_probe_wanted = B_FALSE; 6142185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 6143168404Spjd } 6144168404Spjd 6145185029Spjd for (int c = 0; c < vd->vdev_children; c++) 6146185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 6147168404Spjd} 6148168404Spjd 6149168404Spjdstatic void 6150219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 6151219089Spjd{ 6152219089Spjd sysevent_id_t eid; 6153219089Spjd nvlist_t *attr; 6154219089Spjd char *physpath; 6155219089Spjd 6156219089Spjd if (!spa->spa_autoexpand) 6157219089Spjd return; 6158219089Spjd 6159219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 6160219089Spjd vdev_t *cvd = vd->vdev_child[c]; 6161219089Spjd spa_async_autoexpand(spa, cvd); 6162219089Spjd } 6163219089Spjd 6164219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 6165219089Spjd return; 6166219089Spjd 6167219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 6168219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 6169219089Spjd 6170219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6171219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 6172219089Spjd 6173219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 6174219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 6175219089Spjd 6176219089Spjd nvlist_free(attr); 6177219089Spjd kmem_free(physpath, MAXPATHLEN); 6178219089Spjd} 6179219089Spjd 6180219089Spjdstatic void 6181168404Spjdspa_async_thread(void *arg) 6182168404Spjd{ 6183331399Smav spa_t *spa = (spa_t *)arg; 6184168404Spjd int tasks; 6185168404Spjd 6186168404Spjd ASSERT(spa->spa_sync_on); 6187168404Spjd 6188168404Spjd mutex_enter(&spa->spa_async_lock); 6189168404Spjd tasks = spa->spa_async_tasks; 6190253990Smav spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 6191168404Spjd mutex_exit(&spa->spa_async_lock); 6192168404Spjd 6193168404Spjd /* 6194168404Spjd * See if the config needs to be updated. 6195168404Spjd */ 6196168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 6197219089Spjd uint64_t old_space, new_space; 6198219089Spjd 6199168404Spjd mutex_enter(&spa_namespace_lock); 6200219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 6201168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6202219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 6203168404Spjd mutex_exit(&spa_namespace_lock); 6204219089Spjd 6205219089Spjd /* 6206219089Spjd * If the pool grew as a result of the config update, 6207219089Spjd * then log an internal history event. 6208219089Spjd */ 6209219089Spjd if (new_space != old_space) { 6210248571Smm spa_history_log_internal(spa, "vdev online", NULL, 6211219089Spjd "pool '%s' size: %llu(+%llu)", 6212219089Spjd spa_name(spa), new_space, new_space - old_space); 6213219089Spjd } 6214168404Spjd } 6215168404Spjd 6216219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 6217219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6218219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 6219219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6220219089Spjd } 6221219089Spjd 6222168404Spjd /* 6223185029Spjd * See if any devices need to be probed. 6224168404Spjd */ 6225185029Spjd if (tasks & SPA_ASYNC_PROBE) { 6226219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 6227185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 6228185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 6229185029Spjd } 6230168404Spjd 6231168404Spjd /* 6232185029Spjd * If any devices are done replacing, detach them. 6233168404Spjd */ 6234185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 6235185029Spjd spa_vdev_resilver_done(spa); 6236168404Spjd 6237168404Spjd /* 6238168404Spjd * Kick off a resilver. 6239168404Spjd */ 6240168404Spjd if (tasks & SPA_ASYNC_RESILVER) 6241219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 6242168404Spjd 6243168404Spjd /* 6244168404Spjd * Let the world know that we're done. 6245168404Spjd */ 6246168404Spjd mutex_enter(&spa->spa_async_lock); 6247168404Spjd spa->spa_async_thread = NULL; 6248168404Spjd cv_broadcast(&spa->spa_async_cv); 6249168404Spjd mutex_exit(&spa->spa_async_lock); 6250168404Spjd thread_exit(); 6251168404Spjd} 6252168404Spjd 6253253990Smavstatic void 6254253990Smavspa_async_thread_vd(void *arg) 6255253990Smav{ 6256253990Smav spa_t *spa = arg; 6257253990Smav int tasks; 6258253990Smav 6259253990Smav mutex_enter(&spa->spa_async_lock); 6260253990Smav tasks = spa->spa_async_tasks; 6261253990Smavretry: 6262253990Smav spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 6263253990Smav mutex_exit(&spa->spa_async_lock); 6264253990Smav 6265253990Smav /* 6266253990Smav * See if any devices need to be marked REMOVED. 6267253990Smav */ 6268253990Smav if (tasks & SPA_ASYNC_REMOVE) { 6269253990Smav spa_vdev_state_enter(spa, SCL_NONE); 6270253990Smav spa_async_remove(spa, spa->spa_root_vdev); 6271253990Smav for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 6272253990Smav spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 6273253990Smav for (int i = 0; i < spa->spa_spares.sav_count; i++) 6274253990Smav spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 6275253990Smav (void) spa_vdev_state_exit(spa, NULL, 0); 6276253990Smav } 6277253990Smav 6278253990Smav /* 6279253990Smav * Let the world know that we're done. 6280253990Smav */ 6281253990Smav mutex_enter(&spa->spa_async_lock); 6282253990Smav tasks = spa->spa_async_tasks; 6283253990Smav if ((tasks & SPA_ASYNC_REMOVE) != 0) 6284253990Smav goto retry; 6285253990Smav spa->spa_async_thread_vd = NULL; 6286253990Smav cv_broadcast(&spa->spa_async_cv); 6287253990Smav mutex_exit(&spa->spa_async_lock); 6288253990Smav thread_exit(); 6289253990Smav} 6290253990Smav 6291168404Spjdvoid 6292168404Spjdspa_async_suspend(spa_t *spa) 6293168404Spjd{ 6294168404Spjd mutex_enter(&spa->spa_async_lock); 6295168404Spjd spa->spa_async_suspended++; 6296253990Smav while (spa->spa_async_thread != NULL && 6297253990Smav spa->spa_async_thread_vd != NULL) 6298168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6299168404Spjd mutex_exit(&spa->spa_async_lock); 6300168404Spjd} 6301168404Spjd 6302168404Spjdvoid 6303168404Spjdspa_async_resume(spa_t *spa) 6304168404Spjd{ 6305168404Spjd mutex_enter(&spa->spa_async_lock); 6306168404Spjd ASSERT(spa->spa_async_suspended != 0); 6307168404Spjd spa->spa_async_suspended--; 6308168404Spjd mutex_exit(&spa->spa_async_lock); 6309168404Spjd} 6310168404Spjd 6311251636Sdelphijstatic boolean_t 6312251636Sdelphijspa_async_tasks_pending(spa_t *spa) 6313251636Sdelphij{ 6314251636Sdelphij uint_t non_config_tasks; 6315251636Sdelphij uint_t config_task; 6316251636Sdelphij boolean_t config_task_suspended; 6317251636Sdelphij 6318253990Smav non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6319253990Smav SPA_ASYNC_REMOVE); 6320251636Sdelphij config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6321251636Sdelphij if (spa->spa_ccw_fail_time == 0) { 6322251636Sdelphij config_task_suspended = B_FALSE; 6323251636Sdelphij } else { 6324251636Sdelphij config_task_suspended = 6325251636Sdelphij (gethrtime() - spa->spa_ccw_fail_time) < 6326251636Sdelphij (zfs_ccw_retry_interval * NANOSEC); 6327251636Sdelphij } 6328251636Sdelphij 6329251636Sdelphij return (non_config_tasks || (config_task && !config_task_suspended)); 6330251636Sdelphij} 6331251636Sdelphij 6332168404Spjdstatic void 6333168404Spjdspa_async_dispatch(spa_t *spa) 6334168404Spjd{ 6335168404Spjd mutex_enter(&spa->spa_async_lock); 6336251636Sdelphij if (spa_async_tasks_pending(spa) && 6337251636Sdelphij !spa->spa_async_suspended && 6338168404Spjd spa->spa_async_thread == NULL && 6339251636Sdelphij rootdir != NULL) 6340168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 6341168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6342168404Spjd mutex_exit(&spa->spa_async_lock); 6343168404Spjd} 6344168404Spjd 6345253990Smavstatic void 6346253990Smavspa_async_dispatch_vd(spa_t *spa) 6347253990Smav{ 6348253990Smav mutex_enter(&spa->spa_async_lock); 6349253990Smav if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6350253990Smav !spa->spa_async_suspended && 6351253990Smav spa->spa_async_thread_vd == NULL && 6352253990Smav rootdir != NULL) 6353253990Smav spa->spa_async_thread_vd = thread_create(NULL, 0, 6354253990Smav spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6355253990Smav mutex_exit(&spa->spa_async_lock); 6356253990Smav} 6357253990Smav 6358168404Spjdvoid 6359168404Spjdspa_async_request(spa_t *spa, int task) 6360168404Spjd{ 6361219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6362168404Spjd mutex_enter(&spa->spa_async_lock); 6363168404Spjd spa->spa_async_tasks |= task; 6364168404Spjd mutex_exit(&spa->spa_async_lock); 6365253990Smav spa_async_dispatch_vd(spa); 6366168404Spjd} 6367168404Spjd 6368168404Spjd/* 6369168404Spjd * ========================================================================== 6370168404Spjd * SPA syncing routines 6371168404Spjd * ========================================================================== 6372168404Spjd */ 6373168404Spjd 6374219089Spjdstatic int 6375219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6376168404Spjd{ 6377219089Spjd bpobj_t *bpo = arg; 6378219089Spjd bpobj_enqueue(bpo, bp, tx); 6379219089Spjd return (0); 6380219089Spjd} 6381168404Spjd 6382219089Spjdstatic int 6383219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6384219089Spjd{ 6385219089Spjd zio_t *zio = arg; 6386168404Spjd 6387219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6388240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 6389219089Spjd return (0); 6390168404Spjd} 6391168404Spjd 6392258632Savg/* 6393258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6394258632Savg * amount of time spent syncing frees. 6395258632Savg */ 6396168404Spjdstatic void 6397258632Savgspa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6398258632Savg{ 6399258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6400258632Savg bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6401258632Savg VERIFY(zio_wait(zio) == 0); 6402258632Savg} 6403258632Savg 6404258632Savg/* 6405258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6406258632Savg * amount of time spent syncing deferred frees. 6407258632Savg */ 6408258632Savgstatic void 6409258632Savgspa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6410258632Savg{ 6411258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6412258632Savg VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6413258632Savg spa_free_sync_cb, zio, tx), ==, 0); 6414258632Savg VERIFY0(zio_wait(zio)); 6415258632Savg} 6416258632Savg 6417258632Savg 6418258632Savgstatic void 6419168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6420168404Spjd{ 6421168404Spjd char *packed = NULL; 6422185029Spjd size_t bufsize; 6423168404Spjd size_t nvsize = 0; 6424168404Spjd dmu_buf_t *db; 6425168404Spjd 6426168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6427168404Spjd 6428185029Spjd /* 6429185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6430260150Sdelphij * information. This avoids the dmu_buf_will_dirty() path and 6431185029Spjd * saves us a pre-read to get data we don't actually care about. 6432185029Spjd */ 6433236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6434185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 6435168404Spjd 6436168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6437168404Spjd KM_SLEEP) == 0); 6438185029Spjd bzero(packed + nvsize, bufsize - nvsize); 6439168404Spjd 6440185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6441168404Spjd 6442185029Spjd kmem_free(packed, bufsize); 6443168404Spjd 6444168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6445168404Spjd dmu_buf_will_dirty(db, tx); 6446168404Spjd *(uint64_t *)db->db_data = nvsize; 6447168404Spjd dmu_buf_rele(db, FTAG); 6448168404Spjd} 6449168404Spjd 6450168404Spjdstatic void 6451185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6452185029Spjd const char *config, const char *entry) 6453168404Spjd{ 6454168404Spjd nvlist_t *nvroot; 6455185029Spjd nvlist_t **list; 6456168404Spjd int i; 6457168404Spjd 6458185029Spjd if (!sav->sav_sync) 6459168404Spjd return; 6460168404Spjd 6461168404Spjd /* 6462185029Spjd * Update the MOS nvlist describing the list of available devices. 6463185029Spjd * spa_validate_aux() will have already made sure this nvlist is 6464185029Spjd * valid and the vdevs are labeled appropriately. 6465168404Spjd */ 6466185029Spjd if (sav->sav_object == 0) { 6467185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6468185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6469185029Spjd sizeof (uint64_t), tx); 6470168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 6471185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6472185029Spjd &sav->sav_object, tx) == 0); 6473168404Spjd } 6474168404Spjd 6475168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6476185029Spjd if (sav->sav_count == 0) { 6477185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6478168404Spjd } else { 6479185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6480185029Spjd for (i = 0; i < sav->sav_count; i++) 6481185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6482219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 6483185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6484185029Spjd sav->sav_count) == 0); 6485185029Spjd for (i = 0; i < sav->sav_count; i++) 6486185029Spjd nvlist_free(list[i]); 6487185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 6488168404Spjd } 6489168404Spjd 6490185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6491168404Spjd nvlist_free(nvroot); 6492168404Spjd 6493185029Spjd sav->sav_sync = B_FALSE; 6494168404Spjd} 6495168404Spjd 6496299441Smav/* 6497299441Smav * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 6498299441Smav * The all-vdev ZAP must be empty. 6499299441Smav */ 6500168404Spjdstatic void 6501299441Smavspa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 6502299441Smav{ 6503299441Smav spa_t *spa = vd->vdev_spa; 6504299441Smav if (vd->vdev_top_zap != 0) { 6505299441Smav VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6506299441Smav vd->vdev_top_zap, tx)); 6507299441Smav } 6508299441Smav if (vd->vdev_leaf_zap != 0) { 6509299441Smav VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6510299441Smav vd->vdev_leaf_zap, tx)); 6511299441Smav } 6512299441Smav for (uint64_t i = 0; i < vd->vdev_children; i++) { 6513299441Smav spa_avz_build(vd->vdev_child[i], avz, tx); 6514299441Smav } 6515299441Smav} 6516299441Smav 6517299441Smavstatic void 6518168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6519168404Spjd{ 6520168404Spjd nvlist_t *config; 6521168404Spjd 6522299441Smav /* 6523299441Smav * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 6524299441Smav * its config may not be dirty but we still need to build per-vdev ZAPs. 6525299441Smav * Similarly, if the pool is being assembled (e.g. after a split), we 6526299441Smav * need to rebuild the AVZ although the config may not be dirty. 6527299441Smav */ 6528299441Smav if (list_is_empty(&spa->spa_config_dirty_list) && 6529299441Smav spa->spa_avz_action == AVZ_ACTION_NONE) 6530168404Spjd return; 6531168404Spjd 6532185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6533168404Spjd 6534299441Smav ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 6535321540Smav spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 6536299441Smav spa->spa_all_vdev_zaps != 0); 6537299441Smav 6538299441Smav if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 6539299441Smav /* Make and build the new AVZ */ 6540299441Smav uint64_t new_avz = zap_create(spa->spa_meta_objset, 6541299441Smav DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 6542299441Smav spa_avz_build(spa->spa_root_vdev, new_avz, tx); 6543299441Smav 6544299441Smav /* Diff old AVZ with new one */ 6545299441Smav zap_cursor_t zc; 6546299441Smav zap_attribute_t za; 6547299441Smav 6548299441Smav for (zap_cursor_init(&zc, spa->spa_meta_objset, 6549299441Smav spa->spa_all_vdev_zaps); 6550299441Smav zap_cursor_retrieve(&zc, &za) == 0; 6551299441Smav zap_cursor_advance(&zc)) { 6552299441Smav uint64_t vdzap = za.za_first_integer; 6553299441Smav if (zap_lookup_int(spa->spa_meta_objset, new_avz, 6554299441Smav vdzap) == ENOENT) { 6555299441Smav /* 6556299441Smav * ZAP is listed in old AVZ but not in new one; 6557299441Smav * destroy it 6558299441Smav */ 6559299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 6560299441Smav tx)); 6561299441Smav } 6562299441Smav } 6563299441Smav 6564299441Smav zap_cursor_fini(&zc); 6565299441Smav 6566299441Smav /* Destroy the old AVZ */ 6567299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, 6568299441Smav spa->spa_all_vdev_zaps, tx)); 6569299441Smav 6570299441Smav /* Replace the old AVZ in the dir obj with the new one */ 6571299441Smav VERIFY0(zap_update(spa->spa_meta_objset, 6572299441Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 6573299441Smav sizeof (new_avz), 1, &new_avz, tx)); 6574299441Smav 6575299441Smav spa->spa_all_vdev_zaps = new_avz; 6576299441Smav } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 6577299441Smav zap_cursor_t zc; 6578299441Smav zap_attribute_t za; 6579299441Smav 6580299441Smav /* Walk through the AVZ and destroy all listed ZAPs */ 6581299441Smav for (zap_cursor_init(&zc, spa->spa_meta_objset, 6582299441Smav spa->spa_all_vdev_zaps); 6583299441Smav zap_cursor_retrieve(&zc, &za) == 0; 6584299441Smav zap_cursor_advance(&zc)) { 6585299441Smav uint64_t zap = za.za_first_integer; 6586299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 6587299441Smav } 6588299441Smav 6589299441Smav zap_cursor_fini(&zc); 6590299441Smav 6591299441Smav /* Destroy and unlink the AVZ itself */ 6592299441Smav VERIFY0(zap_destroy(spa->spa_meta_objset, 6593299441Smav spa->spa_all_vdev_zaps, tx)); 6594299441Smav VERIFY0(zap_remove(spa->spa_meta_objset, 6595299441Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 6596299441Smav spa->spa_all_vdev_zaps = 0; 6597299441Smav } 6598299441Smav 6599299441Smav if (spa->spa_all_vdev_zaps == 0) { 6600299441Smav spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 6601299441Smav DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 6602299441Smav DMU_POOL_VDEV_ZAP_MAP, tx); 6603299441Smav } 6604299441Smav spa->spa_avz_action = AVZ_ACTION_NONE; 6605299441Smav 6606299441Smav /* Create ZAPs for vdevs that don't have them. */ 6607299441Smav vdev_construct_zaps(spa->spa_root_vdev, tx); 6608299441Smav 6609185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 6610185029Spjd dmu_tx_get_txg(tx), B_FALSE); 6611185029Spjd 6612243505Smm /* 6613243505Smm * If we're upgrading the spa version then make sure that 6614243505Smm * the config object gets updated with the correct version. 6615243505Smm */ 6616243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6617243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6618243505Smm spa->spa_uberblock.ub_version); 6619243505Smm 6620185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6621185029Spjd 6622296528Smav nvlist_free(spa->spa_config_syncing); 6623168404Spjd spa->spa_config_syncing = config; 6624168404Spjd 6625168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6626168404Spjd} 6627168404Spjd 6628236884Smmstatic void 6629248571Smmspa_sync_version(void *arg, dmu_tx_t *tx) 6630236884Smm{ 6631248571Smm uint64_t *versionp = arg; 6632248571Smm uint64_t version = *versionp; 6633248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6634236884Smm 6635236884Smm /* 6636236884Smm * Setting the version is special cased when first creating the pool. 6637236884Smm */ 6638236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 6639236884Smm 6640247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6641236884Smm ASSERT(version >= spa_version(spa)); 6642236884Smm 6643236884Smm spa->spa_uberblock.ub_version = version; 6644236884Smm vdev_config_dirty(spa->spa_root_vdev); 6645248571Smm spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6646236884Smm} 6647236884Smm 6648185029Spjd/* 6649185029Spjd * Set zpool properties. 6650185029Spjd */ 6651168404Spjdstatic void 6652248571Smmspa_sync_props(void *arg, dmu_tx_t *tx) 6653168404Spjd{ 6654248571Smm nvlist_t *nvp = arg; 6655248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6656185029Spjd objset_t *mos = spa->spa_meta_objset; 6657236884Smm nvpair_t *elem = NULL; 6658168404Spjd 6659168404Spjd mutex_enter(&spa->spa_props_lock); 6660168404Spjd 6661185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 6662236884Smm uint64_t intval; 6663236884Smm char *strval, *fname; 6664236884Smm zpool_prop_t prop; 6665236884Smm const char *propname; 6666236884Smm zprop_type_t proptype; 6667259813Sdelphij spa_feature_t fid; 6668236884Smm 6669185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6670329493Smav case ZPOOL_PROP_INVAL: 6671236884Smm /* 6672236884Smm * We checked this earlier in spa_prop_validate(). 6673236884Smm */ 6674236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 6675236884Smm 6676236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 6677259813Sdelphij VERIFY0(zfeature_lookup_name(fname, &fid)); 6678236884Smm 6679259813Sdelphij spa_feature_enable(spa, fid, tx); 6680248571Smm spa_history_log_internal(spa, "set", tx, 6681248571Smm "%s=enabled", nvpair_name(elem)); 6682236884Smm break; 6683236884Smm 6684185029Spjd case ZPOOL_PROP_VERSION: 6685258717Savg intval = fnvpair_value_uint64(elem); 6686185029Spjd /* 6687236884Smm * The version is synced seperatly before other 6688236884Smm * properties and should be correct by now. 6689185029Spjd */ 6690236884Smm ASSERT3U(spa_version(spa), >=, intval); 6691185029Spjd break; 6692168404Spjd 6693185029Spjd case ZPOOL_PROP_ALTROOT: 6694185029Spjd /* 6695185029Spjd * 'altroot' is a non-persistent property. It should 6696185029Spjd * have been set temporarily at creation or import time. 6697185029Spjd */ 6698185029Spjd ASSERT(spa->spa_root != NULL); 6699185029Spjd break; 6700168404Spjd 6701219089Spjd case ZPOOL_PROP_READONLY: 6702185029Spjd case ZPOOL_PROP_CACHEFILE: 6703185029Spjd /* 6704219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 6705219089Spjd * properties. 6706185029Spjd */ 6707168404Spjd break; 6708228103Smm case ZPOOL_PROP_COMMENT: 6709258717Savg strval = fnvpair_value_string(elem); 6710228103Smm if (spa->spa_comment != NULL) 6711228103Smm spa_strfree(spa->spa_comment); 6712228103Smm spa->spa_comment = spa_strdup(strval); 6713228103Smm /* 6714228103Smm * We need to dirty the configuration on all the vdevs 6715228103Smm * so that their labels get updated. It's unnecessary 6716228103Smm * to do this for pool creation since the vdev's 6717228103Smm * configuratoin has already been dirtied. 6718228103Smm */ 6719228103Smm if (tx->tx_txg != TXG_INITIAL) 6720228103Smm vdev_config_dirty(spa->spa_root_vdev); 6721248571Smm spa_history_log_internal(spa, "set", tx, 6722248571Smm "%s=%s", nvpair_name(elem), strval); 6723228103Smm break; 6724185029Spjd default: 6725185029Spjd /* 6726185029Spjd * Set pool property values in the poolprops mos object. 6727185029Spjd */ 6728185029Spjd if (spa->spa_pool_props_object == 0) { 6729236884Smm spa->spa_pool_props_object = 6730236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 6731185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6732236884Smm tx); 6733185029Spjd } 6734185029Spjd 6735185029Spjd /* normalize the property name */ 6736185029Spjd propname = zpool_prop_to_name(prop); 6737185029Spjd proptype = zpool_prop_get_type(prop); 6738185029Spjd 6739185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 6740185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 6741258717Savg strval = fnvpair_value_string(elem); 6742258717Savg VERIFY0(zap_update(mos, 6743185029Spjd spa->spa_pool_props_object, propname, 6744258717Savg 1, strlen(strval) + 1, strval, tx)); 6745248571Smm spa_history_log_internal(spa, "set", tx, 6746248571Smm "%s=%s", nvpair_name(elem), strval); 6747185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6748258717Savg intval = fnvpair_value_uint64(elem); 6749185029Spjd 6750185029Spjd if (proptype == PROP_TYPE_INDEX) { 6751185029Spjd const char *unused; 6752258717Savg VERIFY0(zpool_prop_index_to_string( 6753258717Savg prop, intval, &unused)); 6754185029Spjd } 6755258717Savg VERIFY0(zap_update(mos, 6756185029Spjd spa->spa_pool_props_object, propname, 6757258717Savg 8, 1, &intval, tx)); 6758248571Smm spa_history_log_internal(spa, "set", tx, 6759248571Smm "%s=%lld", nvpair_name(elem), intval); 6760185029Spjd } else { 6761185029Spjd ASSERT(0); /* not allowed */ 6762185029Spjd } 6763185029Spjd 6764185029Spjd switch (prop) { 6765185029Spjd case ZPOOL_PROP_DELEGATION: 6766185029Spjd spa->spa_delegation = intval; 6767185029Spjd break; 6768185029Spjd case ZPOOL_PROP_BOOTFS: 6769185029Spjd spa->spa_bootfs = intval; 6770185029Spjd break; 6771185029Spjd case ZPOOL_PROP_FAILUREMODE: 6772185029Spjd spa->spa_failmode = intval; 6773185029Spjd break; 6774219089Spjd case ZPOOL_PROP_AUTOEXPAND: 6775219089Spjd spa->spa_autoexpand = intval; 6776219089Spjd if (tx->tx_txg != TXG_INITIAL) 6777219089Spjd spa_async_request(spa, 6778219089Spjd SPA_ASYNC_AUTOEXPAND); 6779219089Spjd break; 6780219089Spjd case ZPOOL_PROP_DEDUPDITTO: 6781219089Spjd spa->spa_dedup_ditto = intval; 6782219089Spjd break; 6783185029Spjd default: 6784185029Spjd break; 6785185029Spjd } 6786168404Spjd } 6787185029Spjd 6788168404Spjd } 6789185029Spjd 6790185029Spjd mutex_exit(&spa->spa_props_lock); 6791168404Spjd} 6792168404Spjd 6793168404Spjd/* 6794219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 6795219089Spjd * reflect the new version this txg, so there must be no changes this 6796219089Spjd * txg to anything that the upgrade code depends on after it executes. 6797219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 6798219089Spjd * tasks. 6799219089Spjd */ 6800219089Spjdstatic void 6801219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6802219089Spjd{ 6803219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6804219089Spjd 6805219089Spjd ASSERT(spa->spa_sync_pass == 1); 6806219089Spjd 6807248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6808248571Smm 6809219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6810219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6811219089Spjd dsl_pool_create_origin(dp, tx); 6812219089Spjd 6813219089Spjd /* Keeping the origin open increases spa_minref */ 6814219089Spjd spa->spa_minref += 3; 6815219089Spjd } 6816219089Spjd 6817219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6818219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6819219089Spjd dsl_pool_upgrade_clones(dp, tx); 6820219089Spjd } 6821219089Spjd 6822219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6823219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6824219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 6825219089Spjd 6826219089Spjd /* Keeping the freedir open increases spa_minref */ 6827219089Spjd spa->spa_minref += 3; 6828219089Spjd } 6829236884Smm 6830236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6831236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6832236884Smm spa_feature_create_zap_objects(spa, tx); 6833236884Smm } 6834268126Sdelphij 6835268126Sdelphij /* 6836268126Sdelphij * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6837268126Sdelphij * when possibility to use lz4 compression for metadata was added 6838268126Sdelphij * Old pools that have this feature enabled must be upgraded to have 6839268126Sdelphij * this feature active 6840268126Sdelphij */ 6841268126Sdelphij if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6842268126Sdelphij boolean_t lz4_en = spa_feature_is_enabled(spa, 6843268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 6844268126Sdelphij boolean_t lz4_ac = spa_feature_is_active(spa, 6845268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 6846268126Sdelphij 6847268126Sdelphij if (lz4_en && !lz4_ac) 6848268126Sdelphij spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6849268126Sdelphij } 6850289422Smav 6851289422Smav /* 6852289422Smav * If we haven't written the salt, do so now. Note that the 6853289422Smav * feature may not be activated yet, but that's fine since 6854289422Smav * the presence of this ZAP entry is backwards compatible. 6855289422Smav */ 6856289422Smav if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6857289422Smav DMU_POOL_CHECKSUM_SALT) == ENOENT) { 6858289422Smav VERIFY0(zap_add(spa->spa_meta_objset, 6859289422Smav DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 6860289422Smav sizeof (spa->spa_cksum_salt.zcs_bytes), 6861289422Smav spa->spa_cksum_salt.zcs_bytes, tx)); 6862289422Smav } 6863289422Smav 6864248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 6865219089Spjd} 6866219089Spjd 6867219089Spjd/* 6868168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 6869168404Spjd * part of the process, so we iterate until it converges. 6870168404Spjd */ 6871168404Spjdvoid 6872168404Spjdspa_sync(spa_t *spa, uint64_t txg) 6873168404Spjd{ 6874168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6875168404Spjd objset_t *mos = spa->spa_meta_objset; 6876219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6877168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6878168404Spjd vdev_t *vd; 6879168404Spjd dmu_tx_t *tx; 6880185029Spjd int error; 6881307277Smav uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 6882307277Smav zfs_vdev_queue_depth_pct / 100; 6883168404Spjd 6884219089Spjd VERIFY(spa_writeable(spa)); 6885219089Spjd 6886168404Spjd /* 6887168404Spjd * Lock out configuration changes. 6888168404Spjd */ 6889185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6890168404Spjd 6891168404Spjd spa->spa_syncing_txg = txg; 6892168404Spjd spa->spa_sync_pass = 0; 6893168404Spjd 6894307277Smav mutex_enter(&spa->spa_alloc_lock); 6895307277Smav VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 6896307277Smav mutex_exit(&spa->spa_alloc_lock); 6897307277Smav 6898185029Spjd /* 6899185029Spjd * If there are any pending vdev state changes, convert them 6900185029Spjd * into config changes that go out with this transaction group. 6901185029Spjd */ 6902185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6903209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 6904209962Smm /* 6905209962Smm * We need the write lock here because, for aux vdevs, 6906209962Smm * calling vdev_config_dirty() modifies sav_config. 6907209962Smm * This is ugly and will become unnecessary when we 6908209962Smm * eliminate the aux vdev wart by integrating all vdevs 6909209962Smm * into the root vdev tree. 6910209962Smm */ 6911209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6912209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6913209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6914209962Smm vdev_state_clean(vd); 6915209962Smm vdev_config_dirty(vd); 6916209962Smm } 6917209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6918209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6919185029Spjd } 6920185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6921185029Spjd 6922168404Spjd tx = dmu_tx_create_assigned(dp, txg); 6923168404Spjd 6924247265Smm spa->spa_sync_starttime = gethrtime(); 6925247265Smm#ifdef illumos 6926247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6927247265Smm spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6928277300Ssmh#else /* !illumos */ 6929247265Smm#ifdef _KERNEL 6930314665Savg callout_schedule(&spa->spa_deadman_cycid, 6931314665Savg hz * spa->spa_deadman_synctime / NANOSEC); 6932247265Smm#endif 6933277300Ssmh#endif /* illumos */ 6934247265Smm 6935168404Spjd /* 6936185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6937168404Spjd * set spa_deflate if we have no raid-z vdevs. 6938168404Spjd */ 6939185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6940185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6941168404Spjd int i; 6942168404Spjd 6943168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 6944168404Spjd vd = rvd->vdev_child[i]; 6945168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6946168404Spjd break; 6947168404Spjd } 6948168404Spjd if (i == rvd->vdev_children) { 6949168404Spjd spa->spa_deflate = TRUE; 6950168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 6951168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6952168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6953168404Spjd } 6954168404Spjd } 6955168404Spjd 6956168404Spjd /* 6957307277Smav * Set the top-level vdev's max queue depth. Evaluate each 6958307277Smav * top-level's async write queue depth in case it changed. 6959307277Smav * The max queue depth will not change in the middle of syncing 6960307277Smav * out this txg. 6961307277Smav */ 6962307277Smav uint64_t queue_depth_total = 0; 6963307277Smav for (int c = 0; c < rvd->vdev_children; c++) { 6964307277Smav vdev_t *tvd = rvd->vdev_child[c]; 6965307277Smav metaslab_group_t *mg = tvd->vdev_mg; 6966307277Smav 6967307277Smav if (mg == NULL || mg->mg_class != spa_normal_class(spa) || 6968307277Smav !metaslab_group_initialized(mg)) 6969307277Smav continue; 6970307277Smav 6971307277Smav /* 6972307277Smav * It is safe to do a lock-free check here because only async 6973307277Smav * allocations look at mg_max_alloc_queue_depth, and async 6974307277Smav * allocations all happen from spa_sync(). 6975307277Smav */ 6976307277Smav ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); 6977307277Smav mg->mg_max_alloc_queue_depth = max_queue_depth; 6978307277Smav queue_depth_total += mg->mg_max_alloc_queue_depth; 6979307277Smav } 6980307277Smav metaslab_class_t *mc = spa_normal_class(spa); 6981307277Smav ASSERT0(refcount_count(&mc->mc_alloc_slots)); 6982307277Smav mc->mc_alloc_max_slots = queue_depth_total; 6983307277Smav mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 6984307277Smav 6985307277Smav ASSERT3U(mc->mc_alloc_max_slots, <=, 6986307277Smav max_queue_depth * rvd->vdev_children); 6987307277Smav 6988307277Smav /* 6989168404Spjd * Iterate to convergence. 6990168404Spjd */ 6991168404Spjd do { 6992219089Spjd int pass = ++spa->spa_sync_pass; 6993168404Spjd 6994168404Spjd spa_sync_config_object(spa, tx); 6995185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6996185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6997185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6998185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6999168404Spjd spa_errlog_sync(spa, txg); 7000168404Spjd dsl_pool_sync(dp, txg); 7001168404Spjd 7002243503Smm if (pass < zfs_sync_pass_deferred_free) { 7003258632Savg spa_sync_frees(spa, free_bpl, tx); 7004219089Spjd } else { 7005275781Sdelphij /* 7006275781Sdelphij * We can not defer frees in pass 1, because 7007275781Sdelphij * we sync the deferred frees later in pass 1. 7008275781Sdelphij */ 7009275781Sdelphij ASSERT3U(pass, >, 1); 7010219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 7011258632Savg &spa->spa_deferred_bpobj, tx); 7012168404Spjd } 7013168404Spjd 7014219089Spjd ddt_sync(spa, txg); 7015219089Spjd dsl_scan_sync(dp, tx); 7016168404Spjd 7017219089Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 7018219089Spjd vdev_sync(vd, txg); 7019168404Spjd 7020275781Sdelphij if (pass == 1) { 7021219089Spjd spa_sync_upgrades(spa, tx); 7022275781Sdelphij ASSERT3U(txg, >=, 7023275781Sdelphij spa->spa_uberblock.ub_rootbp.blk_birth); 7024275781Sdelphij /* 7025275781Sdelphij * Note: We need to check if the MOS is dirty 7026275781Sdelphij * because we could have marked the MOS dirty 7027275781Sdelphij * without updating the uberblock (e.g. if we 7028275781Sdelphij * have sync tasks but no dirty user data). We 7029275781Sdelphij * need to check the uberblock's rootbp because 7030275781Sdelphij * it is updated if we have synced out dirty 7031275781Sdelphij * data (though in this case the MOS will most 7032275781Sdelphij * likely also be dirty due to second order 7033275781Sdelphij * effects, we don't want to rely on that here). 7034275781Sdelphij */ 7035275781Sdelphij if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 7036275781Sdelphij !dmu_objset_is_dirty(mos, txg)) { 7037275781Sdelphij /* 7038275781Sdelphij * Nothing changed on the first pass, 7039275781Sdelphij * therefore this TXG is a no-op. Avoid 7040275781Sdelphij * syncing deferred frees, so that we 7041275781Sdelphij * can keep this TXG as a no-op. 7042275781Sdelphij */ 7043275781Sdelphij ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 7044275781Sdelphij txg)); 7045275781Sdelphij ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7046275781Sdelphij ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 7047275781Sdelphij break; 7048275781Sdelphij } 7049275781Sdelphij spa_sync_deferred_frees(spa, tx); 7050275781Sdelphij } 7051168404Spjd 7052219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 7053219089Spjd 7054299441Smav if (!list_is_empty(&spa->spa_config_dirty_list)) { 7055299441Smav /* 7056299441Smav * Make sure that the number of ZAPs for all the vdevs matches 7057299441Smav * the number of ZAPs in the per-vdev ZAP list. This only gets 7058299441Smav * called if the config is dirty; otherwise there may be 7059299441Smav * outstanding AVZ operations that weren't completed in 7060299441Smav * spa_sync_config_object. 7061299441Smav */ 7062299441Smav uint64_t all_vdev_zap_entry_count; 7063299441Smav ASSERT0(zap_count(spa->spa_meta_objset, 7064299441Smav spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 7065299441Smav ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 7066299441Smav all_vdev_zap_entry_count); 7067299441Smav } 7068299441Smav 7069168404Spjd /* 7070168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 7071168404Spjd * to commit the transaction group. 7072168404Spjd * 7073185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 7074185029Spjd * random top-level vdevs that are known to be visible in the 7075185029Spjd * config cache (see spa_vdev_add() for a complete description). 7076185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 7077168404Spjd */ 7078185029Spjd for (;;) { 7079185029Spjd /* 7080185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 7081185029Spjd * while we're attempting to write the vdev labels. 7082185029Spjd */ 7083185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7084168404Spjd 7085185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 7086185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 7087185029Spjd int svdcount = 0; 7088185029Spjd int children = rvd->vdev_children; 7089185029Spjd int c0 = spa_get_random(children); 7090185029Spjd 7091219089Spjd for (int c = 0; c < children; c++) { 7092185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 7093185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 7094185029Spjd continue; 7095185029Spjd svd[svdcount++] = vd; 7096185029Spjd if (svdcount == SPA_DVAS_PER_BP) 7097185029Spjd break; 7098185029Spjd } 7099294811Smav error = vdev_config_sync(svd, svdcount, txg); 7100185029Spjd } else { 7101185029Spjd error = vdev_config_sync(rvd->vdev_child, 7102294811Smav rvd->vdev_children, txg); 7103168404Spjd } 7104185029Spjd 7105239620Smm if (error == 0) 7106239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 7107239620Smm 7108185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 7109185029Spjd 7110185029Spjd if (error == 0) 7111185029Spjd break; 7112185029Spjd zio_suspend(spa, NULL); 7113185029Spjd zio_resume_wait(spa); 7114168404Spjd } 7115168404Spjd dmu_tx_commit(tx); 7116168404Spjd 7117247265Smm#ifdef illumos 7118247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 7119277300Ssmh#else /* !illumos */ 7120247265Smm#ifdef _KERNEL 7121247265Smm callout_drain(&spa->spa_deadman_cycid); 7122247265Smm#endif 7123277300Ssmh#endif /* illumos */ 7124247265Smm 7125168404Spjd /* 7126168404Spjd * Clear the dirty config list. 7127168404Spjd */ 7128185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 7129168404Spjd vdev_config_clean(vd); 7130168404Spjd 7131168404Spjd /* 7132168404Spjd * Now that the new config has synced transactionally, 7133168404Spjd * let it become visible to the config cache. 7134168404Spjd */ 7135168404Spjd if (spa->spa_config_syncing != NULL) { 7136168404Spjd spa_config_set(spa, spa->spa_config_syncing); 7137168404Spjd spa->spa_config_txg = txg; 7138168404Spjd spa->spa_config_syncing = NULL; 7139168404Spjd } 7140168404Spjd 7141219089Spjd dsl_pool_sync_done(dp, txg); 7142168404Spjd 7143307277Smav mutex_enter(&spa->spa_alloc_lock); 7144307277Smav VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 7145307277Smav mutex_exit(&spa->spa_alloc_lock); 7146307277Smav 7147168404Spjd /* 7148168404Spjd * Update usable space statistics. 7149168404Spjd */ 7150168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 7151168404Spjd vdev_sync_done(vd, txg); 7152168404Spjd 7153219089Spjd spa_update_dspace(spa); 7154219089Spjd 7155168404Spjd /* 7156168404Spjd * It had better be the case that we didn't dirty anything 7157168404Spjd * since vdev_config_sync(). 7158168404Spjd */ 7159168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 7160168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7161168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 7162168404Spjd 7163219089Spjd spa->spa_sync_pass = 0; 7164219089Spjd 7165310515Savg /* 7166310515Savg * Update the last synced uberblock here. We want to do this at 7167310515Savg * the end of spa_sync() so that consumers of spa_last_synced_txg() 7168310515Savg * will be guaranteed that all the processing associated with 7169310515Savg * that txg has been completed. 7170310515Savg */ 7171310515Savg spa->spa_ubsync = spa->spa_uberblock; 7172185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 7173168404Spjd 7174219089Spjd spa_handle_ignored_writes(spa); 7175219089Spjd 7176168404Spjd /* 7177168404Spjd * If any async tasks have been requested, kick them off. 7178168404Spjd */ 7179168404Spjd spa_async_dispatch(spa); 7180253990Smav spa_async_dispatch_vd(spa); 7181168404Spjd} 7182168404Spjd 7183168404Spjd/* 7184168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 7185168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 7186168404Spjd * sync. 7187168404Spjd */ 7188168404Spjdvoid 7189168404Spjdspa_sync_allpools(void) 7190168404Spjd{ 7191168404Spjd spa_t *spa = NULL; 7192168404Spjd mutex_enter(&spa_namespace_lock); 7193168404Spjd while ((spa = spa_next(spa)) != NULL) { 7194219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 7195219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 7196168404Spjd continue; 7197168404Spjd spa_open_ref(spa, FTAG); 7198168404Spjd mutex_exit(&spa_namespace_lock); 7199168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 7200168404Spjd mutex_enter(&spa_namespace_lock); 7201168404Spjd spa_close(spa, FTAG); 7202168404Spjd } 7203168404Spjd mutex_exit(&spa_namespace_lock); 7204168404Spjd} 7205168404Spjd 7206168404Spjd/* 7207168404Spjd * ========================================================================== 7208168404Spjd * Miscellaneous routines 7209168404Spjd * ========================================================================== 7210168404Spjd */ 7211168404Spjd 7212168404Spjd/* 7213168404Spjd * Remove all pools in the system. 7214168404Spjd */ 7215168404Spjdvoid 7216168404Spjdspa_evict_all(void) 7217168404Spjd{ 7218168404Spjd spa_t *spa; 7219168404Spjd 7220168404Spjd /* 7221168404Spjd * Remove all cached state. All pools should be closed now, 7222168404Spjd * so every spa in the AVL tree should be unreferenced. 7223168404Spjd */ 7224168404Spjd mutex_enter(&spa_namespace_lock); 7225168404Spjd while ((spa = spa_next(NULL)) != NULL) { 7226168404Spjd /* 7227168404Spjd * Stop async tasks. The async thread may need to detach 7228168404Spjd * a device that's been replaced, which requires grabbing 7229168404Spjd * spa_namespace_lock, so we must drop it here. 7230168404Spjd */ 7231168404Spjd spa_open_ref(spa, FTAG); 7232168404Spjd mutex_exit(&spa_namespace_lock); 7233168404Spjd spa_async_suspend(spa); 7234168404Spjd mutex_enter(&spa_namespace_lock); 7235168404Spjd spa_close(spa, FTAG); 7236168404Spjd 7237168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7238168404Spjd spa_unload(spa); 7239168404Spjd spa_deactivate(spa); 7240168404Spjd } 7241168404Spjd spa_remove(spa); 7242168404Spjd } 7243168404Spjd mutex_exit(&spa_namespace_lock); 7244168404Spjd} 7245168404Spjd 7246168404Spjdvdev_t * 7247209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 7248168404Spjd{ 7249185029Spjd vdev_t *vd; 7250185029Spjd int i; 7251185029Spjd 7252185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 7253185029Spjd return (vd); 7254185029Spjd 7255209962Smm if (aux) { 7256185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 7257185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 7258185029Spjd if (vd->vdev_guid == guid) 7259185029Spjd return (vd); 7260185029Spjd } 7261209962Smm 7262209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 7263209962Smm vd = spa->spa_spares.sav_vdevs[i]; 7264209962Smm if (vd->vdev_guid == guid) 7265209962Smm return (vd); 7266209962Smm } 7267185029Spjd } 7268185029Spjd 7269185029Spjd return (NULL); 7270168404Spjd} 7271168404Spjd 7272168404Spjdvoid 7273185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 7274168404Spjd{ 7275219089Spjd ASSERT(spa_writeable(spa)); 7276219089Spjd 7277185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7278168404Spjd 7279168404Spjd /* 7280168404Spjd * This should only be called for a non-faulted pool, and since a 7281168404Spjd * future version would result in an unopenable pool, this shouldn't be 7282168404Spjd * possible. 7283168404Spjd */ 7284247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 7285268075Sdelphij ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 7286168404Spjd 7287185029Spjd spa->spa_uberblock.ub_version = version; 7288168404Spjd vdev_config_dirty(spa->spa_root_vdev); 7289168404Spjd 7290185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 7291168404Spjd 7292168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 7293168404Spjd} 7294168404Spjd 7295168404Spjdboolean_t 7296168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 7297168404Spjd{ 7298168404Spjd int i; 7299168404Spjd uint64_t spareguid; 7300185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 7301168404Spjd 7302185029Spjd for (i = 0; i < sav->sav_count; i++) 7303185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 7304168404Spjd return (B_TRUE); 7305168404Spjd 7306185029Spjd for (i = 0; i < sav->sav_npending; i++) { 7307185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 7308185029Spjd &spareguid) == 0 && spareguid == guid) 7309168404Spjd return (B_TRUE); 7310168404Spjd } 7311168404Spjd 7312168404Spjd return (B_FALSE); 7313168404Spjd} 7314168404Spjd 7315185029Spjd/* 7316185029Spjd * Check if a pool has an active shared spare device. 7317185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 7318185029Spjd */ 7319185029Spjdstatic boolean_t 7320185029Spjdspa_has_active_shared_spare(spa_t *spa) 7321168404Spjd{ 7322185029Spjd int i, refcnt; 7323185029Spjd uint64_t pool; 7324185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 7325185029Spjd 7326185029Spjd for (i = 0; i < sav->sav_count; i++) { 7327185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 7328185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 7329185029Spjd refcnt > 2) 7330185029Spjd return (B_TRUE); 7331185029Spjd } 7332185029Spjd 7333185029Spjd return (B_FALSE); 7334168404Spjd} 7335168404Spjd 7336307113Smavstatic sysevent_t * 7337331397Smavspa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 7338168404Spjd{ 7339307113Smav sysevent_t *ev = NULL; 7340185029Spjd#ifdef _KERNEL 7341185029Spjd sysevent_attr_list_t *attr = NULL; 7342185029Spjd sysevent_value_t value; 7343168404Spjd 7344185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 7345185029Spjd SE_SLEEP); 7346307113Smav ASSERT(ev != NULL); 7347168404Spjd 7348185029Spjd value.value_type = SE_DATA_TYPE_STRING; 7349185029Spjd value.value.sv_string = spa_name(spa); 7350185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 7351185029Spjd goto done; 7352168404Spjd 7353185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 7354185029Spjd value.value.sv_uint64 = spa_guid(spa); 7355185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 7356185029Spjd goto done; 7357168404Spjd 7358185029Spjd if (vd) { 7359185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 7360185029Spjd value.value.sv_uint64 = vd->vdev_guid; 7361185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 7362185029Spjd SE_SLEEP) != 0) 7363185029Spjd goto done; 7364168404Spjd 7365185029Spjd if (vd->vdev_path) { 7366185029Spjd value.value_type = SE_DATA_TYPE_STRING; 7367185029Spjd value.value.sv_string = vd->vdev_path; 7368185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 7369185029Spjd &value, SE_SLEEP) != 0) 7370185029Spjd goto done; 7371168404Spjd } 7372168404Spjd } 7373168404Spjd 7374331397Smav if (hist_nvl != NULL) { 7375331397Smav fnvlist_merge((nvlist_t *)attr, hist_nvl); 7376331397Smav } 7377331397Smav 7378185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 7379185029Spjd goto done; 7380185029Spjd attr = NULL; 7381168404Spjd 7382185029Spjddone: 7383185029Spjd if (attr) 7384185029Spjd sysevent_free_attr(attr); 7385307113Smav 7386307113Smav#endif 7387307113Smav return (ev); 7388307113Smav} 7389307113Smav 7390307113Smavstatic void 7391307113Smavspa_event_post(sysevent_t *ev) 7392307113Smav{ 7393307113Smav#ifdef _KERNEL 7394307113Smav sysevent_id_t eid; 7395307113Smav 7396307113Smav (void) log_sysevent(ev, SE_SLEEP, &eid); 7397185029Spjd sysevent_free(ev); 7398185029Spjd#endif 7399168404Spjd} 7400307113Smav 7401307113Smav/* 7402307113Smav * Post a sysevent corresponding to the given event. The 'name' must be one of 7403307113Smav * the event definitions in sys/sysevent/eventdefs.h. The payload will be 7404331397Smav * filled in from the spa and (optionally) the vdev and history nvl. This 7405331397Smav * doesn't do anything in the userland libzpool, as we don't want consumers to 7406331397Smav * misinterpret ztest or zdb as real changes. 7407307113Smav */ 7408307113Smavvoid 7409331397Smavspa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 7410307113Smav{ 7411331397Smav spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 7412307113Smav} 7413