spa.c revision 286575
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24264670Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25268126Sdelphij * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. 26247265Smm * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27286575Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28168404Spjd */ 29168404Spjd 30168404Spjd/* 31251629Sdelphij * SPA: Storage Pool Allocator 32251629Sdelphij * 33168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 34168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 35168404Spjd * pool. 36168404Spjd */ 37168404Spjd 38168404Spjd#include <sys/zfs_context.h> 39168404Spjd#include <sys/fm/fs/zfs.h> 40168404Spjd#include <sys/spa_impl.h> 41168404Spjd#include <sys/zio.h> 42168404Spjd#include <sys/zio_checksum.h> 43168404Spjd#include <sys/dmu.h> 44168404Spjd#include <sys/dmu_tx.h> 45168404Spjd#include <sys/zap.h> 46168404Spjd#include <sys/zil.h> 47219089Spjd#include <sys/ddt.h> 48168404Spjd#include <sys/vdev_impl.h> 49168404Spjd#include <sys/metaslab.h> 50219089Spjd#include <sys/metaslab_impl.h> 51168404Spjd#include <sys/uberblock_impl.h> 52168404Spjd#include <sys/txg.h> 53168404Spjd#include <sys/avl.h> 54168404Spjd#include <sys/dmu_traverse.h> 55168404Spjd#include <sys/dmu_objset.h> 56168404Spjd#include <sys/unique.h> 57168404Spjd#include <sys/dsl_pool.h> 58168404Spjd#include <sys/dsl_dataset.h> 59168404Spjd#include <sys/dsl_dir.h> 60168404Spjd#include <sys/dsl_prop.h> 61168404Spjd#include <sys/dsl_synctask.h> 62168404Spjd#include <sys/fs/zfs.h> 63185029Spjd#include <sys/arc.h> 64168404Spjd#include <sys/callb.h> 65185029Spjd#include <sys/spa_boot.h> 66219089Spjd#include <sys/zfs_ioctl.h> 67219089Spjd#include <sys/dsl_scan.h> 68248571Smm#include <sys/dmu_send.h> 69248571Smm#include <sys/dsl_destroy.h> 70248571Smm#include <sys/dsl_userhold.h> 71236884Smm#include <sys/zfeature.h> 72219089Spjd#include <sys/zvol.h> 73240868Spjd#include <sys/trim_map.h> 74168404Spjd 75219089Spjd#ifdef _KERNEL 76219089Spjd#include <sys/callb.h> 77219089Spjd#include <sys/cpupart.h> 78219089Spjd#include <sys/zone.h> 79219089Spjd#endif /* _KERNEL */ 80219089Spjd 81185029Spjd#include "zfs_prop.h" 82185029Spjd#include "zfs_comutil.h" 83168404Spjd 84204073Spjd/* Check hostid on import? */ 85204073Spjdstatic int check_hostid = 1; 86204073Spjd 87251636Sdelphij/* 88251636Sdelphij * The interval, in seconds, at which failed configuration cache file writes 89251636Sdelphij * should be retried. 90251636Sdelphij */ 91251636Sdelphijstatic int zfs_ccw_retry_interval = 300; 92251636Sdelphij 93271785SwillSYSCTL_DECL(_vfs_zfs); 94271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, 95271785Swill "Check hostid on import?"); 96271785SwillTUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); 97271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, 98271785Swill &zfs_ccw_retry_interval, 0, 99271785Swill "Configuration cache file write, retry after failure, interval (seconds)"); 100271785Swill 101219089Spjdtypedef enum zti_modes { 102258631Savg ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 103258631Savg ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 104258631Savg ZTI_MODE_NULL, /* don't create a taskq */ 105258631Savg ZTI_NMODES 106219089Spjd} zti_modes_t; 107168712Spjd 108258631Savg#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 109258631Savg#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 110258631Savg#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 111209962Smm 112258631Savg#define ZTI_N(n) ZTI_P(n, 1) 113258631Savg#define ZTI_ONE ZTI_N(1) 114209962Smm 115209962Smmtypedef struct zio_taskq_info { 116258631Savg zti_modes_t zti_mode; 117211931Smm uint_t zti_value; 118258631Savg uint_t zti_count; 119209962Smm} zio_taskq_info_t; 120209962Smm 121209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 122219089Spjd "issue", "issue_high", "intr", "intr_high" 123209962Smm}; 124209962Smm 125211931Smm/* 126258631Savg * This table defines the taskq settings for each ZFS I/O type. When 127258631Savg * initializing a pool, we use this table to create an appropriately sized 128258631Savg * taskq. Some operations are low volume and therefore have a small, static 129258631Savg * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 130258631Savg * macros. Other operations process a large amount of data; the ZTI_BATCH 131258631Savg * macro causes us to create a taskq oriented for throughput. Some operations 132258631Savg * are so high frequency and short-lived that the taskq itself can become a a 133258631Savg * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 134258631Savg * additional degree of parallelism specified by the number of threads per- 135258631Savg * taskq and the number of taskqs; when dispatching an event in this case, the 136258631Savg * particular taskq is chosen at random. 137258631Savg * 138258631Savg * The different taskq priorities are to handle the different contexts (issue 139258631Savg * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 140258631Savg * need to be handled with minimum delay. 141211931Smm */ 142211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 143211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 144258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 145264670Sdelphij { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 146258631Savg { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 147258631Savg { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 148258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 149258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 150209962Smm}; 151209962Smm 152248571Smmstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 153248571Smmstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 154185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 155219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 156219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 157219089Spjd char **ereport); 158219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 159185029Spjd 160258632Savguint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 161219089Spjd#ifdef PSRSET_BIND 162219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 163219089Spjd#endif 164219089Spjd#ifdef SYSDC 165219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 166219089Spjd#endif 167219089Spjduint_t zio_taskq_basedc = 80; /* base duty cycle */ 168219089Spjd 169219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 170243503Smmextern int zfs_sync_pass_deferred_free; 171219089Spjd 172247265Smm#ifndef illumos 173247265Smmextern void spa_deadman(void *arg); 174247265Smm#endif 175247265Smm 176168404Spjd/* 177219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 178219089Spjd * to get the vdev stats associated with the imported devices. 179219089Spjd */ 180219089Spjd#define TRYIMPORT_NAME "$import" 181219089Spjd 182219089Spjd/* 183168404Spjd * ========================================================================== 184185029Spjd * SPA properties routines 185185029Spjd * ========================================================================== 186185029Spjd */ 187185029Spjd 188185029Spjd/* 189185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 190185029Spjd */ 191185029Spjdstatic void 192185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 193185029Spjd uint64_t intval, zprop_source_t src) 194185029Spjd{ 195185029Spjd const char *propname = zpool_prop_to_name(prop); 196185029Spjd nvlist_t *propval; 197185029Spjd 198185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 199185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 200185029Spjd 201185029Spjd if (strval != NULL) 202185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 203185029Spjd else 204185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 205185029Spjd 206185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 207185029Spjd nvlist_free(propval); 208185029Spjd} 209185029Spjd 210185029Spjd/* 211185029Spjd * Get property values from the spa configuration. 212185029Spjd */ 213185029Spjdstatic void 214185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 215185029Spjd{ 216236155Smm vdev_t *rvd = spa->spa_root_vdev; 217236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 218269118Sdelphij uint64_t size, alloc, cap, version; 219185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 220185029Spjd spa_config_dirent_t *dp; 221269118Sdelphij metaslab_class_t *mc = spa_normal_class(spa); 222185029Spjd 223185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 224185029Spjd 225236155Smm if (rvd != NULL) { 226219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 227219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 228209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 229209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 230219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 231219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 232219089Spjd size - alloc, src); 233236155Smm 234269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 235269118Sdelphij metaslab_class_fragmentation(mc), src); 236269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 237269118Sdelphij metaslab_class_expandable_space(mc), src); 238219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 239219089Spjd (spa_mode(spa) == FREAD), src); 240185029Spjd 241219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 242209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 243185029Spjd 244219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 245219089Spjd ddt_get_pool_dedup_ratio(spa), src); 246219089Spjd 247209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 248236155Smm rvd->vdev_state, src); 249209962Smm 250209962Smm version = spa_version(spa); 251209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 252209962Smm src = ZPROP_SRC_DEFAULT; 253209962Smm else 254209962Smm src = ZPROP_SRC_LOCAL; 255209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 256209962Smm } 257209962Smm 258236884Smm if (pool != NULL) { 259236884Smm /* 260236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 261236884Smm * when opening pools before this version freedir will be NULL. 262236884Smm */ 263268079Sdelphij if (pool->dp_free_dir != NULL) { 264236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 265275782Sdelphij dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 266275782Sdelphij src); 267236884Smm } else { 268236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 269236884Smm NULL, 0, src); 270236884Smm } 271268079Sdelphij 272268079Sdelphij if (pool->dp_leak_dir != NULL) { 273268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 274275782Sdelphij dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 275275782Sdelphij src); 276268079Sdelphij } else { 277268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 278268079Sdelphij NULL, 0, src); 279268079Sdelphij } 280236884Smm } 281236884Smm 282185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 283185029Spjd 284228103Smm if (spa->spa_comment != NULL) { 285228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 286228103Smm 0, ZPROP_SRC_LOCAL); 287228103Smm } 288228103Smm 289185029Spjd if (spa->spa_root != NULL) 290185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 291185029Spjd 0, ZPROP_SRC_LOCAL); 292185029Spjd 293274337Sdelphij if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 294274337Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 295274337Sdelphij MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 296274337Sdelphij } else { 297274337Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 298274337Sdelphij SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 299274337Sdelphij } 300274337Sdelphij 301185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 302185029Spjd if (dp->scd_path == NULL) { 303185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 304185029Spjd "none", 0, ZPROP_SRC_LOCAL); 305185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 306185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 307185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 308185029Spjd } 309185029Spjd } 310185029Spjd} 311185029Spjd 312185029Spjd/* 313185029Spjd * Get zpool property values. 314185029Spjd */ 315185029Spjdint 316185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 317185029Spjd{ 318219089Spjd objset_t *mos = spa->spa_meta_objset; 319185029Spjd zap_cursor_t zc; 320185029Spjd zap_attribute_t za; 321185029Spjd int err; 322185029Spjd 323185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 324185029Spjd 325185029Spjd mutex_enter(&spa->spa_props_lock); 326185029Spjd 327185029Spjd /* 328185029Spjd * Get properties from the spa config. 329185029Spjd */ 330185029Spjd spa_prop_get_config(spa, nvp); 331185029Spjd 332185029Spjd /* If no pool property object, no more prop to get. */ 333219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 334185029Spjd mutex_exit(&spa->spa_props_lock); 335185029Spjd return (0); 336185029Spjd } 337185029Spjd 338185029Spjd /* 339185029Spjd * Get properties from the MOS pool property object. 340185029Spjd */ 341185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 342185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 343185029Spjd zap_cursor_advance(&zc)) { 344185029Spjd uint64_t intval = 0; 345185029Spjd char *strval = NULL; 346185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 347185029Spjd zpool_prop_t prop; 348185029Spjd 349185029Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 350185029Spjd continue; 351185029Spjd 352185029Spjd switch (za.za_integer_length) { 353185029Spjd case 8: 354185029Spjd /* integer property */ 355185029Spjd if (za.za_first_integer != 356185029Spjd zpool_prop_default_numeric(prop)) 357185029Spjd src = ZPROP_SRC_LOCAL; 358185029Spjd 359185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 360185029Spjd dsl_pool_t *dp; 361185029Spjd dsl_dataset_t *ds = NULL; 362185029Spjd 363185029Spjd dp = spa_get_dsl(spa); 364248571Smm dsl_pool_config_enter(dp, FTAG); 365185029Spjd if (err = dsl_dataset_hold_obj(dp, 366185029Spjd za.za_first_integer, FTAG, &ds)) { 367248571Smm dsl_pool_config_exit(dp, FTAG); 368185029Spjd break; 369185029Spjd } 370185029Spjd 371185029Spjd strval = kmem_alloc( 372185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 373185029Spjd KM_SLEEP); 374185029Spjd dsl_dataset_name(ds, strval); 375185029Spjd dsl_dataset_rele(ds, FTAG); 376248571Smm dsl_pool_config_exit(dp, FTAG); 377185029Spjd } else { 378185029Spjd strval = NULL; 379185029Spjd intval = za.za_first_integer; 380185029Spjd } 381185029Spjd 382185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 383185029Spjd 384185029Spjd if (strval != NULL) 385185029Spjd kmem_free(strval, 386185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 387185029Spjd 388185029Spjd break; 389185029Spjd 390185029Spjd case 1: 391185029Spjd /* string property */ 392185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 393185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 394185029Spjd za.za_name, 1, za.za_num_integers, strval); 395185029Spjd if (err) { 396185029Spjd kmem_free(strval, za.za_num_integers); 397185029Spjd break; 398185029Spjd } 399185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 400185029Spjd kmem_free(strval, za.za_num_integers); 401185029Spjd break; 402185029Spjd 403185029Spjd default: 404185029Spjd break; 405185029Spjd } 406185029Spjd } 407185029Spjd zap_cursor_fini(&zc); 408185029Spjd mutex_exit(&spa->spa_props_lock); 409185029Spjdout: 410185029Spjd if (err && err != ENOENT) { 411185029Spjd nvlist_free(*nvp); 412185029Spjd *nvp = NULL; 413185029Spjd return (err); 414185029Spjd } 415185029Spjd 416185029Spjd return (0); 417185029Spjd} 418185029Spjd 419185029Spjd/* 420185029Spjd * Validate the given pool properties nvlist and modify the list 421185029Spjd * for the property values to be set. 422185029Spjd */ 423185029Spjdstatic int 424185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 425185029Spjd{ 426185029Spjd nvpair_t *elem; 427185029Spjd int error = 0, reset_bootfs = 0; 428247187Smm uint64_t objnum = 0; 429236884Smm boolean_t has_feature = B_FALSE; 430185029Spjd 431185029Spjd elem = NULL; 432185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 433185029Spjd uint64_t intval; 434236884Smm char *strval, *slash, *check, *fname; 435236884Smm const char *propname = nvpair_name(elem); 436236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 437185029Spjd 438236884Smm switch (prop) { 439236884Smm case ZPROP_INVAL: 440236884Smm if (!zpool_prop_feature(propname)) { 441249195Smm error = SET_ERROR(EINVAL); 442236884Smm break; 443236884Smm } 444185029Spjd 445236884Smm /* 446236884Smm * Sanitize the input. 447236884Smm */ 448236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 449249195Smm error = SET_ERROR(EINVAL); 450236884Smm break; 451236884Smm } 452185029Spjd 453236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 454249195Smm error = SET_ERROR(EINVAL); 455236884Smm break; 456236884Smm } 457236884Smm 458236884Smm if (intval != 0) { 459249195Smm error = SET_ERROR(EINVAL); 460236884Smm break; 461236884Smm } 462236884Smm 463236884Smm fname = strchr(propname, '@') + 1; 464236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 465249195Smm error = SET_ERROR(EINVAL); 466236884Smm break; 467236884Smm } 468236884Smm 469236884Smm has_feature = B_TRUE; 470236884Smm break; 471236884Smm 472185029Spjd case ZPOOL_PROP_VERSION: 473185029Spjd error = nvpair_value_uint64(elem, &intval); 474185029Spjd if (!error && 475236884Smm (intval < spa_version(spa) || 476236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 477236884Smm has_feature)) 478249195Smm error = SET_ERROR(EINVAL); 479185029Spjd break; 480185029Spjd 481185029Spjd case ZPOOL_PROP_DELEGATION: 482185029Spjd case ZPOOL_PROP_AUTOREPLACE: 483185029Spjd case ZPOOL_PROP_LISTSNAPS: 484219089Spjd case ZPOOL_PROP_AUTOEXPAND: 485185029Spjd error = nvpair_value_uint64(elem, &intval); 486185029Spjd if (!error && intval > 1) 487249195Smm error = SET_ERROR(EINVAL); 488185029Spjd break; 489185029Spjd 490185029Spjd case ZPOOL_PROP_BOOTFS: 491209962Smm /* 492209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 493209962Smm * or the pool is still being created (version == 0), 494209962Smm * the bootfs property cannot be set. 495209962Smm */ 496185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 497249195Smm error = SET_ERROR(ENOTSUP); 498185029Spjd break; 499185029Spjd } 500185029Spjd 501185029Spjd /* 502185029Spjd * Make sure the vdev config is bootable 503185029Spjd */ 504185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 505249195Smm error = SET_ERROR(ENOTSUP); 506185029Spjd break; 507185029Spjd } 508185029Spjd 509185029Spjd reset_bootfs = 1; 510185029Spjd 511185029Spjd error = nvpair_value_string(elem, &strval); 512185029Spjd 513185029Spjd if (!error) { 514236884Smm objset_t *os; 515274337Sdelphij uint64_t propval; 516185029Spjd 517185029Spjd if (strval == NULL || strval[0] == '\0') { 518185029Spjd objnum = zpool_prop_default_numeric( 519185029Spjd ZPOOL_PROP_BOOTFS); 520185029Spjd break; 521185029Spjd } 522185029Spjd 523219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 524185029Spjd break; 525185029Spjd 526274337Sdelphij /* 527274337Sdelphij * Must be ZPL, and its property settings 528274337Sdelphij * must be supported by GRUB (compression 529274337Sdelphij * is not gzip, and large blocks are not used). 530274337Sdelphij */ 531219089Spjd 532219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 533249195Smm error = SET_ERROR(ENOTSUP); 534248571Smm } else if ((error = 535248571Smm dsl_prop_get_int_ds(dmu_objset_ds(os), 536185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 537274337Sdelphij &propval)) == 0 && 538274337Sdelphij !BOOTFS_COMPRESS_VALID(propval)) { 539249195Smm error = SET_ERROR(ENOTSUP); 540274337Sdelphij } else if ((error = 541274337Sdelphij dsl_prop_get_int_ds(dmu_objset_ds(os), 542274337Sdelphij zfs_prop_to_name(ZFS_PROP_RECORDSIZE), 543274337Sdelphij &propval)) == 0 && 544274337Sdelphij propval > SPA_OLD_MAXBLOCKSIZE) { 545274337Sdelphij error = SET_ERROR(ENOTSUP); 546185029Spjd } else { 547185029Spjd objnum = dmu_objset_id(os); 548185029Spjd } 549219089Spjd dmu_objset_rele(os, FTAG); 550185029Spjd } 551185029Spjd break; 552185029Spjd 553185029Spjd case ZPOOL_PROP_FAILUREMODE: 554185029Spjd error = nvpair_value_uint64(elem, &intval); 555185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 556185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 557249195Smm error = SET_ERROR(EINVAL); 558185029Spjd 559185029Spjd /* 560185029Spjd * This is a special case which only occurs when 561185029Spjd * the pool has completely failed. This allows 562185029Spjd * the user to change the in-core failmode property 563185029Spjd * without syncing it out to disk (I/Os might 564185029Spjd * currently be blocked). We do this by returning 565185029Spjd * EIO to the caller (spa_prop_set) to trick it 566185029Spjd * into thinking we encountered a property validation 567185029Spjd * error. 568185029Spjd */ 569185029Spjd if (!error && spa_suspended(spa)) { 570185029Spjd spa->spa_failmode = intval; 571249195Smm error = SET_ERROR(EIO); 572185029Spjd } 573185029Spjd break; 574185029Spjd 575185029Spjd case ZPOOL_PROP_CACHEFILE: 576185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 577185029Spjd break; 578185029Spjd 579185029Spjd if (strval[0] == '\0') 580185029Spjd break; 581185029Spjd 582185029Spjd if (strcmp(strval, "none") == 0) 583185029Spjd break; 584185029Spjd 585185029Spjd if (strval[0] != '/') { 586249195Smm error = SET_ERROR(EINVAL); 587185029Spjd break; 588185029Spjd } 589185029Spjd 590185029Spjd slash = strrchr(strval, '/'); 591185029Spjd ASSERT(slash != NULL); 592185029Spjd 593185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 594185029Spjd strcmp(slash, "/..") == 0) 595249195Smm error = SET_ERROR(EINVAL); 596185029Spjd break; 597219089Spjd 598228103Smm case ZPOOL_PROP_COMMENT: 599228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 600228103Smm break; 601228103Smm for (check = strval; *check != '\0'; check++) { 602228103Smm /* 603228103Smm * The kernel doesn't have an easy isprint() 604228103Smm * check. For this kernel check, we merely 605228103Smm * check ASCII apart from DEL. Fix this if 606228103Smm * there is an easy-to-use kernel isprint(). 607228103Smm */ 608228103Smm if (*check >= 0x7f) { 609249195Smm error = SET_ERROR(EINVAL); 610228103Smm break; 611228103Smm } 612228103Smm check++; 613228103Smm } 614228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 615228103Smm error = E2BIG; 616228103Smm break; 617228103Smm 618219089Spjd case ZPOOL_PROP_DEDUPDITTO: 619219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 620249195Smm error = SET_ERROR(ENOTSUP); 621219089Spjd else 622219089Spjd error = nvpair_value_uint64(elem, &intval); 623219089Spjd if (error == 0 && 624219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 625249195Smm error = SET_ERROR(EINVAL); 626219089Spjd break; 627185029Spjd } 628185029Spjd 629185029Spjd if (error) 630185029Spjd break; 631185029Spjd } 632185029Spjd 633185029Spjd if (!error && reset_bootfs) { 634185029Spjd error = nvlist_remove(props, 635185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 636185029Spjd 637185029Spjd if (!error) { 638185029Spjd error = nvlist_add_uint64(props, 639185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 640185029Spjd } 641185029Spjd } 642185029Spjd 643185029Spjd return (error); 644185029Spjd} 645185029Spjd 646209962Smmvoid 647209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 648209962Smm{ 649209962Smm char *cachefile; 650209962Smm spa_config_dirent_t *dp; 651209962Smm 652209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 653209962Smm &cachefile) != 0) 654209962Smm return; 655209962Smm 656209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 657209962Smm KM_SLEEP); 658209962Smm 659209962Smm if (cachefile[0] == '\0') 660209962Smm dp->scd_path = spa_strdup(spa_config_path); 661209962Smm else if (strcmp(cachefile, "none") == 0) 662209962Smm dp->scd_path = NULL; 663209962Smm else 664209962Smm dp->scd_path = spa_strdup(cachefile); 665209962Smm 666209962Smm list_insert_head(&spa->spa_config_list, dp); 667209962Smm if (need_sync) 668209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 669209962Smm} 670209962Smm 671185029Spjdint 672185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 673185029Spjd{ 674185029Spjd int error; 675236884Smm nvpair_t *elem = NULL; 676209962Smm boolean_t need_sync = B_FALSE; 677185029Spjd 678185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 679185029Spjd return (error); 680185029Spjd 681209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 682236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 683209962Smm 684219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 685219089Spjd prop == ZPOOL_PROP_ALTROOT || 686219089Spjd prop == ZPOOL_PROP_READONLY) 687209962Smm continue; 688209962Smm 689236884Smm if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 690236884Smm uint64_t ver; 691236884Smm 692236884Smm if (prop == ZPOOL_PROP_VERSION) { 693236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 694236884Smm } else { 695236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 696236884Smm ver = SPA_VERSION_FEATURES; 697236884Smm need_sync = B_TRUE; 698236884Smm } 699236884Smm 700236884Smm /* Save time if the version is already set. */ 701236884Smm if (ver == spa_version(spa)) 702236884Smm continue; 703236884Smm 704236884Smm /* 705236884Smm * In addition to the pool directory object, we might 706236884Smm * create the pool properties object, the features for 707236884Smm * read object, the features for write object, or the 708236884Smm * feature descriptions object. 709236884Smm */ 710248571Smm error = dsl_sync_task(spa->spa_name, NULL, 711268473Sdelphij spa_sync_version, &ver, 712268473Sdelphij 6, ZFS_SPACE_CHECK_RESERVED); 713236884Smm if (error) 714236884Smm return (error); 715236884Smm continue; 716236884Smm } 717236884Smm 718209962Smm need_sync = B_TRUE; 719209962Smm break; 720209962Smm } 721209962Smm 722236884Smm if (need_sync) { 723248571Smm return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 724268473Sdelphij nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 725236884Smm } 726236884Smm 727236884Smm return (0); 728185029Spjd} 729185029Spjd 730185029Spjd/* 731185029Spjd * If the bootfs property value is dsobj, clear it. 732185029Spjd */ 733185029Spjdvoid 734185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 735185029Spjd{ 736185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 737185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 738185029Spjd spa->spa_pool_props_object, 739185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 740185029Spjd spa->spa_bootfs = 0; 741185029Spjd } 742185029Spjd} 743185029Spjd 744239620Smm/*ARGSUSED*/ 745239620Smmstatic int 746248571Smmspa_change_guid_check(void *arg, dmu_tx_t *tx) 747239620Smm{ 748248571Smm uint64_t *newguid = arg; 749248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 750239620Smm vdev_t *rvd = spa->spa_root_vdev; 751239620Smm uint64_t vdev_state; 752239620Smm 753239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 754239620Smm vdev_state = rvd->vdev_state; 755239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 756239620Smm 757239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 758249195Smm return (SET_ERROR(ENXIO)); 759239620Smm 760239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 761239620Smm 762239620Smm return (0); 763239620Smm} 764239620Smm 765239620Smmstatic void 766248571Smmspa_change_guid_sync(void *arg, dmu_tx_t *tx) 767239620Smm{ 768248571Smm uint64_t *newguid = arg; 769248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 770239620Smm uint64_t oldguid; 771239620Smm vdev_t *rvd = spa->spa_root_vdev; 772239620Smm 773239620Smm oldguid = spa_guid(spa); 774239620Smm 775239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 776239620Smm rvd->vdev_guid = *newguid; 777239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 778239620Smm vdev_config_dirty(rvd); 779239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 780239620Smm 781248571Smm spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 782239620Smm oldguid, *newguid); 783239620Smm} 784239620Smm 785185029Spjd/* 786228103Smm * Change the GUID for the pool. This is done so that we can later 787228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 788228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 789228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 790228103Smm * online when we do this, or else any vdevs that weren't present 791228103Smm * would be orphaned from our pool. We are also going to issue a 792228103Smm * sysevent to update any watchers. 793228103Smm */ 794228103Smmint 795228103Smmspa_change_guid(spa_t *spa) 796228103Smm{ 797239620Smm int error; 798239620Smm uint64_t guid; 799228103Smm 800254074Sdelphij mutex_enter(&spa->spa_vdev_top_lock); 801239620Smm mutex_enter(&spa_namespace_lock); 802239620Smm guid = spa_generate_guid(NULL); 803228103Smm 804248571Smm error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 805268473Sdelphij spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 806228103Smm 807239620Smm if (error == 0) { 808239620Smm spa_config_sync(spa, B_FALSE, B_TRUE); 809239620Smm spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 810239620Smm } 811228103Smm 812239620Smm mutex_exit(&spa_namespace_lock); 813254074Sdelphij mutex_exit(&spa->spa_vdev_top_lock); 814228103Smm 815239620Smm return (error); 816228103Smm} 817228103Smm 818228103Smm/* 819185029Spjd * ========================================================================== 820168404Spjd * SPA state manipulation (open/create/destroy/import/export) 821168404Spjd * ========================================================================== 822168404Spjd */ 823168404Spjd 824168404Spjdstatic int 825168404Spjdspa_error_entry_compare(const void *a, const void *b) 826168404Spjd{ 827168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 828168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 829168404Spjd int ret; 830168404Spjd 831168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 832268123Sdelphij sizeof (zbookmark_phys_t)); 833168404Spjd 834168404Spjd if (ret < 0) 835168404Spjd return (-1); 836168404Spjd else if (ret > 0) 837168404Spjd return (1); 838168404Spjd else 839168404Spjd return (0); 840168404Spjd} 841168404Spjd 842168404Spjd/* 843168404Spjd * Utility function which retrieves copies of the current logs and 844168404Spjd * re-initializes them in the process. 845168404Spjd */ 846168404Spjdvoid 847168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 848168404Spjd{ 849168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 850168404Spjd 851168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 852168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 853168404Spjd 854168404Spjd avl_create(&spa->spa_errlist_scrub, 855168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 856168404Spjd offsetof(spa_error_entry_t, se_avl)); 857168404Spjd avl_create(&spa->spa_errlist_last, 858168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 859168404Spjd offsetof(spa_error_entry_t, se_avl)); 860168404Spjd} 861168404Spjd 862258631Savgstatic void 863258631Savgspa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 864168404Spjd{ 865258631Savg const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 866258631Savg enum zti_modes mode = ztip->zti_mode; 867258631Savg uint_t value = ztip->zti_value; 868258631Savg uint_t count = ztip->zti_count; 869258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 870258631Savg char name[32]; 871258630Savg uint_t flags = 0; 872219089Spjd boolean_t batch = B_FALSE; 873168404Spjd 874258631Savg if (mode == ZTI_MODE_NULL) { 875258631Savg tqs->stqs_count = 0; 876258631Savg tqs->stqs_taskq = NULL; 877258631Savg return; 878258631Savg } 879168404Spjd 880258631Savg ASSERT3U(count, >, 0); 881168404Spjd 882258631Savg tqs->stqs_count = count; 883258631Savg tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 884219089Spjd 885258632Savg switch (mode) { 886258632Savg case ZTI_MODE_FIXED: 887258632Savg ASSERT3U(value, >=, 1); 888258632Savg value = MAX(value, 1); 889258632Savg break; 890219089Spjd 891258632Savg case ZTI_MODE_BATCH: 892258632Savg batch = B_TRUE; 893258632Savg flags |= TASKQ_THREADS_CPU_PCT; 894258632Savg value = zio_taskq_batch_pct; 895258632Savg break; 896219089Spjd 897258632Savg default: 898258632Savg panic("unrecognized mode for %s_%s taskq (%u:%u) in " 899258632Savg "spa_activate()", 900258632Savg zio_type_name[t], zio_taskq_types[q], mode, value); 901258632Savg break; 902258632Savg } 903258631Savg 904258632Savg for (uint_t i = 0; i < count; i++) { 905258632Savg taskq_t *tq; 906258631Savg 907258631Savg if (count > 1) { 908258631Savg (void) snprintf(name, sizeof (name), "%s_%s_%u", 909258631Savg zio_type_name[t], zio_taskq_types[q], i); 910258631Savg } else { 911258631Savg (void) snprintf(name, sizeof (name), "%s_%s", 912258631Savg zio_type_name[t], zio_taskq_types[q]); 913258631Savg } 914258631Savg 915219089Spjd#ifdef SYSDC 916258631Savg if (zio_taskq_sysdc && spa->spa_proc != &p0) { 917258631Savg if (batch) 918258631Savg flags |= TASKQ_DC_BATCH; 919219089Spjd 920258631Savg tq = taskq_create_sysdc(name, value, 50, INT_MAX, 921258631Savg spa->spa_proc, zio_taskq_basedc, flags); 922258631Savg } else { 923258631Savg#endif 924258632Savg pri_t pri = maxclsyspri; 925258632Savg /* 926258632Savg * The write issue taskq can be extremely CPU 927258632Savg * intensive. Run it at slightly lower priority 928258632Savg * than the other taskqs. 929258632Savg */ 930258632Savg if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 931258632Savg pri--; 932258632Savg 933258632Savg tq = taskq_create_proc(name, value, pri, 50, 934258631Savg INT_MAX, spa->spa_proc, flags); 935258631Savg#ifdef SYSDC 936258631Savg } 937258631Savg#endif 938258631Savg 939258631Savg tqs->stqs_taskq[i] = tq; 940219089Spjd } 941219089Spjd} 942219089Spjd 943219089Spjdstatic void 944258631Savgspa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 945258631Savg{ 946258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 947258631Savg 948258631Savg if (tqs->stqs_taskq == NULL) { 949258631Savg ASSERT0(tqs->stqs_count); 950258631Savg return; 951258631Savg } 952258631Savg 953258631Savg for (uint_t i = 0; i < tqs->stqs_count; i++) { 954258631Savg ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 955258631Savg taskq_destroy(tqs->stqs_taskq[i]); 956258631Savg } 957258631Savg 958258631Savg kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 959258631Savg tqs->stqs_taskq = NULL; 960258631Savg} 961258631Savg 962258631Savg/* 963258631Savg * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 964258631Savg * Note that a type may have multiple discrete taskqs to avoid lock contention 965258631Savg * on the taskq itself. In that case we choose which taskq at random by using 966258631Savg * the low bits of gethrtime(). 967258631Savg */ 968258631Savgvoid 969258631Savgspa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 970258631Savg task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 971258631Savg{ 972258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 973258631Savg taskq_t *tq; 974258631Savg 975258631Savg ASSERT3P(tqs->stqs_taskq, !=, NULL); 976258631Savg ASSERT3U(tqs->stqs_count, !=, 0); 977258631Savg 978258631Savg if (tqs->stqs_count == 1) { 979258631Savg tq = tqs->stqs_taskq[0]; 980258631Savg } else { 981267038Sbdrewery#ifdef _KERNEL 982267029Smav tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 983267038Sbdrewery#else 984267038Sbdrewery tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 985267038Sbdrewery#endif 986258631Savg } 987258631Savg 988258631Savg taskq_dispatch_ent(tq, func, arg, flags, ent); 989258631Savg} 990258631Savg 991258631Savgstatic void 992219089Spjdspa_create_zio_taskqs(spa_t *spa) 993219089Spjd{ 994185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 995185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 996258631Savg spa_taskqs_init(spa, t, q); 997219089Spjd } 998219089Spjd } 999219089Spjd} 1000209962Smm 1001219089Spjd#ifdef _KERNEL 1002219089Spjd#ifdef SPA_PROCESS 1003219089Spjdstatic void 1004219089Spjdspa_thread(void *arg) 1005219089Spjd{ 1006219089Spjd callb_cpr_t cprinfo; 1007209962Smm 1008219089Spjd spa_t *spa = arg; 1009219089Spjd user_t *pu = PTOU(curproc); 1010209962Smm 1011219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1012219089Spjd spa->spa_name); 1013209962Smm 1014219089Spjd ASSERT(curproc != &p0); 1015219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1016219089Spjd "zpool-%s", spa->spa_name); 1017219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1018211931Smm 1019219089Spjd#ifdef PSRSET_BIND 1020219089Spjd /* bind this thread to the requested psrset */ 1021219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 1022219089Spjd pool_lock(); 1023219089Spjd mutex_enter(&cpu_lock); 1024219089Spjd mutex_enter(&pidlock); 1025219089Spjd mutex_enter(&curproc->p_lock); 1026219089Spjd 1027219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1028219089Spjd 0, NULL, NULL) == 0) { 1029219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 1030219089Spjd } else { 1031219089Spjd cmn_err(CE_WARN, 1032219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 1033219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1034219089Spjd } 1035219089Spjd 1036219089Spjd mutex_exit(&curproc->p_lock); 1037219089Spjd mutex_exit(&pidlock); 1038219089Spjd mutex_exit(&cpu_lock); 1039219089Spjd pool_unlock(); 1040219089Spjd } 1041219089Spjd#endif 1042219089Spjd 1043219089Spjd#ifdef SYSDC 1044219089Spjd if (zio_taskq_sysdc) { 1045219089Spjd sysdc_thread_enter(curthread, 100, 0); 1046219089Spjd } 1047219089Spjd#endif 1048219089Spjd 1049219089Spjd spa->spa_proc = curproc; 1050219089Spjd spa->spa_did = curthread->t_did; 1051219089Spjd 1052219089Spjd spa_create_zio_taskqs(spa); 1053219089Spjd 1054219089Spjd mutex_enter(&spa->spa_proc_lock); 1055219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1056219089Spjd 1057219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 1058219089Spjd cv_broadcast(&spa->spa_proc_cv); 1059219089Spjd 1060219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 1061219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1062219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1063219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1064219089Spjd 1065219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1066219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 1067219089Spjd spa->spa_proc = &p0; 1068219089Spjd cv_broadcast(&spa->spa_proc_cv); 1069219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1070219089Spjd 1071219089Spjd mutex_enter(&curproc->p_lock); 1072219089Spjd lwp_exit(); 1073219089Spjd} 1074219089Spjd#endif /* SPA_PROCESS */ 1075219089Spjd#endif 1076219089Spjd 1077219089Spjd/* 1078219089Spjd * Activate an uninitialized pool. 1079219089Spjd */ 1080219089Spjdstatic void 1081219089Spjdspa_activate(spa_t *spa, int mode) 1082219089Spjd{ 1083219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1084219089Spjd 1085219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 1086219089Spjd spa->spa_mode = mode; 1087219089Spjd 1088219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1089219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1090219089Spjd 1091219089Spjd /* Try to create a covering process */ 1092219089Spjd mutex_enter(&spa->spa_proc_lock); 1093219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1094219089Spjd ASSERT(spa->spa_proc == &p0); 1095219089Spjd spa->spa_did = 0; 1096219089Spjd 1097219089Spjd#ifdef SPA_PROCESS 1098219089Spjd /* Only create a process if we're going to be around a while. */ 1099219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1100219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1101219089Spjd NULL, 0) == 0) { 1102219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 1103219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 1104219089Spjd cv_wait(&spa->spa_proc_cv, 1105219089Spjd &spa->spa_proc_lock); 1106209962Smm } 1107219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1108219089Spjd ASSERT(spa->spa_proc != &p0); 1109219089Spjd ASSERT(spa->spa_did != 0); 1110219089Spjd } else { 1111219089Spjd#ifdef _KERNEL 1112219089Spjd cmn_err(CE_WARN, 1113219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 1114219089Spjd spa->spa_name); 1115219089Spjd#endif 1116185029Spjd } 1117168404Spjd } 1118219089Spjd#endif /* SPA_PROCESS */ 1119219089Spjd mutex_exit(&spa->spa_proc_lock); 1120168404Spjd 1121219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1122219089Spjd ASSERT(spa->spa_proc == &p0); 1123219089Spjd if (spa->spa_proc == &p0) { 1124219089Spjd spa_create_zio_taskqs(spa); 1125219089Spjd } 1126219089Spjd 1127240868Spjd /* 1128240868Spjd * Start TRIM thread. 1129240868Spjd */ 1130240868Spjd trim_thread_create(spa); 1131240868Spjd 1132185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1133185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1134286575Smav list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1135286575Smav offsetof(objset_t, os_evicting_node)); 1136185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1137185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1138168404Spjd 1139168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 1140168404Spjd offsetof(struct vdev, vdev_txg_node)); 1141168404Spjd 1142168404Spjd avl_create(&spa->spa_errlist_scrub, 1143168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1144168404Spjd offsetof(spa_error_entry_t, se_avl)); 1145168404Spjd avl_create(&spa->spa_errlist_last, 1146168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1147168404Spjd offsetof(spa_error_entry_t, se_avl)); 1148168404Spjd} 1149168404Spjd 1150168404Spjd/* 1151168404Spjd * Opposite of spa_activate(). 1152168404Spjd */ 1153168404Spjdstatic void 1154168404Spjdspa_deactivate(spa_t *spa) 1155168404Spjd{ 1156168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1157168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1158168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1159209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1160168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1161168404Spjd 1162240868Spjd /* 1163240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1164240868Spjd * before spa_deactivate(). 1165240868Spjd */ 1166240868Spjd trim_thread_destroy(spa); 1167240868Spjd 1168286575Smav spa_evicting_os_wait(spa); 1169286575Smav 1170168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1171168404Spjd 1172185029Spjd list_destroy(&spa->spa_config_dirty_list); 1173286575Smav list_destroy(&spa->spa_evicting_os_list); 1174185029Spjd list_destroy(&spa->spa_state_dirty_list); 1175168404Spjd 1176185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1177185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1178258631Savg spa_taskqs_fini(spa, t, q); 1179185029Spjd } 1180168404Spjd } 1181168404Spjd 1182168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1183168404Spjd spa->spa_normal_class = NULL; 1184168404Spjd 1185185029Spjd metaslab_class_destroy(spa->spa_log_class); 1186185029Spjd spa->spa_log_class = NULL; 1187185029Spjd 1188168404Spjd /* 1189168404Spjd * If this was part of an import or the open otherwise failed, we may 1190168404Spjd * still have errors left in the queues. Empty them just in case. 1191168404Spjd */ 1192168404Spjd spa_errlog_drain(spa); 1193168404Spjd 1194168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1195168404Spjd avl_destroy(&spa->spa_errlist_last); 1196168404Spjd 1197168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1198219089Spjd 1199219089Spjd mutex_enter(&spa->spa_proc_lock); 1200219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1201219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1202219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1203219089Spjd cv_broadcast(&spa->spa_proc_cv); 1204219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1205219089Spjd ASSERT(spa->spa_proc != &p0); 1206219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1207219089Spjd } 1208219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1209219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1210219089Spjd } 1211219089Spjd ASSERT(spa->spa_proc == &p0); 1212219089Spjd mutex_exit(&spa->spa_proc_lock); 1213219089Spjd 1214219089Spjd#ifdef SPA_PROCESS 1215219089Spjd /* 1216219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1217219089Spjd * module, so that the module can't be unloaded out from underneath 1218219089Spjd * it. 1219219089Spjd */ 1220219089Spjd if (spa->spa_did != 0) { 1221219089Spjd thread_join(spa->spa_did); 1222219089Spjd spa->spa_did = 0; 1223219089Spjd } 1224219089Spjd#endif /* SPA_PROCESS */ 1225168404Spjd} 1226168404Spjd 1227168404Spjd/* 1228168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1229168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1230168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1231168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1232168404Spjd */ 1233168404Spjdstatic int 1234168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1235168404Spjd uint_t id, int atype) 1236168404Spjd{ 1237168404Spjd nvlist_t **child; 1238219089Spjd uint_t children; 1239168404Spjd int error; 1240168404Spjd 1241168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1242168404Spjd return (error); 1243168404Spjd 1244168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1245168404Spjd return (0); 1246168404Spjd 1247185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1248185029Spjd &child, &children); 1249185029Spjd 1250185029Spjd if (error == ENOENT) 1251185029Spjd return (0); 1252185029Spjd 1253185029Spjd if (error) { 1254168404Spjd vdev_free(*vdp); 1255168404Spjd *vdp = NULL; 1256249195Smm return (SET_ERROR(EINVAL)); 1257168404Spjd } 1258168404Spjd 1259219089Spjd for (int c = 0; c < children; c++) { 1260168404Spjd vdev_t *vd; 1261168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1262168404Spjd atype)) != 0) { 1263168404Spjd vdev_free(*vdp); 1264168404Spjd *vdp = NULL; 1265168404Spjd return (error); 1266168404Spjd } 1267168404Spjd } 1268168404Spjd 1269168404Spjd ASSERT(*vdp != NULL); 1270168404Spjd 1271168404Spjd return (0); 1272168404Spjd} 1273168404Spjd 1274168404Spjd/* 1275168404Spjd * Opposite of spa_load(). 1276168404Spjd */ 1277168404Spjdstatic void 1278168404Spjdspa_unload(spa_t *spa) 1279168404Spjd{ 1280168404Spjd int i; 1281168404Spjd 1282185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1283185029Spjd 1284168404Spjd /* 1285240868Spjd * Stop TRIM thread. 1286240868Spjd */ 1287240868Spjd trim_thread_destroy(spa); 1288240868Spjd 1289240868Spjd /* 1290168404Spjd * Stop async tasks. 1291168404Spjd */ 1292168404Spjd spa_async_suspend(spa); 1293168404Spjd 1294168404Spjd /* 1295168404Spjd * Stop syncing. 1296168404Spjd */ 1297168404Spjd if (spa->spa_sync_on) { 1298168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1299168404Spjd spa->spa_sync_on = B_FALSE; 1300168404Spjd } 1301168404Spjd 1302168404Spjd /* 1303185029Spjd * Wait for any outstanding async I/O to complete. 1304168404Spjd */ 1305209962Smm if (spa->spa_async_zio_root != NULL) { 1306272598Sdelphij for (int i = 0; i < max_ncpus; i++) 1307272598Sdelphij (void) zio_wait(spa->spa_async_zio_root[i]); 1308272598Sdelphij kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1309209962Smm spa->spa_async_zio_root = NULL; 1310209962Smm } 1311168404Spjd 1312219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1313219089Spjd 1314258717Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1315258717Savg 1316168404Spjd /* 1317258717Savg * Close all vdevs. 1318258717Savg */ 1319258717Savg if (spa->spa_root_vdev) 1320258717Savg vdev_free(spa->spa_root_vdev); 1321258717Savg ASSERT(spa->spa_root_vdev == NULL); 1322258717Savg 1323258717Savg /* 1324168404Spjd * Close the dsl pool. 1325168404Spjd */ 1326168404Spjd if (spa->spa_dsl_pool) { 1327168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1328168404Spjd spa->spa_dsl_pool = NULL; 1329219089Spjd spa->spa_meta_objset = NULL; 1330168404Spjd } 1331168404Spjd 1332219089Spjd ddt_unload(spa); 1333219089Spjd 1334209962Smm 1335168404Spjd /* 1336209962Smm * Drop and purge level 2 cache 1337209962Smm */ 1338209962Smm spa_l2cache_drop(spa); 1339209962Smm 1340185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1341185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1342185029Spjd if (spa->spa_spares.sav_vdevs) { 1343185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1344185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1345185029Spjd spa->spa_spares.sav_vdevs = NULL; 1346168404Spjd } 1347185029Spjd if (spa->spa_spares.sav_config) { 1348185029Spjd nvlist_free(spa->spa_spares.sav_config); 1349185029Spjd spa->spa_spares.sav_config = NULL; 1350168404Spjd } 1351185029Spjd spa->spa_spares.sav_count = 0; 1352168404Spjd 1353230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1354230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1355185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1356230514Smm } 1357185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1358185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1359185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1360185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1361185029Spjd } 1362185029Spjd if (spa->spa_l2cache.sav_config) { 1363185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1364185029Spjd spa->spa_l2cache.sav_config = NULL; 1365185029Spjd } 1366185029Spjd spa->spa_l2cache.sav_count = 0; 1367185029Spjd 1368168404Spjd spa->spa_async_suspended = 0; 1369209962Smm 1370228103Smm if (spa->spa_comment != NULL) { 1371228103Smm spa_strfree(spa->spa_comment); 1372228103Smm spa->spa_comment = NULL; 1373228103Smm } 1374228103Smm 1375209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1376168404Spjd} 1377168404Spjd 1378168404Spjd/* 1379168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1380168404Spjd * this pool. When this is called, we have some form of basic information in 1381185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1382185029Spjd * then re-generate a more complete list including status information. 1383168404Spjd */ 1384168404Spjdstatic void 1385168404Spjdspa_load_spares(spa_t *spa) 1386168404Spjd{ 1387168404Spjd nvlist_t **spares; 1388168404Spjd uint_t nspares; 1389168404Spjd int i; 1390168404Spjd vdev_t *vd, *tvd; 1391168404Spjd 1392185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1393185029Spjd 1394168404Spjd /* 1395168404Spjd * First, close and free any existing spare vdevs. 1396168404Spjd */ 1397185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1398185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1399168404Spjd 1400168404Spjd /* Undo the call to spa_activate() below */ 1401185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1402185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1403168404Spjd spa_spare_remove(tvd); 1404168404Spjd vdev_close(vd); 1405168404Spjd vdev_free(vd); 1406168404Spjd } 1407168404Spjd 1408185029Spjd if (spa->spa_spares.sav_vdevs) 1409185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1410185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1411168404Spjd 1412185029Spjd if (spa->spa_spares.sav_config == NULL) 1413168404Spjd nspares = 0; 1414168404Spjd else 1415185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1416168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1417168404Spjd 1418185029Spjd spa->spa_spares.sav_count = (int)nspares; 1419185029Spjd spa->spa_spares.sav_vdevs = NULL; 1420168404Spjd 1421168404Spjd if (nspares == 0) 1422168404Spjd return; 1423168404Spjd 1424168404Spjd /* 1425168404Spjd * Construct the array of vdevs, opening them to get status in the 1426168404Spjd * process. For each spare, there is potentially two different vdev_t 1427168404Spjd * structures associated with it: one in the list of spares (used only 1428168404Spjd * for basic validation purposes) and one in the active vdev 1429168404Spjd * configuration (if it's spared in). During this phase we open and 1430168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1431168404Spjd * active configuration, then we also mark this vdev as an active spare. 1432168404Spjd */ 1433185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1434185029Spjd KM_SLEEP); 1435185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1436168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1437168404Spjd VDEV_ALLOC_SPARE) == 0); 1438168404Spjd ASSERT(vd != NULL); 1439168404Spjd 1440185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1441168404Spjd 1442185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1443185029Spjd B_FALSE)) != NULL) { 1444168404Spjd if (!tvd->vdev_isspare) 1445168404Spjd spa_spare_add(tvd); 1446168404Spjd 1447168404Spjd /* 1448168404Spjd * We only mark the spare active if we were successfully 1449168404Spjd * able to load the vdev. Otherwise, importing a pool 1450168404Spjd * with a bad active spare would result in strange 1451168404Spjd * behavior, because multiple pool would think the spare 1452168404Spjd * is actively in use. 1453168404Spjd * 1454168404Spjd * There is a vulnerability here to an equally bizarre 1455168404Spjd * circumstance, where a dead active spare is later 1456168404Spjd * brought back to life (onlined or otherwise). Given 1457168404Spjd * the rarity of this scenario, and the extra complexity 1458168404Spjd * it adds, we ignore the possibility. 1459168404Spjd */ 1460168404Spjd if (!vdev_is_dead(tvd)) 1461168404Spjd spa_spare_activate(tvd); 1462168404Spjd } 1463168404Spjd 1464185029Spjd vd->vdev_top = vd; 1465209962Smm vd->vdev_aux = &spa->spa_spares; 1466185029Spjd 1467168404Spjd if (vdev_open(vd) != 0) 1468168404Spjd continue; 1469168404Spjd 1470185029Spjd if (vdev_validate_aux(vd) == 0) 1471185029Spjd spa_spare_add(vd); 1472168404Spjd } 1473168404Spjd 1474168404Spjd /* 1475168404Spjd * Recompute the stashed list of spares, with status information 1476168404Spjd * this time. 1477168404Spjd */ 1478185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1479168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1480168404Spjd 1481185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1482185029Spjd KM_SLEEP); 1483185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1484185029Spjd spares[i] = vdev_config_generate(spa, 1485219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1486185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1487185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1488185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1489168404Spjd nvlist_free(spares[i]); 1490185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1491168404Spjd} 1492168404Spjd 1493185029Spjd/* 1494185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1495185029Spjd * this pool. When this is called, we have some form of basic information in 1496185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1497185029Spjd * then re-generate a more complete list including status information. 1498185029Spjd * Devices which are already active have their details maintained, and are 1499185029Spjd * not re-opened. 1500185029Spjd */ 1501185029Spjdstatic void 1502185029Spjdspa_load_l2cache(spa_t *spa) 1503185029Spjd{ 1504185029Spjd nvlist_t **l2cache; 1505185029Spjd uint_t nl2cache; 1506185029Spjd int i, j, oldnvdevs; 1507219089Spjd uint64_t guid; 1508185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1509185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1510185029Spjd 1511185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1512185029Spjd 1513185029Spjd if (sav->sav_config != NULL) { 1514185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1515185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1516185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1517185029Spjd } else { 1518185029Spjd nl2cache = 0; 1519247187Smm newvdevs = NULL; 1520185029Spjd } 1521185029Spjd 1522185029Spjd oldvdevs = sav->sav_vdevs; 1523185029Spjd oldnvdevs = sav->sav_count; 1524185029Spjd sav->sav_vdevs = NULL; 1525185029Spjd sav->sav_count = 0; 1526185029Spjd 1527185029Spjd /* 1528185029Spjd * Process new nvlist of vdevs. 1529185029Spjd */ 1530185029Spjd for (i = 0; i < nl2cache; i++) { 1531185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1532185029Spjd &guid) == 0); 1533185029Spjd 1534185029Spjd newvdevs[i] = NULL; 1535185029Spjd for (j = 0; j < oldnvdevs; j++) { 1536185029Spjd vd = oldvdevs[j]; 1537185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1538185029Spjd /* 1539185029Spjd * Retain previous vdev for add/remove ops. 1540185029Spjd */ 1541185029Spjd newvdevs[i] = vd; 1542185029Spjd oldvdevs[j] = NULL; 1543185029Spjd break; 1544185029Spjd } 1545185029Spjd } 1546185029Spjd 1547185029Spjd if (newvdevs[i] == NULL) { 1548185029Spjd /* 1549185029Spjd * Create new vdev 1550185029Spjd */ 1551185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1552185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1553185029Spjd ASSERT(vd != NULL); 1554185029Spjd newvdevs[i] = vd; 1555185029Spjd 1556185029Spjd /* 1557185029Spjd * Commit this vdev as an l2cache device, 1558185029Spjd * even if it fails to open. 1559185029Spjd */ 1560185029Spjd spa_l2cache_add(vd); 1561185029Spjd 1562185029Spjd vd->vdev_top = vd; 1563185029Spjd vd->vdev_aux = sav; 1564185029Spjd 1565185029Spjd spa_l2cache_activate(vd); 1566185029Spjd 1567185029Spjd if (vdev_open(vd) != 0) 1568185029Spjd continue; 1569185029Spjd 1570185029Spjd (void) vdev_validate_aux(vd); 1571185029Spjd 1572219089Spjd if (!vdev_is_dead(vd)) 1573219089Spjd l2arc_add_vdev(spa, vd); 1574185029Spjd } 1575185029Spjd } 1576185029Spjd 1577185029Spjd /* 1578185029Spjd * Purge vdevs that were dropped 1579185029Spjd */ 1580185029Spjd for (i = 0; i < oldnvdevs; i++) { 1581185029Spjd uint64_t pool; 1582185029Spjd 1583185029Spjd vd = oldvdevs[i]; 1584185029Spjd if (vd != NULL) { 1585230514Smm ASSERT(vd->vdev_isl2cache); 1586230514Smm 1587209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1588209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1589185029Spjd l2arc_remove_vdev(vd); 1590230514Smm vdev_clear_stats(vd); 1591230514Smm vdev_free(vd); 1592185029Spjd } 1593185029Spjd } 1594185029Spjd 1595185029Spjd if (oldvdevs) 1596185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1597185029Spjd 1598185029Spjd if (sav->sav_config == NULL) 1599185029Spjd goto out; 1600185029Spjd 1601185029Spjd sav->sav_vdevs = newvdevs; 1602185029Spjd sav->sav_count = (int)nl2cache; 1603185029Spjd 1604185029Spjd /* 1605185029Spjd * Recompute the stashed list of l2cache devices, with status 1606185029Spjd * information this time. 1607185029Spjd */ 1608185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1609185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1610185029Spjd 1611185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1612185029Spjd for (i = 0; i < sav->sav_count; i++) 1613185029Spjd l2cache[i] = vdev_config_generate(spa, 1614219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1615185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1616185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1617185029Spjdout: 1618185029Spjd for (i = 0; i < sav->sav_count; i++) 1619185029Spjd nvlist_free(l2cache[i]); 1620185029Spjd if (sav->sav_count) 1621185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1622185029Spjd} 1623185029Spjd 1624168404Spjdstatic int 1625168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1626168404Spjd{ 1627168404Spjd dmu_buf_t *db; 1628168404Spjd char *packed = NULL; 1629168404Spjd size_t nvsize = 0; 1630168404Spjd int error; 1631168404Spjd *value = NULL; 1632168404Spjd 1633262676Sdelphij error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1634262676Sdelphij if (error != 0) 1635262676Sdelphij return (error); 1636168404Spjd nvsize = *(uint64_t *)db->db_data; 1637168404Spjd dmu_buf_rele(db, FTAG); 1638168404Spjd 1639168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1640209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1641209962Smm DMU_READ_PREFETCH); 1642168404Spjd if (error == 0) 1643168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1644168404Spjd kmem_free(packed, nvsize); 1645168404Spjd 1646168404Spjd return (error); 1647168404Spjd} 1648168404Spjd 1649168404Spjd/* 1650185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1651185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1652185029Spjd */ 1653185029Spjdstatic void 1654185029Spjdspa_check_removed(vdev_t *vd) 1655185029Spjd{ 1656219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1657185029Spjd spa_check_removed(vd->vdev_child[c]); 1658185029Spjd 1659249188Smm if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1660249188Smm !vd->vdev_ishole) { 1661185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1662185029Spjd spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1663185029Spjd } 1664185029Spjd} 1665185029Spjd 1666185029Spjd/* 1667219089Spjd * Validate the current config against the MOS config 1668213197Smm */ 1669219089Spjdstatic boolean_t 1670219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1671213197Smm{ 1672219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1673219089Spjd nvlist_t *nv; 1674213197Smm 1675219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1676213197Smm 1677219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1678219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1679219089Spjd 1680219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1681219089Spjd 1682219089Spjd /* 1683219089Spjd * If we're doing a normal import, then build up any additional 1684219089Spjd * diagnostic information about missing devices in this config. 1685219089Spjd * We'll pass this up to the user for further processing. 1686219089Spjd */ 1687219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1688219089Spjd nvlist_t **child, *nv; 1689219089Spjd uint64_t idx = 0; 1690219089Spjd 1691219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1692219089Spjd KM_SLEEP); 1693219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1694219089Spjd 1695219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1696219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1697219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1698219089Spjd 1699219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1700219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1701219089Spjd mtvd->vdev_islog) 1702219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1703219089Spjd B_FALSE, 0); 1704219089Spjd } 1705219089Spjd 1706219089Spjd if (idx) { 1707219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1708219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1709219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1710219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1711219089Spjd 1712219089Spjd for (int i = 0; i < idx; i++) 1713219089Spjd nvlist_free(child[i]); 1714219089Spjd } 1715219089Spjd nvlist_free(nv); 1716219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1717219089Spjd } 1718219089Spjd 1719219089Spjd /* 1720219089Spjd * Compare the root vdev tree with the information we have 1721219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1722219089Spjd * with the corresponding MOS config top-level (mtvd). 1723219089Spjd */ 1724219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1725213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1726219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1727213197Smm 1728219089Spjd /* 1729219089Spjd * Resolve any "missing" vdevs in the current configuration. 1730219089Spjd * If we find that the MOS config has more accurate information 1731219089Spjd * about the top-level vdev then use that vdev instead. 1732219089Spjd */ 1733219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1734219089Spjd mtvd->vdev_ops != &vdev_missing_ops) { 1735219089Spjd 1736219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1737219089Spjd continue; 1738219089Spjd 1739219089Spjd /* 1740219089Spjd * Device specific actions. 1741219089Spjd */ 1742219089Spjd if (mtvd->vdev_islog) { 1743219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1744219089Spjd } else { 1745219089Spjd /* 1746219089Spjd * XXX - once we have 'readonly' pool 1747219089Spjd * support we should be able to handle 1748219089Spjd * missing data devices by transitioning 1749219089Spjd * the pool to readonly. 1750219089Spjd */ 1751219089Spjd continue; 1752219089Spjd } 1753219089Spjd 1754219089Spjd /* 1755219089Spjd * Swap the missing vdev with the data we were 1756219089Spjd * able to obtain from the MOS config. 1757219089Spjd */ 1758219089Spjd vdev_remove_child(rvd, tvd); 1759219089Spjd vdev_remove_child(mrvd, mtvd); 1760219089Spjd 1761219089Spjd vdev_add_child(rvd, mtvd); 1762219089Spjd vdev_add_child(mrvd, tvd); 1763219089Spjd 1764219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1765219089Spjd vdev_load(mtvd); 1766219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1767219089Spjd 1768219089Spjd vdev_reopen(rvd); 1769219089Spjd } else if (mtvd->vdev_islog) { 1770219089Spjd /* 1771219089Spjd * Load the slog device's state from the MOS config 1772219089Spjd * since it's possible that the label does not 1773219089Spjd * contain the most up-to-date information. 1774219089Spjd */ 1775219089Spjd vdev_load_log_state(tvd, mtvd); 1776219089Spjd vdev_reopen(tvd); 1777219089Spjd } 1778213197Smm } 1779219089Spjd vdev_free(mrvd); 1780219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1781219089Spjd 1782219089Spjd /* 1783219089Spjd * Ensure we were able to validate the config. 1784219089Spjd */ 1785219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1786213197Smm} 1787213197Smm 1788213197Smm/* 1789185029Spjd * Check for missing log devices 1790185029Spjd */ 1791248571Smmstatic boolean_t 1792185029Spjdspa_check_logs(spa_t *spa) 1793185029Spjd{ 1794248571Smm boolean_t rv = B_FALSE; 1795248571Smm 1796185029Spjd switch (spa->spa_log_state) { 1797185029Spjd case SPA_LOG_MISSING: 1798185029Spjd /* need to recheck in case slog has been restored */ 1799185029Spjd case SPA_LOG_UNKNOWN: 1800248571Smm rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, 1801248571Smm NULL, DS_FIND_CHILDREN) != 0); 1802248571Smm if (rv) 1803219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1804185029Spjd break; 1805185029Spjd } 1806248571Smm return (rv); 1807185029Spjd} 1808185029Spjd 1809219089Spjdstatic boolean_t 1810219089Spjdspa_passivate_log(spa_t *spa) 1811219089Spjd{ 1812219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1813219089Spjd boolean_t slog_found = B_FALSE; 1814219089Spjd 1815219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1816219089Spjd 1817219089Spjd if (!spa_has_slogs(spa)) 1818219089Spjd return (B_FALSE); 1819219089Spjd 1820219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1821219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1822219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1823219089Spjd 1824219089Spjd if (tvd->vdev_islog) { 1825219089Spjd metaslab_group_passivate(mg); 1826219089Spjd slog_found = B_TRUE; 1827219089Spjd } 1828219089Spjd } 1829219089Spjd 1830219089Spjd return (slog_found); 1831219089Spjd} 1832219089Spjd 1833219089Spjdstatic void 1834219089Spjdspa_activate_log(spa_t *spa) 1835219089Spjd{ 1836219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1837219089Spjd 1838219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1839219089Spjd 1840219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1841219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1842219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1843219089Spjd 1844219089Spjd if (tvd->vdev_islog) 1845219089Spjd metaslab_group_activate(mg); 1846219089Spjd } 1847219089Spjd} 1848219089Spjd 1849219089Spjdint 1850219089Spjdspa_offline_log(spa_t *spa) 1851219089Spjd{ 1852248571Smm int error; 1853219089Spjd 1854248571Smm error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1855248571Smm NULL, DS_FIND_CHILDREN); 1856248571Smm if (error == 0) { 1857219089Spjd /* 1858219089Spjd * We successfully offlined the log device, sync out the 1859219089Spjd * current txg so that the "stubby" block can be removed 1860219089Spjd * by zil_sync(). 1861219089Spjd */ 1862219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1863219089Spjd } 1864219089Spjd return (error); 1865219089Spjd} 1866219089Spjd 1867219089Spjdstatic void 1868219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1869219089Spjd{ 1870219089Spjd int i; 1871219089Spjd 1872219089Spjd for (i = 0; i < sav->sav_count; i++) 1873219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1874219089Spjd} 1875219089Spjd 1876219089Spjdvoid 1877219089Spjdspa_claim_notify(zio_t *zio) 1878219089Spjd{ 1879219089Spjd spa_t *spa = zio->io_spa; 1880219089Spjd 1881219089Spjd if (zio->io_error) 1882219089Spjd return; 1883219089Spjd 1884219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1885219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1886219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1887219089Spjd mutex_exit(&spa->spa_props_lock); 1888219089Spjd} 1889219089Spjd 1890219089Spjdtypedef struct spa_load_error { 1891219089Spjd uint64_t sle_meta_count; 1892219089Spjd uint64_t sle_data_count; 1893219089Spjd} spa_load_error_t; 1894219089Spjd 1895219089Spjdstatic void 1896219089Spjdspa_load_verify_done(zio_t *zio) 1897219089Spjd{ 1898219089Spjd blkptr_t *bp = zio->io_bp; 1899219089Spjd spa_load_error_t *sle = zio->io_private; 1900219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1901219089Spjd int error = zio->io_error; 1902268720Sdelphij spa_t *spa = zio->io_spa; 1903219089Spjd 1904219089Spjd if (error) { 1905236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1906219089Spjd type != DMU_OT_INTENT_LOG) 1907270247Sdelphij atomic_inc_64(&sle->sle_meta_count); 1908219089Spjd else 1909270247Sdelphij atomic_inc_64(&sle->sle_data_count); 1910219089Spjd } 1911219089Spjd zio_data_buf_free(zio->io_data, zio->io_size); 1912268720Sdelphij 1913268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 1914268720Sdelphij spa->spa_scrub_inflight--; 1915268720Sdelphij cv_broadcast(&spa->spa_scrub_io_cv); 1916268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 1917219089Spjd} 1918219089Spjd 1919268720Sdelphij/* 1920268720Sdelphij * Maximum number of concurrent scrub i/os to create while verifying 1921268720Sdelphij * a pool while importing it. 1922268720Sdelphij */ 1923268720Sdelphijint spa_load_verify_maxinflight = 10000; 1924268720Sdelphijboolean_t spa_load_verify_metadata = B_TRUE; 1925268720Sdelphijboolean_t spa_load_verify_data = B_TRUE; 1926268720Sdelphij 1927268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, 1928268720Sdelphij &spa_load_verify_maxinflight, 0, 1929268720Sdelphij "Maximum number of concurrent scrub I/Os to create while verifying a " 1930268720Sdelphij "pool while importing it"); 1931268720Sdelphij 1932268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, 1933268720Sdelphij &spa_load_verify_metadata, 0, 1934268720Sdelphij "Check metadata on import?"); 1935268720Sdelphij 1936268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, 1937268720Sdelphij &spa_load_verify_data, 0, 1938268720Sdelphij "Check user data on import?"); 1939268720Sdelphij 1940219089Spjd/*ARGSUSED*/ 1941219089Spjdstatic int 1942219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1943268123Sdelphij const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1944219089Spjd{ 1945268720Sdelphij if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1946268720Sdelphij return (0); 1947268720Sdelphij /* 1948268720Sdelphij * Note: normally this routine will not be called if 1949268720Sdelphij * spa_load_verify_metadata is not set. However, it may be useful 1950268720Sdelphij * to manually set the flag after the traversal has begun. 1951268720Sdelphij */ 1952268720Sdelphij if (!spa_load_verify_metadata) 1953268720Sdelphij return (0); 1954268720Sdelphij if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) 1955268720Sdelphij return (0); 1956219089Spjd 1957268720Sdelphij zio_t *rio = arg; 1958268720Sdelphij size_t size = BP_GET_PSIZE(bp); 1959268720Sdelphij void *data = zio_data_buf_alloc(size); 1960268720Sdelphij 1961268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 1962268720Sdelphij while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1963268720Sdelphij cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1964268720Sdelphij spa->spa_scrub_inflight++; 1965268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 1966268720Sdelphij 1967268720Sdelphij zio_nowait(zio_read(rio, spa, bp, data, size, 1968268720Sdelphij spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1969268720Sdelphij ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1970268720Sdelphij ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1971219089Spjd return (0); 1972219089Spjd} 1973219089Spjd 1974219089Spjdstatic int 1975219089Spjdspa_load_verify(spa_t *spa) 1976219089Spjd{ 1977219089Spjd zio_t *rio; 1978219089Spjd spa_load_error_t sle = { 0 }; 1979219089Spjd zpool_rewind_policy_t policy; 1980219089Spjd boolean_t verify_ok = B_FALSE; 1981268720Sdelphij int error = 0; 1982219089Spjd 1983219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 1984219089Spjd 1985219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1986219089Spjd return (0); 1987219089Spjd 1988219089Spjd rio = zio_root(spa, NULL, &sle, 1989219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1990219089Spjd 1991268720Sdelphij if (spa_load_verify_metadata) { 1992268720Sdelphij error = traverse_pool(spa, spa->spa_verify_min_txg, 1993268720Sdelphij TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 1994268720Sdelphij spa_load_verify_cb, rio); 1995268720Sdelphij } 1996219089Spjd 1997219089Spjd (void) zio_wait(rio); 1998219089Spjd 1999219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 2000219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 2001219089Spjd 2002219089Spjd if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 2003219089Spjd sle.sle_data_count <= policy.zrp_maxdata) { 2004219089Spjd int64_t loss = 0; 2005219089Spjd 2006219089Spjd verify_ok = B_TRUE; 2007219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2008219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2009219089Spjd 2010219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2011219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 2012219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2013219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 2014219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2015219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 2016219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2017219089Spjd } else { 2018219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2019219089Spjd } 2020219089Spjd 2021219089Spjd if (error) { 2022219089Spjd if (error != ENXIO && error != EIO) 2023249195Smm error = SET_ERROR(EIO); 2024219089Spjd return (error); 2025219089Spjd } 2026219089Spjd 2027219089Spjd return (verify_ok ? 0 : EIO); 2028219089Spjd} 2029219089Spjd 2030185029Spjd/* 2031219089Spjd * Find a value in the pool props object. 2032168404Spjd */ 2033219089Spjdstatic void 2034219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2035219089Spjd{ 2036219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2037219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2038219089Spjd} 2039219089Spjd 2040219089Spjd/* 2041219089Spjd * Find a value in the pool directory object. 2042219089Spjd */ 2043168404Spjdstatic int 2044219089Spjdspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 2045168404Spjd{ 2046219089Spjd return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2047219089Spjd name, sizeof (uint64_t), 1, val)); 2048219089Spjd} 2049168404Spjd 2050219089Spjdstatic int 2051219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2052219089Spjd{ 2053219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2054219089Spjd return (err); 2055219089Spjd} 2056219089Spjd 2057219089Spjd/* 2058219089Spjd * Fix up config after a partly-completed split. This is done with the 2059219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2060219089Spjd * pool have that entry in their config, but only the splitting one contains 2061219089Spjd * a list of all the guids of the vdevs that are being split off. 2062219089Spjd * 2063219089Spjd * This function determines what to do with that list: either rejoin 2064219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 2065219089Spjd * the rejoin, each disk that is offlined is marked online again, and 2066219089Spjd * we do a reopen() call. If the vdev label for every disk that was 2067219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2068219089Spjd * then we call vdev_split() on each disk, and complete the split. 2069219089Spjd * 2070219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 2071219089Spjd * the original pool. 2072219089Spjd */ 2073219089Spjdstatic void 2074219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 2075219089Spjd{ 2076219089Spjd uint_t extracted; 2077219089Spjd uint64_t *glist; 2078219089Spjd uint_t i, gcount; 2079219089Spjd nvlist_t *nvl; 2080219089Spjd vdev_t **vd; 2081219089Spjd boolean_t attempt_reopen; 2082219089Spjd 2083219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2084219089Spjd return; 2085219089Spjd 2086219089Spjd /* check that the config is complete */ 2087219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2088219089Spjd &glist, &gcount) != 0) 2089219089Spjd return; 2090219089Spjd 2091219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2092219089Spjd 2093219089Spjd /* attempt to online all the vdevs & validate */ 2094219089Spjd attempt_reopen = B_TRUE; 2095219089Spjd for (i = 0; i < gcount; i++) { 2096219089Spjd if (glist[i] == 0) /* vdev is hole */ 2097219089Spjd continue; 2098219089Spjd 2099219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2100219089Spjd if (vd[i] == NULL) { 2101219089Spjd /* 2102219089Spjd * Don't bother attempting to reopen the disks; 2103219089Spjd * just do the split. 2104219089Spjd */ 2105219089Spjd attempt_reopen = B_FALSE; 2106219089Spjd } else { 2107219089Spjd /* attempt to re-online it */ 2108219089Spjd vd[i]->vdev_offline = B_FALSE; 2109219089Spjd } 2110219089Spjd } 2111219089Spjd 2112219089Spjd if (attempt_reopen) { 2113219089Spjd vdev_reopen(spa->spa_root_vdev); 2114219089Spjd 2115219089Spjd /* check each device to see what state it's in */ 2116219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 2117219089Spjd if (vd[i] != NULL && 2118219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2119219089Spjd break; 2120219089Spjd ++extracted; 2121219089Spjd } 2122219089Spjd } 2123219089Spjd 2124209962Smm /* 2125219089Spjd * If every disk has been moved to the new pool, or if we never 2126219089Spjd * even attempted to look at them, then we split them off for 2127219089Spjd * good. 2128209962Smm */ 2129219089Spjd if (!attempt_reopen || gcount == extracted) { 2130219089Spjd for (i = 0; i < gcount; i++) 2131219089Spjd if (vd[i] != NULL) 2132219089Spjd vdev_split(vd[i]); 2133219089Spjd vdev_reopen(spa->spa_root_vdev); 2134219089Spjd } 2135209962Smm 2136219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 2137219089Spjd} 2138185029Spjd 2139219089Spjdstatic int 2140219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2141219089Spjd boolean_t mosconfig) 2142219089Spjd{ 2143219089Spjd nvlist_t *config = spa->spa_config; 2144219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 2145228103Smm char *comment; 2146219089Spjd int error; 2147219089Spjd uint64_t pool_guid; 2148219089Spjd nvlist_t *nvl; 2149168404Spjd 2150219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2151249195Smm return (SET_ERROR(EINVAL)); 2152168404Spjd 2153228103Smm ASSERT(spa->spa_comment == NULL); 2154228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2155228103Smm spa->spa_comment = spa_strdup(comment); 2156228103Smm 2157168404Spjd /* 2158168404Spjd * Versioning wasn't explicitly added to the label until later, so if 2159168404Spjd * it's not present treat it as the initial version. 2160168404Spjd */ 2161219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2162219089Spjd &spa->spa_ubsync.ub_version) != 0) 2163219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2164168404Spjd 2165168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2166168404Spjd &spa->spa_config_txg); 2167168404Spjd 2168168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2169168404Spjd spa_guid_exists(pool_guid, 0)) { 2170249195Smm error = SET_ERROR(EEXIST); 2171219089Spjd } else { 2172228103Smm spa->spa_config_guid = pool_guid; 2173219089Spjd 2174219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2175219089Spjd &nvl) == 0) { 2176219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2177219089Spjd KM_SLEEP) == 0); 2178219089Spjd } 2179219089Spjd 2180236884Smm nvlist_free(spa->spa_load_info); 2181236884Smm spa->spa_load_info = fnvlist_alloc(); 2182236884Smm 2183219089Spjd gethrestime(&spa->spa_loaded_ts); 2184219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 2185219089Spjd mosconfig, &ereport); 2186168404Spjd } 2187168404Spjd 2188286575Smav /* 2189286575Smav * Don't count references from objsets that are already closed 2190286575Smav * and are making their way through the eviction process. 2191286575Smav */ 2192286575Smav spa_evicting_os_wait(spa); 2193219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2194219089Spjd if (error) { 2195219089Spjd if (error != EEXIST) { 2196219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2197219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2198219089Spjd } 2199219089Spjd if (error != EBADF) { 2200219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2201219089Spjd } 2202219089Spjd } 2203219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2204219089Spjd spa->spa_ena = 0; 2205168404Spjd 2206219089Spjd return (error); 2207219089Spjd} 2208219089Spjd 2209219089Spjd/* 2210219089Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 2211219089Spjd * source of configuration information. 2212219089Spjd */ 2213219089Spjdstatic int 2214219089Spjdspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2215219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2216219089Spjd char **ereport) 2217219089Spjd{ 2218219089Spjd int error = 0; 2219219089Spjd nvlist_t *nvroot = NULL; 2220236884Smm nvlist_t *label; 2221219089Spjd vdev_t *rvd; 2222219089Spjd uberblock_t *ub = &spa->spa_uberblock; 2223219089Spjd uint64_t children, config_cache_txg = spa->spa_config_txg; 2224219089Spjd int orig_mode = spa->spa_mode; 2225219089Spjd int parse; 2226219089Spjd uint64_t obj; 2227236884Smm boolean_t missing_feat_write = B_FALSE; 2228219089Spjd 2229168404Spjd /* 2230219089Spjd * If this is an untrusted config, access the pool in read-only mode. 2231219089Spjd * This prevents things like resilvering recently removed devices. 2232219089Spjd */ 2233219089Spjd if (!mosconfig) 2234219089Spjd spa->spa_mode = FREAD; 2235219089Spjd 2236219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2237219089Spjd 2238219089Spjd spa->spa_load_state = state; 2239219089Spjd 2240219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2241249195Smm return (SET_ERROR(EINVAL)); 2242219089Spjd 2243219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 2244219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2245219089Spjd 2246219089Spjd /* 2247209962Smm * Create "The Godfather" zio to hold all async IOs 2248209962Smm */ 2249272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2250272598Sdelphij KM_SLEEP); 2251272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 2252272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2253272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2254272598Sdelphij ZIO_FLAG_GODFATHER); 2255272598Sdelphij } 2256209962Smm 2257209962Smm /* 2258168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2259168404Spjd * value that will be returned by spa_version() since parsing the 2260168404Spjd * configuration requires knowing the version number. 2261168404Spjd */ 2262185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2263219089Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2264185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2265168404Spjd 2266168404Spjd if (error != 0) 2267219089Spjd return (error); 2268168404Spjd 2269168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2270284304Savg ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2271284304Savg ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2272168404Spjd 2273219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2274219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2275219089Spjd } 2276219089Spjd 2277168404Spjd /* 2278168404Spjd * Try to open all vdevs, loading each label in the process. 2279168404Spjd */ 2280185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2281168926Spjd error = vdev_open(rvd); 2282185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2283168926Spjd if (error != 0) 2284219089Spjd return (error); 2285168404Spjd 2286168404Spjd /* 2287209962Smm * We need to validate the vdev labels against the configuration that 2288209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 2289209962Smm * mosconfig is true then we're validating the vdev labels based on 2290219089Spjd * that config. Otherwise, we're validating against the cached config 2291209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 2292209962Smm * later we will recursively call spa_load() and validate against 2293209962Smm * the vdev config. 2294219089Spjd * 2295219089Spjd * If we're assembling a new pool that's been split off from an 2296219089Spjd * existing pool, the labels haven't yet been updated so we skip 2297219089Spjd * validation for now. 2298168404Spjd */ 2299219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2300219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2301230514Smm error = vdev_validate(rvd, mosconfig); 2302219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2303168404Spjd 2304219089Spjd if (error != 0) 2305219089Spjd return (error); 2306219089Spjd 2307219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2308249195Smm return (SET_ERROR(ENXIO)); 2309168404Spjd } 2310168404Spjd 2311168404Spjd /* 2312168404Spjd * Find the best uberblock. 2313168404Spjd */ 2314236884Smm vdev_uberblock_load(rvd, ub, &label); 2315168404Spjd 2316168404Spjd /* 2317168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2318168404Spjd */ 2319236884Smm if (ub->ub_txg == 0) { 2320236884Smm nvlist_free(label); 2321219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2322236884Smm } 2323168404Spjd 2324168404Spjd /* 2325236884Smm * If the pool has an unsupported version we can't open it. 2326168404Spjd */ 2327236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2328236884Smm nvlist_free(label); 2329219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2330236884Smm } 2331168404Spjd 2332236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2333236884Smm nvlist_t *features; 2334236884Smm 2335236884Smm /* 2336236884Smm * If we weren't able to find what's necessary for reading the 2337236884Smm * MOS in the label, return failure. 2338236884Smm */ 2339236884Smm if (label == NULL || nvlist_lookup_nvlist(label, 2340236884Smm ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2341236884Smm nvlist_free(label); 2342236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2343236884Smm ENXIO)); 2344236884Smm } 2345236884Smm 2346236884Smm /* 2347236884Smm * Update our in-core representation with the definitive values 2348236884Smm * from the label. 2349236884Smm */ 2350236884Smm nvlist_free(spa->spa_label_features); 2351236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2352236884Smm } 2353236884Smm 2354236884Smm nvlist_free(label); 2355236884Smm 2356168404Spjd /* 2357236884Smm * Look through entries in the label nvlist's features_for_read. If 2358236884Smm * there is a feature listed there which we don't understand then we 2359236884Smm * cannot open a pool. 2360236884Smm */ 2361236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2362236884Smm nvlist_t *unsup_feat; 2363236884Smm 2364236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2365236884Smm 0); 2366236884Smm 2367236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2368236884Smm NULL); nvp != NULL; 2369236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2370236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2371236884Smm VERIFY(nvlist_add_string(unsup_feat, 2372236884Smm nvpair_name(nvp), "") == 0); 2373236884Smm } 2374236884Smm } 2375236884Smm 2376236884Smm if (!nvlist_empty(unsup_feat)) { 2377236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2378236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2379236884Smm nvlist_free(unsup_feat); 2380236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2381236884Smm ENOTSUP)); 2382236884Smm } 2383236884Smm 2384236884Smm nvlist_free(unsup_feat); 2385236884Smm } 2386236884Smm 2387236884Smm /* 2388168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2389219089Spjd * incomplete configuration. We first check to see if the pool 2390219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2391219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2392219089Spjd * can handle missing vdevs. 2393168404Spjd */ 2394219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2395219089Spjd &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2396219089Spjd rvd->vdev_guid_sum != ub->ub_guid_sum) 2397219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2398219089Spjd 2399219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2400219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2401219089Spjd spa_try_repair(spa, config); 2402219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2403219089Spjd nvlist_free(spa->spa_config_splitting); 2404219089Spjd spa->spa_config_splitting = NULL; 2405168404Spjd } 2406168404Spjd 2407168404Spjd /* 2408168404Spjd * Initialize internal SPA structures. 2409168404Spjd */ 2410168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2411168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2412219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2413219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2414219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2415219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2416219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2417219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2418219089Spjd 2419236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2420219089Spjd if (error) 2421219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2422168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2423168404Spjd 2424219089Spjd if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2425219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2426168404Spjd 2427236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2428236884Smm boolean_t missing_feat_read = B_FALSE; 2429238926Smm nvlist_t *unsup_feat, *enabled_feat; 2430236884Smm 2431236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2432236884Smm &spa->spa_feat_for_read_obj) != 0) { 2433236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2434236884Smm } 2435236884Smm 2436236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2437236884Smm &spa->spa_feat_for_write_obj) != 0) { 2438236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2439236884Smm } 2440236884Smm 2441236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2442236884Smm &spa->spa_feat_desc_obj) != 0) { 2443236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2444236884Smm } 2445236884Smm 2446238926Smm enabled_feat = fnvlist_alloc(); 2447238926Smm unsup_feat = fnvlist_alloc(); 2448236884Smm 2449259813Sdelphij if (!spa_features_check(spa, B_FALSE, 2450238926Smm unsup_feat, enabled_feat)) 2451236884Smm missing_feat_read = B_TRUE; 2452236884Smm 2453236884Smm if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2454259813Sdelphij if (!spa_features_check(spa, B_TRUE, 2455238926Smm unsup_feat, enabled_feat)) { 2456236884Smm missing_feat_write = B_TRUE; 2457238926Smm } 2458236884Smm } 2459236884Smm 2460238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2461238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2462238926Smm 2463236884Smm if (!nvlist_empty(unsup_feat)) { 2464238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2465238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2466236884Smm } 2467236884Smm 2468238926Smm fnvlist_free(enabled_feat); 2469238926Smm fnvlist_free(unsup_feat); 2470236884Smm 2471236884Smm if (!missing_feat_read) { 2472236884Smm fnvlist_add_boolean(spa->spa_load_info, 2473236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2474236884Smm } 2475236884Smm 2476236884Smm /* 2477236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2478236884Smm * twofold: to determine whether the pool is available for 2479236884Smm * import in read-write mode and (if it is not) whether the 2480236884Smm * pool is available for import in read-only mode. If the pool 2481236884Smm * is available for import in read-write mode, it is displayed 2482236884Smm * as available in userland; if it is not available for import 2483236884Smm * in read-only mode, it is displayed as unavailable in 2484236884Smm * userland. If the pool is available for import in read-only 2485236884Smm * mode but not read-write mode, it is displayed as unavailable 2486236884Smm * in userland with a special note that the pool is actually 2487236884Smm * available for open in read-only mode. 2488236884Smm * 2489236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2490236884Smm * missing a feature for write, we must first determine whether 2491236884Smm * the pool can be opened read-only before returning to 2492236884Smm * userland in order to know whether to display the 2493236884Smm * abovementioned note. 2494236884Smm */ 2495236884Smm if (missing_feat_read || (missing_feat_write && 2496236884Smm spa_writeable(spa))) { 2497236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2498236884Smm ENOTSUP)); 2499236884Smm } 2500260150Sdelphij 2501260150Sdelphij /* 2502260150Sdelphij * Load refcounts for ZFS features from disk into an in-memory 2503260150Sdelphij * cache during SPA initialization. 2504260150Sdelphij */ 2505260150Sdelphij for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2506260150Sdelphij uint64_t refcount; 2507260150Sdelphij 2508260150Sdelphij error = feature_get_refcount_from_disk(spa, 2509260150Sdelphij &spa_feature_table[i], &refcount); 2510260150Sdelphij if (error == 0) { 2511260150Sdelphij spa->spa_feat_refcount_cache[i] = refcount; 2512260150Sdelphij } else if (error == ENOTSUP) { 2513260150Sdelphij spa->spa_feat_refcount_cache[i] = 2514260150Sdelphij SPA_FEATURE_DISABLED; 2515260150Sdelphij } else { 2516260150Sdelphij return (spa_vdev_err(rvd, 2517260150Sdelphij VDEV_AUX_CORRUPT_DATA, EIO)); 2518260150Sdelphij } 2519260150Sdelphij } 2520236884Smm } 2521236884Smm 2522260150Sdelphij if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2523260150Sdelphij if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2524268075Sdelphij &spa->spa_feat_enabled_txg_obj) != 0) 2525260150Sdelphij return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2526260150Sdelphij } 2527260150Sdelphij 2528236884Smm spa->spa_is_initializing = B_TRUE; 2529236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2530236884Smm spa->spa_is_initializing = B_FALSE; 2531236884Smm if (error != 0) 2532236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2533236884Smm 2534168404Spjd if (!mosconfig) { 2535168498Spjd uint64_t hostid; 2536219089Spjd nvlist_t *policy = NULL, *nvconfig; 2537168404Spjd 2538219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2539219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2540168404Spjd 2541219089Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2542185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2543168498Spjd char *hostname; 2544168498Spjd unsigned long myhostid = 0; 2545168498Spjd 2546219089Spjd VERIFY(nvlist_lookup_string(nvconfig, 2547168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2548168498Spjd 2549219089Spjd#ifdef _KERNEL 2550219089Spjd myhostid = zone_get_hostid(NULL); 2551219089Spjd#else /* _KERNEL */ 2552219089Spjd /* 2553219089Spjd * We're emulating the system's hostid in userland, so 2554219089Spjd * we can't use zone_get_hostid(). 2555219089Spjd */ 2556168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2557219089Spjd#endif /* _KERNEL */ 2558204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 2559219089Spjd hostid != myhostid) { 2560219089Spjd nvlist_free(nvconfig); 2561168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 2562168498Spjd "loaded as it was last accessed by " 2563185029Spjd "another system (host: %s hostid: 0x%lx). " 2564236146Smm "See: http://illumos.org/msg/ZFS-8000-EY", 2565185029Spjd spa_name(spa), hostname, 2566168498Spjd (unsigned long)hostid); 2567249195Smm return (SET_ERROR(EBADF)); 2568168498Spjd } 2569168498Spjd } 2570219089Spjd if (nvlist_lookup_nvlist(spa->spa_config, 2571219089Spjd ZPOOL_REWIND_POLICY, &policy) == 0) 2572219089Spjd VERIFY(nvlist_add_nvlist(nvconfig, 2573219089Spjd ZPOOL_REWIND_POLICY, policy) == 0); 2574168498Spjd 2575219089Spjd spa_config_set(spa, nvconfig); 2576168404Spjd spa_unload(spa); 2577168404Spjd spa_deactivate(spa); 2578209962Smm spa_activate(spa, orig_mode); 2579168404Spjd 2580219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2581168404Spjd } 2582168404Spjd 2583219089Spjd if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2584219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2585219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2586219089Spjd if (error != 0) 2587219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2588168404Spjd 2589168404Spjd /* 2590168404Spjd * Load the bit that tells us to use the new accounting function 2591168404Spjd * (raid-z deflation). If we have an older pool, this will not 2592168404Spjd * be present. 2593168404Spjd */ 2594219089Spjd error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2595219089Spjd if (error != 0 && error != ENOENT) 2596219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2597168404Spjd 2598219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2599219089Spjd &spa->spa_creation_version); 2600219089Spjd if (error != 0 && error != ENOENT) 2601219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2602219089Spjd 2603168404Spjd /* 2604168404Spjd * Load the persistent error log. If we have an older pool, this will 2605168404Spjd * not be present. 2606168404Spjd */ 2607219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2608219089Spjd if (error != 0 && error != ENOENT) 2609219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2610168404Spjd 2611219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2612219089Spjd &spa->spa_errlog_scrub); 2613219089Spjd if (error != 0 && error != ENOENT) 2614219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2615168404Spjd 2616168404Spjd /* 2617168404Spjd * Load the history object. If we have an older pool, this 2618168404Spjd * will not be present. 2619168404Spjd */ 2620219089Spjd error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2621219089Spjd if (error != 0 && error != ENOENT) 2622219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2623168404Spjd 2624168404Spjd /* 2625219089Spjd * If we're assembling the pool from the split-off vdevs of 2626219089Spjd * an existing pool, we don't want to attach the spares & cache 2627219089Spjd * devices. 2628219089Spjd */ 2629219089Spjd 2630219089Spjd /* 2631168404Spjd * Load any hot spares for this pool. 2632168404Spjd */ 2633219089Spjd error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2634219089Spjd if (error != 0 && error != ENOENT) 2635219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2636219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2637185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2638185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 2639219089Spjd &spa->spa_spares.sav_config) != 0) 2640219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2641168404Spjd 2642185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2643168404Spjd spa_load_spares(spa); 2644185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2645219089Spjd } else if (error == 0) { 2646219089Spjd spa->spa_spares.sav_sync = B_TRUE; 2647168404Spjd } 2648168404Spjd 2649185029Spjd /* 2650185029Spjd * Load any level 2 ARC devices for this pool. 2651185029Spjd */ 2652219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2653185029Spjd &spa->spa_l2cache.sav_object); 2654219089Spjd if (error != 0 && error != ENOENT) 2655219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2656219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2657185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2658185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2659219089Spjd &spa->spa_l2cache.sav_config) != 0) 2660219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2661185029Spjd 2662185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2663185029Spjd spa_load_l2cache(spa); 2664185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2665219089Spjd } else if (error == 0) { 2666219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2667185029Spjd } 2668185029Spjd 2669219089Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2670213197Smm 2671219089Spjd error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2672219089Spjd if (error && error != ENOENT) 2673219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2674185029Spjd 2675219089Spjd if (error == 0) { 2676219089Spjd uint64_t autoreplace; 2677185029Spjd 2678219089Spjd spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2679219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2680219089Spjd spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2681219089Spjd spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2682219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2683219089Spjd spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2684219089Spjd &spa->spa_dedup_ditto); 2685185029Spjd 2686219089Spjd spa->spa_autoreplace = (autoreplace != 0); 2687168404Spjd } 2688168404Spjd 2689168404Spjd /* 2690185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 2691185029Spjd * the ZFS DE that it should not issue any faults for unopenable 2692185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 2693185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 2694185029Spjd * over. 2695185029Spjd */ 2696219089Spjd if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2697185029Spjd spa_check_removed(spa->spa_root_vdev); 2698219089Spjd /* 2699219089Spjd * For the import case, this is done in spa_import(), because 2700219089Spjd * at this point we're using the spare definitions from 2701219089Spjd * the MOS config, not necessarily from the userland config. 2702219089Spjd */ 2703219089Spjd if (state != SPA_LOAD_IMPORT) { 2704219089Spjd spa_aux_check_removed(&spa->spa_spares); 2705219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 2706219089Spjd } 2707219089Spjd } 2708185029Spjd 2709185029Spjd /* 2710168404Spjd * Load the vdev state for all toplevel vdevs. 2711168404Spjd */ 2712168404Spjd vdev_load(rvd); 2713168404Spjd 2714168404Spjd /* 2715168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 2716168404Spjd */ 2717185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2718168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2719185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2720168404Spjd 2721168404Spjd /* 2722219089Spjd * Load the DDTs (dedup tables). 2723168404Spjd */ 2724219089Spjd error = ddt_load(spa); 2725219089Spjd if (error != 0) 2726219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2727219089Spjd 2728219089Spjd spa_update_dspace(spa); 2729219089Spjd 2730219089Spjd /* 2731219089Spjd * Validate the config, using the MOS config to fill in any 2732219089Spjd * information which might be missing. If we fail to validate 2733219089Spjd * the config then declare the pool unfit for use. If we're 2734219089Spjd * assembling a pool from a split, the log is not transferred 2735219089Spjd * over. 2736219089Spjd */ 2737219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2738219089Spjd nvlist_t *nvconfig; 2739219089Spjd 2740219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2741219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2742219089Spjd 2743219089Spjd if (!spa_config_valid(spa, nvconfig)) { 2744219089Spjd nvlist_free(nvconfig); 2745219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2746219089Spjd ENXIO)); 2747219089Spjd } 2748219089Spjd nvlist_free(nvconfig); 2749219089Spjd 2750219089Spjd /* 2751236884Smm * Now that we've validated the config, check the state of the 2752219089Spjd * root vdev. If it can't be opened, it indicates one or 2753219089Spjd * more toplevel vdevs are faulted. 2754219089Spjd */ 2755219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2756249195Smm return (SET_ERROR(ENXIO)); 2757219089Spjd 2758219089Spjd if (spa_check_logs(spa)) { 2759219089Spjd *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2760219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2761219089Spjd } 2762168404Spjd } 2763168404Spjd 2764236884Smm if (missing_feat_write) { 2765236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 2766236884Smm 2767236884Smm /* 2768236884Smm * At this point, we know that we can open the pool in 2769236884Smm * read-only mode but not read-write mode. We now have enough 2770236884Smm * information and can return to userland. 2771236884Smm */ 2772236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2773236884Smm } 2774236884Smm 2775219089Spjd /* 2776219089Spjd * We've successfully opened the pool, verify that we're ready 2777219089Spjd * to start pushing transactions. 2778219089Spjd */ 2779219089Spjd if (state != SPA_LOAD_TRYIMPORT) { 2780219089Spjd if (error = spa_load_verify(spa)) 2781219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2782219089Spjd error)); 2783219089Spjd } 2784219089Spjd 2785219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2786219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 2787168404Spjd dmu_tx_t *tx; 2788168404Spjd int need_update = B_FALSE; 2789168404Spjd 2790209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 2791209962Smm 2792168404Spjd /* 2793168404Spjd * Claim log blocks that haven't been committed yet. 2794168404Spjd * This must all happen in a single txg. 2795219089Spjd * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2796219089Spjd * invoked from zil_claim_log_block()'s i/o done callback. 2797219089Spjd * Price of rollback is that we abandon the log. 2798168404Spjd */ 2799219089Spjd spa->spa_claiming = B_TRUE; 2800219089Spjd 2801168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2802168404Spjd spa_first_txg(spa)); 2803185029Spjd (void) dmu_objset_find(spa_name(spa), 2804168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 2805168404Spjd dmu_tx_commit(tx); 2806168404Spjd 2807219089Spjd spa->spa_claiming = B_FALSE; 2808219089Spjd 2809219089Spjd spa_set_log_state(spa, SPA_LOG_GOOD); 2810168404Spjd spa->spa_sync_on = B_TRUE; 2811168404Spjd txg_sync_start(spa->spa_dsl_pool); 2812168404Spjd 2813168404Spjd /* 2814219089Spjd * Wait for all claims to sync. We sync up to the highest 2815219089Spjd * claimed log block birth time so that claimed log blocks 2816219089Spjd * don't appear to be from the future. spa_claim_max_txg 2817219089Spjd * will have been set for us by either zil_check_log_chain() 2818219089Spjd * (invoked from spa_check_logs()) or zil_claim() above. 2819168404Spjd */ 2820219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2821168404Spjd 2822168404Spjd /* 2823168404Spjd * If the config cache is stale, or we have uninitialized 2824168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 2825209962Smm * 2826219089Spjd * If this is a verbatim import, trust the current 2827209962Smm * in-core spa_config and update the disk labels. 2828168404Spjd */ 2829168404Spjd if (config_cache_txg != spa->spa_config_txg || 2830219089Spjd state == SPA_LOAD_IMPORT || 2831219089Spjd state == SPA_LOAD_RECOVER || 2832219089Spjd (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2833168404Spjd need_update = B_TRUE; 2834168404Spjd 2835209962Smm for (int c = 0; c < rvd->vdev_children; c++) 2836168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 2837168404Spjd need_update = B_TRUE; 2838168404Spjd 2839168404Spjd /* 2840168404Spjd * Update the config cache asychronously in case we're the 2841168404Spjd * root pool, in which case the config cache isn't writable yet. 2842168404Spjd */ 2843168404Spjd if (need_update) 2844168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2845208683Spjd 2846208683Spjd /* 2847208683Spjd * Check all DTLs to see if anything needs resilvering. 2848208683Spjd */ 2849219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2850219089Spjd vdev_resilver_needed(rvd, NULL, NULL)) 2851208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2852219089Spjd 2853219089Spjd /* 2854248571Smm * Log the fact that we booted up (so that we can detect if 2855248571Smm * we rebooted in the middle of an operation). 2856248571Smm */ 2857248571Smm spa_history_log_version(spa, "open"); 2858248571Smm 2859248571Smm /* 2860219089Spjd * Delete any inconsistent datasets. 2861219089Spjd */ 2862219089Spjd (void) dmu_objset_find(spa_name(spa), 2863219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2864219089Spjd 2865219089Spjd /* 2866219089Spjd * Clean up any stale temporary dataset userrefs. 2867219089Spjd */ 2868219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2869168404Spjd } 2870168404Spjd 2871219089Spjd return (0); 2872219089Spjd} 2873168404Spjd 2874219089Spjdstatic int 2875219089Spjdspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2876219089Spjd{ 2877219089Spjd int mode = spa->spa_mode; 2878219089Spjd 2879219089Spjd spa_unload(spa); 2880219089Spjd spa_deactivate(spa); 2881219089Spjd 2882268720Sdelphij spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 2883219089Spjd 2884219089Spjd spa_activate(spa, mode); 2885219089Spjd spa_async_suspend(spa); 2886219089Spjd 2887219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2888168404Spjd} 2889168404Spjd 2890236884Smm/* 2891236884Smm * If spa_load() fails this function will try loading prior txg's. If 2892236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2893236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2894236884Smm * function will not rewind the pool and will return the same error as 2895236884Smm * spa_load(). 2896236884Smm */ 2897219089Spjdstatic int 2898219089Spjdspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2899219089Spjd uint64_t max_request, int rewind_flags) 2900219089Spjd{ 2901236884Smm nvlist_t *loadinfo = NULL; 2902219089Spjd nvlist_t *config = NULL; 2903219089Spjd int load_error, rewind_error; 2904219089Spjd uint64_t safe_rewind_txg; 2905219089Spjd uint64_t min_txg; 2906219089Spjd 2907219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2908219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 2909219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2910219089Spjd } else { 2911219089Spjd spa->spa_load_max_txg = max_request; 2912268720Sdelphij if (max_request != UINT64_MAX) 2913268720Sdelphij spa->spa_extreme_rewind = B_TRUE; 2914219089Spjd } 2915219089Spjd 2916219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2917219089Spjd mosconfig); 2918219089Spjd if (load_error == 0) 2919219089Spjd return (0); 2920219089Spjd 2921219089Spjd if (spa->spa_root_vdev != NULL) 2922219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2923219089Spjd 2924219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2925219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2926219089Spjd 2927219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 2928219089Spjd nvlist_free(config); 2929219089Spjd return (load_error); 2930219089Spjd } 2931219089Spjd 2932236884Smm if (state == SPA_LOAD_RECOVER) { 2933236884Smm /* Price of rolling back is discarding txgs, including log */ 2934219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2935236884Smm } else { 2936236884Smm /* 2937236884Smm * If we aren't rolling back save the load info from our first 2938236884Smm * import attempt so that we can restore it after attempting 2939236884Smm * to rewind. 2940236884Smm */ 2941236884Smm loadinfo = spa->spa_load_info; 2942236884Smm spa->spa_load_info = fnvlist_alloc(); 2943236884Smm } 2944219089Spjd 2945219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2946219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2947219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2948219089Spjd TXG_INITIAL : safe_rewind_txg; 2949219089Spjd 2950219089Spjd /* 2951219089Spjd * Continue as long as we're finding errors, we're still within 2952219089Spjd * the acceptable rewind range, and we're still finding uberblocks 2953219089Spjd */ 2954219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2955219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2956219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 2957219089Spjd spa->spa_extreme_rewind = B_TRUE; 2958219089Spjd rewind_error = spa_load_retry(spa, state, mosconfig); 2959219089Spjd } 2960219089Spjd 2961219089Spjd spa->spa_extreme_rewind = B_FALSE; 2962219089Spjd spa->spa_load_max_txg = UINT64_MAX; 2963219089Spjd 2964219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2965219089Spjd spa_config_set(spa, config); 2966219089Spjd 2967236884Smm if (state == SPA_LOAD_RECOVER) { 2968236884Smm ASSERT3P(loadinfo, ==, NULL); 2969236884Smm return (rewind_error); 2970236884Smm } else { 2971236884Smm /* Store the rewind info as part of the initial load info */ 2972236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2973236884Smm spa->spa_load_info); 2974236884Smm 2975236884Smm /* Restore the initial load info */ 2976236884Smm fnvlist_free(spa->spa_load_info); 2977236884Smm spa->spa_load_info = loadinfo; 2978236884Smm 2979236884Smm return (load_error); 2980236884Smm } 2981219089Spjd} 2982219089Spjd 2983168404Spjd/* 2984168404Spjd * Pool Open/Import 2985168404Spjd * 2986168404Spjd * The import case is identical to an open except that the configuration is sent 2987168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 2988168404Spjd * case of an open, the pool configuration will exist in the 2989185029Spjd * POOL_STATE_UNINITIALIZED state. 2990168404Spjd * 2991168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 2992168404Spjd * the same time open the pool, without having to keep around the spa_t in some 2993168404Spjd * ambiguous state. 2994168404Spjd */ 2995168404Spjdstatic int 2996219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2997219089Spjd nvlist_t **config) 2998168404Spjd{ 2999168404Spjd spa_t *spa; 3000219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 3001168404Spjd int error; 3002168404Spjd int locked = B_FALSE; 3003219089Spjd int firstopen = B_FALSE; 3004168404Spjd 3005168404Spjd *spapp = NULL; 3006168404Spjd 3007168404Spjd /* 3008168404Spjd * As disgusting as this is, we need to support recursive calls to this 3009168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 3010168404Spjd * up calling spa_open() again. The real fix is to figure out how to 3011168404Spjd * avoid dsl_dir_open() calling this in the first place. 3012168404Spjd */ 3013168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 3014168404Spjd mutex_enter(&spa_namespace_lock); 3015168404Spjd locked = B_TRUE; 3016168404Spjd } 3017168404Spjd 3018168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 3019168404Spjd if (locked) 3020168404Spjd mutex_exit(&spa_namespace_lock); 3021249195Smm return (SET_ERROR(ENOENT)); 3022168404Spjd } 3023219089Spjd 3024168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 3025219089Spjd zpool_rewind_policy_t policy; 3026168404Spjd 3027219089Spjd firstopen = B_TRUE; 3028219089Spjd 3029219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 3030219089Spjd &policy); 3031219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 3032219089Spjd state = SPA_LOAD_RECOVER; 3033219089Spjd 3034209962Smm spa_activate(spa, spa_mode_global); 3035168404Spjd 3036219089Spjd if (state != SPA_LOAD_RECOVER) 3037219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3038168404Spjd 3039219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3040219089Spjd policy.zrp_request); 3041219089Spjd 3042168404Spjd if (error == EBADF) { 3043168404Spjd /* 3044168404Spjd * If vdev_validate() returns failure (indicated by 3045168404Spjd * EBADF), it indicates that one of the vdevs indicates 3046168404Spjd * that the pool has been exported or destroyed. If 3047168404Spjd * this is the case, the config cache is out of sync and 3048168404Spjd * we should remove the pool from the namespace. 3049168404Spjd */ 3050168404Spjd spa_unload(spa); 3051168404Spjd spa_deactivate(spa); 3052185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 3053168404Spjd spa_remove(spa); 3054168404Spjd if (locked) 3055168404Spjd mutex_exit(&spa_namespace_lock); 3056249195Smm return (SET_ERROR(ENOENT)); 3057168404Spjd } 3058168404Spjd 3059168404Spjd if (error) { 3060168404Spjd /* 3061168404Spjd * We can't open the pool, but we still have useful 3062168404Spjd * information: the state of each vdev after the 3063168404Spjd * attempted vdev_open(). Return this to the user. 3064168404Spjd */ 3065219089Spjd if (config != NULL && spa->spa_config) { 3066219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 3067219089Spjd KM_SLEEP) == 0); 3068219089Spjd VERIFY(nvlist_add_nvlist(*config, 3069219089Spjd ZPOOL_CONFIG_LOAD_INFO, 3070219089Spjd spa->spa_load_info) == 0); 3071219089Spjd } 3072168404Spjd spa_unload(spa); 3073168404Spjd spa_deactivate(spa); 3074219089Spjd spa->spa_last_open_failed = error; 3075168404Spjd if (locked) 3076168404Spjd mutex_exit(&spa_namespace_lock); 3077168404Spjd *spapp = NULL; 3078168404Spjd return (error); 3079168404Spjd } 3080168404Spjd } 3081168404Spjd 3082168404Spjd spa_open_ref(spa, tag); 3083185029Spjd 3084219089Spjd if (config != NULL) 3085219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3086219089Spjd 3087219089Spjd /* 3088219089Spjd * If we've recovered the pool, pass back any information we 3089219089Spjd * gathered while doing the load. 3090219089Spjd */ 3091219089Spjd if (state == SPA_LOAD_RECOVER) { 3092219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3093219089Spjd spa->spa_load_info) == 0); 3094219089Spjd } 3095219089Spjd 3096219089Spjd if (locked) { 3097219089Spjd spa->spa_last_open_failed = 0; 3098219089Spjd spa->spa_last_ubsync_txg = 0; 3099219089Spjd spa->spa_load_txg = 0; 3100168404Spjd mutex_exit(&spa_namespace_lock); 3101219089Spjd#ifdef __FreeBSD__ 3102219089Spjd#ifdef _KERNEL 3103219089Spjd if (firstopen) 3104249047Savg zvol_create_minors(spa->spa_name); 3105219089Spjd#endif 3106219089Spjd#endif 3107219089Spjd } 3108168404Spjd 3109168404Spjd *spapp = spa; 3110168404Spjd 3111168404Spjd return (0); 3112168404Spjd} 3113168404Spjd 3114168404Spjdint 3115219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3116219089Spjd nvlist_t **config) 3117219089Spjd{ 3118219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 3119219089Spjd} 3120219089Spjd 3121219089Spjdint 3122168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 3123168404Spjd{ 3124219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 3125168404Spjd} 3126168404Spjd 3127168404Spjd/* 3128168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 3129168404Spjd * preventing it from being exported or destroyed. 3130168404Spjd */ 3131168404Spjdspa_t * 3132168404Spjdspa_inject_addref(char *name) 3133168404Spjd{ 3134168404Spjd spa_t *spa; 3135168404Spjd 3136168404Spjd mutex_enter(&spa_namespace_lock); 3137168404Spjd if ((spa = spa_lookup(name)) == NULL) { 3138168404Spjd mutex_exit(&spa_namespace_lock); 3139168404Spjd return (NULL); 3140168404Spjd } 3141168404Spjd spa->spa_inject_ref++; 3142168404Spjd mutex_exit(&spa_namespace_lock); 3143168404Spjd 3144168404Spjd return (spa); 3145168404Spjd} 3146168404Spjd 3147168404Spjdvoid 3148168404Spjdspa_inject_delref(spa_t *spa) 3149168404Spjd{ 3150168404Spjd mutex_enter(&spa_namespace_lock); 3151168404Spjd spa->spa_inject_ref--; 3152168404Spjd mutex_exit(&spa_namespace_lock); 3153168404Spjd} 3154168404Spjd 3155185029Spjd/* 3156185029Spjd * Add spares device information to the nvlist. 3157185029Spjd */ 3158168404Spjdstatic void 3159168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 3160168404Spjd{ 3161168404Spjd nvlist_t **spares; 3162168404Spjd uint_t i, nspares; 3163168404Spjd nvlist_t *nvroot; 3164168404Spjd uint64_t guid; 3165168404Spjd vdev_stat_t *vs; 3166168404Spjd uint_t vsc; 3167168404Spjd uint64_t pool; 3168168404Spjd 3169209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3170209962Smm 3171185029Spjd if (spa->spa_spares.sav_count == 0) 3172168404Spjd return; 3173168404Spjd 3174168404Spjd VERIFY(nvlist_lookup_nvlist(config, 3175168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3176185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3177168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3178168404Spjd if (nspares != 0) { 3179168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3180168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3181168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3182168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3183168404Spjd 3184168404Spjd /* 3185168404Spjd * Go through and find any spares which have since been 3186168404Spjd * repurposed as an active spare. If this is the case, update 3187168404Spjd * their status appropriately. 3188168404Spjd */ 3189168404Spjd for (i = 0; i < nspares; i++) { 3190168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 3191168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3192185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 3193185029Spjd pool != 0ULL) { 3194168404Spjd VERIFY(nvlist_lookup_uint64_array( 3195219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 3196168404Spjd (uint64_t **)&vs, &vsc) == 0); 3197168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 3198168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 3199168404Spjd } 3200168404Spjd } 3201168404Spjd } 3202168404Spjd} 3203168404Spjd 3204185029Spjd/* 3205185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 3206185029Spjd */ 3207185029Spjdstatic void 3208185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 3209185029Spjd{ 3210185029Spjd nvlist_t **l2cache; 3211185029Spjd uint_t i, j, nl2cache; 3212185029Spjd nvlist_t *nvroot; 3213185029Spjd uint64_t guid; 3214185029Spjd vdev_t *vd; 3215185029Spjd vdev_stat_t *vs; 3216185029Spjd uint_t vsc; 3217185029Spjd 3218209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3219209962Smm 3220185029Spjd if (spa->spa_l2cache.sav_count == 0) 3221185029Spjd return; 3222185029Spjd 3223185029Spjd VERIFY(nvlist_lookup_nvlist(config, 3224185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3225185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3226185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3227185029Spjd if (nl2cache != 0) { 3228185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3229185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3230185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3231185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3232185029Spjd 3233185029Spjd /* 3234185029Spjd * Update level 2 cache device stats. 3235185029Spjd */ 3236185029Spjd 3237185029Spjd for (i = 0; i < nl2cache; i++) { 3238185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 3239185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3240185029Spjd 3241185029Spjd vd = NULL; 3242185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3243185029Spjd if (guid == 3244185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3245185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 3246185029Spjd break; 3247185029Spjd } 3248185029Spjd } 3249185029Spjd ASSERT(vd != NULL); 3250185029Spjd 3251185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3252219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3253219089Spjd == 0); 3254185029Spjd vdev_get_stats(vd, vs); 3255185029Spjd } 3256185029Spjd } 3257185029Spjd} 3258185029Spjd 3259236884Smmstatic void 3260236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3261236884Smm{ 3262236884Smm nvlist_t *features; 3263236884Smm zap_cursor_t zc; 3264236884Smm zap_attribute_t za; 3265236884Smm 3266236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3267236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3268236884Smm 3269253993Smav /* We may be unable to read features if pool is suspended. */ 3270253993Smav if (spa_suspended(spa)) 3271253993Smav goto out; 3272253993Smav 3273236884Smm if (spa->spa_feat_for_read_obj != 0) { 3274236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3275236884Smm spa->spa_feat_for_read_obj); 3276236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3277236884Smm zap_cursor_advance(&zc)) { 3278236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3279236884Smm za.za_num_integers == 1); 3280236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3281236884Smm za.za_first_integer)); 3282236884Smm } 3283236884Smm zap_cursor_fini(&zc); 3284236884Smm } 3285236884Smm 3286236884Smm if (spa->spa_feat_for_write_obj != 0) { 3287236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3288236884Smm spa->spa_feat_for_write_obj); 3289236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3290236884Smm zap_cursor_advance(&zc)) { 3291236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3292236884Smm za.za_num_integers == 1); 3293236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3294236884Smm za.za_first_integer)); 3295236884Smm } 3296236884Smm zap_cursor_fini(&zc); 3297236884Smm } 3298236884Smm 3299253993Smavout: 3300236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3301236884Smm features) == 0); 3302236884Smm nvlist_free(features); 3303236884Smm} 3304236884Smm 3305168404Spjdint 3306236884Smmspa_get_stats(const char *name, nvlist_t **config, 3307236884Smm char *altroot, size_t buflen) 3308168404Spjd{ 3309168404Spjd int error; 3310168404Spjd spa_t *spa; 3311168404Spjd 3312168404Spjd *config = NULL; 3313219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3314168404Spjd 3315209962Smm if (spa != NULL) { 3316209962Smm /* 3317209962Smm * This still leaves a window of inconsistency where the spares 3318209962Smm * or l2cache devices could change and the config would be 3319209962Smm * self-inconsistent. 3320209962Smm */ 3321209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3322168404Spjd 3323209962Smm if (*config != NULL) { 3324219089Spjd uint64_t loadtimes[2]; 3325219089Spjd 3326219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3327219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3328219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3329219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3330219089Spjd 3331185029Spjd VERIFY(nvlist_add_uint64(*config, 3332209962Smm ZPOOL_CONFIG_ERRCOUNT, 3333209962Smm spa_get_errlog_size(spa)) == 0); 3334185029Spjd 3335209962Smm if (spa_suspended(spa)) 3336209962Smm VERIFY(nvlist_add_uint64(*config, 3337209962Smm ZPOOL_CONFIG_SUSPENDED, 3338209962Smm spa->spa_failmode) == 0); 3339209962Smm 3340209962Smm spa_add_spares(spa, *config); 3341209962Smm spa_add_l2cache(spa, *config); 3342236884Smm spa_add_feature_stats(spa, *config); 3343209962Smm } 3344168404Spjd } 3345168404Spjd 3346168404Spjd /* 3347168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3348168404Spjd * and call spa_lookup() directly. 3349168404Spjd */ 3350168404Spjd if (altroot) { 3351168404Spjd if (spa == NULL) { 3352168404Spjd mutex_enter(&spa_namespace_lock); 3353168404Spjd spa = spa_lookup(name); 3354168404Spjd if (spa) 3355168404Spjd spa_altroot(spa, altroot, buflen); 3356168404Spjd else 3357168404Spjd altroot[0] = '\0'; 3358168404Spjd spa = NULL; 3359168404Spjd mutex_exit(&spa_namespace_lock); 3360168404Spjd } else { 3361168404Spjd spa_altroot(spa, altroot, buflen); 3362168404Spjd } 3363168404Spjd } 3364168404Spjd 3365209962Smm if (spa != NULL) { 3366209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3367168404Spjd spa_close(spa, FTAG); 3368209962Smm } 3369168404Spjd 3370168404Spjd return (error); 3371168404Spjd} 3372168404Spjd 3373168404Spjd/* 3374185029Spjd * Validate that the auxiliary device array is well formed. We must have an 3375185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 3376185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3377185029Spjd * specified, as long as they are well-formed. 3378168404Spjd */ 3379168404Spjdstatic int 3380185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3381185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 3382185029Spjd vdev_labeltype_t label) 3383168404Spjd{ 3384185029Spjd nvlist_t **dev; 3385185029Spjd uint_t i, ndev; 3386168404Spjd vdev_t *vd; 3387168404Spjd int error; 3388168404Spjd 3389185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3390185029Spjd 3391168404Spjd /* 3392185029Spjd * It's acceptable to have no devs specified. 3393168404Spjd */ 3394185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3395168404Spjd return (0); 3396168404Spjd 3397185029Spjd if (ndev == 0) 3398249195Smm return (SET_ERROR(EINVAL)); 3399168404Spjd 3400168404Spjd /* 3401185029Spjd * Make sure the pool is formatted with a version that supports this 3402185029Spjd * device type. 3403168404Spjd */ 3404185029Spjd if (spa_version(spa) < version) 3405249195Smm return (SET_ERROR(ENOTSUP)); 3406168404Spjd 3407168404Spjd /* 3408185029Spjd * Set the pending device list so we correctly handle device in-use 3409168404Spjd * checking. 3410168404Spjd */ 3411185029Spjd sav->sav_pending = dev; 3412185029Spjd sav->sav_npending = ndev; 3413168404Spjd 3414185029Spjd for (i = 0; i < ndev; i++) { 3415185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3416168404Spjd mode)) != 0) 3417168404Spjd goto out; 3418168404Spjd 3419168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3420168404Spjd vdev_free(vd); 3421249195Smm error = SET_ERROR(EINVAL); 3422168404Spjd goto out; 3423168404Spjd } 3424168404Spjd 3425185029Spjd /* 3426185029Spjd * The L2ARC currently only supports disk devices in 3427185029Spjd * kernel context. For user-level testing, we allow it. 3428185029Spjd */ 3429185029Spjd#ifdef _KERNEL 3430185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3431185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3432249195Smm error = SET_ERROR(ENOTBLK); 3433230514Smm vdev_free(vd); 3434185029Spjd goto out; 3435185029Spjd } 3436185029Spjd#endif 3437168404Spjd vd->vdev_top = vd; 3438168404Spjd 3439168404Spjd if ((error = vdev_open(vd)) == 0 && 3440185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 3441185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3442168404Spjd vd->vdev_guid) == 0); 3443168404Spjd } 3444168404Spjd 3445168404Spjd vdev_free(vd); 3446168404Spjd 3447185029Spjd if (error && 3448185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3449168404Spjd goto out; 3450168404Spjd else 3451168404Spjd error = 0; 3452168404Spjd } 3453168404Spjd 3454168404Spjdout: 3455185029Spjd sav->sav_pending = NULL; 3456185029Spjd sav->sav_npending = 0; 3457168404Spjd return (error); 3458168404Spjd} 3459168404Spjd 3460185029Spjdstatic int 3461185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3462185029Spjd{ 3463185029Spjd int error; 3464185029Spjd 3465185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3466185029Spjd 3467185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3468185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3469185029Spjd VDEV_LABEL_SPARE)) != 0) { 3470185029Spjd return (error); 3471185029Spjd } 3472185029Spjd 3473185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3474185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3475185029Spjd VDEV_LABEL_L2CACHE)); 3476185029Spjd} 3477185029Spjd 3478185029Spjdstatic void 3479185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3480185029Spjd const char *config) 3481185029Spjd{ 3482185029Spjd int i; 3483185029Spjd 3484185029Spjd if (sav->sav_config != NULL) { 3485185029Spjd nvlist_t **olddevs; 3486185029Spjd uint_t oldndevs; 3487185029Spjd nvlist_t **newdevs; 3488185029Spjd 3489185029Spjd /* 3490185029Spjd * Generate new dev list by concatentating with the 3491185029Spjd * current dev list. 3492185029Spjd */ 3493185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3494185029Spjd &olddevs, &oldndevs) == 0); 3495185029Spjd 3496185029Spjd newdevs = kmem_alloc(sizeof (void *) * 3497185029Spjd (ndevs + oldndevs), KM_SLEEP); 3498185029Spjd for (i = 0; i < oldndevs; i++) 3499185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3500185029Spjd KM_SLEEP) == 0); 3501185029Spjd for (i = 0; i < ndevs; i++) 3502185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3503185029Spjd KM_SLEEP) == 0); 3504185029Spjd 3505185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 3506185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 3507185029Spjd 3508185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3509185029Spjd config, newdevs, ndevs + oldndevs) == 0); 3510185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 3511185029Spjd nvlist_free(newdevs[i]); 3512185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3513185029Spjd } else { 3514185029Spjd /* 3515185029Spjd * Generate a new dev list. 3516185029Spjd */ 3517185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3518185029Spjd KM_SLEEP) == 0); 3519185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3520185029Spjd devs, ndevs) == 0); 3521185029Spjd } 3522185029Spjd} 3523185029Spjd 3524168404Spjd/* 3525185029Spjd * Stop and drop level 2 ARC devices 3526185029Spjd */ 3527185029Spjdvoid 3528185029Spjdspa_l2cache_drop(spa_t *spa) 3529185029Spjd{ 3530185029Spjd vdev_t *vd; 3531185029Spjd int i; 3532185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 3533185029Spjd 3534185029Spjd for (i = 0; i < sav->sav_count; i++) { 3535185029Spjd uint64_t pool; 3536185029Spjd 3537185029Spjd vd = sav->sav_vdevs[i]; 3538185029Spjd ASSERT(vd != NULL); 3539185029Spjd 3540209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3541209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 3542185029Spjd l2arc_remove_vdev(vd); 3543185029Spjd } 3544185029Spjd} 3545185029Spjd 3546185029Spjd/* 3547168404Spjd * Pool Creation 3548168404Spjd */ 3549168404Spjdint 3550185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3551248571Smm nvlist_t *zplprops) 3552168404Spjd{ 3553168404Spjd spa_t *spa; 3554185029Spjd char *altroot = NULL; 3555168404Spjd vdev_t *rvd; 3556168404Spjd dsl_pool_t *dp; 3557168404Spjd dmu_tx_t *tx; 3558219089Spjd int error = 0; 3559168404Spjd uint64_t txg = TXG_INITIAL; 3560185029Spjd nvlist_t **spares, **l2cache; 3561185029Spjd uint_t nspares, nl2cache; 3562219089Spjd uint64_t version, obj; 3563236884Smm boolean_t has_features; 3564168404Spjd 3565168404Spjd /* 3566168404Spjd * If this pool already exists, return failure. 3567168404Spjd */ 3568168404Spjd mutex_enter(&spa_namespace_lock); 3569168404Spjd if (spa_lookup(pool) != NULL) { 3570168404Spjd mutex_exit(&spa_namespace_lock); 3571249195Smm return (SET_ERROR(EEXIST)); 3572168404Spjd } 3573168404Spjd 3574168404Spjd /* 3575168404Spjd * Allocate a new spa_t structure. 3576168404Spjd */ 3577185029Spjd (void) nvlist_lookup_string(props, 3578185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3579219089Spjd spa = spa_add(pool, NULL, altroot); 3580209962Smm spa_activate(spa, spa_mode_global); 3581168404Spjd 3582185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 3583185029Spjd spa_deactivate(spa); 3584185029Spjd spa_remove(spa); 3585185029Spjd mutex_exit(&spa_namespace_lock); 3586185029Spjd return (error); 3587185029Spjd } 3588185029Spjd 3589236884Smm has_features = B_FALSE; 3590236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3591236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3592236884Smm if (zpool_prop_feature(nvpair_name(elem))) 3593236884Smm has_features = B_TRUE; 3594236884Smm } 3595236884Smm 3596236884Smm if (has_features || nvlist_lookup_uint64(props, 3597236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3598185029Spjd version = SPA_VERSION; 3599236884Smm } 3600236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3601219089Spjd 3602219089Spjd spa->spa_first_txg = txg; 3603219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 3604185029Spjd spa->spa_uberblock.ub_version = version; 3605168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3606168404Spjd 3607168404Spjd /* 3608209962Smm * Create "The Godfather" zio to hold all async IOs 3609209962Smm */ 3610272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3611272598Sdelphij KM_SLEEP); 3612272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 3613272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3614272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3615272598Sdelphij ZIO_FLAG_GODFATHER); 3616272598Sdelphij } 3617209962Smm 3618209962Smm /* 3619168404Spjd * Create the root vdev. 3620168404Spjd */ 3621185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3622168404Spjd 3623168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3624168404Spjd 3625168404Spjd ASSERT(error != 0 || rvd != NULL); 3626168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3627168404Spjd 3628185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 3629249195Smm error = SET_ERROR(EINVAL); 3630168404Spjd 3631168404Spjd if (error == 0 && 3632168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3633185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 3634168404Spjd VDEV_ALLOC_ADD)) == 0) { 3635219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 3636254591Sgibbs vdev_ashift_optimize(rvd->vdev_child[c]); 3637219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 3638219089Spjd vdev_expand(rvd->vdev_child[c], txg); 3639219089Spjd } 3640168404Spjd } 3641168404Spjd 3642185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3643168404Spjd 3644168404Spjd if (error != 0) { 3645168404Spjd spa_unload(spa); 3646168404Spjd spa_deactivate(spa); 3647168404Spjd spa_remove(spa); 3648168404Spjd mutex_exit(&spa_namespace_lock); 3649168404Spjd return (error); 3650168404Spjd } 3651168404Spjd 3652168404Spjd /* 3653168404Spjd * Get the list of spares, if specified. 3654168404Spjd */ 3655168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3656168404Spjd &spares, &nspares) == 0) { 3657185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3658168404Spjd KM_SLEEP) == 0); 3659185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3660168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3661185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3662168404Spjd spa_load_spares(spa); 3663185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3664185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3665168404Spjd } 3666168404Spjd 3667185029Spjd /* 3668185029Spjd * Get the list of level 2 cache devices, if specified. 3669185029Spjd */ 3670185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3671185029Spjd &l2cache, &nl2cache) == 0) { 3672185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3673185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 3674185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3675185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3676185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3677185029Spjd spa_load_l2cache(spa); 3678185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3679185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3680185029Spjd } 3681185029Spjd 3682236884Smm spa->spa_is_initializing = B_TRUE; 3683185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3684168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 3685236884Smm spa->spa_is_initializing = B_FALSE; 3686168404Spjd 3687219089Spjd /* 3688219089Spjd * Create DDTs (dedup tables). 3689219089Spjd */ 3690219089Spjd ddt_create(spa); 3691219089Spjd 3692219089Spjd spa_update_dspace(spa); 3693219089Spjd 3694168404Spjd tx = dmu_tx_create_assigned(dp, txg); 3695168404Spjd 3696168404Spjd /* 3697168404Spjd * Create the pool config object. 3698168404Spjd */ 3699168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3700185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3701168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3702168404Spjd 3703168404Spjd if (zap_add(spa->spa_meta_objset, 3704168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3705168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3706168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 3707168404Spjd } 3708168404Spjd 3709236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 3710236884Smm spa_feature_create_zap_objects(spa, tx); 3711236884Smm 3712219089Spjd if (zap_add(spa->spa_meta_objset, 3713219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3714219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 3715219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 3716219089Spjd } 3717219089Spjd 3718185029Spjd /* Newly created pools with the right version are always deflated. */ 3719185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3720185029Spjd spa->spa_deflate = TRUE; 3721185029Spjd if (zap_add(spa->spa_meta_objset, 3722185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3723185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3724185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 3725185029Spjd } 3726168404Spjd } 3727168404Spjd 3728168404Spjd /* 3729219089Spjd * Create the deferred-free bpobj. Turn off compression 3730168404Spjd * because sync-to-convergence takes longer if the blocksize 3731168404Spjd * keeps changing. 3732168404Spjd */ 3733219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3734219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 3735168404Spjd ZIO_COMPRESS_OFF, tx); 3736168404Spjd if (zap_add(spa->spa_meta_objset, 3737219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3738219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 3739219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 3740168404Spjd } 3741219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3742219089Spjd spa->spa_meta_objset, obj)); 3743168404Spjd 3744168404Spjd /* 3745168404Spjd * Create the pool's history object. 3746168404Spjd */ 3747185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 3748185029Spjd spa_history_create_obj(spa, tx); 3749168404Spjd 3750185029Spjd /* 3751185029Spjd * Set pool properties. 3752185029Spjd */ 3753185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3754185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3755185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3756219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3757219089Spjd 3758209962Smm if (props != NULL) { 3759209962Smm spa_configfile_set(spa, props, B_FALSE); 3760248571Smm spa_sync_props(props, tx); 3761209962Smm } 3762185029Spjd 3763168404Spjd dmu_tx_commit(tx); 3764168404Spjd 3765168404Spjd spa->spa_sync_on = B_TRUE; 3766168404Spjd txg_sync_start(spa->spa_dsl_pool); 3767168404Spjd 3768168404Spjd /* 3769168404Spjd * We explicitly wait for the first transaction to complete so that our 3770168404Spjd * bean counters are appropriately updated. 3771168404Spjd */ 3772168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 3773168404Spjd 3774185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3775168404Spjd 3776248571Smm spa_history_log_version(spa, "create"); 3777185029Spjd 3778286575Smav /* 3779286575Smav * Don't count references from objsets that are already closed 3780286575Smav * and are making their way through the eviction process. 3781286575Smav */ 3782286575Smav spa_evicting_os_wait(spa); 3783208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 3784208442Smm 3785168404Spjd mutex_exit(&spa_namespace_lock); 3786168404Spjd 3787168404Spjd return (0); 3788168404Spjd} 3789168404Spjd 3790241286Savg#ifdef _KERNEL 3791277300Ssmh#ifdef illumos 3792185029Spjd/* 3793219089Spjd * Get the root pool information from the root disk, then import the root pool 3794219089Spjd * during the system boot up time. 3795185029Spjd */ 3796219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3797219089Spjd 3798219089Spjdstatic nvlist_t * 3799219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3800185029Spjd{ 3801219089Spjd nvlist_t *config; 3802185029Spjd nvlist_t *nvtop, *nvroot; 3803185029Spjd uint64_t pgid; 3804185029Spjd 3805219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3806219089Spjd return (NULL); 3807219089Spjd 3808168404Spjd /* 3809185029Spjd * Add this top-level vdev to the child array. 3810168404Spjd */ 3811219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3812219089Spjd &nvtop) == 0); 3813219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3814219089Spjd &pgid) == 0); 3815219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3816168404Spjd 3817185029Spjd /* 3818185029Spjd * Put this pool's top-level vdevs into a root vdev. 3819185029Spjd */ 3820185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3821219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3822219089Spjd VDEV_TYPE_ROOT) == 0); 3823185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3824185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3825185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3826185029Spjd &nvtop, 1) == 0); 3827168404Spjd 3828168404Spjd /* 3829185029Spjd * Replace the existing vdev_tree with the new root vdev in 3830185029Spjd * this pool's configuration (remove the old, add the new). 3831168404Spjd */ 3832185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3833185029Spjd nvlist_free(nvroot); 3834219089Spjd return (config); 3835185029Spjd} 3836168404Spjd 3837185029Spjd/* 3838219089Spjd * Walk the vdev tree and see if we can find a device with "better" 3839219089Spjd * configuration. A configuration is "better" if the label on that 3840219089Spjd * device has a more recent txg. 3841185029Spjd */ 3842219089Spjdstatic void 3843219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3844185029Spjd{ 3845219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3846219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3847185029Spjd 3848219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 3849219089Spjd nvlist_t *label; 3850219089Spjd uint64_t label_txg; 3851185029Spjd 3852219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3853219089Spjd &label) != 0) 3854219089Spjd return; 3855185029Spjd 3856219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3857219089Spjd &label_txg) == 0); 3858168404Spjd 3859219089Spjd /* 3860219089Spjd * Do we have a better boot device? 3861219089Spjd */ 3862219089Spjd if (label_txg > *txg) { 3863219089Spjd *txg = label_txg; 3864219089Spjd *avd = vd; 3865185029Spjd } 3866219089Spjd nvlist_free(label); 3867185029Spjd } 3868185029Spjd} 3869185029Spjd 3870185029Spjd/* 3871185029Spjd * Import a root pool. 3872185029Spjd * 3873185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 3874185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3875185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 3876185029Spjd * 3877185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 3878185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 3879185029Spjd * e.g. 3880185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 3881185029Spjd */ 3882185029Spjdint 3883185029Spjdspa_import_rootpool(char *devpath, char *devid) 3884185029Spjd{ 3885219089Spjd spa_t *spa; 3886219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 3887219089Spjd nvlist_t *config, *nvtop; 3888219089Spjd uint64_t guid, txg; 3889185029Spjd char *pname; 3890185029Spjd int error; 3891185029Spjd 3892185029Spjd /* 3893219089Spjd * Read the label from the boot device and generate a configuration. 3894185029Spjd */ 3895219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3896219089Spjd#if defined(_OBP) && defined(_KERNEL) 3897219089Spjd if (config == NULL) { 3898219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 3899219089Spjd /* iscsi boot */ 3900219089Spjd get_iscsi_bootpath_phy(devpath); 3901219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3902219089Spjd } 3903219089Spjd } 3904219089Spjd#endif 3905219089Spjd if (config == NULL) { 3906236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3907219089Spjd devpath); 3908249195Smm return (SET_ERROR(EIO)); 3909219089Spjd } 3910185029Spjd 3911219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3912219089Spjd &pname) == 0); 3913219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3914185029Spjd 3915209962Smm mutex_enter(&spa_namespace_lock); 3916209962Smm if ((spa = spa_lookup(pname)) != NULL) { 3917209962Smm /* 3918209962Smm * Remove the existing root pool from the namespace so that we 3919209962Smm * can replace it with the correct config we just read in. 3920209962Smm */ 3921209962Smm spa_remove(spa); 3922209962Smm } 3923185029Spjd 3924219089Spjd spa = spa_add(pname, config, NULL); 3925209962Smm spa->spa_is_root = B_TRUE; 3926219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3927209962Smm 3928219089Spjd /* 3929219089Spjd * Build up a vdev tree based on the boot device's label config. 3930219089Spjd */ 3931219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3932219089Spjd &nvtop) == 0); 3933219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3934219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3935219089Spjd VDEV_ALLOC_ROOTPOOL); 3936219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3937219089Spjd if (error) { 3938209962Smm mutex_exit(&spa_namespace_lock); 3939219089Spjd nvlist_free(config); 3940219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3941219089Spjd pname); 3942219089Spjd return (error); 3943209962Smm } 3944209962Smm 3945219089Spjd /* 3946219089Spjd * Get the boot vdev. 3947219089Spjd */ 3948219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3949219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3950219089Spjd (u_longlong_t)guid); 3951249195Smm error = SET_ERROR(ENOENT); 3952219089Spjd goto out; 3953219089Spjd } 3954209962Smm 3955219089Spjd /* 3956219089Spjd * Determine if there is a better boot device. 3957219089Spjd */ 3958219089Spjd avd = bvd; 3959219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 3960219089Spjd if (avd != bvd) { 3961219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3962219089Spjd "try booting from '%s'", avd->vdev_path); 3963249195Smm error = SET_ERROR(EINVAL); 3964219089Spjd goto out; 3965219089Spjd } 3966209962Smm 3967219089Spjd /* 3968219089Spjd * If the boot device is part of a spare vdev then ensure that 3969219089Spjd * we're booting off the active spare. 3970219089Spjd */ 3971219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3972219089Spjd !bvd->vdev_isspare) { 3973219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3974219089Spjd "try booting from '%s'", 3975219089Spjd bvd->vdev_parent-> 3976219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3977249195Smm error = SET_ERROR(EINVAL); 3978219089Spjd goto out; 3979219089Spjd } 3980209962Smm 3981219089Spjd error = 0; 3982219089Spjdout: 3983219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3984219089Spjd vdev_free(rvd); 3985219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3986209962Smm mutex_exit(&spa_namespace_lock); 3987209962Smm 3988219089Spjd nvlist_free(config); 3989219089Spjd return (error); 3990185029Spjd} 3991185029Spjd 3992277300Ssmh#else /* !illumos */ 3993241286Savg 3994243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3995243502Savg uint64_t *count); 3996241286Savg 3997241286Savgstatic nvlist_t * 3998241286Savgspa_generate_rootconf(const char *name) 3999241286Savg{ 4000243502Savg nvlist_t **configs, **tops; 4001241286Savg nvlist_t *config; 4002243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 4003243502Savg uint64_t *holes; 4004243502Savg uint64_t best_txg; 4005243213Savg uint64_t nchildren; 4006241286Savg uint64_t pgid; 4007243502Savg uint64_t count; 4008243502Savg uint64_t i; 4009243502Savg uint_t nholes; 4010241286Savg 4011243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 4012241286Savg return (NULL); 4013241286Savg 4014243502Savg ASSERT3U(count, !=, 0); 4015243502Savg best_txg = 0; 4016243502Savg for (i = 0; i < count; i++) { 4017243502Savg uint64_t txg; 4018243502Savg 4019243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 4020243502Savg &txg) == 0); 4021243502Savg if (txg > best_txg) { 4022243502Savg best_txg = txg; 4023243502Savg best_cfg = configs[i]; 4024243502Savg } 4025243502Savg } 4026243502Savg 4027245945Savg nchildren = 1; 4028245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 4029243502Savg holes = NULL; 4030243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 4031243502Savg &holes, &nholes); 4032243502Savg 4033244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 4034243502Savg for (i = 0; i < nchildren; i++) { 4035243502Savg if (i >= count) 4036243502Savg break; 4037243502Savg if (configs[i] == NULL) 4038243502Savg continue; 4039243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 4040243502Savg &nvtop) == 0); 4041243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 4042243213Savg } 4043243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 4044243502Savg if (i >= nchildren) 4045243502Savg continue; 4046243502Savg if (tops[holes[i]] != NULL) 4047243502Savg continue; 4048243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 4049243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 4050243502Savg VDEV_TYPE_HOLE) == 0); 4051243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 4052243502Savg holes[i]) == 0); 4053243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 4054243502Savg 0) == 0); 4055243502Savg } 4056243502Savg for (i = 0; i < nchildren; i++) { 4057243502Savg if (tops[i] != NULL) 4058243502Savg continue; 4059243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 4060243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 4061243502Savg VDEV_TYPE_MISSING) == 0); 4062243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 4063243502Savg i) == 0); 4064243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 4065243502Savg 0) == 0); 4066243502Savg } 4067243213Savg 4068243213Savg /* 4069243502Savg * Create pool config based on the best vdev config. 4070241286Savg */ 4071243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 4072241286Savg 4073241286Savg /* 4074241286Savg * Put this pool's top-level vdevs into a root vdev. 4075241286Savg */ 4076243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4077243502Savg &pgid) == 0); 4078241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4079241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4080241286Savg VDEV_TYPE_ROOT) == 0); 4081241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4082241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4083241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4084243502Savg tops, nchildren) == 0); 4085241286Savg 4086241286Savg /* 4087241286Savg * Replace the existing vdev_tree with the new root vdev in 4088241286Savg * this pool's configuration (remove the old, add the new). 4089241286Savg */ 4090241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4091243502Savg 4092243502Savg /* 4093243502Savg * Drop vdev config elements that should not be present at pool level. 4094243502Savg */ 4095243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 4096243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 4097243502Savg 4098243502Savg for (i = 0; i < count; i++) 4099243502Savg nvlist_free(configs[i]); 4100243502Savg kmem_free(configs, count * sizeof(void *)); 4101243502Savg for (i = 0; i < nchildren; i++) 4102243502Savg nvlist_free(tops[i]); 4103243502Savg kmem_free(tops, nchildren * sizeof(void *)); 4104241286Savg nvlist_free(nvroot); 4105241286Savg return (config); 4106241286Savg} 4107241286Savg 4108241286Savgint 4109241286Savgspa_import_rootpool(const char *name) 4110241286Savg{ 4111241286Savg spa_t *spa; 4112241286Savg vdev_t *rvd, *bvd, *avd = NULL; 4113241286Savg nvlist_t *config, *nvtop; 4114241286Savg uint64_t txg; 4115241286Savg char *pname; 4116241286Savg int error; 4117241286Savg 4118241286Savg /* 4119241286Savg * Read the label from the boot device and generate a configuration. 4120241286Savg */ 4121241286Savg config = spa_generate_rootconf(name); 4122243213Savg 4123243213Savg mutex_enter(&spa_namespace_lock); 4124243213Savg if (config != NULL) { 4125243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4126243213Savg &pname) == 0 && strcmp(name, pname) == 0); 4127243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 4128243213Savg == 0); 4129243213Savg 4130243213Savg if ((spa = spa_lookup(pname)) != NULL) { 4131243213Savg /* 4132243213Savg * Remove the existing root pool from the namespace so 4133243213Savg * that we can replace it with the correct config 4134243213Savg * we just read in. 4135243213Savg */ 4136243213Savg spa_remove(spa); 4137243213Savg } 4138243213Savg spa = spa_add(pname, config, NULL); 4139243501Savg 4140243501Savg /* 4141243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4142243501Savg * via spa_version(). 4143243501Savg */ 4144243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4145243501Savg &spa->spa_ubsync.ub_version) != 0) 4146243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4147243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 4148241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4149241286Savg name); 4150241286Savg return (EIO); 4151243213Savg } else { 4152243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4153241286Savg } 4154241286Savg spa->spa_is_root = B_TRUE; 4155241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4156241286Savg 4157241286Savg /* 4158241286Savg * Build up a vdev tree based on the boot device's label config. 4159241286Savg */ 4160241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4161241286Savg &nvtop) == 0); 4162241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4163241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4164241286Savg VDEV_ALLOC_ROOTPOOL); 4165241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4166241286Savg if (error) { 4167241286Savg mutex_exit(&spa_namespace_lock); 4168241286Savg nvlist_free(config); 4169241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4170241286Savg pname); 4171241286Savg return (error); 4172241286Savg } 4173241286Savg 4174241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4175241286Savg vdev_free(rvd); 4176241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4177241286Savg mutex_exit(&spa_namespace_lock); 4178241286Savg 4179243213Savg nvlist_free(config); 4180243213Savg return (0); 4181241286Savg} 4182241286Savg 4183277300Ssmh#endif /* illumos */ 4184277300Ssmh#endif /* _KERNEL */ 4185219089Spjd 4186209962Smm/* 4187209962Smm * Import a non-root pool into the system. 4188209962Smm */ 4189185029Spjdint 4190219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4191185029Spjd{ 4192209962Smm spa_t *spa; 4193209962Smm char *altroot = NULL; 4194219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 4195219089Spjd zpool_rewind_policy_t policy; 4196219089Spjd uint64_t mode = spa_mode_global; 4197219089Spjd uint64_t readonly = B_FALSE; 4198209962Smm int error; 4199209962Smm nvlist_t *nvroot; 4200209962Smm nvlist_t **spares, **l2cache; 4201209962Smm uint_t nspares, nl2cache; 4202209962Smm 4203209962Smm /* 4204209962Smm * If a pool with this name exists, return failure. 4205209962Smm */ 4206209962Smm mutex_enter(&spa_namespace_lock); 4207219089Spjd if (spa_lookup(pool) != NULL) { 4208209962Smm mutex_exit(&spa_namespace_lock); 4209249195Smm return (SET_ERROR(EEXIST)); 4210209962Smm } 4211209962Smm 4212209962Smm /* 4213209962Smm * Create and initialize the spa structure. 4214209962Smm */ 4215209962Smm (void) nvlist_lookup_string(props, 4216209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4217219089Spjd (void) nvlist_lookup_uint64(props, 4218219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4219219089Spjd if (readonly) 4220219089Spjd mode = FREAD; 4221219089Spjd spa = spa_add(pool, config, altroot); 4222219089Spjd spa->spa_import_flags = flags; 4223209962Smm 4224209962Smm /* 4225219089Spjd * Verbatim import - Take a pool and insert it into the namespace 4226219089Spjd * as if it had been loaded at boot. 4227219089Spjd */ 4228219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4229219089Spjd if (props != NULL) 4230219089Spjd spa_configfile_set(spa, props, B_FALSE); 4231219089Spjd 4232219089Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 4233219089Spjd 4234219089Spjd mutex_exit(&spa_namespace_lock); 4235219089Spjd return (0); 4236219089Spjd } 4237219089Spjd 4238219089Spjd spa_activate(spa, mode); 4239219089Spjd 4240219089Spjd /* 4241209962Smm * Don't start async tasks until we know everything is healthy. 4242209962Smm */ 4243209962Smm spa_async_suspend(spa); 4244209962Smm 4245219089Spjd zpool_get_rewind_policy(config, &policy); 4246219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 4247219089Spjd state = SPA_LOAD_RECOVER; 4248219089Spjd 4249209962Smm /* 4250209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4251209962Smm * because the user-supplied config is actually the one to trust when 4252209962Smm * doing an import. 4253209962Smm */ 4254219089Spjd if (state != SPA_LOAD_RECOVER) 4255219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4256209962Smm 4257219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4258219089Spjd policy.zrp_request); 4259219089Spjd 4260219089Spjd /* 4261219089Spjd * Propagate anything learned while loading the pool and pass it 4262219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 4263219089Spjd */ 4264219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4265219089Spjd spa->spa_load_info) == 0); 4266219089Spjd 4267209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4268209962Smm /* 4269209962Smm * Toss any existing sparelist, as it doesn't have any validity 4270209962Smm * anymore, and conflicts with spa_has_spare(). 4271209962Smm */ 4272209962Smm if (spa->spa_spares.sav_config) { 4273209962Smm nvlist_free(spa->spa_spares.sav_config); 4274209962Smm spa->spa_spares.sav_config = NULL; 4275209962Smm spa_load_spares(spa); 4276209962Smm } 4277209962Smm if (spa->spa_l2cache.sav_config) { 4278209962Smm nvlist_free(spa->spa_l2cache.sav_config); 4279209962Smm spa->spa_l2cache.sav_config = NULL; 4280209962Smm spa_load_l2cache(spa); 4281209962Smm } 4282209962Smm 4283209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4284209962Smm &nvroot) == 0); 4285209962Smm if (error == 0) 4286209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4287209962Smm VDEV_ALLOC_SPARE); 4288209962Smm if (error == 0) 4289209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4290209962Smm VDEV_ALLOC_L2CACHE); 4291209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4292209962Smm 4293209962Smm if (props != NULL) 4294209962Smm spa_configfile_set(spa, props, B_FALSE); 4295209962Smm 4296209962Smm if (error != 0 || (props && spa_writeable(spa) && 4297209962Smm (error = spa_prop_set(spa, props)))) { 4298209962Smm spa_unload(spa); 4299209962Smm spa_deactivate(spa); 4300209962Smm spa_remove(spa); 4301209962Smm mutex_exit(&spa_namespace_lock); 4302209962Smm return (error); 4303209962Smm } 4304209962Smm 4305209962Smm spa_async_resume(spa); 4306209962Smm 4307209962Smm /* 4308209962Smm * Override any spares and level 2 cache devices as specified by 4309209962Smm * the user, as these may have correct device names/devids, etc. 4310209962Smm */ 4311209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4312209962Smm &spares, &nspares) == 0) { 4313209962Smm if (spa->spa_spares.sav_config) 4314209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4315209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4316209962Smm else 4317209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4318209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4319209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4320209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4321209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4322209962Smm spa_load_spares(spa); 4323209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4324209962Smm spa->spa_spares.sav_sync = B_TRUE; 4325209962Smm } 4326209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4327209962Smm &l2cache, &nl2cache) == 0) { 4328209962Smm if (spa->spa_l2cache.sav_config) 4329209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4330209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4331209962Smm else 4332209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4333209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4334209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4335209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4336209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4337209962Smm spa_load_l2cache(spa); 4338209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4339209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 4340209962Smm } 4341209962Smm 4342219089Spjd /* 4343219089Spjd * Check for any removed devices. 4344219089Spjd */ 4345219089Spjd if (spa->spa_autoreplace) { 4346219089Spjd spa_aux_check_removed(&spa->spa_spares); 4347219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 4348219089Spjd } 4349219089Spjd 4350209962Smm if (spa_writeable(spa)) { 4351209962Smm /* 4352209962Smm * Update the config cache to include the newly-imported pool. 4353209962Smm */ 4354209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4355209962Smm } 4356209962Smm 4357219089Spjd /* 4358219089Spjd * It's possible that the pool was expanded while it was exported. 4359219089Spjd * We kick off an async task to handle this for us. 4360219089Spjd */ 4361219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4362219089Spjd 4363209962Smm mutex_exit(&spa_namespace_lock); 4364248571Smm spa_history_log_version(spa, "import"); 4365209962Smm 4366219089Spjd#ifdef __FreeBSD__ 4367219089Spjd#ifdef _KERNEL 4368219089Spjd zvol_create_minors(pool); 4369219089Spjd#endif 4370219089Spjd#endif 4371209962Smm return (0); 4372185029Spjd} 4373185029Spjd 4374168404Spjdnvlist_t * 4375168404Spjdspa_tryimport(nvlist_t *tryconfig) 4376168404Spjd{ 4377168404Spjd nvlist_t *config = NULL; 4378168404Spjd char *poolname; 4379168404Spjd spa_t *spa; 4380168404Spjd uint64_t state; 4381208443Smm int error; 4382168404Spjd 4383168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4384168404Spjd return (NULL); 4385168404Spjd 4386168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4387168404Spjd return (NULL); 4388168404Spjd 4389168404Spjd /* 4390168404Spjd * Create and initialize the spa structure. 4391168404Spjd */ 4392168404Spjd mutex_enter(&spa_namespace_lock); 4393219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4394209962Smm spa_activate(spa, FREAD); 4395168404Spjd 4396168404Spjd /* 4397168404Spjd * Pass off the heavy lifting to spa_load(). 4398168404Spjd * Pass TRUE for mosconfig because the user-supplied config 4399168404Spjd * is actually the one to trust when doing an import. 4400168404Spjd */ 4401219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4402168404Spjd 4403168404Spjd /* 4404168404Spjd * If 'tryconfig' was at least parsable, return the current config. 4405168404Spjd */ 4406168404Spjd if (spa->spa_root_vdev != NULL) { 4407168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4408168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4409168404Spjd poolname) == 0); 4410168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4411168404Spjd state) == 0); 4412168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4413168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 4414236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4415236884Smm spa->spa_load_info) == 0); 4416168404Spjd 4417168404Spjd /* 4418185029Spjd * If the bootfs property exists on this pool then we 4419185029Spjd * copy it out so that external consumers can tell which 4420185029Spjd * pools are bootable. 4421168404Spjd */ 4422208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 4423185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4424185029Spjd 4425185029Spjd /* 4426185029Spjd * We have to play games with the name since the 4427185029Spjd * pool was opened as TRYIMPORT_NAME. 4428185029Spjd */ 4429185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 4430185029Spjd spa->spa_bootfs, tmpname) == 0) { 4431185029Spjd char *cp; 4432185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4433185029Spjd 4434185029Spjd cp = strchr(tmpname, '/'); 4435185029Spjd if (cp == NULL) { 4436185029Spjd (void) strlcpy(dsname, tmpname, 4437185029Spjd MAXPATHLEN); 4438185029Spjd } else { 4439185029Spjd (void) snprintf(dsname, MAXPATHLEN, 4440185029Spjd "%s/%s", poolname, ++cp); 4441185029Spjd } 4442185029Spjd VERIFY(nvlist_add_string(config, 4443185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4444185029Spjd kmem_free(dsname, MAXPATHLEN); 4445185029Spjd } 4446185029Spjd kmem_free(tmpname, MAXPATHLEN); 4447185029Spjd } 4448185029Spjd 4449185029Spjd /* 4450185029Spjd * Add the list of hot spares and level 2 cache devices. 4451185029Spjd */ 4452209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4453168404Spjd spa_add_spares(spa, config); 4454185029Spjd spa_add_l2cache(spa, config); 4455209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4456168404Spjd } 4457168404Spjd 4458168404Spjd spa_unload(spa); 4459168404Spjd spa_deactivate(spa); 4460168404Spjd spa_remove(spa); 4461168404Spjd mutex_exit(&spa_namespace_lock); 4462168404Spjd 4463168404Spjd return (config); 4464168404Spjd} 4465168404Spjd 4466168404Spjd/* 4467168404Spjd * Pool export/destroy 4468168404Spjd * 4469168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 4470168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 4471168404Spjd * update the pool state and sync all the labels to disk, removing the 4472207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4473207670Smm * we don't sync the labels or remove the configuration cache. 4474168404Spjd */ 4475168404Spjdstatic int 4476185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4477207670Smm boolean_t force, boolean_t hardforce) 4478168404Spjd{ 4479168404Spjd spa_t *spa; 4480168404Spjd 4481168404Spjd if (oldconfig) 4482168404Spjd *oldconfig = NULL; 4483168404Spjd 4484209962Smm if (!(spa_mode_global & FWRITE)) 4485249195Smm return (SET_ERROR(EROFS)); 4486168404Spjd 4487168404Spjd mutex_enter(&spa_namespace_lock); 4488168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4489168404Spjd mutex_exit(&spa_namespace_lock); 4490249195Smm return (SET_ERROR(ENOENT)); 4491168404Spjd } 4492168404Spjd 4493168404Spjd /* 4494168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 4495168404Spjd * reacquire the namespace lock, and see if we can export. 4496168404Spjd */ 4497168404Spjd spa_open_ref(spa, FTAG); 4498168404Spjd mutex_exit(&spa_namespace_lock); 4499168404Spjd spa_async_suspend(spa); 4500168404Spjd mutex_enter(&spa_namespace_lock); 4501168404Spjd spa_close(spa, FTAG); 4502168404Spjd 4503168404Spjd /* 4504168404Spjd * The pool will be in core if it's openable, 4505168404Spjd * in which case we can modify its state. 4506168404Spjd */ 4507168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4508168404Spjd /* 4509168404Spjd * Objsets may be open only because they're dirty, so we 4510168404Spjd * have to force it to sync before checking spa_refcnt. 4511168404Spjd */ 4512168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 4513286575Smav spa_evicting_os_wait(spa); 4514168404Spjd 4515168404Spjd /* 4516168404Spjd * A pool cannot be exported or destroyed if there are active 4517168404Spjd * references. If we are resetting a pool, allow references by 4518168404Spjd * fault injection handlers. 4519168404Spjd */ 4520168404Spjd if (!spa_refcount_zero(spa) || 4521168404Spjd (spa->spa_inject_ref != 0 && 4522168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 4523168404Spjd spa_async_resume(spa); 4524168404Spjd mutex_exit(&spa_namespace_lock); 4525249195Smm return (SET_ERROR(EBUSY)); 4526168404Spjd } 4527168404Spjd 4528185029Spjd /* 4529185029Spjd * A pool cannot be exported if it has an active shared spare. 4530185029Spjd * This is to prevent other pools stealing the active spare 4531185029Spjd * from an exported pool. At user's own will, such pool can 4532185029Spjd * be forcedly exported. 4533185029Spjd */ 4534185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 4535185029Spjd spa_has_active_shared_spare(spa)) { 4536185029Spjd spa_async_resume(spa); 4537185029Spjd mutex_exit(&spa_namespace_lock); 4538249195Smm return (SET_ERROR(EXDEV)); 4539185029Spjd } 4540168404Spjd 4541168404Spjd /* 4542168404Spjd * We want this to be reflected on every label, 4543168404Spjd * so mark them all dirty. spa_unload() will do the 4544168404Spjd * final sync that pushes these changes out. 4545168404Spjd */ 4546207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4547185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4548168404Spjd spa->spa_state = new_state; 4549219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 4550219089Spjd TXG_DEFER_SIZE + 1; 4551168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4552185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4553168404Spjd } 4554168404Spjd } 4555168404Spjd 4556185029Spjd spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4557185029Spjd 4558168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4559168404Spjd spa_unload(spa); 4560168404Spjd spa_deactivate(spa); 4561168404Spjd } 4562168404Spjd 4563168404Spjd if (oldconfig && spa->spa_config) 4564168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4565168404Spjd 4566168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 4567207670Smm if (!hardforce) 4568207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 4569168404Spjd spa_remove(spa); 4570168404Spjd } 4571168404Spjd mutex_exit(&spa_namespace_lock); 4572168404Spjd 4573168404Spjd return (0); 4574168404Spjd} 4575168404Spjd 4576168404Spjd/* 4577168404Spjd * Destroy a storage pool. 4578168404Spjd */ 4579168404Spjdint 4580168404Spjdspa_destroy(char *pool) 4581168404Spjd{ 4582207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4583207670Smm B_FALSE, B_FALSE)); 4584168404Spjd} 4585168404Spjd 4586168404Spjd/* 4587168404Spjd * Export a storage pool. 4588168404Spjd */ 4589168404Spjdint 4590207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4591207670Smm boolean_t hardforce) 4592168404Spjd{ 4593207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4594207670Smm force, hardforce)); 4595168404Spjd} 4596168404Spjd 4597168404Spjd/* 4598168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 4599168404Spjd * from the namespace in any way. 4600168404Spjd */ 4601168404Spjdint 4602168404Spjdspa_reset(char *pool) 4603168404Spjd{ 4604185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4605207670Smm B_FALSE, B_FALSE)); 4606168404Spjd} 4607168404Spjd 4608168404Spjd/* 4609168404Spjd * ========================================================================== 4610168404Spjd * Device manipulation 4611168404Spjd * ========================================================================== 4612168404Spjd */ 4613168404Spjd 4614168404Spjd/* 4615185029Spjd * Add a device to a storage pool. 4616168404Spjd */ 4617168404Spjdint 4618168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4619168404Spjd{ 4620219089Spjd uint64_t txg, id; 4621209962Smm int error; 4622168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4623168404Spjd vdev_t *vd, *tvd; 4624185029Spjd nvlist_t **spares, **l2cache; 4625185029Spjd uint_t nspares, nl2cache; 4626168404Spjd 4627219089Spjd ASSERT(spa_writeable(spa)); 4628219089Spjd 4629168404Spjd txg = spa_vdev_enter(spa); 4630168404Spjd 4631168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4632168404Spjd VDEV_ALLOC_ADD)) != 0) 4633168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4634168404Spjd 4635185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4636168404Spjd 4637185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4638185029Spjd &nspares) != 0) 4639168404Spjd nspares = 0; 4640168404Spjd 4641185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4642185029Spjd &nl2cache) != 0) 4643185029Spjd nl2cache = 0; 4644185029Spjd 4645185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4646168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4647168404Spjd 4648185029Spjd if (vd->vdev_children != 0 && 4649185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 4650185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4651168404Spjd 4652168404Spjd /* 4653185029Spjd * We must validate the spares and l2cache devices after checking the 4654185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4655168404Spjd */ 4656185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4657168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4658168404Spjd 4659168404Spjd /* 4660168404Spjd * Transfer each new top-level vdev from vd to rvd. 4661168404Spjd */ 4662209962Smm for (int c = 0; c < vd->vdev_children; c++) { 4663219089Spjd 4664219089Spjd /* 4665219089Spjd * Set the vdev id to the first hole, if one exists. 4666219089Spjd */ 4667219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 4668219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 4669219089Spjd vdev_free(rvd->vdev_child[id]); 4670219089Spjd break; 4671219089Spjd } 4672219089Spjd } 4673168404Spjd tvd = vd->vdev_child[c]; 4674168404Spjd vdev_remove_child(vd, tvd); 4675219089Spjd tvd->vdev_id = id; 4676168404Spjd vdev_add_child(rvd, tvd); 4677168404Spjd vdev_config_dirty(tvd); 4678168404Spjd } 4679168404Spjd 4680168404Spjd if (nspares != 0) { 4681185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4682185029Spjd ZPOOL_CONFIG_SPARES); 4683168404Spjd spa_load_spares(spa); 4684185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4685168404Spjd } 4686168404Spjd 4687185029Spjd if (nl2cache != 0) { 4688185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4689185029Spjd ZPOOL_CONFIG_L2CACHE); 4690185029Spjd spa_load_l2cache(spa); 4691185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4692185029Spjd } 4693185029Spjd 4694168404Spjd /* 4695168404Spjd * We have to be careful when adding new vdevs to an existing pool. 4696168404Spjd * If other threads start allocating from these vdevs before we 4697168404Spjd * sync the config cache, and we lose power, then upon reboot we may 4698168404Spjd * fail to open the pool because there are DVAs that the config cache 4699168404Spjd * can't translate. Therefore, we first add the vdevs without 4700168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4701168404Spjd * and then let spa_config_update() initialize the new metaslabs. 4702168404Spjd * 4703168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 4704168404Spjd * if we lose power at any point in this sequence, the remaining 4705168404Spjd * steps will be completed the next time we load the pool. 4706168404Spjd */ 4707168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 4708168404Spjd 4709168404Spjd mutex_enter(&spa_namespace_lock); 4710168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4711168404Spjd mutex_exit(&spa_namespace_lock); 4712168404Spjd 4713168404Spjd return (0); 4714168404Spjd} 4715168404Spjd 4716168404Spjd/* 4717168404Spjd * Attach a device to a mirror. The arguments are the path to any device 4718168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 4719168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 4720168404Spjd * 4721168404Spjd * If 'replacing' is specified, the new device is intended to replace the 4722168404Spjd * existing device; in this case the two devices are made into their own 4723185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 4724168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 4725168404Spjd * extra rules: you can't attach to it after it's been created, and upon 4726168404Spjd * completion of resilvering, the first disk (the one being replaced) 4727168404Spjd * is automatically detached. 4728168404Spjd */ 4729168404Spjdint 4730168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4731168404Spjd{ 4732219089Spjd uint64_t txg, dtl_max_txg; 4733168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4734168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4735168404Spjd vdev_ops_t *pvops; 4736185029Spjd char *oldvdpath, *newvdpath; 4737185029Spjd int newvd_isspare; 4738185029Spjd int error; 4739168404Spjd 4740219089Spjd ASSERT(spa_writeable(spa)); 4741219089Spjd 4742168404Spjd txg = spa_vdev_enter(spa); 4743168404Spjd 4744185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4745168404Spjd 4746168404Spjd if (oldvd == NULL) 4747168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4748168404Spjd 4749168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 4750168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4751168404Spjd 4752168404Spjd pvd = oldvd->vdev_parent; 4753168404Spjd 4754168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4755230514Smm VDEV_ALLOC_ATTACH)) != 0) 4756185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4757185029Spjd 4758185029Spjd if (newrootvd->vdev_children != 1) 4759168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4760168404Spjd 4761168404Spjd newvd = newrootvd->vdev_child[0]; 4762168404Spjd 4763168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 4764168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4765168404Spjd 4766168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4767168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 4768168404Spjd 4769185029Spjd /* 4770185029Spjd * Spares can't replace logs 4771185029Spjd */ 4772185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4773185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4774185029Spjd 4775168404Spjd if (!replacing) { 4776168404Spjd /* 4777168404Spjd * For attach, the only allowable parent is a mirror or the root 4778168404Spjd * vdev. 4779168404Spjd */ 4780168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 4781168404Spjd pvd->vdev_ops != &vdev_root_ops) 4782168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4783168404Spjd 4784168404Spjd pvops = &vdev_mirror_ops; 4785168404Spjd } else { 4786168404Spjd /* 4787168404Spjd * Active hot spares can only be replaced by inactive hot 4788168404Spjd * spares. 4789168404Spjd */ 4790168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4791219089Spjd oldvd->vdev_isspare && 4792168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 4793168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4794168404Spjd 4795168404Spjd /* 4796168404Spjd * If the source is a hot spare, and the parent isn't already a 4797168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 4798168404Spjd * want to create a replacing vdev. The user is not allowed to 4799168404Spjd * attach to a spared vdev child unless the 'isspare' state is 4800168404Spjd * the same (spare replaces spare, non-spare replaces 4801168404Spjd * non-spare). 4802168404Spjd */ 4803219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 4804219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4805168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4806219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 4807219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 4808168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4809219089Spjd } 4810219089Spjd 4811219089Spjd if (newvd->vdev_isspare) 4812168404Spjd pvops = &vdev_spare_ops; 4813168404Spjd else 4814168404Spjd pvops = &vdev_replacing_ops; 4815168404Spjd } 4816168404Spjd 4817168404Spjd /* 4818219089Spjd * Make sure the new device is big enough. 4819168404Spjd */ 4820219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4821168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4822168404Spjd 4823168404Spjd /* 4824168404Spjd * The new device cannot have a higher alignment requirement 4825168404Spjd * than the top-level vdev. 4826168404Spjd */ 4827168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4828168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4829168404Spjd 4830168404Spjd /* 4831168404Spjd * If this is an in-place replacement, update oldvd's path and devid 4832168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 4833168404Spjd */ 4834168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4835168404Spjd spa_strfree(oldvd->vdev_path); 4836168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4837168404Spjd KM_SLEEP); 4838168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 4839168404Spjd newvd->vdev_path, "old"); 4840168404Spjd if (oldvd->vdev_devid != NULL) { 4841168404Spjd spa_strfree(oldvd->vdev_devid); 4842168404Spjd oldvd->vdev_devid = NULL; 4843168404Spjd } 4844168404Spjd } 4845168404Spjd 4846219089Spjd /* mark the device being resilvered */ 4847254112Sdelphij newvd->vdev_resilver_txg = txg; 4848219089Spjd 4849168404Spjd /* 4850168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 4851168404Spjd * mirror/replacing/spare vdev above oldvd. 4852168404Spjd */ 4853168404Spjd if (pvd->vdev_ops != pvops) 4854168404Spjd pvd = vdev_add_parent(oldvd, pvops); 4855168404Spjd 4856168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 4857168404Spjd ASSERT(pvd->vdev_ops == pvops); 4858168404Spjd ASSERT(oldvd->vdev_parent == pvd); 4859168404Spjd 4860168404Spjd /* 4861168404Spjd * Extract the new device from its root and add it to pvd. 4862168404Spjd */ 4863168404Spjd vdev_remove_child(newrootvd, newvd); 4864168404Spjd newvd->vdev_id = pvd->vdev_children; 4865219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 4866168404Spjd vdev_add_child(pvd, newvd); 4867168404Spjd 4868168404Spjd tvd = newvd->vdev_top; 4869168404Spjd ASSERT(pvd->vdev_top == tvd); 4870168404Spjd ASSERT(tvd->vdev_parent == rvd); 4871168404Spjd 4872168404Spjd vdev_config_dirty(tvd); 4873168404Spjd 4874168404Spjd /* 4875219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4876219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 4877219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 4878168404Spjd */ 4879219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4880168404Spjd 4881219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4882219089Spjd dtl_max_txg - TXG_INITIAL); 4883168404Spjd 4884209962Smm if (newvd->vdev_isspare) { 4885168404Spjd spa_spare_activate(newvd); 4886209962Smm spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4887209962Smm } 4888209962Smm 4889185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 4890185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 4891185029Spjd newvd_isspare = newvd->vdev_isspare; 4892168404Spjd 4893168404Spjd /* 4894168404Spjd * Mark newvd's DTL dirty in this txg. 4895168404Spjd */ 4896168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 4897168404Spjd 4898219089Spjd /* 4899258717Savg * Schedule the resilver to restart in the future. We do this to 4900258717Savg * ensure that dmu_sync-ed blocks have been stitched into the 4901258717Savg * respective datasets. 4902219089Spjd */ 4903219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4904168404Spjd 4905219089Spjd /* 4906219089Spjd * Commit the config 4907219089Spjd */ 4908219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4909185029Spjd 4910248571Smm spa_history_log_internal(spa, "vdev attach", NULL, 4911219089Spjd "%s vdev=%s %s vdev=%s", 4912219089Spjd replacing && newvd_isspare ? "spare in" : 4913219089Spjd replacing ? "replace" : "attach", newvdpath, 4914219089Spjd replacing ? "for" : "to", oldvdpath); 4915219089Spjd 4916185029Spjd spa_strfree(oldvdpath); 4917185029Spjd spa_strfree(newvdpath); 4918185029Spjd 4919219089Spjd if (spa->spa_bootfs) 4920219089Spjd spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4921168404Spjd 4922168404Spjd return (0); 4923168404Spjd} 4924168404Spjd 4925168404Spjd/* 4926168404Spjd * Detach a device from a mirror or replacing vdev. 4927251631Sdelphij * 4928168404Spjd * If 'replace_done' is specified, only detach if the parent 4929168404Spjd * is a replacing vdev. 4930168404Spjd */ 4931168404Spjdint 4932209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4933168404Spjd{ 4934168404Spjd uint64_t txg; 4935209962Smm int error; 4936168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4937168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 4938168404Spjd boolean_t unspare = B_FALSE; 4939247187Smm uint64_t unspare_guid = 0; 4940219089Spjd char *vdpath; 4941168404Spjd 4942219089Spjd ASSERT(spa_writeable(spa)); 4943219089Spjd 4944168404Spjd txg = spa_vdev_enter(spa); 4945168404Spjd 4946185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4947168404Spjd 4948168404Spjd if (vd == NULL) 4949168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4950168404Spjd 4951168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 4952168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4953168404Spjd 4954168404Spjd pvd = vd->vdev_parent; 4955168404Spjd 4956168404Spjd /* 4957209962Smm * If the parent/child relationship is not as expected, don't do it. 4958209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4959209962Smm * vdev that's replacing B with C. The user's intent in replacing 4960209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 4961209962Smm * the replace by detaching C, the expected behavior is to end up 4962209962Smm * M(A,B). But suppose that right after deciding to detach C, 4963209962Smm * the replacement of B completes. We would have M(A,C), and then 4964209962Smm * ask to detach C, which would leave us with just A -- not what 4965209962Smm * the user wanted. To prevent this, we make sure that the 4966209962Smm * parent/child relationship hasn't changed -- in this example, 4967209962Smm * that C's parent is still the replacing vdev R. 4968209962Smm */ 4969209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 4970209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4971209962Smm 4972209962Smm /* 4973219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 4974168404Spjd */ 4975219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4976219089Spjd pvd->vdev_ops != &vdev_spare_ops) 4977219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4978168404Spjd 4979168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4980185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 4981168404Spjd 4982168404Spjd /* 4983168404Spjd * Only mirror, replacing, and spare vdevs support detach. 4984168404Spjd */ 4985168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 4986168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 4987168404Spjd pvd->vdev_ops != &vdev_spare_ops) 4988168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4989168404Spjd 4990168404Spjd /* 4991209962Smm * If this device has the only valid copy of some data, 4992209962Smm * we cannot safely detach it. 4993168404Spjd */ 4994209962Smm if (vdev_dtl_required(vd)) 4995168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4996168404Spjd 4997209962Smm ASSERT(pvd->vdev_children >= 2); 4998168404Spjd 4999168404Spjd /* 5000185029Spjd * If we are detaching the second disk from a replacing vdev, then 5001185029Spjd * check to see if we changed the original vdev's path to have "/old" 5002185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 5003168404Spjd */ 5004219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 5005219089Spjd vd->vdev_path != NULL) { 5006219089Spjd size_t len = strlen(vd->vdev_path); 5007219089Spjd 5008219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 5009219089Spjd cvd = pvd->vdev_child[c]; 5010219089Spjd 5011219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 5012219089Spjd continue; 5013219089Spjd 5014219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 5015219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 5016219089Spjd spa_strfree(cvd->vdev_path); 5017219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 5018219089Spjd break; 5019219089Spjd } 5020185029Spjd } 5021185029Spjd } 5022168404Spjd 5023168404Spjd /* 5024168404Spjd * If we are detaching the original disk from a spare, then it implies 5025168404Spjd * that the spare should become a real disk, and be removed from the 5026168404Spjd * active spare list for the pool. 5027168404Spjd */ 5028168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 5029219089Spjd vd->vdev_id == 0 && 5030219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 5031168404Spjd unspare = B_TRUE; 5032168404Spjd 5033168404Spjd /* 5034168404Spjd * Erase the disk labels so the disk can be used for other things. 5035168404Spjd * This must be done after all other error cases are handled, 5036168404Spjd * but before we disembowel vd (so we can still do I/O to it). 5037168404Spjd * But if we can't do it, don't treat the error as fatal -- 5038168404Spjd * it may be that the unwritability of the disk is the reason 5039168404Spjd * it's being detached! 5040168404Spjd */ 5041168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5042168404Spjd 5043168404Spjd /* 5044168404Spjd * Remove vd from its parent and compact the parent's children. 5045168404Spjd */ 5046168404Spjd vdev_remove_child(pvd, vd); 5047168404Spjd vdev_compact_children(pvd); 5048168404Spjd 5049168404Spjd /* 5050168404Spjd * Remember one of the remaining children so we can get tvd below. 5051168404Spjd */ 5052219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 5053168404Spjd 5054168404Spjd /* 5055168404Spjd * If we need to remove the remaining child from the list of hot spares, 5056209962Smm * do it now, marking the vdev as no longer a spare in the process. 5057209962Smm * We must do this before vdev_remove_parent(), because that can 5058209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 5059209962Smm * reason, we must remove the spare now, in the same txg as the detach; 5060209962Smm * otherwise someone could attach a new sibling, change the GUID, and 5061209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 5062168404Spjd */ 5063168404Spjd if (unspare) { 5064168404Spjd ASSERT(cvd->vdev_isspare); 5065168404Spjd spa_spare_remove(cvd); 5066168404Spjd unspare_guid = cvd->vdev_guid; 5067209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 5068219089Spjd cvd->vdev_unspare = B_TRUE; 5069168404Spjd } 5070168404Spjd 5071168404Spjd /* 5072168404Spjd * If the parent mirror/replacing vdev only has one child, 5073168404Spjd * the parent is no longer needed. Remove it from the tree. 5074168404Spjd */ 5075219089Spjd if (pvd->vdev_children == 1) { 5076219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 5077219089Spjd cvd->vdev_unspare = B_FALSE; 5078168404Spjd vdev_remove_parent(cvd); 5079219089Spjd } 5080168404Spjd 5081219089Spjd 5082168404Spjd /* 5083168404Spjd * We don't set tvd until now because the parent we just removed 5084168404Spjd * may have been the previous top-level vdev. 5085168404Spjd */ 5086168404Spjd tvd = cvd->vdev_top; 5087168404Spjd ASSERT(tvd->vdev_parent == rvd); 5088168404Spjd 5089168404Spjd /* 5090168404Spjd * Reevaluate the parent vdev state. 5091168404Spjd */ 5092185029Spjd vdev_propagate_state(cvd); 5093168404Spjd 5094168404Spjd /* 5095219089Spjd * If the 'autoexpand' property is set on the pool then automatically 5096219089Spjd * try to expand the size of the pool. For example if the device we 5097219089Spjd * just detached was smaller than the others, it may be possible to 5098219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5099219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 5100168404Spjd */ 5101219089Spjd if (spa->spa_autoexpand) { 5102219089Spjd vdev_reopen(tvd); 5103219089Spjd vdev_expand(tvd, txg); 5104219089Spjd } 5105168404Spjd 5106168404Spjd vdev_config_dirty(tvd); 5107168404Spjd 5108168404Spjd /* 5109168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5110168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 5111168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 5112168404Spjd * prevent vd from being accessed after it's freed. 5113168404Spjd */ 5114219089Spjd vdpath = spa_strdup(vd->vdev_path); 5115209962Smm for (int t = 0; t < TXG_SIZE; t++) 5116168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5117168404Spjd vd->vdev_detached = B_TRUE; 5118168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 5119168404Spjd 5120185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 5121185029Spjd 5122219089Spjd /* hang on to the spa before we release the lock */ 5123219089Spjd spa_open_ref(spa, FTAG); 5124219089Spjd 5125168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 5126168404Spjd 5127248571Smm spa_history_log_internal(spa, "detach", NULL, 5128219089Spjd "vdev=%s", vdpath); 5129219089Spjd spa_strfree(vdpath); 5130219089Spjd 5131168404Spjd /* 5132168404Spjd * If this was the removal of the original device in a hot spare vdev, 5133168404Spjd * then we want to go through and remove the device from the hot spare 5134168404Spjd * list of every other pool. 5135168404Spjd */ 5136168404Spjd if (unspare) { 5137219089Spjd spa_t *altspa = NULL; 5138219089Spjd 5139168404Spjd mutex_enter(&spa_namespace_lock); 5140219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 5141219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 5142219089Spjd altspa == spa) 5143168404Spjd continue; 5144219089Spjd 5145219089Spjd spa_open_ref(altspa, FTAG); 5146185029Spjd mutex_exit(&spa_namespace_lock); 5147219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5148185029Spjd mutex_enter(&spa_namespace_lock); 5149219089Spjd spa_close(altspa, FTAG); 5150168404Spjd } 5151168404Spjd mutex_exit(&spa_namespace_lock); 5152219089Spjd 5153219089Spjd /* search the rest of the vdevs for spares to remove */ 5154219089Spjd spa_vdev_resilver_done(spa); 5155168404Spjd } 5156168404Spjd 5157219089Spjd /* all done with the spa; OK to release */ 5158219089Spjd mutex_enter(&spa_namespace_lock); 5159219089Spjd spa_close(spa, FTAG); 5160219089Spjd mutex_exit(&spa_namespace_lock); 5161219089Spjd 5162168404Spjd return (error); 5163168404Spjd} 5164168404Spjd 5165219089Spjd/* 5166219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 5167219089Spjd */ 5168219089Spjdint 5169219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5170219089Spjd nvlist_t *props, boolean_t exp) 5171219089Spjd{ 5172219089Spjd int error = 0; 5173219089Spjd uint64_t txg, *glist; 5174219089Spjd spa_t *newspa; 5175219089Spjd uint_t c, children, lastlog; 5176219089Spjd nvlist_t **child, *nvl, *tmp; 5177219089Spjd dmu_tx_t *tx; 5178219089Spjd char *altroot = NULL; 5179219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5180219089Spjd boolean_t activate_slog; 5181219089Spjd 5182219089Spjd ASSERT(spa_writeable(spa)); 5183219089Spjd 5184219089Spjd txg = spa_vdev_enter(spa); 5185219089Spjd 5186219089Spjd /* clear the log and flush everything up to now */ 5187219089Spjd activate_slog = spa_passivate_log(spa); 5188219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5189219089Spjd error = spa_offline_log(spa); 5190219089Spjd txg = spa_vdev_config_enter(spa); 5191219089Spjd 5192219089Spjd if (activate_slog) 5193219089Spjd spa_activate_log(spa); 5194219089Spjd 5195219089Spjd if (error != 0) 5196219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5197219089Spjd 5198219089Spjd /* check new spa name before going any further */ 5199219089Spjd if (spa_lookup(newname) != NULL) 5200219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5201219089Spjd 5202219089Spjd /* 5203219089Spjd * scan through all the children to ensure they're all mirrors 5204219089Spjd */ 5205219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5206219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5207219089Spjd &children) != 0) 5208219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5209219089Spjd 5210219089Spjd /* first, check to ensure we've got the right child count */ 5211219089Spjd rvd = spa->spa_root_vdev; 5212219089Spjd lastlog = 0; 5213219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 5214219089Spjd vdev_t *vd = rvd->vdev_child[c]; 5215219089Spjd 5216219089Spjd /* don't count the holes & logs as children */ 5217219089Spjd if (vd->vdev_islog || vd->vdev_ishole) { 5218219089Spjd if (lastlog == 0) 5219219089Spjd lastlog = c; 5220219089Spjd continue; 5221219089Spjd } 5222219089Spjd 5223219089Spjd lastlog = 0; 5224219089Spjd } 5225219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5226219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5227219089Spjd 5228219089Spjd /* next, ensure no spare or cache devices are part of the split */ 5229219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5230219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5231219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5232219089Spjd 5233219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5234219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5235219089Spjd 5236219089Spjd /* then, loop over each vdev and validate it */ 5237219089Spjd for (c = 0; c < children; c++) { 5238219089Spjd uint64_t is_hole = 0; 5239219089Spjd 5240219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5241219089Spjd &is_hole); 5242219089Spjd 5243219089Spjd if (is_hole != 0) { 5244219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5245219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5246219089Spjd continue; 5247219089Spjd } else { 5248249195Smm error = SET_ERROR(EINVAL); 5249219089Spjd break; 5250219089Spjd } 5251219089Spjd } 5252219089Spjd 5253219089Spjd /* which disk is going to be split? */ 5254219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5255219089Spjd &glist[c]) != 0) { 5256249195Smm error = SET_ERROR(EINVAL); 5257219089Spjd break; 5258219089Spjd } 5259219089Spjd 5260219089Spjd /* look it up in the spa */ 5261219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5262219089Spjd if (vml[c] == NULL) { 5263249195Smm error = SET_ERROR(ENODEV); 5264219089Spjd break; 5265219089Spjd } 5266219089Spjd 5267219089Spjd /* make sure there's nothing stopping the split */ 5268219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5269219089Spjd vml[c]->vdev_islog || 5270219089Spjd vml[c]->vdev_ishole || 5271219089Spjd vml[c]->vdev_isspare || 5272219089Spjd vml[c]->vdev_isl2cache || 5273219089Spjd !vdev_writeable(vml[c]) || 5274219089Spjd vml[c]->vdev_children != 0 || 5275219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5276219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5277249195Smm error = SET_ERROR(EINVAL); 5278219089Spjd break; 5279219089Spjd } 5280219089Spjd 5281219089Spjd if (vdev_dtl_required(vml[c])) { 5282249195Smm error = SET_ERROR(EBUSY); 5283219089Spjd break; 5284219089Spjd } 5285219089Spjd 5286219089Spjd /* we need certain info from the top level */ 5287219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5288219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 5289219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5290219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 5291219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5292219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 5293219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5294219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 5295219089Spjd } 5296219089Spjd 5297219089Spjd if (error != 0) { 5298219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5299219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5300219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5301219089Spjd } 5302219089Spjd 5303219089Spjd /* stop writers from using the disks */ 5304219089Spjd for (c = 0; c < children; c++) { 5305219089Spjd if (vml[c] != NULL) 5306219089Spjd vml[c]->vdev_offline = B_TRUE; 5307219089Spjd } 5308219089Spjd vdev_reopen(spa->spa_root_vdev); 5309219089Spjd 5310219089Spjd /* 5311219089Spjd * Temporarily record the splitting vdevs in the spa config. This 5312219089Spjd * will disappear once the config is regenerated. 5313219089Spjd */ 5314219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5315219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5316219089Spjd glist, children) == 0); 5317219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5318219089Spjd 5319219089Spjd mutex_enter(&spa->spa_props_lock); 5320219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5321219089Spjd nvl) == 0); 5322219089Spjd mutex_exit(&spa->spa_props_lock); 5323219089Spjd spa->spa_config_splitting = nvl; 5324219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5325219089Spjd 5326219089Spjd /* configure and create the new pool */ 5327219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5328219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5329219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5330219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5331219089Spjd spa_version(spa)) == 0); 5332219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5333219089Spjd spa->spa_config_txg) == 0); 5334219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5335219089Spjd spa_generate_guid(NULL)) == 0); 5336219089Spjd (void) nvlist_lookup_string(props, 5337219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5338219089Spjd 5339219089Spjd /* add the new pool to the namespace */ 5340219089Spjd newspa = spa_add(newname, config, altroot); 5341219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 5342219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 5343219089Spjd 5344219089Spjd /* release the spa config lock, retaining the namespace lock */ 5345219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5346219089Spjd 5347219089Spjd if (zio_injection_enabled) 5348219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 5349219089Spjd 5350219089Spjd spa_activate(newspa, spa_mode_global); 5351219089Spjd spa_async_suspend(newspa); 5352219089Spjd 5353277300Ssmh#ifndef illumos 5354219089Spjd /* mark that we are creating new spa by splitting */ 5355219089Spjd newspa->spa_splitting_newspa = B_TRUE; 5356219089Spjd#endif 5357219089Spjd /* create the new pool from the disks of the original pool */ 5358219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5359277300Ssmh#ifndef illumos 5360219089Spjd newspa->spa_splitting_newspa = B_FALSE; 5361219089Spjd#endif 5362219089Spjd if (error) 5363219089Spjd goto out; 5364219089Spjd 5365219089Spjd /* if that worked, generate a real config for the new pool */ 5366219089Spjd if (newspa->spa_root_vdev != NULL) { 5367219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5368219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 5369219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5370219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5371219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5372219089Spjd B_TRUE)); 5373219089Spjd } 5374219089Spjd 5375219089Spjd /* set the props */ 5376219089Spjd if (props != NULL) { 5377219089Spjd spa_configfile_set(newspa, props, B_FALSE); 5378219089Spjd error = spa_prop_set(newspa, props); 5379219089Spjd if (error) 5380219089Spjd goto out; 5381219089Spjd } 5382219089Spjd 5383219089Spjd /* flush everything */ 5384219089Spjd txg = spa_vdev_config_enter(newspa); 5385219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 5386219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5387219089Spjd 5388219089Spjd if (zio_injection_enabled) 5389219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 5390219089Spjd 5391219089Spjd spa_async_resume(newspa); 5392219089Spjd 5393219089Spjd /* finally, update the original pool's config */ 5394219089Spjd txg = spa_vdev_config_enter(spa); 5395219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5396219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 5397219089Spjd if (error != 0) 5398219089Spjd dmu_tx_abort(tx); 5399219089Spjd for (c = 0; c < children; c++) { 5400219089Spjd if (vml[c] != NULL) { 5401219089Spjd vdev_split(vml[c]); 5402219089Spjd if (error == 0) 5403248571Smm spa_history_log_internal(spa, "detach", tx, 5404248571Smm "vdev=%s", vml[c]->vdev_path); 5405219089Spjd vdev_free(vml[c]); 5406219089Spjd } 5407219089Spjd } 5408219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5409219089Spjd spa->spa_config_splitting = NULL; 5410219089Spjd nvlist_free(nvl); 5411219089Spjd if (error == 0) 5412219089Spjd dmu_tx_commit(tx); 5413219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 5414219089Spjd 5415219089Spjd if (zio_injection_enabled) 5416219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 5417219089Spjd 5418219089Spjd /* split is complete; log a history record */ 5419248571Smm spa_history_log_internal(newspa, "split", NULL, 5420248571Smm "from pool %s", spa_name(spa)); 5421219089Spjd 5422219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5423219089Spjd 5424219089Spjd /* if we're not going to mount the filesystems in userland, export */ 5425219089Spjd if (exp) 5426219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5427219089Spjd B_FALSE, B_FALSE); 5428219089Spjd 5429219089Spjd return (error); 5430219089Spjd 5431219089Spjdout: 5432219089Spjd spa_unload(newspa); 5433219089Spjd spa_deactivate(newspa); 5434219089Spjd spa_remove(newspa); 5435219089Spjd 5436219089Spjd txg = spa_vdev_config_enter(spa); 5437219089Spjd 5438219089Spjd /* re-online all offlined disks */ 5439219089Spjd for (c = 0; c < children; c++) { 5440219089Spjd if (vml[c] != NULL) 5441219089Spjd vml[c]->vdev_offline = B_FALSE; 5442219089Spjd } 5443219089Spjd vdev_reopen(spa->spa_root_vdev); 5444219089Spjd 5445219089Spjd nvlist_free(spa->spa_config_splitting); 5446219089Spjd spa->spa_config_splitting = NULL; 5447219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 5448219089Spjd 5449219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5450219089Spjd return (error); 5451219089Spjd} 5452219089Spjd 5453185029Spjdstatic nvlist_t * 5454185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5455185029Spjd{ 5456185029Spjd for (int i = 0; i < count; i++) { 5457185029Spjd uint64_t guid; 5458185029Spjd 5459185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5460185029Spjd &guid) == 0); 5461185029Spjd 5462185029Spjd if (guid == target_guid) 5463185029Spjd return (nvpp[i]); 5464185029Spjd } 5465185029Spjd 5466185029Spjd return (NULL); 5467185029Spjd} 5468185029Spjd 5469185029Spjdstatic void 5470185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5471185029Spjd nvlist_t *dev_to_remove) 5472185029Spjd{ 5473185029Spjd nvlist_t **newdev = NULL; 5474185029Spjd 5475185029Spjd if (count > 1) 5476185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5477185029Spjd 5478185029Spjd for (int i = 0, j = 0; i < count; i++) { 5479185029Spjd if (dev[i] == dev_to_remove) 5480185029Spjd continue; 5481185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5482185029Spjd } 5483185029Spjd 5484185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5485185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5486185029Spjd 5487185029Spjd for (int i = 0; i < count - 1; i++) 5488185029Spjd nvlist_free(newdev[i]); 5489185029Spjd 5490185029Spjd if (count > 1) 5491185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 5492185029Spjd} 5493185029Spjd 5494168404Spjd/* 5495219089Spjd * Evacuate the device. 5496219089Spjd */ 5497219089Spjdstatic int 5498219089Spjdspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5499219089Spjd{ 5500219089Spjd uint64_t txg; 5501219089Spjd int error = 0; 5502219089Spjd 5503219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5504219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5505219089Spjd ASSERT(vd == vd->vdev_top); 5506219089Spjd 5507219089Spjd /* 5508219089Spjd * Evacuate the device. We don't hold the config lock as writer 5509219089Spjd * since we need to do I/O but we do keep the 5510219089Spjd * spa_namespace_lock held. Once this completes the device 5511219089Spjd * should no longer have any blocks allocated on it. 5512219089Spjd */ 5513219089Spjd if (vd->vdev_islog) { 5514219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5515219089Spjd error = spa_offline_log(spa); 5516219089Spjd } else { 5517249195Smm error = SET_ERROR(ENOTSUP); 5518219089Spjd } 5519219089Spjd 5520219089Spjd if (error) 5521219089Spjd return (error); 5522219089Spjd 5523219089Spjd /* 5524219089Spjd * The evacuation succeeded. Remove any remaining MOS metadata 5525219089Spjd * associated with this vdev, and wait for these changes to sync. 5526219089Spjd */ 5527240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 5528219089Spjd txg = spa_vdev_config_enter(spa); 5529219089Spjd vd->vdev_removing = B_TRUE; 5530258717Savg vdev_dirty_leaves(vd, VDD_DTL, txg); 5531219089Spjd vdev_config_dirty(vd); 5532219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5533219089Spjd 5534219089Spjd return (0); 5535219089Spjd} 5536219089Spjd 5537219089Spjd/* 5538219089Spjd * Complete the removal by cleaning up the namespace. 5539219089Spjd */ 5540219089Spjdstatic void 5541219089Spjdspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5542219089Spjd{ 5543219089Spjd vdev_t *rvd = spa->spa_root_vdev; 5544219089Spjd uint64_t id = vd->vdev_id; 5545219089Spjd boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5546219089Spjd 5547219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5548219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5549219089Spjd ASSERT(vd == vd->vdev_top); 5550219089Spjd 5551219089Spjd /* 5552219089Spjd * Only remove any devices which are empty. 5553219089Spjd */ 5554219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5555219089Spjd return; 5556219089Spjd 5557219089Spjd (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5558219089Spjd 5559219089Spjd if (list_link_active(&vd->vdev_state_dirty_node)) 5560219089Spjd vdev_state_clean(vd); 5561219089Spjd if (list_link_active(&vd->vdev_config_dirty_node)) 5562219089Spjd vdev_config_clean(vd); 5563219089Spjd 5564219089Spjd vdev_free(vd); 5565219089Spjd 5566219089Spjd if (last_vdev) { 5567219089Spjd vdev_compact_children(rvd); 5568219089Spjd } else { 5569219089Spjd vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5570219089Spjd vdev_add_child(rvd, vd); 5571219089Spjd } 5572219089Spjd vdev_config_dirty(rvd); 5573219089Spjd 5574219089Spjd /* 5575219089Spjd * Reassess the health of our root vdev. 5576219089Spjd */ 5577219089Spjd vdev_reopen(rvd); 5578219089Spjd} 5579219089Spjd 5580219089Spjd/* 5581219089Spjd * Remove a device from the pool - 5582219089Spjd * 5583219089Spjd * Removing a device from the vdev namespace requires several steps 5584219089Spjd * and can take a significant amount of time. As a result we use 5585219089Spjd * the spa_vdev_config_[enter/exit] functions which allow us to 5586219089Spjd * grab and release the spa_config_lock while still holding the namespace 5587219089Spjd * lock. During each step the configuration is synced out. 5588251631Sdelphij * 5589251631Sdelphij * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5590251631Sdelphij * devices. 5591219089Spjd */ 5592168404Spjdint 5593168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5594168404Spjd{ 5595168404Spjd vdev_t *vd; 5596219089Spjd metaslab_group_t *mg; 5597185029Spjd nvlist_t **spares, **l2cache, *nv; 5598219089Spjd uint64_t txg = 0; 5599185029Spjd uint_t nspares, nl2cache; 5600185029Spjd int error = 0; 5601209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5602168404Spjd 5603219089Spjd ASSERT(spa_writeable(spa)); 5604219089Spjd 5605209962Smm if (!locked) 5606209962Smm txg = spa_vdev_enter(spa); 5607168404Spjd 5608185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5609168404Spjd 5610185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 5611185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5612185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5613185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5614185029Spjd /* 5615185029Spjd * Only remove the hot spare if it's not currently in use 5616185029Spjd * in this pool. 5617185029Spjd */ 5618185029Spjd if (vd == NULL || unspare) { 5619185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 5620185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5621185029Spjd spa_load_spares(spa); 5622185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5623185029Spjd } else { 5624249195Smm error = SET_ERROR(EBUSY); 5625168404Spjd } 5626185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 5627185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5628185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5629185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5630185029Spjd /* 5631185029Spjd * Cache devices can always be removed. 5632185029Spjd */ 5633185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5634185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5635185029Spjd spa_load_l2cache(spa); 5636185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5637219089Spjd } else if (vd != NULL && vd->vdev_islog) { 5638219089Spjd ASSERT(!locked); 5639219089Spjd ASSERT(vd == vd->vdev_top); 5640219089Spjd 5641219089Spjd mg = vd->vdev_mg; 5642219089Spjd 5643219089Spjd /* 5644219089Spjd * Stop allocating from this vdev. 5645219089Spjd */ 5646219089Spjd metaslab_group_passivate(mg); 5647219089Spjd 5648219089Spjd /* 5649219089Spjd * Wait for the youngest allocations and frees to sync, 5650219089Spjd * and then wait for the deferral of those frees to finish. 5651219089Spjd */ 5652219089Spjd spa_vdev_config_exit(spa, NULL, 5653219089Spjd txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5654219089Spjd 5655219089Spjd /* 5656219089Spjd * Attempt to evacuate the vdev. 5657219089Spjd */ 5658219089Spjd error = spa_vdev_remove_evacuate(spa, vd); 5659219089Spjd 5660219089Spjd txg = spa_vdev_config_enter(spa); 5661219089Spjd 5662219089Spjd /* 5663219089Spjd * If we couldn't evacuate the vdev, unwind. 5664219089Spjd */ 5665219089Spjd if (error) { 5666219089Spjd metaslab_group_activate(mg); 5667219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5668219089Spjd } 5669219089Spjd 5670219089Spjd /* 5671219089Spjd * Clean up the vdev namespace. 5672219089Spjd */ 5673219089Spjd spa_vdev_remove_from_namespace(spa, vd); 5674219089Spjd 5675185029Spjd } else if (vd != NULL) { 5676185029Spjd /* 5677185029Spjd * Normal vdevs cannot be removed (yet). 5678185029Spjd */ 5679249195Smm error = SET_ERROR(ENOTSUP); 5680168404Spjd } else { 5681185029Spjd /* 5682185029Spjd * There is no vdev of any kind with the specified guid. 5683185029Spjd */ 5684249195Smm error = SET_ERROR(ENOENT); 5685168404Spjd } 5686168404Spjd 5687209962Smm if (!locked) 5688209962Smm return (spa_vdev_exit(spa, NULL, txg, error)); 5689209962Smm 5690209962Smm return (error); 5691168404Spjd} 5692168404Spjd 5693168404Spjd/* 5694185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 5695251631Sdelphij * currently spared, so we can detach it. 5696168404Spjd */ 5697168404Spjdstatic vdev_t * 5698185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 5699168404Spjd{ 5700168404Spjd vdev_t *newvd, *oldvd; 5701168404Spjd 5702219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5703185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5704168404Spjd if (oldvd != NULL) 5705168404Spjd return (oldvd); 5706168404Spjd } 5707168404Spjd 5708185029Spjd /* 5709219089Spjd * Check for a completed replacement. We always consider the first 5710219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 5711219089Spjd * the newest (see spa_vdev_attach() for how that works). In 5712219089Spjd * the case where the newest vdev is faulted, we will not automatically 5713219089Spjd * remove it after a resilver completes. This is OK as it will require 5714219089Spjd * user intervention to determine which disk the admin wishes to keep. 5715185029Spjd */ 5716219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 5717219089Spjd ASSERT(vd->vdev_children > 1); 5718219089Spjd 5719219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 5720168404Spjd oldvd = vd->vdev_child[0]; 5721168404Spjd 5722209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 5723219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5724209962Smm !vdev_dtl_required(oldvd)) 5725168404Spjd return (oldvd); 5726168404Spjd } 5727168404Spjd 5728185029Spjd /* 5729185029Spjd * Check for a completed resilver with the 'unspare' flag set. 5730185029Spjd */ 5731219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 5732219089Spjd vdev_t *first = vd->vdev_child[0]; 5733219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5734185029Spjd 5735219089Spjd if (last->vdev_unspare) { 5736219089Spjd oldvd = first; 5737219089Spjd newvd = last; 5738219089Spjd } else if (first->vdev_unspare) { 5739219089Spjd oldvd = last; 5740219089Spjd newvd = first; 5741219089Spjd } else { 5742219089Spjd oldvd = NULL; 5743219089Spjd } 5744219089Spjd 5745219089Spjd if (oldvd != NULL && 5746209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 5747219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5748219089Spjd !vdev_dtl_required(oldvd)) 5749185029Spjd return (oldvd); 5750219089Spjd 5751219089Spjd /* 5752219089Spjd * If there are more than two spares attached to a disk, 5753219089Spjd * and those spares are not required, then we want to 5754219089Spjd * attempt to free them up now so that they can be used 5755219089Spjd * by other pools. Once we're back down to a single 5756219089Spjd * disk+spare, we stop removing them. 5757219089Spjd */ 5758219089Spjd if (vd->vdev_children > 2) { 5759219089Spjd newvd = vd->vdev_child[1]; 5760219089Spjd 5761219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 5762219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 5763219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 5764219089Spjd !vdev_dtl_required(newvd)) 5765219089Spjd return (newvd); 5766185029Spjd } 5767185029Spjd } 5768185029Spjd 5769168404Spjd return (NULL); 5770168404Spjd} 5771168404Spjd 5772168404Spjdstatic void 5773185029Spjdspa_vdev_resilver_done(spa_t *spa) 5774168404Spjd{ 5775209962Smm vdev_t *vd, *pvd, *ppvd; 5776209962Smm uint64_t guid, sguid, pguid, ppguid; 5777168404Spjd 5778209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5779168404Spjd 5780185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5781209962Smm pvd = vd->vdev_parent; 5782209962Smm ppvd = pvd->vdev_parent; 5783168404Spjd guid = vd->vdev_guid; 5784209962Smm pguid = pvd->vdev_guid; 5785209962Smm ppguid = ppvd->vdev_guid; 5786209962Smm sguid = 0; 5787168404Spjd /* 5788168404Spjd * If we have just finished replacing a hot spared device, then 5789168404Spjd * we need to detach the parent's first child (the original hot 5790168404Spjd * spare) as well. 5791168404Spjd */ 5792219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5793219089Spjd ppvd->vdev_children == 2) { 5794168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5795209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 5796168404Spjd } 5797254112Sdelphij ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5798254112Sdelphij 5799209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5800209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5801168404Spjd return; 5802209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5803168404Spjd return; 5804209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5805168404Spjd } 5806168404Spjd 5807209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5808168404Spjd} 5809168404Spjd 5810168404Spjd/* 5811219089Spjd * Update the stored path or FRU for this vdev. 5812168404Spjd */ 5813168404Spjdint 5814209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5815209962Smm boolean_t ispath) 5816168404Spjd{ 5817185029Spjd vdev_t *vd; 5818219089Spjd boolean_t sync = B_FALSE; 5819168404Spjd 5820219089Spjd ASSERT(spa_writeable(spa)); 5821168404Spjd 5822219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 5823219089Spjd 5824209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5825219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5826168404Spjd 5827168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5828219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5829168404Spjd 5830209962Smm if (ispath) { 5831219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 5832219089Spjd spa_strfree(vd->vdev_path); 5833219089Spjd vd->vdev_path = spa_strdup(value); 5834219089Spjd sync = B_TRUE; 5835219089Spjd } 5836209962Smm } else { 5837219089Spjd if (vd->vdev_fru == NULL) { 5838219089Spjd vd->vdev_fru = spa_strdup(value); 5839219089Spjd sync = B_TRUE; 5840219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 5841209962Smm spa_strfree(vd->vdev_fru); 5842219089Spjd vd->vdev_fru = spa_strdup(value); 5843219089Spjd sync = B_TRUE; 5844219089Spjd } 5845209962Smm } 5846168404Spjd 5847219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5848168404Spjd} 5849168404Spjd 5850209962Smmint 5851209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5852209962Smm{ 5853209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5854209962Smm} 5855209962Smm 5856209962Smmint 5857209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5858209962Smm{ 5859209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5860209962Smm} 5861209962Smm 5862168404Spjd/* 5863168404Spjd * ========================================================================== 5864219089Spjd * SPA Scanning 5865168404Spjd * ========================================================================== 5866168404Spjd */ 5867168404Spjd 5868168404Spjdint 5869219089Spjdspa_scan_stop(spa_t *spa) 5870168404Spjd{ 5871185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5872219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5873249195Smm return (SET_ERROR(EBUSY)); 5874219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 5875219089Spjd} 5876168404Spjd 5877219089Spjdint 5878219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 5879219089Spjd{ 5880219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5881219089Spjd 5882219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5883249195Smm return (SET_ERROR(ENOTSUP)); 5884168404Spjd 5885168404Spjd /* 5886185029Spjd * If a resilver was requested, but there is no DTL on a 5887185029Spjd * writeable leaf device, we have nothing to do. 5888168404Spjd */ 5889219089Spjd if (func == POOL_SCAN_RESILVER && 5890185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5891185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5892168404Spjd return (0); 5893168404Spjd } 5894168404Spjd 5895219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 5896168404Spjd} 5897168404Spjd 5898168404Spjd/* 5899168404Spjd * ========================================================================== 5900168404Spjd * SPA async task processing 5901168404Spjd * ========================================================================== 5902168404Spjd */ 5903168404Spjd 5904168404Spjdstatic void 5905185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 5906168404Spjd{ 5907185029Spjd if (vd->vdev_remove_wanted) { 5908219089Spjd vd->vdev_remove_wanted = B_FALSE; 5909219089Spjd vd->vdev_delayed_close = B_FALSE; 5910185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5911209962Smm 5912209962Smm /* 5913209962Smm * We want to clear the stats, but we don't want to do a full 5914209962Smm * vdev_clear() as that will cause us to throw away 5915209962Smm * degraded/faulted state as well as attempt to reopen the 5916209962Smm * device, all of which is a waste. 5917209962Smm */ 5918209962Smm vd->vdev_stat.vs_read_errors = 0; 5919209962Smm vd->vdev_stat.vs_write_errors = 0; 5920209962Smm vd->vdev_stat.vs_checksum_errors = 0; 5921209962Smm 5922185029Spjd vdev_state_dirty(vd->vdev_top); 5923185029Spjd } 5924168404Spjd 5925185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5926185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 5927185029Spjd} 5928168404Spjd 5929185029Spjdstatic void 5930185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 5931185029Spjd{ 5932185029Spjd if (vd->vdev_probe_wanted) { 5933219089Spjd vd->vdev_probe_wanted = B_FALSE; 5934185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 5935168404Spjd } 5936168404Spjd 5937185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5938185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 5939168404Spjd} 5940168404Spjd 5941168404Spjdstatic void 5942219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 5943219089Spjd{ 5944219089Spjd sysevent_id_t eid; 5945219089Spjd nvlist_t *attr; 5946219089Spjd char *physpath; 5947219089Spjd 5948219089Spjd if (!spa->spa_autoexpand) 5949219089Spjd return; 5950219089Spjd 5951219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5952219089Spjd vdev_t *cvd = vd->vdev_child[c]; 5953219089Spjd spa_async_autoexpand(spa, cvd); 5954219089Spjd } 5955219089Spjd 5956219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5957219089Spjd return; 5958219089Spjd 5959219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5960219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5961219089Spjd 5962219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5963219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5964219089Spjd 5965219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5966219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5967219089Spjd 5968219089Spjd nvlist_free(attr); 5969219089Spjd kmem_free(physpath, MAXPATHLEN); 5970219089Spjd} 5971219089Spjd 5972219089Spjdstatic void 5973168404Spjdspa_async_thread(void *arg) 5974168404Spjd{ 5975168404Spjd spa_t *spa = arg; 5976168404Spjd int tasks; 5977168404Spjd 5978168404Spjd ASSERT(spa->spa_sync_on); 5979168404Spjd 5980168404Spjd mutex_enter(&spa->spa_async_lock); 5981168404Spjd tasks = spa->spa_async_tasks; 5982253990Smav spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 5983168404Spjd mutex_exit(&spa->spa_async_lock); 5984168404Spjd 5985168404Spjd /* 5986168404Spjd * See if the config needs to be updated. 5987168404Spjd */ 5988168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5989219089Spjd uint64_t old_space, new_space; 5990219089Spjd 5991168404Spjd mutex_enter(&spa_namespace_lock); 5992219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 5993168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5994219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 5995168404Spjd mutex_exit(&spa_namespace_lock); 5996219089Spjd 5997219089Spjd /* 5998219089Spjd * If the pool grew as a result of the config update, 5999219089Spjd * then log an internal history event. 6000219089Spjd */ 6001219089Spjd if (new_space != old_space) { 6002248571Smm spa_history_log_internal(spa, "vdev online", NULL, 6003219089Spjd "pool '%s' size: %llu(+%llu)", 6004219089Spjd spa_name(spa), new_space, new_space - old_space); 6005219089Spjd } 6006168404Spjd } 6007168404Spjd 6008219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 6009219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6010219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 6011219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6012219089Spjd } 6013219089Spjd 6014168404Spjd /* 6015185029Spjd * See if any devices need to be probed. 6016168404Spjd */ 6017185029Spjd if (tasks & SPA_ASYNC_PROBE) { 6018219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 6019185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 6020185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 6021185029Spjd } 6022168404Spjd 6023168404Spjd /* 6024185029Spjd * If any devices are done replacing, detach them. 6025168404Spjd */ 6026185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 6027185029Spjd spa_vdev_resilver_done(spa); 6028168404Spjd 6029168404Spjd /* 6030168404Spjd * Kick off a resilver. 6031168404Spjd */ 6032168404Spjd if (tasks & SPA_ASYNC_RESILVER) 6033219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 6034168404Spjd 6035168404Spjd /* 6036168404Spjd * Let the world know that we're done. 6037168404Spjd */ 6038168404Spjd mutex_enter(&spa->spa_async_lock); 6039168404Spjd spa->spa_async_thread = NULL; 6040168404Spjd cv_broadcast(&spa->spa_async_cv); 6041168404Spjd mutex_exit(&spa->spa_async_lock); 6042168404Spjd thread_exit(); 6043168404Spjd} 6044168404Spjd 6045253990Smavstatic void 6046253990Smavspa_async_thread_vd(void *arg) 6047253990Smav{ 6048253990Smav spa_t *spa = arg; 6049253990Smav int tasks; 6050253990Smav 6051253990Smav ASSERT(spa->spa_sync_on); 6052253990Smav 6053253990Smav mutex_enter(&spa->spa_async_lock); 6054253990Smav tasks = spa->spa_async_tasks; 6055253990Smavretry: 6056253990Smav spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 6057253990Smav mutex_exit(&spa->spa_async_lock); 6058253990Smav 6059253990Smav /* 6060253990Smav * See if any devices need to be marked REMOVED. 6061253990Smav */ 6062253990Smav if (tasks & SPA_ASYNC_REMOVE) { 6063253990Smav spa_vdev_state_enter(spa, SCL_NONE); 6064253990Smav spa_async_remove(spa, spa->spa_root_vdev); 6065253990Smav for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 6066253990Smav spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 6067253990Smav for (int i = 0; i < spa->spa_spares.sav_count; i++) 6068253990Smav spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 6069253990Smav (void) spa_vdev_state_exit(spa, NULL, 0); 6070253990Smav } 6071253990Smav 6072253990Smav /* 6073253990Smav * Let the world know that we're done. 6074253990Smav */ 6075253990Smav mutex_enter(&spa->spa_async_lock); 6076253990Smav tasks = spa->spa_async_tasks; 6077253990Smav if ((tasks & SPA_ASYNC_REMOVE) != 0) 6078253990Smav goto retry; 6079253990Smav spa->spa_async_thread_vd = NULL; 6080253990Smav cv_broadcast(&spa->spa_async_cv); 6081253990Smav mutex_exit(&spa->spa_async_lock); 6082253990Smav thread_exit(); 6083253990Smav} 6084253990Smav 6085168404Spjdvoid 6086168404Spjdspa_async_suspend(spa_t *spa) 6087168404Spjd{ 6088168404Spjd mutex_enter(&spa->spa_async_lock); 6089168404Spjd spa->spa_async_suspended++; 6090253990Smav while (spa->spa_async_thread != NULL && 6091253990Smav spa->spa_async_thread_vd != NULL) 6092168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6093168404Spjd mutex_exit(&spa->spa_async_lock); 6094168404Spjd} 6095168404Spjd 6096168404Spjdvoid 6097168404Spjdspa_async_resume(spa_t *spa) 6098168404Spjd{ 6099168404Spjd mutex_enter(&spa->spa_async_lock); 6100168404Spjd ASSERT(spa->spa_async_suspended != 0); 6101168404Spjd spa->spa_async_suspended--; 6102168404Spjd mutex_exit(&spa->spa_async_lock); 6103168404Spjd} 6104168404Spjd 6105251636Sdelphijstatic boolean_t 6106251636Sdelphijspa_async_tasks_pending(spa_t *spa) 6107251636Sdelphij{ 6108251636Sdelphij uint_t non_config_tasks; 6109251636Sdelphij uint_t config_task; 6110251636Sdelphij boolean_t config_task_suspended; 6111251636Sdelphij 6112253990Smav non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6113253990Smav SPA_ASYNC_REMOVE); 6114251636Sdelphij config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6115251636Sdelphij if (spa->spa_ccw_fail_time == 0) { 6116251636Sdelphij config_task_suspended = B_FALSE; 6117251636Sdelphij } else { 6118251636Sdelphij config_task_suspended = 6119251636Sdelphij (gethrtime() - spa->spa_ccw_fail_time) < 6120251636Sdelphij (zfs_ccw_retry_interval * NANOSEC); 6121251636Sdelphij } 6122251636Sdelphij 6123251636Sdelphij return (non_config_tasks || (config_task && !config_task_suspended)); 6124251636Sdelphij} 6125251636Sdelphij 6126168404Spjdstatic void 6127168404Spjdspa_async_dispatch(spa_t *spa) 6128168404Spjd{ 6129168404Spjd mutex_enter(&spa->spa_async_lock); 6130251636Sdelphij if (spa_async_tasks_pending(spa) && 6131251636Sdelphij !spa->spa_async_suspended && 6132168404Spjd spa->spa_async_thread == NULL && 6133251636Sdelphij rootdir != NULL) 6134168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 6135168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6136168404Spjd mutex_exit(&spa->spa_async_lock); 6137168404Spjd} 6138168404Spjd 6139253990Smavstatic void 6140253990Smavspa_async_dispatch_vd(spa_t *spa) 6141253990Smav{ 6142253990Smav mutex_enter(&spa->spa_async_lock); 6143253990Smav if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6144253990Smav !spa->spa_async_suspended && 6145253990Smav spa->spa_async_thread_vd == NULL && 6146253990Smav rootdir != NULL) 6147253990Smav spa->spa_async_thread_vd = thread_create(NULL, 0, 6148253990Smav spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6149253990Smav mutex_exit(&spa->spa_async_lock); 6150253990Smav} 6151253990Smav 6152168404Spjdvoid 6153168404Spjdspa_async_request(spa_t *spa, int task) 6154168404Spjd{ 6155219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6156168404Spjd mutex_enter(&spa->spa_async_lock); 6157168404Spjd spa->spa_async_tasks |= task; 6158168404Spjd mutex_exit(&spa->spa_async_lock); 6159253990Smav spa_async_dispatch_vd(spa); 6160168404Spjd} 6161168404Spjd 6162168404Spjd/* 6163168404Spjd * ========================================================================== 6164168404Spjd * SPA syncing routines 6165168404Spjd * ========================================================================== 6166168404Spjd */ 6167168404Spjd 6168219089Spjdstatic int 6169219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6170168404Spjd{ 6171219089Spjd bpobj_t *bpo = arg; 6172219089Spjd bpobj_enqueue(bpo, bp, tx); 6173219089Spjd return (0); 6174219089Spjd} 6175168404Spjd 6176219089Spjdstatic int 6177219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6178219089Spjd{ 6179219089Spjd zio_t *zio = arg; 6180168404Spjd 6181219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6182240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 6183219089Spjd return (0); 6184168404Spjd} 6185168404Spjd 6186258632Savg/* 6187258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6188258632Savg * amount of time spent syncing frees. 6189258632Savg */ 6190168404Spjdstatic void 6191258632Savgspa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6192258632Savg{ 6193258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6194258632Savg bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6195258632Savg VERIFY(zio_wait(zio) == 0); 6196258632Savg} 6197258632Savg 6198258632Savg/* 6199258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6200258632Savg * amount of time spent syncing deferred frees. 6201258632Savg */ 6202258632Savgstatic void 6203258632Savgspa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6204258632Savg{ 6205258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6206258632Savg VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6207258632Savg spa_free_sync_cb, zio, tx), ==, 0); 6208258632Savg VERIFY0(zio_wait(zio)); 6209258632Savg} 6210258632Savg 6211258632Savg 6212258632Savgstatic void 6213168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6214168404Spjd{ 6215168404Spjd char *packed = NULL; 6216185029Spjd size_t bufsize; 6217168404Spjd size_t nvsize = 0; 6218168404Spjd dmu_buf_t *db; 6219168404Spjd 6220168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6221168404Spjd 6222185029Spjd /* 6223185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6224260150Sdelphij * information. This avoids the dmu_buf_will_dirty() path and 6225185029Spjd * saves us a pre-read to get data we don't actually care about. 6226185029Spjd */ 6227236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6228185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 6229168404Spjd 6230168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6231168404Spjd KM_SLEEP) == 0); 6232185029Spjd bzero(packed + nvsize, bufsize - nvsize); 6233168404Spjd 6234185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6235168404Spjd 6236185029Spjd kmem_free(packed, bufsize); 6237168404Spjd 6238168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6239168404Spjd dmu_buf_will_dirty(db, tx); 6240168404Spjd *(uint64_t *)db->db_data = nvsize; 6241168404Spjd dmu_buf_rele(db, FTAG); 6242168404Spjd} 6243168404Spjd 6244168404Spjdstatic void 6245185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6246185029Spjd const char *config, const char *entry) 6247168404Spjd{ 6248168404Spjd nvlist_t *nvroot; 6249185029Spjd nvlist_t **list; 6250168404Spjd int i; 6251168404Spjd 6252185029Spjd if (!sav->sav_sync) 6253168404Spjd return; 6254168404Spjd 6255168404Spjd /* 6256185029Spjd * Update the MOS nvlist describing the list of available devices. 6257185029Spjd * spa_validate_aux() will have already made sure this nvlist is 6258185029Spjd * valid and the vdevs are labeled appropriately. 6259168404Spjd */ 6260185029Spjd if (sav->sav_object == 0) { 6261185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6262185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6263185029Spjd sizeof (uint64_t), tx); 6264168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 6265185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6266185029Spjd &sav->sav_object, tx) == 0); 6267168404Spjd } 6268168404Spjd 6269168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6270185029Spjd if (sav->sav_count == 0) { 6271185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6272168404Spjd } else { 6273185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6274185029Spjd for (i = 0; i < sav->sav_count; i++) 6275185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6276219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 6277185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6278185029Spjd sav->sav_count) == 0); 6279185029Spjd for (i = 0; i < sav->sav_count; i++) 6280185029Spjd nvlist_free(list[i]); 6281185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 6282168404Spjd } 6283168404Spjd 6284185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6285168404Spjd nvlist_free(nvroot); 6286168404Spjd 6287185029Spjd sav->sav_sync = B_FALSE; 6288168404Spjd} 6289168404Spjd 6290168404Spjdstatic void 6291168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6292168404Spjd{ 6293168404Spjd nvlist_t *config; 6294168404Spjd 6295185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) 6296168404Spjd return; 6297168404Spjd 6298185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6299168404Spjd 6300185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 6301185029Spjd dmu_tx_get_txg(tx), B_FALSE); 6302185029Spjd 6303243505Smm /* 6304243505Smm * If we're upgrading the spa version then make sure that 6305243505Smm * the config object gets updated with the correct version. 6306243505Smm */ 6307243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6308243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6309243505Smm spa->spa_uberblock.ub_version); 6310243505Smm 6311185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6312185029Spjd 6313168404Spjd if (spa->spa_config_syncing) 6314168404Spjd nvlist_free(spa->spa_config_syncing); 6315168404Spjd spa->spa_config_syncing = config; 6316168404Spjd 6317168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6318168404Spjd} 6319168404Spjd 6320236884Smmstatic void 6321248571Smmspa_sync_version(void *arg, dmu_tx_t *tx) 6322236884Smm{ 6323248571Smm uint64_t *versionp = arg; 6324248571Smm uint64_t version = *versionp; 6325248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6326236884Smm 6327236884Smm /* 6328236884Smm * Setting the version is special cased when first creating the pool. 6329236884Smm */ 6330236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 6331236884Smm 6332247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6333236884Smm ASSERT(version >= spa_version(spa)); 6334236884Smm 6335236884Smm spa->spa_uberblock.ub_version = version; 6336236884Smm vdev_config_dirty(spa->spa_root_vdev); 6337248571Smm spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6338236884Smm} 6339236884Smm 6340185029Spjd/* 6341185029Spjd * Set zpool properties. 6342185029Spjd */ 6343168404Spjdstatic void 6344248571Smmspa_sync_props(void *arg, dmu_tx_t *tx) 6345168404Spjd{ 6346248571Smm nvlist_t *nvp = arg; 6347248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6348185029Spjd objset_t *mos = spa->spa_meta_objset; 6349236884Smm nvpair_t *elem = NULL; 6350168404Spjd 6351168404Spjd mutex_enter(&spa->spa_props_lock); 6352168404Spjd 6353185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 6354236884Smm uint64_t intval; 6355236884Smm char *strval, *fname; 6356236884Smm zpool_prop_t prop; 6357236884Smm const char *propname; 6358236884Smm zprop_type_t proptype; 6359259813Sdelphij spa_feature_t fid; 6360236884Smm 6361185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6362236884Smm case ZPROP_INVAL: 6363236884Smm /* 6364236884Smm * We checked this earlier in spa_prop_validate(). 6365236884Smm */ 6366236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 6367236884Smm 6368236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 6369259813Sdelphij VERIFY0(zfeature_lookup_name(fname, &fid)); 6370236884Smm 6371259813Sdelphij spa_feature_enable(spa, fid, tx); 6372248571Smm spa_history_log_internal(spa, "set", tx, 6373248571Smm "%s=enabled", nvpair_name(elem)); 6374236884Smm break; 6375236884Smm 6376185029Spjd case ZPOOL_PROP_VERSION: 6377258717Savg intval = fnvpair_value_uint64(elem); 6378185029Spjd /* 6379236884Smm * The version is synced seperatly before other 6380236884Smm * properties and should be correct by now. 6381185029Spjd */ 6382236884Smm ASSERT3U(spa_version(spa), >=, intval); 6383185029Spjd break; 6384168404Spjd 6385185029Spjd case ZPOOL_PROP_ALTROOT: 6386185029Spjd /* 6387185029Spjd * 'altroot' is a non-persistent property. It should 6388185029Spjd * have been set temporarily at creation or import time. 6389185029Spjd */ 6390185029Spjd ASSERT(spa->spa_root != NULL); 6391185029Spjd break; 6392168404Spjd 6393219089Spjd case ZPOOL_PROP_READONLY: 6394185029Spjd case ZPOOL_PROP_CACHEFILE: 6395185029Spjd /* 6396219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 6397219089Spjd * properties. 6398185029Spjd */ 6399168404Spjd break; 6400228103Smm case ZPOOL_PROP_COMMENT: 6401258717Savg strval = fnvpair_value_string(elem); 6402228103Smm if (spa->spa_comment != NULL) 6403228103Smm spa_strfree(spa->spa_comment); 6404228103Smm spa->spa_comment = spa_strdup(strval); 6405228103Smm /* 6406228103Smm * We need to dirty the configuration on all the vdevs 6407228103Smm * so that their labels get updated. It's unnecessary 6408228103Smm * to do this for pool creation since the vdev's 6409228103Smm * configuratoin has already been dirtied. 6410228103Smm */ 6411228103Smm if (tx->tx_txg != TXG_INITIAL) 6412228103Smm vdev_config_dirty(spa->spa_root_vdev); 6413248571Smm spa_history_log_internal(spa, "set", tx, 6414248571Smm "%s=%s", nvpair_name(elem), strval); 6415228103Smm break; 6416185029Spjd default: 6417185029Spjd /* 6418185029Spjd * Set pool property values in the poolprops mos object. 6419185029Spjd */ 6420185029Spjd if (spa->spa_pool_props_object == 0) { 6421236884Smm spa->spa_pool_props_object = 6422236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 6423185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6424236884Smm tx); 6425185029Spjd } 6426185029Spjd 6427185029Spjd /* normalize the property name */ 6428185029Spjd propname = zpool_prop_to_name(prop); 6429185029Spjd proptype = zpool_prop_get_type(prop); 6430185029Spjd 6431185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 6432185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 6433258717Savg strval = fnvpair_value_string(elem); 6434258717Savg VERIFY0(zap_update(mos, 6435185029Spjd spa->spa_pool_props_object, propname, 6436258717Savg 1, strlen(strval) + 1, strval, tx)); 6437248571Smm spa_history_log_internal(spa, "set", tx, 6438248571Smm "%s=%s", nvpair_name(elem), strval); 6439185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6440258717Savg intval = fnvpair_value_uint64(elem); 6441185029Spjd 6442185029Spjd if (proptype == PROP_TYPE_INDEX) { 6443185029Spjd const char *unused; 6444258717Savg VERIFY0(zpool_prop_index_to_string( 6445258717Savg prop, intval, &unused)); 6446185029Spjd } 6447258717Savg VERIFY0(zap_update(mos, 6448185029Spjd spa->spa_pool_props_object, propname, 6449258717Savg 8, 1, &intval, tx)); 6450248571Smm spa_history_log_internal(spa, "set", tx, 6451248571Smm "%s=%lld", nvpair_name(elem), intval); 6452185029Spjd } else { 6453185029Spjd ASSERT(0); /* not allowed */ 6454185029Spjd } 6455185029Spjd 6456185029Spjd switch (prop) { 6457185029Spjd case ZPOOL_PROP_DELEGATION: 6458185029Spjd spa->spa_delegation = intval; 6459185029Spjd break; 6460185029Spjd case ZPOOL_PROP_BOOTFS: 6461185029Spjd spa->spa_bootfs = intval; 6462185029Spjd break; 6463185029Spjd case ZPOOL_PROP_FAILUREMODE: 6464185029Spjd spa->spa_failmode = intval; 6465185029Spjd break; 6466219089Spjd case ZPOOL_PROP_AUTOEXPAND: 6467219089Spjd spa->spa_autoexpand = intval; 6468219089Spjd if (tx->tx_txg != TXG_INITIAL) 6469219089Spjd spa_async_request(spa, 6470219089Spjd SPA_ASYNC_AUTOEXPAND); 6471219089Spjd break; 6472219089Spjd case ZPOOL_PROP_DEDUPDITTO: 6473219089Spjd spa->spa_dedup_ditto = intval; 6474219089Spjd break; 6475185029Spjd default: 6476185029Spjd break; 6477185029Spjd } 6478168404Spjd } 6479185029Spjd 6480168404Spjd } 6481185029Spjd 6482185029Spjd mutex_exit(&spa->spa_props_lock); 6483168404Spjd} 6484168404Spjd 6485168404Spjd/* 6486219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 6487219089Spjd * reflect the new version this txg, so there must be no changes this 6488219089Spjd * txg to anything that the upgrade code depends on after it executes. 6489219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 6490219089Spjd * tasks. 6491219089Spjd */ 6492219089Spjdstatic void 6493219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6494219089Spjd{ 6495219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6496219089Spjd 6497219089Spjd ASSERT(spa->spa_sync_pass == 1); 6498219089Spjd 6499248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6500248571Smm 6501219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6502219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6503219089Spjd dsl_pool_create_origin(dp, tx); 6504219089Spjd 6505219089Spjd /* Keeping the origin open increases spa_minref */ 6506219089Spjd spa->spa_minref += 3; 6507219089Spjd } 6508219089Spjd 6509219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6510219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6511219089Spjd dsl_pool_upgrade_clones(dp, tx); 6512219089Spjd } 6513219089Spjd 6514219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6515219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6516219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 6517219089Spjd 6518219089Spjd /* Keeping the freedir open increases spa_minref */ 6519219089Spjd spa->spa_minref += 3; 6520219089Spjd } 6521236884Smm 6522236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6523236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6524236884Smm spa_feature_create_zap_objects(spa, tx); 6525236884Smm } 6526268126Sdelphij 6527268126Sdelphij /* 6528268126Sdelphij * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6529268126Sdelphij * when possibility to use lz4 compression for metadata was added 6530268126Sdelphij * Old pools that have this feature enabled must be upgraded to have 6531268126Sdelphij * this feature active 6532268126Sdelphij */ 6533268126Sdelphij if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6534268126Sdelphij boolean_t lz4_en = spa_feature_is_enabled(spa, 6535268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 6536268126Sdelphij boolean_t lz4_ac = spa_feature_is_active(spa, 6537268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 6538268126Sdelphij 6539268126Sdelphij if (lz4_en && !lz4_ac) 6540268126Sdelphij spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6541268126Sdelphij } 6542248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 6543219089Spjd} 6544219089Spjd 6545219089Spjd/* 6546168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 6547168404Spjd * part of the process, so we iterate until it converges. 6548168404Spjd */ 6549168404Spjdvoid 6550168404Spjdspa_sync(spa_t *spa, uint64_t txg) 6551168404Spjd{ 6552168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6553168404Spjd objset_t *mos = spa->spa_meta_objset; 6554219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6555168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6556168404Spjd vdev_t *vd; 6557168404Spjd dmu_tx_t *tx; 6558185029Spjd int error; 6559168404Spjd 6560219089Spjd VERIFY(spa_writeable(spa)); 6561219089Spjd 6562168404Spjd /* 6563168404Spjd * Lock out configuration changes. 6564168404Spjd */ 6565185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6566168404Spjd 6567168404Spjd spa->spa_syncing_txg = txg; 6568168404Spjd spa->spa_sync_pass = 0; 6569168404Spjd 6570185029Spjd /* 6571185029Spjd * If there are any pending vdev state changes, convert them 6572185029Spjd * into config changes that go out with this transaction group. 6573185029Spjd */ 6574185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6575209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 6576209962Smm /* 6577209962Smm * We need the write lock here because, for aux vdevs, 6578209962Smm * calling vdev_config_dirty() modifies sav_config. 6579209962Smm * This is ugly and will become unnecessary when we 6580209962Smm * eliminate the aux vdev wart by integrating all vdevs 6581209962Smm * into the root vdev tree. 6582209962Smm */ 6583209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6584209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6585209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6586209962Smm vdev_state_clean(vd); 6587209962Smm vdev_config_dirty(vd); 6588209962Smm } 6589209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6590209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6591185029Spjd } 6592185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6593185029Spjd 6594168404Spjd tx = dmu_tx_create_assigned(dp, txg); 6595168404Spjd 6596247265Smm spa->spa_sync_starttime = gethrtime(); 6597247265Smm#ifdef illumos 6598247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6599247265Smm spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6600277300Ssmh#else /* !illumos */ 6601247265Smm#ifdef _KERNEL 6602247265Smm callout_reset(&spa->spa_deadman_cycid, 6603247265Smm hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6604247265Smm#endif 6605277300Ssmh#endif /* illumos */ 6606247265Smm 6607168404Spjd /* 6608185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6609168404Spjd * set spa_deflate if we have no raid-z vdevs. 6610168404Spjd */ 6611185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6612185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6613168404Spjd int i; 6614168404Spjd 6615168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 6616168404Spjd vd = rvd->vdev_child[i]; 6617168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6618168404Spjd break; 6619168404Spjd } 6620168404Spjd if (i == rvd->vdev_children) { 6621168404Spjd spa->spa_deflate = TRUE; 6622168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 6623168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6624168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6625168404Spjd } 6626168404Spjd } 6627168404Spjd 6628168404Spjd /* 6629168404Spjd * Iterate to convergence. 6630168404Spjd */ 6631168404Spjd do { 6632219089Spjd int pass = ++spa->spa_sync_pass; 6633168404Spjd 6634168404Spjd spa_sync_config_object(spa, tx); 6635185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6636185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6637185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6638185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6639168404Spjd spa_errlog_sync(spa, txg); 6640168404Spjd dsl_pool_sync(dp, txg); 6641168404Spjd 6642243503Smm if (pass < zfs_sync_pass_deferred_free) { 6643258632Savg spa_sync_frees(spa, free_bpl, tx); 6644219089Spjd } else { 6645275781Sdelphij /* 6646275781Sdelphij * We can not defer frees in pass 1, because 6647275781Sdelphij * we sync the deferred frees later in pass 1. 6648275781Sdelphij */ 6649275781Sdelphij ASSERT3U(pass, >, 1); 6650219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 6651258632Savg &spa->spa_deferred_bpobj, tx); 6652168404Spjd } 6653168404Spjd 6654219089Spjd ddt_sync(spa, txg); 6655219089Spjd dsl_scan_sync(dp, tx); 6656168404Spjd 6657219089Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6658219089Spjd vdev_sync(vd, txg); 6659168404Spjd 6660275781Sdelphij if (pass == 1) { 6661219089Spjd spa_sync_upgrades(spa, tx); 6662275781Sdelphij ASSERT3U(txg, >=, 6663275781Sdelphij spa->spa_uberblock.ub_rootbp.blk_birth); 6664275781Sdelphij /* 6665275781Sdelphij * Note: We need to check if the MOS is dirty 6666275781Sdelphij * because we could have marked the MOS dirty 6667275781Sdelphij * without updating the uberblock (e.g. if we 6668275781Sdelphij * have sync tasks but no dirty user data). We 6669275781Sdelphij * need to check the uberblock's rootbp because 6670275781Sdelphij * it is updated if we have synced out dirty 6671275781Sdelphij * data (though in this case the MOS will most 6672275781Sdelphij * likely also be dirty due to second order 6673275781Sdelphij * effects, we don't want to rely on that here). 6674275781Sdelphij */ 6675275781Sdelphij if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 6676275781Sdelphij !dmu_objset_is_dirty(mos, txg)) { 6677275781Sdelphij /* 6678275781Sdelphij * Nothing changed on the first pass, 6679275781Sdelphij * therefore this TXG is a no-op. Avoid 6680275781Sdelphij * syncing deferred frees, so that we 6681275781Sdelphij * can keep this TXG as a no-op. 6682275781Sdelphij */ 6683275781Sdelphij ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 6684275781Sdelphij txg)); 6685275781Sdelphij ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6686275781Sdelphij ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 6687275781Sdelphij break; 6688275781Sdelphij } 6689275781Sdelphij spa_sync_deferred_frees(spa, tx); 6690275781Sdelphij } 6691168404Spjd 6692219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 6693219089Spjd 6694168404Spjd /* 6695168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 6696168404Spjd * to commit the transaction group. 6697168404Spjd * 6698185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 6699185029Spjd * random top-level vdevs that are known to be visible in the 6700185029Spjd * config cache (see spa_vdev_add() for a complete description). 6701185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6702168404Spjd */ 6703185029Spjd for (;;) { 6704185029Spjd /* 6705185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 6706185029Spjd * while we're attempting to write the vdev labels. 6707185029Spjd */ 6708185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6709168404Spjd 6710185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 6711185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 6712185029Spjd int svdcount = 0; 6713185029Spjd int children = rvd->vdev_children; 6714185029Spjd int c0 = spa_get_random(children); 6715185029Spjd 6716219089Spjd for (int c = 0; c < children; c++) { 6717185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 6718185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6719185029Spjd continue; 6720185029Spjd svd[svdcount++] = vd; 6721185029Spjd if (svdcount == SPA_DVAS_PER_BP) 6722185029Spjd break; 6723185029Spjd } 6724213198Smm error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6725213198Smm if (error != 0) 6726213198Smm error = vdev_config_sync(svd, svdcount, txg, 6727213198Smm B_TRUE); 6728185029Spjd } else { 6729185029Spjd error = vdev_config_sync(rvd->vdev_child, 6730213198Smm rvd->vdev_children, txg, B_FALSE); 6731213198Smm if (error != 0) 6732213198Smm error = vdev_config_sync(rvd->vdev_child, 6733213198Smm rvd->vdev_children, txg, B_TRUE); 6734168404Spjd } 6735185029Spjd 6736239620Smm if (error == 0) 6737239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 6738239620Smm 6739185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6740185029Spjd 6741185029Spjd if (error == 0) 6742185029Spjd break; 6743185029Spjd zio_suspend(spa, NULL); 6744185029Spjd zio_resume_wait(spa); 6745168404Spjd } 6746168404Spjd dmu_tx_commit(tx); 6747168404Spjd 6748247265Smm#ifdef illumos 6749247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6750277300Ssmh#else /* !illumos */ 6751247265Smm#ifdef _KERNEL 6752247265Smm callout_drain(&spa->spa_deadman_cycid); 6753247265Smm#endif 6754277300Ssmh#endif /* illumos */ 6755247265Smm 6756168404Spjd /* 6757168404Spjd * Clear the dirty config list. 6758168404Spjd */ 6759185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6760168404Spjd vdev_config_clean(vd); 6761168404Spjd 6762168404Spjd /* 6763168404Spjd * Now that the new config has synced transactionally, 6764168404Spjd * let it become visible to the config cache. 6765168404Spjd */ 6766168404Spjd if (spa->spa_config_syncing != NULL) { 6767168404Spjd spa_config_set(spa, spa->spa_config_syncing); 6768168404Spjd spa->spa_config_txg = txg; 6769168404Spjd spa->spa_config_syncing = NULL; 6770168404Spjd } 6771168404Spjd 6772168404Spjd spa->spa_ubsync = spa->spa_uberblock; 6773168404Spjd 6774219089Spjd dsl_pool_sync_done(dp, txg); 6775168404Spjd 6776168404Spjd /* 6777168404Spjd * Update usable space statistics. 6778168404Spjd */ 6779168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6780168404Spjd vdev_sync_done(vd, txg); 6781168404Spjd 6782219089Spjd spa_update_dspace(spa); 6783219089Spjd 6784168404Spjd /* 6785168404Spjd * It had better be the case that we didn't dirty anything 6786168404Spjd * since vdev_config_sync(). 6787168404Spjd */ 6788168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6789168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6790168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6791168404Spjd 6792219089Spjd spa->spa_sync_pass = 0; 6793219089Spjd 6794185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6795168404Spjd 6796219089Spjd spa_handle_ignored_writes(spa); 6797219089Spjd 6798168404Spjd /* 6799168404Spjd * If any async tasks have been requested, kick them off. 6800168404Spjd */ 6801168404Spjd spa_async_dispatch(spa); 6802253990Smav spa_async_dispatch_vd(spa); 6803168404Spjd} 6804168404Spjd 6805168404Spjd/* 6806168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 6807168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 6808168404Spjd * sync. 6809168404Spjd */ 6810168404Spjdvoid 6811168404Spjdspa_sync_allpools(void) 6812168404Spjd{ 6813168404Spjd spa_t *spa = NULL; 6814168404Spjd mutex_enter(&spa_namespace_lock); 6815168404Spjd while ((spa = spa_next(spa)) != NULL) { 6816219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 6817219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 6818168404Spjd continue; 6819168404Spjd spa_open_ref(spa, FTAG); 6820168404Spjd mutex_exit(&spa_namespace_lock); 6821168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6822168404Spjd mutex_enter(&spa_namespace_lock); 6823168404Spjd spa_close(spa, FTAG); 6824168404Spjd } 6825168404Spjd mutex_exit(&spa_namespace_lock); 6826168404Spjd} 6827168404Spjd 6828168404Spjd/* 6829168404Spjd * ========================================================================== 6830168404Spjd * Miscellaneous routines 6831168404Spjd * ========================================================================== 6832168404Spjd */ 6833168404Spjd 6834168404Spjd/* 6835168404Spjd * Remove all pools in the system. 6836168404Spjd */ 6837168404Spjdvoid 6838168404Spjdspa_evict_all(void) 6839168404Spjd{ 6840168404Spjd spa_t *spa; 6841168404Spjd 6842168404Spjd /* 6843168404Spjd * Remove all cached state. All pools should be closed now, 6844168404Spjd * so every spa in the AVL tree should be unreferenced. 6845168404Spjd */ 6846168404Spjd mutex_enter(&spa_namespace_lock); 6847168404Spjd while ((spa = spa_next(NULL)) != NULL) { 6848168404Spjd /* 6849168404Spjd * Stop async tasks. The async thread may need to detach 6850168404Spjd * a device that's been replaced, which requires grabbing 6851168404Spjd * spa_namespace_lock, so we must drop it here. 6852168404Spjd */ 6853168404Spjd spa_open_ref(spa, FTAG); 6854168404Spjd mutex_exit(&spa_namespace_lock); 6855168404Spjd spa_async_suspend(spa); 6856168404Spjd mutex_enter(&spa_namespace_lock); 6857168404Spjd spa_close(spa, FTAG); 6858168404Spjd 6859168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6860168404Spjd spa_unload(spa); 6861168404Spjd spa_deactivate(spa); 6862168404Spjd } 6863168404Spjd spa_remove(spa); 6864168404Spjd } 6865168404Spjd mutex_exit(&spa_namespace_lock); 6866168404Spjd} 6867168404Spjd 6868168404Spjdvdev_t * 6869209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6870168404Spjd{ 6871185029Spjd vdev_t *vd; 6872185029Spjd int i; 6873185029Spjd 6874185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6875185029Spjd return (vd); 6876185029Spjd 6877209962Smm if (aux) { 6878185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6879185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 6880185029Spjd if (vd->vdev_guid == guid) 6881185029Spjd return (vd); 6882185029Spjd } 6883209962Smm 6884209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 6885209962Smm vd = spa->spa_spares.sav_vdevs[i]; 6886209962Smm if (vd->vdev_guid == guid) 6887209962Smm return (vd); 6888209962Smm } 6889185029Spjd } 6890185029Spjd 6891185029Spjd return (NULL); 6892168404Spjd} 6893168404Spjd 6894168404Spjdvoid 6895185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 6896168404Spjd{ 6897219089Spjd ASSERT(spa_writeable(spa)); 6898219089Spjd 6899185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6900168404Spjd 6901168404Spjd /* 6902168404Spjd * This should only be called for a non-faulted pool, and since a 6903168404Spjd * future version would result in an unopenable pool, this shouldn't be 6904168404Spjd * possible. 6905168404Spjd */ 6906247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6907268075Sdelphij ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 6908168404Spjd 6909185029Spjd spa->spa_uberblock.ub_version = version; 6910168404Spjd vdev_config_dirty(spa->spa_root_vdev); 6911168404Spjd 6912185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 6913168404Spjd 6914168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6915168404Spjd} 6916168404Spjd 6917168404Spjdboolean_t 6918168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 6919168404Spjd{ 6920168404Spjd int i; 6921168404Spjd uint64_t spareguid; 6922185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6923168404Spjd 6924185029Spjd for (i = 0; i < sav->sav_count; i++) 6925185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 6926168404Spjd return (B_TRUE); 6927168404Spjd 6928185029Spjd for (i = 0; i < sav->sav_npending; i++) { 6929185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6930185029Spjd &spareguid) == 0 && spareguid == guid) 6931168404Spjd return (B_TRUE); 6932168404Spjd } 6933168404Spjd 6934168404Spjd return (B_FALSE); 6935168404Spjd} 6936168404Spjd 6937185029Spjd/* 6938185029Spjd * Check if a pool has an active shared spare device. 6939185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 6940185029Spjd */ 6941185029Spjdstatic boolean_t 6942185029Spjdspa_has_active_shared_spare(spa_t *spa) 6943168404Spjd{ 6944185029Spjd int i, refcnt; 6945185029Spjd uint64_t pool; 6946185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6947185029Spjd 6948185029Spjd for (i = 0; i < sav->sav_count; i++) { 6949185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6950185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6951185029Spjd refcnt > 2) 6952185029Spjd return (B_TRUE); 6953185029Spjd } 6954185029Spjd 6955185029Spjd return (B_FALSE); 6956168404Spjd} 6957168404Spjd 6958185029Spjd/* 6959185029Spjd * Post a sysevent corresponding to the given event. The 'name' must be one of 6960185029Spjd * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6961185029Spjd * filled in from the spa and (optionally) the vdev. This doesn't do anything 6962185029Spjd * in the userland libzpool, as we don't want consumers to misinterpret ztest 6963185029Spjd * or zdb as real changes. 6964185029Spjd */ 6965185029Spjdvoid 6966185029Spjdspa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6967168404Spjd{ 6968185029Spjd#ifdef _KERNEL 6969185029Spjd sysevent_t *ev; 6970185029Spjd sysevent_attr_list_t *attr = NULL; 6971185029Spjd sysevent_value_t value; 6972185029Spjd sysevent_id_t eid; 6973168404Spjd 6974185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6975185029Spjd SE_SLEEP); 6976168404Spjd 6977185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6978185029Spjd value.value.sv_string = spa_name(spa); 6979185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6980185029Spjd goto done; 6981168404Spjd 6982185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6983185029Spjd value.value.sv_uint64 = spa_guid(spa); 6984185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6985185029Spjd goto done; 6986168404Spjd 6987185029Spjd if (vd) { 6988185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6989185029Spjd value.value.sv_uint64 = vd->vdev_guid; 6990185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6991185029Spjd SE_SLEEP) != 0) 6992185029Spjd goto done; 6993168404Spjd 6994185029Spjd if (vd->vdev_path) { 6995185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6996185029Spjd value.value.sv_string = vd->vdev_path; 6997185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6998185029Spjd &value, SE_SLEEP) != 0) 6999185029Spjd goto done; 7000168404Spjd } 7001168404Spjd } 7002168404Spjd 7003185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 7004185029Spjd goto done; 7005185029Spjd attr = NULL; 7006168404Spjd 7007185029Spjd (void) log_sysevent(ev, SE_SLEEP, &eid); 7008185029Spjd 7009185029Spjddone: 7010185029Spjd if (attr) 7011185029Spjd sysevent_free_attr(attr); 7012185029Spjd sysevent_free(ev); 7013185029Spjd#endif 7014168404Spjd} 7015