spa.c revision 213198
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23209962Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24168404Spjd * Use is subject to license terms. 25168404Spjd */ 26168404Spjd 27168404Spjd/* 28168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 29168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 30168404Spjd * pool. 31168404Spjd */ 32168404Spjd 33168404Spjd#include <sys/zfs_context.h> 34168404Spjd#include <sys/fm/fs/zfs.h> 35168404Spjd#include <sys/spa_impl.h> 36168404Spjd#include <sys/zio.h> 37168404Spjd#include <sys/zio_checksum.h> 38168404Spjd#include <sys/zio_compress.h> 39168404Spjd#include <sys/dmu.h> 40168404Spjd#include <sys/dmu_tx.h> 41168404Spjd#include <sys/zap.h> 42168404Spjd#include <sys/zil.h> 43168404Spjd#include <sys/vdev_impl.h> 44168404Spjd#include <sys/metaslab.h> 45168404Spjd#include <sys/uberblock_impl.h> 46168404Spjd#include <sys/txg.h> 47168404Spjd#include <sys/avl.h> 48168404Spjd#include <sys/dmu_traverse.h> 49168404Spjd#include <sys/dmu_objset.h> 50168404Spjd#include <sys/unique.h> 51168404Spjd#include <sys/dsl_pool.h> 52168404Spjd#include <sys/dsl_dataset.h> 53168404Spjd#include <sys/dsl_dir.h> 54168404Spjd#include <sys/dsl_prop.h> 55168404Spjd#include <sys/dsl_synctask.h> 56168404Spjd#include <sys/fs/zfs.h> 57185029Spjd#include <sys/arc.h> 58168404Spjd#include <sys/callb.h> 59168962Spjd#include <sys/sunddi.h> 60185029Spjd#include <sys/spa_boot.h> 61168404Spjd 62185029Spjd#include "zfs_prop.h" 63185029Spjd#include "zfs_comutil.h" 64168404Spjd 65204073Spjd/* Check hostid on import? */ 66204073Spjdstatic int check_hostid = 1; 67204073Spjd 68204073SpjdSYSCTL_DECL(_vfs_zfs); 69204073SpjdTUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 70204073SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 71204073Spjd "Check hostid on import?"); 72204073Spjd 73209962Smmenum zti_modes { 74209962Smm zti_mode_fixed, /* value is # of threads (min 1) */ 75209962Smm zti_mode_online_percent, /* value is % of online CPUs */ 76209962Smm zti_mode_tune, /* fill from zio_taskq_tune_* */ 77211931Smm zti_mode_null, /* don't create a taskq */ 78209962Smm zti_nmodes 79185029Spjd}; 80168712Spjd 81211931Smm#define ZTI_FIX(n) { zti_mode_fixed, (n) } 82211931Smm#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 83211931Smm#define ZTI_TUNE { zti_mode_tune, 0 } 84211931Smm#define ZTI_NULL { zti_mode_null, 0 } 85209962Smm 86211931Smm#define ZTI_ONE ZTI_FIX(1) 87209962Smm 88209962Smmtypedef struct zio_taskq_info { 89211931Smm enum zti_modes zti_mode; 90211931Smm uint_t zti_value; 91209962Smm} zio_taskq_info_t; 92209962Smm 93209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 94211931Smm "issue", "issue_high", "intr", "intr_high" 95209962Smm}; 96209962Smm 97211931Smm/* 98211931Smm * Define the taskq threads for the following I/O types: 99211931Smm * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 100211931Smm */ 101211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 102211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 103211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 104211931Smm { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL }, 105211931Smm { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 106211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 107211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 108211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 109209962Smm}; 110209962Smm 111209962Smmenum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; 112209962Smmuint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ 113209962Smm 114185029Spjdstatic void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 115185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 116185029Spjd 117168404Spjd/* 118168404Spjd * ========================================================================== 119185029Spjd * SPA properties routines 120185029Spjd * ========================================================================== 121185029Spjd */ 122185029Spjd 123185029Spjd/* 124185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 125185029Spjd */ 126185029Spjdstatic void 127185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 128185029Spjd uint64_t intval, zprop_source_t src) 129185029Spjd{ 130185029Spjd const char *propname = zpool_prop_to_name(prop); 131185029Spjd nvlist_t *propval; 132185029Spjd 133185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 134185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 135185029Spjd 136185029Spjd if (strval != NULL) 137185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 138185029Spjd else 139185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 140185029Spjd 141185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 142185029Spjd nvlist_free(propval); 143185029Spjd} 144185029Spjd 145185029Spjd/* 146185029Spjd * Get property values from the spa configuration. 147185029Spjd */ 148185029Spjdstatic void 149185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 150185029Spjd{ 151209962Smm uint64_t size; 152209962Smm uint64_t used; 153185029Spjd uint64_t cap, version; 154185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 155185029Spjd spa_config_dirent_t *dp; 156185029Spjd 157185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 158185029Spjd 159209962Smm if (spa->spa_root_vdev != NULL) { 160209962Smm size = spa_get_space(spa); 161209962Smm used = spa_get_alloc(spa); 162209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 163209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 164209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 165209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 166209962Smm size - used, src); 167185029Spjd 168209962Smm cap = (size == 0) ? 0 : (used * 100 / size); 169209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 170185029Spjd 171209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 172209962Smm spa->spa_root_vdev->vdev_state, src); 173209962Smm 174209962Smm version = spa_version(spa); 175209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 176209962Smm src = ZPROP_SRC_DEFAULT; 177209962Smm else 178209962Smm src = ZPROP_SRC_LOCAL; 179209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 180209962Smm } 181209962Smm 182185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 183185029Spjd 184185029Spjd if (spa->spa_root != NULL) 185185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 186185029Spjd 0, ZPROP_SRC_LOCAL); 187185029Spjd 188185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 189185029Spjd if (dp->scd_path == NULL) { 190185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 191185029Spjd "none", 0, ZPROP_SRC_LOCAL); 192185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 193185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 194185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 195185029Spjd } 196185029Spjd } 197185029Spjd} 198185029Spjd 199185029Spjd/* 200185029Spjd * Get zpool property values. 201185029Spjd */ 202185029Spjdint 203185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 204185029Spjd{ 205185029Spjd zap_cursor_t zc; 206185029Spjd zap_attribute_t za; 207185029Spjd objset_t *mos = spa->spa_meta_objset; 208185029Spjd int err; 209185029Spjd 210185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 211185029Spjd 212185029Spjd mutex_enter(&spa->spa_props_lock); 213185029Spjd 214185029Spjd /* 215185029Spjd * Get properties from the spa config. 216185029Spjd */ 217185029Spjd spa_prop_get_config(spa, nvp); 218185029Spjd 219185029Spjd /* If no pool property object, no more prop to get. */ 220185029Spjd if (spa->spa_pool_props_object == 0) { 221185029Spjd mutex_exit(&spa->spa_props_lock); 222185029Spjd return (0); 223185029Spjd } 224185029Spjd 225185029Spjd /* 226185029Spjd * Get properties from the MOS pool property object. 227185029Spjd */ 228185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 229185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 230185029Spjd zap_cursor_advance(&zc)) { 231185029Spjd uint64_t intval = 0; 232185029Spjd char *strval = NULL; 233185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 234185029Spjd zpool_prop_t prop; 235185029Spjd 236185029Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 237185029Spjd continue; 238185029Spjd 239185029Spjd switch (za.za_integer_length) { 240185029Spjd case 8: 241185029Spjd /* integer property */ 242185029Spjd if (za.za_first_integer != 243185029Spjd zpool_prop_default_numeric(prop)) 244185029Spjd src = ZPROP_SRC_LOCAL; 245185029Spjd 246185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 247185029Spjd dsl_pool_t *dp; 248185029Spjd dsl_dataset_t *ds = NULL; 249185029Spjd 250185029Spjd dp = spa_get_dsl(spa); 251185029Spjd rw_enter(&dp->dp_config_rwlock, RW_READER); 252185029Spjd if (err = dsl_dataset_hold_obj(dp, 253185029Spjd za.za_first_integer, FTAG, &ds)) { 254185029Spjd rw_exit(&dp->dp_config_rwlock); 255185029Spjd break; 256185029Spjd } 257185029Spjd 258185029Spjd strval = kmem_alloc( 259185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 260185029Spjd KM_SLEEP); 261185029Spjd dsl_dataset_name(ds, strval); 262185029Spjd dsl_dataset_rele(ds, FTAG); 263185029Spjd rw_exit(&dp->dp_config_rwlock); 264185029Spjd } else { 265185029Spjd strval = NULL; 266185029Spjd intval = za.za_first_integer; 267185029Spjd } 268185029Spjd 269185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 270185029Spjd 271185029Spjd if (strval != NULL) 272185029Spjd kmem_free(strval, 273185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 274185029Spjd 275185029Spjd break; 276185029Spjd 277185029Spjd case 1: 278185029Spjd /* string property */ 279185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 280185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 281185029Spjd za.za_name, 1, za.za_num_integers, strval); 282185029Spjd if (err) { 283185029Spjd kmem_free(strval, za.za_num_integers); 284185029Spjd break; 285185029Spjd } 286185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 287185029Spjd kmem_free(strval, za.za_num_integers); 288185029Spjd break; 289185029Spjd 290185029Spjd default: 291185029Spjd break; 292185029Spjd } 293185029Spjd } 294185029Spjd zap_cursor_fini(&zc); 295185029Spjd mutex_exit(&spa->spa_props_lock); 296185029Spjdout: 297185029Spjd if (err && err != ENOENT) { 298185029Spjd nvlist_free(*nvp); 299185029Spjd *nvp = NULL; 300185029Spjd return (err); 301185029Spjd } 302185029Spjd 303185029Spjd return (0); 304185029Spjd} 305185029Spjd 306185029Spjd/* 307185029Spjd * Validate the given pool properties nvlist and modify the list 308185029Spjd * for the property values to be set. 309185029Spjd */ 310185029Spjdstatic int 311185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 312185029Spjd{ 313185029Spjd nvpair_t *elem; 314185029Spjd int error = 0, reset_bootfs = 0; 315185029Spjd uint64_t objnum; 316185029Spjd 317185029Spjd elem = NULL; 318185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 319185029Spjd zpool_prop_t prop; 320185029Spjd char *propname, *strval; 321185029Spjd uint64_t intval; 322185029Spjd objset_t *os; 323185029Spjd char *slash; 324185029Spjd 325185029Spjd propname = nvpair_name(elem); 326185029Spjd 327185029Spjd if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 328185029Spjd return (EINVAL); 329185029Spjd 330185029Spjd switch (prop) { 331185029Spjd case ZPOOL_PROP_VERSION: 332185029Spjd error = nvpair_value_uint64(elem, &intval); 333185029Spjd if (!error && 334185029Spjd (intval < spa_version(spa) || intval > SPA_VERSION)) 335185029Spjd error = EINVAL; 336185029Spjd break; 337185029Spjd 338185029Spjd case ZPOOL_PROP_DELEGATION: 339185029Spjd case ZPOOL_PROP_AUTOREPLACE: 340185029Spjd case ZPOOL_PROP_LISTSNAPS: 341185029Spjd error = nvpair_value_uint64(elem, &intval); 342185029Spjd if (!error && intval > 1) 343185029Spjd error = EINVAL; 344185029Spjd break; 345185029Spjd 346185029Spjd case ZPOOL_PROP_BOOTFS: 347209962Smm /* 348209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 349209962Smm * or the pool is still being created (version == 0), 350209962Smm * the bootfs property cannot be set. 351209962Smm */ 352185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 353185029Spjd error = ENOTSUP; 354185029Spjd break; 355185029Spjd } 356185029Spjd 357185029Spjd /* 358185029Spjd * Make sure the vdev config is bootable 359185029Spjd */ 360185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 361185029Spjd error = ENOTSUP; 362185029Spjd break; 363185029Spjd } 364185029Spjd 365185029Spjd reset_bootfs = 1; 366185029Spjd 367185029Spjd error = nvpair_value_string(elem, &strval); 368185029Spjd 369185029Spjd if (!error) { 370185029Spjd uint64_t compress; 371185029Spjd 372185029Spjd if (strval == NULL || strval[0] == '\0') { 373185029Spjd objnum = zpool_prop_default_numeric( 374185029Spjd ZPOOL_PROP_BOOTFS); 375185029Spjd break; 376185029Spjd } 377185029Spjd 378185029Spjd if (error = dmu_objset_open(strval, DMU_OST_ZFS, 379185029Spjd DS_MODE_USER | DS_MODE_READONLY, &os)) 380185029Spjd break; 381185029Spjd 382185029Spjd /* We don't support gzip bootable datasets */ 383185029Spjd if ((error = dsl_prop_get_integer(strval, 384185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 385185029Spjd &compress, NULL)) == 0 && 386185029Spjd !BOOTFS_COMPRESS_VALID(compress)) { 387185029Spjd error = ENOTSUP; 388185029Spjd } else { 389185029Spjd objnum = dmu_objset_id(os); 390185029Spjd } 391185029Spjd dmu_objset_close(os); 392185029Spjd } 393185029Spjd break; 394185029Spjd 395185029Spjd case ZPOOL_PROP_FAILUREMODE: 396185029Spjd error = nvpair_value_uint64(elem, &intval); 397185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 398185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 399185029Spjd error = EINVAL; 400185029Spjd 401185029Spjd /* 402185029Spjd * This is a special case which only occurs when 403185029Spjd * the pool has completely failed. This allows 404185029Spjd * the user to change the in-core failmode property 405185029Spjd * without syncing it out to disk (I/Os might 406185029Spjd * currently be blocked). We do this by returning 407185029Spjd * EIO to the caller (spa_prop_set) to trick it 408185029Spjd * into thinking we encountered a property validation 409185029Spjd * error. 410185029Spjd */ 411185029Spjd if (!error && spa_suspended(spa)) { 412185029Spjd spa->spa_failmode = intval; 413185029Spjd error = EIO; 414185029Spjd } 415185029Spjd break; 416185029Spjd 417185029Spjd case ZPOOL_PROP_CACHEFILE: 418185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 419185029Spjd break; 420185029Spjd 421185029Spjd if (strval[0] == '\0') 422185029Spjd break; 423185029Spjd 424185029Spjd if (strcmp(strval, "none") == 0) 425185029Spjd break; 426185029Spjd 427185029Spjd if (strval[0] != '/') { 428185029Spjd error = EINVAL; 429185029Spjd break; 430185029Spjd } 431185029Spjd 432185029Spjd slash = strrchr(strval, '/'); 433185029Spjd ASSERT(slash != NULL); 434185029Spjd 435185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 436185029Spjd strcmp(slash, "/..") == 0) 437185029Spjd error = EINVAL; 438185029Spjd break; 439185029Spjd } 440185029Spjd 441185029Spjd if (error) 442185029Spjd break; 443185029Spjd } 444185029Spjd 445185029Spjd if (!error && reset_bootfs) { 446185029Spjd error = nvlist_remove(props, 447185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 448185029Spjd 449185029Spjd if (!error) { 450185029Spjd error = nvlist_add_uint64(props, 451185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 452185029Spjd } 453185029Spjd } 454185029Spjd 455185029Spjd return (error); 456185029Spjd} 457185029Spjd 458209962Smmvoid 459209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 460209962Smm{ 461209962Smm char *cachefile; 462209962Smm spa_config_dirent_t *dp; 463209962Smm 464209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 465209962Smm &cachefile) != 0) 466209962Smm return; 467209962Smm 468209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 469209962Smm KM_SLEEP); 470209962Smm 471209962Smm if (cachefile[0] == '\0') 472209962Smm dp->scd_path = spa_strdup(spa_config_path); 473209962Smm else if (strcmp(cachefile, "none") == 0) 474209962Smm dp->scd_path = NULL; 475209962Smm else 476209962Smm dp->scd_path = spa_strdup(cachefile); 477209962Smm 478209962Smm list_insert_head(&spa->spa_config_list, dp); 479209962Smm if (need_sync) 480209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 481209962Smm} 482209962Smm 483185029Spjdint 484185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 485185029Spjd{ 486185029Spjd int error; 487209962Smm nvpair_t *elem; 488209962Smm boolean_t need_sync = B_FALSE; 489209962Smm zpool_prop_t prop; 490185029Spjd 491185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 492185029Spjd return (error); 493185029Spjd 494209962Smm elem = NULL; 495209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 496209962Smm if ((prop = zpool_name_to_prop( 497209962Smm nvpair_name(elem))) == ZPROP_INVAL) 498209962Smm return (EINVAL); 499209962Smm 500209962Smm if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 501209962Smm continue; 502209962Smm 503209962Smm need_sync = B_TRUE; 504209962Smm break; 505209962Smm } 506209962Smm 507209962Smm if (need_sync) 508209962Smm return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 509209962Smm spa, nvp, 3)); 510209962Smm else 511209962Smm return (0); 512185029Spjd} 513185029Spjd 514185029Spjd/* 515185029Spjd * If the bootfs property value is dsobj, clear it. 516185029Spjd */ 517185029Spjdvoid 518185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 519185029Spjd{ 520185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 521185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 522185029Spjd spa->spa_pool_props_object, 523185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 524185029Spjd spa->spa_bootfs = 0; 525185029Spjd } 526185029Spjd} 527185029Spjd 528185029Spjd/* 529185029Spjd * ========================================================================== 530168404Spjd * SPA state manipulation (open/create/destroy/import/export) 531168404Spjd * ========================================================================== 532168404Spjd */ 533168404Spjd 534168404Spjdstatic int 535168404Spjdspa_error_entry_compare(const void *a, const void *b) 536168404Spjd{ 537168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 538168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 539168404Spjd int ret; 540168404Spjd 541168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 542168404Spjd sizeof (zbookmark_t)); 543168404Spjd 544168404Spjd if (ret < 0) 545168404Spjd return (-1); 546168404Spjd else if (ret > 0) 547168404Spjd return (1); 548168404Spjd else 549168404Spjd return (0); 550168404Spjd} 551168404Spjd 552168404Spjd/* 553168404Spjd * Utility function which retrieves copies of the current logs and 554168404Spjd * re-initializes them in the process. 555168404Spjd */ 556168404Spjdvoid 557168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 558168404Spjd{ 559168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 560168404Spjd 561168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 562168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 563168404Spjd 564168404Spjd avl_create(&spa->spa_errlist_scrub, 565168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 566168404Spjd offsetof(spa_error_entry_t, se_avl)); 567168404Spjd avl_create(&spa->spa_errlist_last, 568168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 569168404Spjd offsetof(spa_error_entry_t, se_avl)); 570168404Spjd} 571168404Spjd 572168404Spjd/* 573168404Spjd * Activate an uninitialized pool. 574168404Spjd */ 575168404Spjdstatic void 576209962Smmspa_activate(spa_t *spa, int mode) 577168404Spjd{ 578168404Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 579168404Spjd 580168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 581209962Smm spa->spa_mode = mode; 582168404Spjd 583209962Smm spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops); 584209962Smm spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops); 585168404Spjd 586185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 587185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 588211931Smm const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 589211931Smm enum zti_modes mode = ztip->zti_mode; 590211931Smm uint_t value = ztip->zti_value; 591209962Smm char name[32]; 592209962Smm 593209962Smm (void) snprintf(name, sizeof (name), 594211931Smm "%s_%s", zio_type_name[t], zio_taskq_types[q]); 595209962Smm 596209962Smm if (mode == zti_mode_tune) { 597209962Smm mode = zio_taskq_tune_mode; 598209962Smm value = zio_taskq_tune_value; 599209962Smm if (mode == zti_mode_tune) 600209962Smm mode = zti_mode_online_percent; 601209962Smm } 602209962Smm 603209962Smm switch (mode) { 604209962Smm case zti_mode_fixed: 605209962Smm ASSERT3U(value, >=, 1); 606209962Smm value = MAX(value, 1); 607209962Smm 608209962Smm spa->spa_zio_taskq[t][q] = taskq_create(name, 609209962Smm value, maxclsyspri, 50, INT_MAX, 610209962Smm TASKQ_PREPOPULATE); 611209962Smm break; 612209962Smm 613209962Smm case zti_mode_online_percent: 614209962Smm spa->spa_zio_taskq[t][q] = taskq_create(name, 615209962Smm value, maxclsyspri, 50, INT_MAX, 616209962Smm TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); 617209962Smm break; 618209962Smm 619211931Smm case zti_mode_null: 620211931Smm spa->spa_zio_taskq[t][q] = NULL; 621211931Smm break; 622211931Smm 623209962Smm case zti_mode_tune: 624209962Smm default: 625209962Smm panic("unrecognized mode for " 626209962Smm "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " 627209962Smm "in spa_activate()", 628209962Smm t, q, mode, value); 629209962Smm break; 630209962Smm } 631185029Spjd } 632168404Spjd } 633168404Spjd 634185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 635185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 636185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 637185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 638168404Spjd 639168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 640168404Spjd offsetof(struct vdev, vdev_txg_node)); 641168404Spjd 642168404Spjd avl_create(&spa->spa_errlist_scrub, 643168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 644168404Spjd offsetof(spa_error_entry_t, se_avl)); 645168404Spjd avl_create(&spa->spa_errlist_last, 646168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 647168404Spjd offsetof(spa_error_entry_t, se_avl)); 648168404Spjd} 649168404Spjd 650168404Spjd/* 651168404Spjd * Opposite of spa_activate(). 652168404Spjd */ 653168404Spjdstatic void 654168404Spjdspa_deactivate(spa_t *spa) 655168404Spjd{ 656168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 657168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 658168404Spjd ASSERT(spa->spa_root_vdev == NULL); 659209962Smm ASSERT(spa->spa_async_zio_root == NULL); 660168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 661168404Spjd 662168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 663168404Spjd 664185029Spjd list_destroy(&spa->spa_config_dirty_list); 665185029Spjd list_destroy(&spa->spa_state_dirty_list); 666168404Spjd 667185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 668185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 669211931Smm if (spa->spa_zio_taskq[t][q] != NULL) 670211931Smm taskq_destroy(spa->spa_zio_taskq[t][q]); 671185029Spjd spa->spa_zio_taskq[t][q] = NULL; 672185029Spjd } 673168404Spjd } 674168404Spjd 675168404Spjd metaslab_class_destroy(spa->spa_normal_class); 676168404Spjd spa->spa_normal_class = NULL; 677168404Spjd 678185029Spjd metaslab_class_destroy(spa->spa_log_class); 679185029Spjd spa->spa_log_class = NULL; 680185029Spjd 681168404Spjd /* 682168404Spjd * If this was part of an import or the open otherwise failed, we may 683168404Spjd * still have errors left in the queues. Empty them just in case. 684168404Spjd */ 685168404Spjd spa_errlog_drain(spa); 686168404Spjd 687168404Spjd avl_destroy(&spa->spa_errlist_scrub); 688168404Spjd avl_destroy(&spa->spa_errlist_last); 689168404Spjd 690168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 691168404Spjd} 692168404Spjd 693168404Spjd/* 694168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 695168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 696168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 697168404Spjd * All vdev validation is done by the vdev_alloc() routine. 698168404Spjd */ 699168404Spjdstatic int 700168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 701168404Spjd uint_t id, int atype) 702168404Spjd{ 703168404Spjd nvlist_t **child; 704168404Spjd uint_t c, children; 705168404Spjd int error; 706168404Spjd 707168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 708168404Spjd return (error); 709168404Spjd 710168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 711168404Spjd return (0); 712168404Spjd 713185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 714185029Spjd &child, &children); 715185029Spjd 716185029Spjd if (error == ENOENT) 717185029Spjd return (0); 718185029Spjd 719185029Spjd if (error) { 720168404Spjd vdev_free(*vdp); 721168404Spjd *vdp = NULL; 722168404Spjd return (EINVAL); 723168404Spjd } 724168404Spjd 725168404Spjd for (c = 0; c < children; c++) { 726168404Spjd vdev_t *vd; 727168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 728168404Spjd atype)) != 0) { 729168404Spjd vdev_free(*vdp); 730168404Spjd *vdp = NULL; 731168404Spjd return (error); 732168404Spjd } 733168404Spjd } 734168404Spjd 735168404Spjd ASSERT(*vdp != NULL); 736168404Spjd 737168404Spjd return (0); 738168404Spjd} 739168404Spjd 740168404Spjd/* 741168404Spjd * Opposite of spa_load(). 742168404Spjd */ 743168404Spjdstatic void 744168404Spjdspa_unload(spa_t *spa) 745168404Spjd{ 746168404Spjd int i; 747168404Spjd 748185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 749185029Spjd 750168404Spjd /* 751168404Spjd * Stop async tasks. 752168404Spjd */ 753168404Spjd spa_async_suspend(spa); 754168404Spjd 755168404Spjd /* 756168404Spjd * Stop syncing. 757168404Spjd */ 758168404Spjd if (spa->spa_sync_on) { 759168404Spjd txg_sync_stop(spa->spa_dsl_pool); 760168404Spjd spa->spa_sync_on = B_FALSE; 761168404Spjd } 762168404Spjd 763168404Spjd /* 764185029Spjd * Wait for any outstanding async I/O to complete. 765168404Spjd */ 766209962Smm if (spa->spa_async_zio_root != NULL) { 767209962Smm (void) zio_wait(spa->spa_async_zio_root); 768209962Smm spa->spa_async_zio_root = NULL; 769209962Smm } 770168404Spjd 771168404Spjd /* 772168404Spjd * Close the dsl pool. 773168404Spjd */ 774168404Spjd if (spa->spa_dsl_pool) { 775168404Spjd dsl_pool_close(spa->spa_dsl_pool); 776168404Spjd spa->spa_dsl_pool = NULL; 777168404Spjd } 778168404Spjd 779209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 780209962Smm 781168404Spjd /* 782209962Smm * Drop and purge level 2 cache 783209962Smm */ 784209962Smm spa_l2cache_drop(spa); 785209962Smm 786209962Smm /* 787168404Spjd * Close all vdevs. 788168404Spjd */ 789168404Spjd if (spa->spa_root_vdev) 790168404Spjd vdev_free(spa->spa_root_vdev); 791168404Spjd ASSERT(spa->spa_root_vdev == NULL); 792168404Spjd 793185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 794185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 795185029Spjd if (spa->spa_spares.sav_vdevs) { 796185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 797185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 798185029Spjd spa->spa_spares.sav_vdevs = NULL; 799168404Spjd } 800185029Spjd if (spa->spa_spares.sav_config) { 801185029Spjd nvlist_free(spa->spa_spares.sav_config); 802185029Spjd spa->spa_spares.sav_config = NULL; 803168404Spjd } 804185029Spjd spa->spa_spares.sav_count = 0; 805168404Spjd 806185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) 807185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 808185029Spjd if (spa->spa_l2cache.sav_vdevs) { 809185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 810185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 811185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 812185029Spjd } 813185029Spjd if (spa->spa_l2cache.sav_config) { 814185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 815185029Spjd spa->spa_l2cache.sav_config = NULL; 816185029Spjd } 817185029Spjd spa->spa_l2cache.sav_count = 0; 818185029Spjd 819168404Spjd spa->spa_async_suspended = 0; 820209962Smm 821209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 822168404Spjd} 823168404Spjd 824168404Spjd/* 825168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 826168404Spjd * this pool. When this is called, we have some form of basic information in 827185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 828185029Spjd * then re-generate a more complete list including status information. 829168404Spjd */ 830168404Spjdstatic void 831168404Spjdspa_load_spares(spa_t *spa) 832168404Spjd{ 833168404Spjd nvlist_t **spares; 834168404Spjd uint_t nspares; 835168404Spjd int i; 836168404Spjd vdev_t *vd, *tvd; 837168404Spjd 838185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 839185029Spjd 840168404Spjd /* 841168404Spjd * First, close and free any existing spare vdevs. 842168404Spjd */ 843185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 844185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 845168404Spjd 846168404Spjd /* Undo the call to spa_activate() below */ 847185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 848185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 849168404Spjd spa_spare_remove(tvd); 850168404Spjd vdev_close(vd); 851168404Spjd vdev_free(vd); 852168404Spjd } 853168404Spjd 854185029Spjd if (spa->spa_spares.sav_vdevs) 855185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 856185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 857168404Spjd 858185029Spjd if (spa->spa_spares.sav_config == NULL) 859168404Spjd nspares = 0; 860168404Spjd else 861185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 862168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 863168404Spjd 864185029Spjd spa->spa_spares.sav_count = (int)nspares; 865185029Spjd spa->spa_spares.sav_vdevs = NULL; 866168404Spjd 867168404Spjd if (nspares == 0) 868168404Spjd return; 869168404Spjd 870168404Spjd /* 871168404Spjd * Construct the array of vdevs, opening them to get status in the 872168404Spjd * process. For each spare, there is potentially two different vdev_t 873168404Spjd * structures associated with it: one in the list of spares (used only 874168404Spjd * for basic validation purposes) and one in the active vdev 875168404Spjd * configuration (if it's spared in). During this phase we open and 876168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 877168404Spjd * active configuration, then we also mark this vdev as an active spare. 878168404Spjd */ 879185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 880185029Spjd KM_SLEEP); 881185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 882168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 883168404Spjd VDEV_ALLOC_SPARE) == 0); 884168404Spjd ASSERT(vd != NULL); 885168404Spjd 886185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 887168404Spjd 888185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 889185029Spjd B_FALSE)) != NULL) { 890168404Spjd if (!tvd->vdev_isspare) 891168404Spjd spa_spare_add(tvd); 892168404Spjd 893168404Spjd /* 894168404Spjd * We only mark the spare active if we were successfully 895168404Spjd * able to load the vdev. Otherwise, importing a pool 896168404Spjd * with a bad active spare would result in strange 897168404Spjd * behavior, because multiple pool would think the spare 898168404Spjd * is actively in use. 899168404Spjd * 900168404Spjd * There is a vulnerability here to an equally bizarre 901168404Spjd * circumstance, where a dead active spare is later 902168404Spjd * brought back to life (onlined or otherwise). Given 903168404Spjd * the rarity of this scenario, and the extra complexity 904168404Spjd * it adds, we ignore the possibility. 905168404Spjd */ 906168404Spjd if (!vdev_is_dead(tvd)) 907168404Spjd spa_spare_activate(tvd); 908168404Spjd } 909168404Spjd 910185029Spjd vd->vdev_top = vd; 911209962Smm vd->vdev_aux = &spa->spa_spares; 912185029Spjd 913168404Spjd if (vdev_open(vd) != 0) 914168404Spjd continue; 915168404Spjd 916185029Spjd if (vdev_validate_aux(vd) == 0) 917185029Spjd spa_spare_add(vd); 918168404Spjd } 919168404Spjd 920168404Spjd /* 921168404Spjd * Recompute the stashed list of spares, with status information 922168404Spjd * this time. 923168404Spjd */ 924185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 925168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 926168404Spjd 927185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 928185029Spjd KM_SLEEP); 929185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 930185029Spjd spares[i] = vdev_config_generate(spa, 931185029Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 932185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 933185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 934185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 935168404Spjd nvlist_free(spares[i]); 936185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 937168404Spjd} 938168404Spjd 939185029Spjd/* 940185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 941185029Spjd * this pool. When this is called, we have some form of basic information in 942185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 943185029Spjd * then re-generate a more complete list including status information. 944185029Spjd * Devices which are already active have their details maintained, and are 945185029Spjd * not re-opened. 946185029Spjd */ 947185029Spjdstatic void 948185029Spjdspa_load_l2cache(spa_t *spa) 949185029Spjd{ 950185029Spjd nvlist_t **l2cache; 951185029Spjd uint_t nl2cache; 952185029Spjd int i, j, oldnvdevs; 953185029Spjd uint64_t guid, size; 954185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 955185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 956185029Spjd 957185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 958185029Spjd 959185029Spjd if (sav->sav_config != NULL) { 960185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 961185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 962185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 963185029Spjd } else { 964185029Spjd nl2cache = 0; 965185029Spjd } 966185029Spjd 967185029Spjd oldvdevs = sav->sav_vdevs; 968185029Spjd oldnvdevs = sav->sav_count; 969185029Spjd sav->sav_vdevs = NULL; 970185029Spjd sav->sav_count = 0; 971185029Spjd 972185029Spjd /* 973185029Spjd * Process new nvlist of vdevs. 974185029Spjd */ 975185029Spjd for (i = 0; i < nl2cache; i++) { 976185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 977185029Spjd &guid) == 0); 978185029Spjd 979185029Spjd newvdevs[i] = NULL; 980185029Spjd for (j = 0; j < oldnvdevs; j++) { 981185029Spjd vd = oldvdevs[j]; 982185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 983185029Spjd /* 984185029Spjd * Retain previous vdev for add/remove ops. 985185029Spjd */ 986185029Spjd newvdevs[i] = vd; 987185029Spjd oldvdevs[j] = NULL; 988185029Spjd break; 989185029Spjd } 990185029Spjd } 991185029Spjd 992185029Spjd if (newvdevs[i] == NULL) { 993185029Spjd /* 994185029Spjd * Create new vdev 995185029Spjd */ 996185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 997185029Spjd VDEV_ALLOC_L2CACHE) == 0); 998185029Spjd ASSERT(vd != NULL); 999185029Spjd newvdevs[i] = vd; 1000185029Spjd 1001185029Spjd /* 1002185029Spjd * Commit this vdev as an l2cache device, 1003185029Spjd * even if it fails to open. 1004185029Spjd */ 1005185029Spjd spa_l2cache_add(vd); 1006185029Spjd 1007185029Spjd vd->vdev_top = vd; 1008185029Spjd vd->vdev_aux = sav; 1009185029Spjd 1010185029Spjd spa_l2cache_activate(vd); 1011185029Spjd 1012185029Spjd if (vdev_open(vd) != 0) 1013185029Spjd continue; 1014185029Spjd 1015185029Spjd (void) vdev_validate_aux(vd); 1016185029Spjd 1017185029Spjd if (!vdev_is_dead(vd)) { 1018185029Spjd size = vdev_get_rsize(vd); 1019185029Spjd l2arc_add_vdev(spa, vd, 1020185029Spjd VDEV_LABEL_START_SIZE, 1021185029Spjd size - VDEV_LABEL_START_SIZE); 1022185029Spjd } 1023185029Spjd } 1024185029Spjd } 1025185029Spjd 1026185029Spjd /* 1027185029Spjd * Purge vdevs that were dropped 1028185029Spjd */ 1029185029Spjd for (i = 0; i < oldnvdevs; i++) { 1030185029Spjd uint64_t pool; 1031185029Spjd 1032185029Spjd vd = oldvdevs[i]; 1033185029Spjd if (vd != NULL) { 1034209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1035209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1036185029Spjd l2arc_remove_vdev(vd); 1037185029Spjd (void) vdev_close(vd); 1038185029Spjd spa_l2cache_remove(vd); 1039185029Spjd } 1040185029Spjd } 1041185029Spjd 1042185029Spjd if (oldvdevs) 1043185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1044185029Spjd 1045185029Spjd if (sav->sav_config == NULL) 1046185029Spjd goto out; 1047185029Spjd 1048185029Spjd sav->sav_vdevs = newvdevs; 1049185029Spjd sav->sav_count = (int)nl2cache; 1050185029Spjd 1051185029Spjd /* 1052185029Spjd * Recompute the stashed list of l2cache devices, with status 1053185029Spjd * information this time. 1054185029Spjd */ 1055185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1056185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1057185029Spjd 1058185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1059185029Spjd for (i = 0; i < sav->sav_count; i++) 1060185029Spjd l2cache[i] = vdev_config_generate(spa, 1061185029Spjd sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 1062185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1063185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1064185029Spjdout: 1065185029Spjd for (i = 0; i < sav->sav_count; i++) 1066185029Spjd nvlist_free(l2cache[i]); 1067185029Spjd if (sav->sav_count) 1068185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1069185029Spjd} 1070185029Spjd 1071168404Spjdstatic int 1072168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1073168404Spjd{ 1074168404Spjd dmu_buf_t *db; 1075168404Spjd char *packed = NULL; 1076168404Spjd size_t nvsize = 0; 1077168404Spjd int error; 1078168404Spjd *value = NULL; 1079168404Spjd 1080168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1081168404Spjd nvsize = *(uint64_t *)db->db_data; 1082168404Spjd dmu_buf_rele(db, FTAG); 1083168404Spjd 1084168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1085209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1086209962Smm DMU_READ_PREFETCH); 1087168404Spjd if (error == 0) 1088168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1089168404Spjd kmem_free(packed, nvsize); 1090168404Spjd 1091168404Spjd return (error); 1092168404Spjd} 1093168404Spjd 1094168404Spjd/* 1095185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1096185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1097185029Spjd */ 1098185029Spjdstatic void 1099185029Spjdspa_check_removed(vdev_t *vd) 1100185029Spjd{ 1101185029Spjd int c; 1102185029Spjd 1103185029Spjd for (c = 0; c < vd->vdev_children; c++) 1104185029Spjd spa_check_removed(vd->vdev_child[c]); 1105185029Spjd 1106185029Spjd if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1107185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1108185029Spjd spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1109185029Spjd } 1110185029Spjd} 1111185029Spjd 1112185029Spjd/* 1113213197Smm * Load the slog device state from the config object since it's possible 1114213197Smm * that the label does not contain the most up-to-date information. 1115213197Smm */ 1116213197Smmvoid 1117213197Smmspa_load_log_state(spa_t *spa) 1118213197Smm{ 1119213197Smm nvlist_t *nv, *nvroot, **child; 1120213197Smm uint64_t is_log; 1121213197Smm uint_t children, c; 1122213197Smm vdev_t *rvd = spa->spa_root_vdev; 1123213197Smm 1124213197Smm VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0); 1125213197Smm VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1126213197Smm VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1127213197Smm &child, &children) == 0); 1128213197Smm 1129213197Smm for (c = 0; c < children; c++) { 1130213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1131213197Smm 1132213197Smm if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, 1133213197Smm &is_log) == 0 && is_log) 1134213197Smm vdev_load_log_state(tvd, child[c]); 1135213197Smm } 1136213197Smm nvlist_free(nv); 1137213197Smm} 1138213197Smm 1139213197Smm/* 1140185029Spjd * Check for missing log devices 1141185029Spjd */ 1142185029Spjdint 1143185029Spjdspa_check_logs(spa_t *spa) 1144185029Spjd{ 1145185029Spjd switch (spa->spa_log_state) { 1146185029Spjd case SPA_LOG_MISSING: 1147185029Spjd /* need to recheck in case slog has been restored */ 1148185029Spjd case SPA_LOG_UNKNOWN: 1149185029Spjd if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1150185029Spjd DS_FIND_CHILDREN)) { 1151185029Spjd spa->spa_log_state = SPA_LOG_MISSING; 1152185029Spjd return (1); 1153185029Spjd } 1154185029Spjd break; 1155185029Spjd } 1156185029Spjd return (0); 1157185029Spjd} 1158185029Spjd 1159185029Spjd/* 1160168404Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 1161168404Spjd * source of configuration information. 1162168404Spjd */ 1163168404Spjdstatic int 1164168404Spjdspa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 1165168404Spjd{ 1166168404Spjd int error = 0; 1167168404Spjd nvlist_t *nvroot = NULL; 1168168404Spjd vdev_t *rvd; 1169168404Spjd uberblock_t *ub = &spa->spa_uberblock; 1170168404Spjd uint64_t config_cache_txg = spa->spa_config_txg; 1171168404Spjd uint64_t pool_guid; 1172168404Spjd uint64_t version; 1173185029Spjd uint64_t autoreplace = 0; 1174209962Smm int orig_mode = spa->spa_mode; 1175185029Spjd char *ereport = FM_EREPORT_ZFS_POOL; 1176168404Spjd 1177209962Smm /* 1178209962Smm * If this is an untrusted config, access the pool in read-only mode. 1179209962Smm * This prevents things like resilvering recently removed devices. 1180209962Smm */ 1181209962Smm if (!mosconfig) 1182209962Smm spa->spa_mode = FREAD; 1183209962Smm 1184185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1185185029Spjd 1186168404Spjd spa->spa_load_state = state; 1187168404Spjd 1188168404Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1189168404Spjd nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1190168404Spjd error = EINVAL; 1191168404Spjd goto out; 1192168404Spjd } 1193168404Spjd 1194168404Spjd /* 1195168404Spjd * Versioning wasn't explicitly added to the label until later, so if 1196168404Spjd * it's not present treat it as the initial version. 1197168404Spjd */ 1198168404Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1199185029Spjd version = SPA_VERSION_INITIAL; 1200168404Spjd 1201168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1202168404Spjd &spa->spa_config_txg); 1203168404Spjd 1204168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1205168404Spjd spa_guid_exists(pool_guid, 0)) { 1206168404Spjd error = EEXIST; 1207168404Spjd goto out; 1208168404Spjd } 1209168404Spjd 1210168404Spjd spa->spa_load_guid = pool_guid; 1211168404Spjd 1212168404Spjd /* 1213209962Smm * Create "The Godfather" zio to hold all async IOs 1214209962Smm */ 1215209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1216209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1217209962Smm 1218209962Smm /* 1219168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 1220168404Spjd * value that will be returned by spa_version() since parsing the 1221168404Spjd * configuration requires knowing the version number. 1222168404Spjd */ 1223185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1224168404Spjd spa->spa_ubsync.ub_version = version; 1225168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1226185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1227168404Spjd 1228168404Spjd if (error != 0) 1229168404Spjd goto out; 1230168404Spjd 1231168404Spjd ASSERT(spa->spa_root_vdev == rvd); 1232168404Spjd ASSERT(spa_guid(spa) == pool_guid); 1233168404Spjd 1234168404Spjd /* 1235168404Spjd * Try to open all vdevs, loading each label in the process. 1236168404Spjd */ 1237185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1238168926Spjd error = vdev_open(rvd); 1239185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1240168926Spjd if (error != 0) 1241168404Spjd goto out; 1242168404Spjd 1243168404Spjd /* 1244209962Smm * We need to validate the vdev labels against the configuration that 1245209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 1246209962Smm * mosconfig is true then we're validating the vdev labels based on 1247209962Smm * that config. Otherwise, we're validating against the cached config 1248209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 1249209962Smm * later we will recursively call spa_load() and validate against 1250209962Smm * the vdev config. 1251168404Spjd */ 1252185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1253168404Spjd error = vdev_validate(rvd); 1254185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1255168926Spjd if (error != 0) 1256168404Spjd goto out; 1257168404Spjd 1258168404Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1259168404Spjd error = ENXIO; 1260168404Spjd goto out; 1261168404Spjd } 1262168404Spjd 1263168404Spjd /* 1264168404Spjd * Find the best uberblock. 1265168404Spjd */ 1266185029Spjd vdev_uberblock_load(NULL, rvd, ub); 1267168404Spjd 1268168404Spjd /* 1269168404Spjd * If we weren't able to find a single valid uberblock, return failure. 1270168404Spjd */ 1271168404Spjd if (ub->ub_txg == 0) { 1272168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1273168404Spjd VDEV_AUX_CORRUPT_DATA); 1274168404Spjd error = ENXIO; 1275168404Spjd goto out; 1276168404Spjd } 1277168404Spjd 1278168404Spjd /* 1279168404Spjd * If the pool is newer than the code, we can't open it. 1280168404Spjd */ 1281185029Spjd if (ub->ub_version > SPA_VERSION) { 1282168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1283168404Spjd VDEV_AUX_VERSION_NEWER); 1284168404Spjd error = ENOTSUP; 1285168404Spjd goto out; 1286168404Spjd } 1287168404Spjd 1288168404Spjd /* 1289168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 1290168404Spjd * incomplete configuration. 1291168404Spjd */ 1292168404Spjd if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1293168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1294168404Spjd VDEV_AUX_BAD_GUID_SUM); 1295168404Spjd error = ENXIO; 1296168404Spjd goto out; 1297168404Spjd } 1298168404Spjd 1299168404Spjd /* 1300168404Spjd * Initialize internal SPA structures. 1301168404Spjd */ 1302168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 1303168404Spjd spa->spa_ubsync = spa->spa_uberblock; 1304168404Spjd spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1305168404Spjd error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1306168404Spjd if (error) { 1307168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1308168404Spjd VDEV_AUX_CORRUPT_DATA); 1309168404Spjd goto out; 1310168404Spjd } 1311168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1312168404Spjd 1313168404Spjd if (zap_lookup(spa->spa_meta_objset, 1314168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1315168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1316168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1317168404Spjd VDEV_AUX_CORRUPT_DATA); 1318168404Spjd error = EIO; 1319168404Spjd goto out; 1320168404Spjd } 1321168404Spjd 1322168404Spjd if (!mosconfig) { 1323168404Spjd nvlist_t *newconfig; 1324168498Spjd uint64_t hostid; 1325168404Spjd 1326168404Spjd if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 1327168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1328168404Spjd VDEV_AUX_CORRUPT_DATA); 1329168404Spjd error = EIO; 1330168404Spjd goto out; 1331168404Spjd } 1332168404Spjd 1333185029Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig, 1334185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1335168498Spjd char *hostname; 1336168498Spjd unsigned long myhostid = 0; 1337168498Spjd 1338168498Spjd VERIFY(nvlist_lookup_string(newconfig, 1339168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1340168498Spjd 1341168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1342204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 1343185029Spjd (unsigned long)hostid != myhostid) { 1344168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 1345168498Spjd "loaded as it was last accessed by " 1346185029Spjd "another system (host: %s hostid: 0x%lx). " 1347168498Spjd "See: http://www.sun.com/msg/ZFS-8000-EY", 1348185029Spjd spa_name(spa), hostname, 1349168498Spjd (unsigned long)hostid); 1350168498Spjd error = EBADF; 1351168498Spjd goto out; 1352168498Spjd } 1353168498Spjd } 1354168498Spjd 1355168404Spjd spa_config_set(spa, newconfig); 1356168404Spjd spa_unload(spa); 1357168404Spjd spa_deactivate(spa); 1358209962Smm spa_activate(spa, orig_mode); 1359168404Spjd 1360168404Spjd return (spa_load(spa, newconfig, state, B_TRUE)); 1361168404Spjd } 1362168404Spjd 1363168404Spjd if (zap_lookup(spa->spa_meta_objset, 1364168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1365168404Spjd sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1366168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1367168404Spjd VDEV_AUX_CORRUPT_DATA); 1368168404Spjd error = EIO; 1369168404Spjd goto out; 1370168404Spjd } 1371168404Spjd 1372168404Spjd /* 1373168404Spjd * Load the bit that tells us to use the new accounting function 1374168404Spjd * (raid-z deflation). If we have an older pool, this will not 1375168404Spjd * be present. 1376168404Spjd */ 1377168404Spjd error = zap_lookup(spa->spa_meta_objset, 1378168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1379168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate); 1380168404Spjd if (error != 0 && error != ENOENT) { 1381168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1382168404Spjd VDEV_AUX_CORRUPT_DATA); 1383168404Spjd error = EIO; 1384168404Spjd goto out; 1385168404Spjd } 1386168404Spjd 1387168404Spjd /* 1388168404Spjd * Load the persistent error log. If we have an older pool, this will 1389168404Spjd * not be present. 1390168404Spjd */ 1391168404Spjd error = zap_lookup(spa->spa_meta_objset, 1392168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1393168404Spjd sizeof (uint64_t), 1, &spa->spa_errlog_last); 1394168404Spjd if (error != 0 && error != ENOENT) { 1395168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1396168404Spjd VDEV_AUX_CORRUPT_DATA); 1397168404Spjd error = EIO; 1398168404Spjd goto out; 1399168404Spjd } 1400168404Spjd 1401168404Spjd error = zap_lookup(spa->spa_meta_objset, 1402168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1403168404Spjd sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1404168404Spjd if (error != 0 && error != ENOENT) { 1405168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1406168404Spjd VDEV_AUX_CORRUPT_DATA); 1407168404Spjd error = EIO; 1408168404Spjd goto out; 1409168404Spjd } 1410168404Spjd 1411168404Spjd /* 1412168404Spjd * Load the history object. If we have an older pool, this 1413168404Spjd * will not be present. 1414168404Spjd */ 1415168404Spjd error = zap_lookup(spa->spa_meta_objset, 1416168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1417168404Spjd sizeof (uint64_t), 1, &spa->spa_history); 1418168404Spjd if (error != 0 && error != ENOENT) { 1419168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1420168404Spjd VDEV_AUX_CORRUPT_DATA); 1421168404Spjd error = EIO; 1422168404Spjd goto out; 1423168404Spjd } 1424168404Spjd 1425168404Spjd /* 1426168404Spjd * Load any hot spares for this pool. 1427168404Spjd */ 1428168404Spjd error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1429185029Spjd DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1430168404Spjd if (error != 0 && error != ENOENT) { 1431168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1432168404Spjd VDEV_AUX_CORRUPT_DATA); 1433168404Spjd error = EIO; 1434168404Spjd goto out; 1435168404Spjd } 1436168404Spjd if (error == 0) { 1437185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1438185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 1439185029Spjd &spa->spa_spares.sav_config) != 0) { 1440168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1441168404Spjd VDEV_AUX_CORRUPT_DATA); 1442168404Spjd error = EIO; 1443168404Spjd goto out; 1444168404Spjd } 1445168404Spjd 1446185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1447168404Spjd spa_load_spares(spa); 1448185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1449168404Spjd } 1450168404Spjd 1451185029Spjd /* 1452185029Spjd * Load any level 2 ARC devices for this pool. 1453185029Spjd */ 1454168404Spjd error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1455185029Spjd DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1456185029Spjd &spa->spa_l2cache.sav_object); 1457185029Spjd if (error != 0 && error != ENOENT) { 1458185029Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1459185029Spjd VDEV_AUX_CORRUPT_DATA); 1460185029Spjd error = EIO; 1461185029Spjd goto out; 1462185029Spjd } 1463185029Spjd if (error == 0) { 1464185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1465185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1466185029Spjd &spa->spa_l2cache.sav_config) != 0) { 1467185029Spjd vdev_set_state(rvd, B_TRUE, 1468185029Spjd VDEV_STATE_CANT_OPEN, 1469185029Spjd VDEV_AUX_CORRUPT_DATA); 1470185029Spjd error = EIO; 1471185029Spjd goto out; 1472185029Spjd } 1473185029Spjd 1474185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1475185029Spjd spa_load_l2cache(spa); 1476185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1477185029Spjd } 1478185029Spjd 1479213197Smm spa_load_log_state(spa); 1480213197Smm 1481185029Spjd if (spa_check_logs(spa)) { 1482185029Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1483185029Spjd VDEV_AUX_BAD_LOG); 1484185029Spjd error = ENXIO; 1485185029Spjd ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1486185029Spjd goto out; 1487185029Spjd } 1488185029Spjd 1489185029Spjd 1490185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1491185029Spjd 1492185029Spjd error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1493168404Spjd DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1494168404Spjd 1495168404Spjd if (error && error != ENOENT) { 1496168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1497168404Spjd VDEV_AUX_CORRUPT_DATA); 1498168404Spjd error = EIO; 1499168404Spjd goto out; 1500168404Spjd } 1501168404Spjd 1502168404Spjd if (error == 0) { 1503168404Spjd (void) zap_lookup(spa->spa_meta_objset, 1504168404Spjd spa->spa_pool_props_object, 1505185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1506168404Spjd sizeof (uint64_t), 1, &spa->spa_bootfs); 1507185029Spjd (void) zap_lookup(spa->spa_meta_objset, 1508185029Spjd spa->spa_pool_props_object, 1509185029Spjd zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1510185029Spjd sizeof (uint64_t), 1, &autoreplace); 1511185029Spjd (void) zap_lookup(spa->spa_meta_objset, 1512185029Spjd spa->spa_pool_props_object, 1513185029Spjd zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1514185029Spjd sizeof (uint64_t), 1, &spa->spa_delegation); 1515185029Spjd (void) zap_lookup(spa->spa_meta_objset, 1516185029Spjd spa->spa_pool_props_object, 1517185029Spjd zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1518185029Spjd sizeof (uint64_t), 1, &spa->spa_failmode); 1519168404Spjd } 1520168404Spjd 1521168404Spjd /* 1522185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 1523185029Spjd * the ZFS DE that it should not issue any faults for unopenable 1524185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 1525185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 1526185029Spjd * over. 1527185029Spjd */ 1528185029Spjd if (autoreplace && state != SPA_LOAD_TRYIMPORT) 1529185029Spjd spa_check_removed(spa->spa_root_vdev); 1530185029Spjd 1531185029Spjd /* 1532168404Spjd * Load the vdev state for all toplevel vdevs. 1533168404Spjd */ 1534168404Spjd vdev_load(rvd); 1535168404Spjd 1536168404Spjd /* 1537168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 1538168404Spjd */ 1539185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1540168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1541185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1542168404Spjd 1543168404Spjd /* 1544168404Spjd * Check the state of the root vdev. If it can't be opened, it 1545168404Spjd * indicates one or more toplevel vdevs are faulted. 1546168404Spjd */ 1547168404Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1548168404Spjd error = ENXIO; 1549168404Spjd goto out; 1550168404Spjd } 1551168404Spjd 1552209962Smm if (spa_writeable(spa)) { 1553168404Spjd dmu_tx_t *tx; 1554168404Spjd int need_update = B_FALSE; 1555168404Spjd 1556209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 1557209962Smm 1558168404Spjd /* 1559168404Spjd * Claim log blocks that haven't been committed yet. 1560168404Spjd * This must all happen in a single txg. 1561168404Spjd */ 1562168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1563168404Spjd spa_first_txg(spa)); 1564185029Spjd (void) dmu_objset_find(spa_name(spa), 1565168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 1566168404Spjd dmu_tx_commit(tx); 1567168404Spjd 1568213197Smm spa->spa_log_state = SPA_LOG_GOOD; 1569168404Spjd spa->spa_sync_on = B_TRUE; 1570168404Spjd txg_sync_start(spa->spa_dsl_pool); 1571168404Spjd 1572168404Spjd /* 1573168404Spjd * Wait for all claims to sync. 1574168404Spjd */ 1575168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1576168404Spjd 1577168404Spjd /* 1578168404Spjd * If the config cache is stale, or we have uninitialized 1579168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 1580209962Smm * 1581209962Smm * If spa_load_verbatim is true, trust the current 1582209962Smm * in-core spa_config and update the disk labels. 1583168404Spjd */ 1584168404Spjd if (config_cache_txg != spa->spa_config_txg || 1585209962Smm state == SPA_LOAD_IMPORT || spa->spa_load_verbatim) 1586168404Spjd need_update = B_TRUE; 1587168404Spjd 1588209962Smm for (int c = 0; c < rvd->vdev_children; c++) 1589168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 1590168404Spjd need_update = B_TRUE; 1591168404Spjd 1592168404Spjd /* 1593168404Spjd * Update the config cache asychronously in case we're the 1594168404Spjd * root pool, in which case the config cache isn't writable yet. 1595168404Spjd */ 1596168404Spjd if (need_update) 1597168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1598208683Spjd 1599208683Spjd /* 1600208683Spjd * Check all DTLs to see if anything needs resilvering. 1601208683Spjd */ 1602208683Spjd if (vdev_resilver_needed(rvd, NULL, NULL)) 1603208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 1604168404Spjd } 1605168404Spjd 1606168404Spjd error = 0; 1607168404Spjdout: 1608185029Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 1609168404Spjd if (error && error != EBADF) 1610185029Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1611168404Spjd spa->spa_load_state = SPA_LOAD_NONE; 1612168404Spjd spa->spa_ena = 0; 1613168404Spjd 1614168404Spjd return (error); 1615168404Spjd} 1616168404Spjd 1617168404Spjd/* 1618168404Spjd * Pool Open/Import 1619168404Spjd * 1620168404Spjd * The import case is identical to an open except that the configuration is sent 1621168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 1622168404Spjd * case of an open, the pool configuration will exist in the 1623185029Spjd * POOL_STATE_UNINITIALIZED state. 1624168404Spjd * 1625168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 1626168404Spjd * the same time open the pool, without having to keep around the spa_t in some 1627168404Spjd * ambiguous state. 1628168404Spjd */ 1629168404Spjdstatic int 1630168404Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1631168404Spjd{ 1632168404Spjd spa_t *spa; 1633168404Spjd int error; 1634168404Spjd int locked = B_FALSE; 1635168404Spjd 1636168404Spjd *spapp = NULL; 1637168404Spjd 1638168404Spjd /* 1639168404Spjd * As disgusting as this is, we need to support recursive calls to this 1640168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 1641168404Spjd * up calling spa_open() again. The real fix is to figure out how to 1642168404Spjd * avoid dsl_dir_open() calling this in the first place. 1643168404Spjd */ 1644168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 1645168404Spjd mutex_enter(&spa_namespace_lock); 1646168404Spjd locked = B_TRUE; 1647168404Spjd } 1648168404Spjd 1649168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 1650168404Spjd if (locked) 1651168404Spjd mutex_exit(&spa_namespace_lock); 1652168404Spjd return (ENOENT); 1653168404Spjd } 1654168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1655168404Spjd 1656209962Smm spa_activate(spa, spa_mode_global); 1657168404Spjd 1658168404Spjd error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1659168404Spjd 1660168404Spjd if (error == EBADF) { 1661168404Spjd /* 1662168404Spjd * If vdev_validate() returns failure (indicated by 1663168404Spjd * EBADF), it indicates that one of the vdevs indicates 1664168404Spjd * that the pool has been exported or destroyed. If 1665168404Spjd * this is the case, the config cache is out of sync and 1666168404Spjd * we should remove the pool from the namespace. 1667168404Spjd */ 1668168404Spjd spa_unload(spa); 1669168404Spjd spa_deactivate(spa); 1670185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 1671168404Spjd spa_remove(spa); 1672168404Spjd if (locked) 1673168404Spjd mutex_exit(&spa_namespace_lock); 1674168404Spjd return (ENOENT); 1675168404Spjd } 1676168404Spjd 1677168404Spjd if (error) { 1678168404Spjd /* 1679168404Spjd * We can't open the pool, but we still have useful 1680168404Spjd * information: the state of each vdev after the 1681168404Spjd * attempted vdev_open(). Return this to the user. 1682168404Spjd */ 1683185029Spjd if (config != NULL && spa->spa_root_vdev != NULL) 1684168404Spjd *config = spa_config_generate(spa, NULL, -1ULL, 1685168404Spjd B_TRUE); 1686168404Spjd spa_unload(spa); 1687168404Spjd spa_deactivate(spa); 1688168404Spjd spa->spa_last_open_failed = B_TRUE; 1689168404Spjd if (locked) 1690168404Spjd mutex_exit(&spa_namespace_lock); 1691168404Spjd *spapp = NULL; 1692168404Spjd return (error); 1693168404Spjd } else { 1694168404Spjd spa->spa_last_open_failed = B_FALSE; 1695168404Spjd } 1696168404Spjd } 1697168404Spjd 1698168404Spjd spa_open_ref(spa, tag); 1699185029Spjd 1700168404Spjd if (locked) 1701168404Spjd mutex_exit(&spa_namespace_lock); 1702168404Spjd 1703168404Spjd *spapp = spa; 1704168404Spjd 1705185029Spjd if (config != NULL) 1706168404Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1707168404Spjd 1708168404Spjd return (0); 1709168404Spjd} 1710168404Spjd 1711168404Spjdint 1712168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 1713168404Spjd{ 1714168404Spjd return (spa_open_common(name, spapp, tag, NULL)); 1715168404Spjd} 1716168404Spjd 1717168404Spjd/* 1718168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 1719168404Spjd * preventing it from being exported or destroyed. 1720168404Spjd */ 1721168404Spjdspa_t * 1722168404Spjdspa_inject_addref(char *name) 1723168404Spjd{ 1724168404Spjd spa_t *spa; 1725168404Spjd 1726168404Spjd mutex_enter(&spa_namespace_lock); 1727168404Spjd if ((spa = spa_lookup(name)) == NULL) { 1728168404Spjd mutex_exit(&spa_namespace_lock); 1729168404Spjd return (NULL); 1730168404Spjd } 1731168404Spjd spa->spa_inject_ref++; 1732168404Spjd mutex_exit(&spa_namespace_lock); 1733168404Spjd 1734168404Spjd return (spa); 1735168404Spjd} 1736168404Spjd 1737168404Spjdvoid 1738168404Spjdspa_inject_delref(spa_t *spa) 1739168404Spjd{ 1740168404Spjd mutex_enter(&spa_namespace_lock); 1741168404Spjd spa->spa_inject_ref--; 1742168404Spjd mutex_exit(&spa_namespace_lock); 1743168404Spjd} 1744168404Spjd 1745185029Spjd/* 1746185029Spjd * Add spares device information to the nvlist. 1747185029Spjd */ 1748168404Spjdstatic void 1749168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 1750168404Spjd{ 1751168404Spjd nvlist_t **spares; 1752168404Spjd uint_t i, nspares; 1753168404Spjd nvlist_t *nvroot; 1754168404Spjd uint64_t guid; 1755168404Spjd vdev_stat_t *vs; 1756168404Spjd uint_t vsc; 1757168404Spjd uint64_t pool; 1758168404Spjd 1759209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1760209962Smm 1761185029Spjd if (spa->spa_spares.sav_count == 0) 1762168404Spjd return; 1763168404Spjd 1764168404Spjd VERIFY(nvlist_lookup_nvlist(config, 1765168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1766185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1767168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1768168404Spjd if (nspares != 0) { 1769168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 1770168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1771168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 1772168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1773168404Spjd 1774168404Spjd /* 1775168404Spjd * Go through and find any spares which have since been 1776168404Spjd * repurposed as an active spare. If this is the case, update 1777168404Spjd * their status appropriately. 1778168404Spjd */ 1779168404Spjd for (i = 0; i < nspares; i++) { 1780168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 1781168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 1782185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 1783185029Spjd pool != 0ULL) { 1784168404Spjd VERIFY(nvlist_lookup_uint64_array( 1785168404Spjd spares[i], ZPOOL_CONFIG_STATS, 1786168404Spjd (uint64_t **)&vs, &vsc) == 0); 1787168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 1788168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 1789168404Spjd } 1790168404Spjd } 1791168404Spjd } 1792168404Spjd} 1793168404Spjd 1794185029Spjd/* 1795185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 1796185029Spjd */ 1797185029Spjdstatic void 1798185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 1799185029Spjd{ 1800185029Spjd nvlist_t **l2cache; 1801185029Spjd uint_t i, j, nl2cache; 1802185029Spjd nvlist_t *nvroot; 1803185029Spjd uint64_t guid; 1804185029Spjd vdev_t *vd; 1805185029Spjd vdev_stat_t *vs; 1806185029Spjd uint_t vsc; 1807185029Spjd 1808209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1809209962Smm 1810185029Spjd if (spa->spa_l2cache.sav_count == 0) 1811185029Spjd return; 1812185029Spjd 1813185029Spjd VERIFY(nvlist_lookup_nvlist(config, 1814185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1815185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1816185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1817185029Spjd if (nl2cache != 0) { 1818185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 1819185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1820185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 1821185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1822185029Spjd 1823185029Spjd /* 1824185029Spjd * Update level 2 cache device stats. 1825185029Spjd */ 1826185029Spjd 1827185029Spjd for (i = 0; i < nl2cache; i++) { 1828185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 1829185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 1830185029Spjd 1831185029Spjd vd = NULL; 1832185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1833185029Spjd if (guid == 1834185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1835185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 1836185029Spjd break; 1837185029Spjd } 1838185029Spjd } 1839185029Spjd ASSERT(vd != NULL); 1840185029Spjd 1841185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1842185029Spjd ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1843185029Spjd vdev_get_stats(vd, vs); 1844185029Spjd } 1845185029Spjd } 1846185029Spjd} 1847185029Spjd 1848168404Spjdint 1849168404Spjdspa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1850168404Spjd{ 1851168404Spjd int error; 1852168404Spjd spa_t *spa; 1853168404Spjd 1854168404Spjd *config = NULL; 1855168404Spjd error = spa_open_common(name, &spa, FTAG, config); 1856168404Spjd 1857209962Smm if (spa != NULL) { 1858209962Smm /* 1859209962Smm * This still leaves a window of inconsistency where the spares 1860209962Smm * or l2cache devices could change and the config would be 1861209962Smm * self-inconsistent. 1862209962Smm */ 1863209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1864168404Spjd 1865209962Smm if (*config != NULL) { 1866185029Spjd VERIFY(nvlist_add_uint64(*config, 1867209962Smm ZPOOL_CONFIG_ERRCOUNT, 1868209962Smm spa_get_errlog_size(spa)) == 0); 1869185029Spjd 1870209962Smm if (spa_suspended(spa)) 1871209962Smm VERIFY(nvlist_add_uint64(*config, 1872209962Smm ZPOOL_CONFIG_SUSPENDED, 1873209962Smm spa->spa_failmode) == 0); 1874209962Smm 1875209962Smm spa_add_spares(spa, *config); 1876209962Smm spa_add_l2cache(spa, *config); 1877209962Smm } 1878168404Spjd } 1879168404Spjd 1880168404Spjd /* 1881168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 1882168404Spjd * and call spa_lookup() directly. 1883168404Spjd */ 1884168404Spjd if (altroot) { 1885168404Spjd if (spa == NULL) { 1886168404Spjd mutex_enter(&spa_namespace_lock); 1887168404Spjd spa = spa_lookup(name); 1888168404Spjd if (spa) 1889168404Spjd spa_altroot(spa, altroot, buflen); 1890168404Spjd else 1891168404Spjd altroot[0] = '\0'; 1892168404Spjd spa = NULL; 1893168404Spjd mutex_exit(&spa_namespace_lock); 1894168404Spjd } else { 1895168404Spjd spa_altroot(spa, altroot, buflen); 1896168404Spjd } 1897168404Spjd } 1898168404Spjd 1899209962Smm if (spa != NULL) { 1900209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 1901168404Spjd spa_close(spa, FTAG); 1902209962Smm } 1903168404Spjd 1904168404Spjd return (error); 1905168404Spjd} 1906168404Spjd 1907168404Spjd/* 1908185029Spjd * Validate that the auxiliary device array is well formed. We must have an 1909185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 1910185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1911185029Spjd * specified, as long as they are well-formed. 1912168404Spjd */ 1913168404Spjdstatic int 1914185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1915185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 1916185029Spjd vdev_labeltype_t label) 1917168404Spjd{ 1918185029Spjd nvlist_t **dev; 1919185029Spjd uint_t i, ndev; 1920168404Spjd vdev_t *vd; 1921168404Spjd int error; 1922168404Spjd 1923185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1924185029Spjd 1925168404Spjd /* 1926185029Spjd * It's acceptable to have no devs specified. 1927168404Spjd */ 1928185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1929168404Spjd return (0); 1930168404Spjd 1931185029Spjd if (ndev == 0) 1932168404Spjd return (EINVAL); 1933168404Spjd 1934168404Spjd /* 1935185029Spjd * Make sure the pool is formatted with a version that supports this 1936185029Spjd * device type. 1937168404Spjd */ 1938185029Spjd if (spa_version(spa) < version) 1939168404Spjd return (ENOTSUP); 1940168404Spjd 1941168404Spjd /* 1942185029Spjd * Set the pending device list so we correctly handle device in-use 1943168404Spjd * checking. 1944168404Spjd */ 1945185029Spjd sav->sav_pending = dev; 1946185029Spjd sav->sav_npending = ndev; 1947168404Spjd 1948185029Spjd for (i = 0; i < ndev; i++) { 1949185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1950168404Spjd mode)) != 0) 1951168404Spjd goto out; 1952168404Spjd 1953168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 1954168404Spjd vdev_free(vd); 1955168404Spjd error = EINVAL; 1956168404Spjd goto out; 1957168404Spjd } 1958168404Spjd 1959185029Spjd /* 1960185029Spjd * The L2ARC currently only supports disk devices in 1961185029Spjd * kernel context. For user-level testing, we allow it. 1962185029Spjd */ 1963185029Spjd#ifdef _KERNEL 1964185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1965185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1966185029Spjd error = ENOTBLK; 1967185029Spjd goto out; 1968185029Spjd } 1969185029Spjd#endif 1970168404Spjd vd->vdev_top = vd; 1971168404Spjd 1972168404Spjd if ((error = vdev_open(vd)) == 0 && 1973185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 1974185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 1975168404Spjd vd->vdev_guid) == 0); 1976168404Spjd } 1977168404Spjd 1978168404Spjd vdev_free(vd); 1979168404Spjd 1980185029Spjd if (error && 1981185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 1982168404Spjd goto out; 1983168404Spjd else 1984168404Spjd error = 0; 1985168404Spjd } 1986168404Spjd 1987168404Spjdout: 1988185029Spjd sav->sav_pending = NULL; 1989185029Spjd sav->sav_npending = 0; 1990168404Spjd return (error); 1991168404Spjd} 1992168404Spjd 1993185029Spjdstatic int 1994185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1995185029Spjd{ 1996185029Spjd int error; 1997185029Spjd 1998185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1999185029Spjd 2000185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2001185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2002185029Spjd VDEV_LABEL_SPARE)) != 0) { 2003185029Spjd return (error); 2004185029Spjd } 2005185029Spjd 2006185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2007185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2008185029Spjd VDEV_LABEL_L2CACHE)); 2009185029Spjd} 2010185029Spjd 2011185029Spjdstatic void 2012185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2013185029Spjd const char *config) 2014185029Spjd{ 2015185029Spjd int i; 2016185029Spjd 2017185029Spjd if (sav->sav_config != NULL) { 2018185029Spjd nvlist_t **olddevs; 2019185029Spjd uint_t oldndevs; 2020185029Spjd nvlist_t **newdevs; 2021185029Spjd 2022185029Spjd /* 2023185029Spjd * Generate new dev list by concatentating with the 2024185029Spjd * current dev list. 2025185029Spjd */ 2026185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2027185029Spjd &olddevs, &oldndevs) == 0); 2028185029Spjd 2029185029Spjd newdevs = kmem_alloc(sizeof (void *) * 2030185029Spjd (ndevs + oldndevs), KM_SLEEP); 2031185029Spjd for (i = 0; i < oldndevs; i++) 2032185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2033185029Spjd KM_SLEEP) == 0); 2034185029Spjd for (i = 0; i < ndevs; i++) 2035185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2036185029Spjd KM_SLEEP) == 0); 2037185029Spjd 2038185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 2039185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 2040185029Spjd 2041185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2042185029Spjd config, newdevs, ndevs + oldndevs) == 0); 2043185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 2044185029Spjd nvlist_free(newdevs[i]); 2045185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2046185029Spjd } else { 2047185029Spjd /* 2048185029Spjd * Generate a new dev list. 2049185029Spjd */ 2050185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2051185029Spjd KM_SLEEP) == 0); 2052185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2053185029Spjd devs, ndevs) == 0); 2054185029Spjd } 2055185029Spjd} 2056185029Spjd 2057168404Spjd/* 2058185029Spjd * Stop and drop level 2 ARC devices 2059185029Spjd */ 2060185029Spjdvoid 2061185029Spjdspa_l2cache_drop(spa_t *spa) 2062185029Spjd{ 2063185029Spjd vdev_t *vd; 2064185029Spjd int i; 2065185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 2066185029Spjd 2067185029Spjd for (i = 0; i < sav->sav_count; i++) { 2068185029Spjd uint64_t pool; 2069185029Spjd 2070185029Spjd vd = sav->sav_vdevs[i]; 2071185029Spjd ASSERT(vd != NULL); 2072185029Spjd 2073209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2074209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 2075185029Spjd l2arc_remove_vdev(vd); 2076185029Spjd if (vd->vdev_isl2cache) 2077185029Spjd spa_l2cache_remove(vd); 2078185029Spjd vdev_clear_stats(vd); 2079185029Spjd (void) vdev_close(vd); 2080185029Spjd } 2081185029Spjd} 2082185029Spjd 2083185029Spjd/* 2084168404Spjd * Pool Creation 2085168404Spjd */ 2086168404Spjdint 2087185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2088185029Spjd const char *history_str, nvlist_t *zplprops) 2089168404Spjd{ 2090168404Spjd spa_t *spa; 2091185029Spjd char *altroot = NULL; 2092168404Spjd vdev_t *rvd; 2093168404Spjd dsl_pool_t *dp; 2094168404Spjd dmu_tx_t *tx; 2095168404Spjd int c, error = 0; 2096168404Spjd uint64_t txg = TXG_INITIAL; 2097185029Spjd nvlist_t **spares, **l2cache; 2098185029Spjd uint_t nspares, nl2cache; 2099185029Spjd uint64_t version; 2100168404Spjd 2101168404Spjd /* 2102168404Spjd * If this pool already exists, return failure. 2103168404Spjd */ 2104168404Spjd mutex_enter(&spa_namespace_lock); 2105168404Spjd if (spa_lookup(pool) != NULL) { 2106168404Spjd mutex_exit(&spa_namespace_lock); 2107168404Spjd return (EEXIST); 2108168404Spjd } 2109168404Spjd 2110168404Spjd /* 2111168404Spjd * Allocate a new spa_t structure. 2112168404Spjd */ 2113185029Spjd (void) nvlist_lookup_string(props, 2114185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2115168404Spjd spa = spa_add(pool, altroot); 2116209962Smm spa_activate(spa, spa_mode_global); 2117168404Spjd 2118168404Spjd spa->spa_uberblock.ub_txg = txg - 1; 2119185029Spjd 2120185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 2121185029Spjd spa_deactivate(spa); 2122185029Spjd spa_remove(spa); 2123185029Spjd mutex_exit(&spa_namespace_lock); 2124185029Spjd return (error); 2125185029Spjd } 2126185029Spjd 2127185029Spjd if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2128185029Spjd &version) != 0) 2129185029Spjd version = SPA_VERSION; 2130185029Spjd ASSERT(version <= SPA_VERSION); 2131185029Spjd spa->spa_uberblock.ub_version = version; 2132168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2133168404Spjd 2134168404Spjd /* 2135209962Smm * Create "The Godfather" zio to hold all async IOs 2136209962Smm */ 2137209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2138209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2139209962Smm 2140209962Smm /* 2141168404Spjd * Create the root vdev. 2142168404Spjd */ 2143185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2144168404Spjd 2145168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2146168404Spjd 2147168404Spjd ASSERT(error != 0 || rvd != NULL); 2148168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2149168404Spjd 2150185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 2151168404Spjd error = EINVAL; 2152168404Spjd 2153168404Spjd if (error == 0 && 2154168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2155185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 2156168404Spjd VDEV_ALLOC_ADD)) == 0) { 2157168404Spjd for (c = 0; c < rvd->vdev_children; c++) 2158168404Spjd vdev_init(rvd->vdev_child[c], txg); 2159168404Spjd vdev_config_dirty(rvd); 2160168404Spjd } 2161168404Spjd 2162185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2163168404Spjd 2164168404Spjd if (error != 0) { 2165168404Spjd spa_unload(spa); 2166168404Spjd spa_deactivate(spa); 2167168404Spjd spa_remove(spa); 2168168404Spjd mutex_exit(&spa_namespace_lock); 2169168404Spjd return (error); 2170168404Spjd } 2171168404Spjd 2172168404Spjd /* 2173168404Spjd * Get the list of spares, if specified. 2174168404Spjd */ 2175168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2176168404Spjd &spares, &nspares) == 0) { 2177185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2178168404Spjd KM_SLEEP) == 0); 2179185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2180168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2181185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2182168404Spjd spa_load_spares(spa); 2183185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2184185029Spjd spa->spa_spares.sav_sync = B_TRUE; 2185168404Spjd } 2186168404Spjd 2187185029Spjd /* 2188185029Spjd * Get the list of level 2 cache devices, if specified. 2189185029Spjd */ 2190185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2191185029Spjd &l2cache, &nl2cache) == 0) { 2192185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2193185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 2194185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2195185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2196185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2197185029Spjd spa_load_l2cache(spa); 2198185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2199185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2200185029Spjd } 2201185029Spjd 2202185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2203168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 2204168404Spjd 2205168404Spjd tx = dmu_tx_create_assigned(dp, txg); 2206168404Spjd 2207168404Spjd /* 2208168404Spjd * Create the pool config object. 2209168404Spjd */ 2210168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2211185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2212168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2213168404Spjd 2214168404Spjd if (zap_add(spa->spa_meta_objset, 2215168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2216168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2217168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 2218168404Spjd } 2219168404Spjd 2220185029Spjd /* Newly created pools with the right version are always deflated. */ 2221185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2222185029Spjd spa->spa_deflate = TRUE; 2223185029Spjd if (zap_add(spa->spa_meta_objset, 2224185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2225185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2226185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 2227185029Spjd } 2228168404Spjd } 2229168404Spjd 2230168404Spjd /* 2231168404Spjd * Create the deferred-free bplist object. Turn off compression 2232168404Spjd * because sync-to-convergence takes longer if the blocksize 2233168404Spjd * keeps changing. 2234168404Spjd */ 2235168404Spjd spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2236168404Spjd 1 << 14, tx); 2237168404Spjd dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2238168404Spjd ZIO_COMPRESS_OFF, tx); 2239168404Spjd 2240168404Spjd if (zap_add(spa->spa_meta_objset, 2241168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2242168404Spjd sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2243168404Spjd cmn_err(CE_PANIC, "failed to add bplist"); 2244168404Spjd } 2245168404Spjd 2246168404Spjd /* 2247168404Spjd * Create the pool's history object. 2248168404Spjd */ 2249185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 2250185029Spjd spa_history_create_obj(spa, tx); 2251168404Spjd 2252185029Spjd /* 2253185029Spjd * Set pool properties. 2254185029Spjd */ 2255185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2256185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2257185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2258209962Smm if (props != NULL) { 2259209962Smm spa_configfile_set(spa, props, B_FALSE); 2260185029Spjd spa_sync_props(spa, props, CRED(), tx); 2261209962Smm } 2262185029Spjd 2263168404Spjd dmu_tx_commit(tx); 2264168404Spjd 2265168404Spjd spa->spa_sync_on = B_TRUE; 2266168404Spjd txg_sync_start(spa->spa_dsl_pool); 2267168404Spjd 2268168404Spjd /* 2269168404Spjd * We explicitly wait for the first transaction to complete so that our 2270168404Spjd * bean counters are appropriately updated. 2271168404Spjd */ 2272168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 2273168404Spjd 2274185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 2275168404Spjd 2276185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2277185029Spjd (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2278185029Spjd 2279208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 2280208442Smm 2281168404Spjd mutex_exit(&spa_namespace_lock); 2282168404Spjd 2283168404Spjd return (0); 2284168404Spjd} 2285168404Spjd 2286209962Smm#ifdef sun 2287185029Spjd#ifdef _KERNEL 2288185029Spjd/* 2289185029Spjd * Build a "root" vdev for a top level vdev read in from a rootpool 2290185029Spjd * device label. 2291185029Spjd */ 2292185029Spjdstatic void 2293185029Spjdspa_build_rootpool_config(nvlist_t *config) 2294185029Spjd{ 2295185029Spjd nvlist_t *nvtop, *nvroot; 2296185029Spjd uint64_t pgid; 2297185029Spjd 2298168404Spjd /* 2299185029Spjd * Add this top-level vdev to the child array. 2300168404Spjd */ 2301185029Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) 2302185029Spjd == 0); 2303185029Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) 2304185029Spjd == 0); 2305168404Spjd 2306185029Spjd /* 2307185029Spjd * Put this pool's top-level vdevs into a root vdev. 2308185029Spjd */ 2309185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2310185029Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) 2311185029Spjd == 0); 2312185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2313185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2314185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2315185029Spjd &nvtop, 1) == 0); 2316168404Spjd 2317168404Spjd /* 2318185029Spjd * Replace the existing vdev_tree with the new root vdev in 2319185029Spjd * this pool's configuration (remove the old, add the new). 2320168404Spjd */ 2321185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2322185029Spjd nvlist_free(nvroot); 2323185029Spjd} 2324168404Spjd 2325185029Spjd/* 2326185029Spjd * Get the root pool information from the root disk, then import the root pool 2327185029Spjd * during the system boot up time. 2328185029Spjd */ 2329185029Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2330185029Spjd 2331185029Spjdint 2332185029Spjdspa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, 2333185029Spjd uint64_t *besttxg) 2334185029Spjd{ 2335185029Spjd nvlist_t *config; 2336185029Spjd uint64_t txg; 2337185029Spjd int error; 2338185029Spjd 2339185029Spjd if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) 2340185029Spjd return (error); 2341185029Spjd 2342185029Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2343185029Spjd 2344185029Spjd if (bestconf != NULL) 2345185029Spjd *bestconf = config; 2346185029Spjd else 2347185029Spjd nvlist_free(config); 2348185029Spjd *besttxg = txg; 2349168404Spjd return (0); 2350168404Spjd} 2351168404Spjd 2352185029Spjdboolean_t 2353185029Spjdspa_rootdev_validate(nvlist_t *nv) 2354185029Spjd{ 2355185029Spjd uint64_t ival; 2356185029Spjd 2357185029Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || 2358185029Spjd nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || 2359185029Spjd nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) 2360185029Spjd return (B_FALSE); 2361185029Spjd 2362185029Spjd return (B_TRUE); 2363185029Spjd} 2364185029Spjd 2365185029Spjd 2366168404Spjd/* 2367185029Spjd * Given the boot device's physical path or devid, check if the device 2368185029Spjd * is in a valid state. If so, return the configuration from the vdev 2369185029Spjd * label. 2370185029Spjd */ 2371185029Spjdint 2372185029Spjdspa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) 2373185029Spjd{ 2374185029Spjd nvlist_t *conf = NULL; 2375185029Spjd uint64_t txg = 0; 2376185029Spjd nvlist_t *nvtop, **child; 2377185029Spjd char *type; 2378185029Spjd char *bootpath = NULL; 2379185029Spjd uint_t children, c; 2380185029Spjd char *tmp; 2381185029Spjd int error; 2382185029Spjd 2383185029Spjd if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) 2384185029Spjd *tmp = '\0'; 2385185029Spjd if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { 2386185029Spjd cmn_err(CE_NOTE, "error reading device label"); 2387185029Spjd return (error); 2388185029Spjd } 2389185029Spjd if (txg == 0) { 2390185029Spjd cmn_err(CE_NOTE, "this device is detached"); 2391185029Spjd nvlist_free(conf); 2392185029Spjd return (EINVAL); 2393185029Spjd } 2394185029Spjd 2395185029Spjd VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, 2396185029Spjd &nvtop) == 0); 2397185029Spjd VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); 2398185029Spjd 2399185029Spjd if (strcmp(type, VDEV_TYPE_DISK) == 0) { 2400185029Spjd if (spa_rootdev_validate(nvtop)) { 2401185029Spjd goto out; 2402185029Spjd } else { 2403185029Spjd nvlist_free(conf); 2404185029Spjd return (EINVAL); 2405185029Spjd } 2406185029Spjd } 2407185029Spjd 2408185029Spjd ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); 2409185029Spjd 2410185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, 2411185029Spjd &child, &children) == 0); 2412185029Spjd 2413185029Spjd /* 2414185029Spjd * Go thru vdevs in the mirror to see if the given device 2415185029Spjd * has the most recent txg. Only the device with the most 2416185029Spjd * recent txg has valid information and should be booted. 2417185029Spjd */ 2418185029Spjd for (c = 0; c < children; c++) { 2419185029Spjd char *cdevid, *cpath; 2420185029Spjd uint64_t tmptxg; 2421185029Spjd 2422209962Smm cpath = NULL; 2423209962Smm cdevid = NULL; 2424185029Spjd if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, 2425209962Smm &cpath) != 0 && nvlist_lookup_string(child[c], 2426209962Smm ZPOOL_CONFIG_DEVID, &cdevid) != 0) 2427185029Spjd return (EINVAL); 2428185029Spjd if ((spa_check_rootconf(cpath, cdevid, NULL, 2429185029Spjd &tmptxg) == 0) && (tmptxg > txg)) { 2430185029Spjd txg = tmptxg; 2431185029Spjd VERIFY(nvlist_lookup_string(child[c], 2432185029Spjd ZPOOL_CONFIG_PATH, &bootpath) == 0); 2433185029Spjd } 2434185029Spjd } 2435185029Spjd 2436185029Spjd /* Does the best device match the one we've booted from? */ 2437185029Spjd if (bootpath) { 2438185029Spjd cmn_err(CE_NOTE, "try booting from '%s'", bootpath); 2439185029Spjd return (EINVAL); 2440185029Spjd } 2441185029Spjdout: 2442185029Spjd *bestconf = conf; 2443185029Spjd return (0); 2444185029Spjd} 2445185029Spjd 2446185029Spjd/* 2447185029Spjd * Import a root pool. 2448185029Spjd * 2449185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 2450185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2451185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 2452185029Spjd * 2453185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 2454185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 2455185029Spjd * e.g. 2456185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 2457185029Spjd */ 2458185029Spjdint 2459185029Spjdspa_import_rootpool(char *devpath, char *devid) 2460185029Spjd{ 2461185029Spjd nvlist_t *conf = NULL; 2462185029Spjd char *pname; 2463185029Spjd int error; 2464209962Smm spa_t *spa; 2465185029Spjd 2466185029Spjd /* 2467185029Spjd * Get the vdev pathname and configuation from the most 2468185029Spjd * recently updated vdev (highest txg). 2469185029Spjd */ 2470185029Spjd if (error = spa_get_rootconf(devpath, devid, &conf)) 2471185029Spjd goto msg_out; 2472185029Spjd 2473185029Spjd /* 2474185029Spjd * Add type "root" vdev to the config. 2475185029Spjd */ 2476185029Spjd spa_build_rootpool_config(conf); 2477185029Spjd 2478185029Spjd VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); 2479185029Spjd 2480209962Smm mutex_enter(&spa_namespace_lock); 2481209962Smm if ((spa = spa_lookup(pname)) != NULL) { 2482209962Smm /* 2483209962Smm * Remove the existing root pool from the namespace so that we 2484209962Smm * can replace it with the correct config we just read in. 2485209962Smm */ 2486209962Smm spa_remove(spa); 2487209962Smm } 2488185029Spjd 2489209962Smm spa = spa_add(pname, NULL); 2490209962Smm spa->spa_is_root = B_TRUE; 2491209962Smm spa->spa_load_verbatim = B_TRUE; 2492209962Smm 2493209962Smm VERIFY(nvlist_dup(conf, &spa->spa_config, 0) == 0); 2494209962Smm mutex_exit(&spa_namespace_lock); 2495209962Smm 2496185029Spjd nvlist_free(conf); 2497209962Smm return (0); 2498185029Spjd 2499185029Spjdmsg_out: 2500185029Spjd cmn_err(CE_NOTE, "\n" 2501185029Spjd " *************************************************** \n" 2502185029Spjd " * This device is not bootable! * \n" 2503185029Spjd " * It is either offlined or detached or faulted. * \n" 2504185029Spjd " * Please try to boot from a different device. * \n" 2505185029Spjd " *************************************************** "); 2506185029Spjd 2507185029Spjd return (error); 2508185029Spjd} 2509185029Spjd#endif 2510209962Smm#endif /* sun */ 2511185029Spjd 2512185029Spjd/* 2513209962Smm * Take a pool and insert it into the namespace as if it had been loaded at 2514209962Smm * boot. 2515185029Spjd */ 2516185029Spjdint 2517209962Smmspa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 2518185029Spjd{ 2519209962Smm spa_t *spa; 2520209962Smm char *altroot = NULL; 2521209962Smm 2522209962Smm mutex_enter(&spa_namespace_lock); 2523209962Smm if (spa_lookup(pool) != NULL) { 2524209962Smm mutex_exit(&spa_namespace_lock); 2525209962Smm return (EEXIST); 2526209962Smm } 2527209962Smm 2528209962Smm (void) nvlist_lookup_string(props, 2529209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2530209962Smm spa = spa_add(pool, altroot); 2531209962Smm 2532209962Smm spa->spa_load_verbatim = B_TRUE; 2533209962Smm 2534209962Smm VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 2535209962Smm 2536209962Smm if (props != NULL) 2537209962Smm spa_configfile_set(spa, props, B_FALSE); 2538209962Smm 2539209962Smm spa_config_sync(spa, B_FALSE, B_TRUE); 2540209962Smm 2541209962Smm mutex_exit(&spa_namespace_lock); 2542209962Smm 2543209962Smm return (0); 2544185029Spjd} 2545185029Spjd 2546209962Smm/* 2547209962Smm * Import a non-root pool into the system. 2548209962Smm */ 2549185029Spjdint 2550209962Smmspa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2551185029Spjd{ 2552209962Smm spa_t *spa; 2553209962Smm char *altroot = NULL; 2554209962Smm int error; 2555209962Smm nvlist_t *nvroot; 2556209962Smm nvlist_t **spares, **l2cache; 2557209962Smm uint_t nspares, nl2cache; 2558209962Smm 2559209962Smm /* 2560209962Smm * If a pool with this name exists, return failure. 2561209962Smm */ 2562209962Smm mutex_enter(&spa_namespace_lock); 2563209962Smm if ((spa = spa_lookup(pool)) != NULL) { 2564209962Smm mutex_exit(&spa_namespace_lock); 2565209962Smm return (EEXIST); 2566209962Smm } 2567209962Smm 2568209962Smm /* 2569209962Smm * Create and initialize the spa structure. 2570209962Smm */ 2571209962Smm (void) nvlist_lookup_string(props, 2572209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2573209962Smm spa = spa_add(pool, altroot); 2574209962Smm spa_activate(spa, spa_mode_global); 2575209962Smm 2576209962Smm /* 2577209962Smm * Don't start async tasks until we know everything is healthy. 2578209962Smm */ 2579209962Smm spa_async_suspend(spa); 2580209962Smm 2581209962Smm /* 2582209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 2583209962Smm * because the user-supplied config is actually the one to trust when 2584209962Smm * doing an import. 2585209962Smm */ 2586209962Smm error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 2587209962Smm 2588209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2589209962Smm /* 2590209962Smm * Toss any existing sparelist, as it doesn't have any validity 2591209962Smm * anymore, and conflicts with spa_has_spare(). 2592209962Smm */ 2593209962Smm if (spa->spa_spares.sav_config) { 2594209962Smm nvlist_free(spa->spa_spares.sav_config); 2595209962Smm spa->spa_spares.sav_config = NULL; 2596209962Smm spa_load_spares(spa); 2597209962Smm } 2598209962Smm if (spa->spa_l2cache.sav_config) { 2599209962Smm nvlist_free(spa->spa_l2cache.sav_config); 2600209962Smm spa->spa_l2cache.sav_config = NULL; 2601209962Smm spa_load_l2cache(spa); 2602209962Smm } 2603209962Smm 2604209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2605209962Smm &nvroot) == 0); 2606209962Smm if (error == 0) 2607209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 2608209962Smm VDEV_ALLOC_SPARE); 2609209962Smm if (error == 0) 2610209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 2611209962Smm VDEV_ALLOC_L2CACHE); 2612209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 2613209962Smm 2614209962Smm if (props != NULL) 2615209962Smm spa_configfile_set(spa, props, B_FALSE); 2616209962Smm 2617209962Smm if (error != 0 || (props && spa_writeable(spa) && 2618209962Smm (error = spa_prop_set(spa, props)))) { 2619209962Smm spa_unload(spa); 2620209962Smm spa_deactivate(spa); 2621209962Smm spa_remove(spa); 2622209962Smm mutex_exit(&spa_namespace_lock); 2623209962Smm return (error); 2624209962Smm } 2625209962Smm 2626209962Smm spa_async_resume(spa); 2627209962Smm 2628209962Smm /* 2629209962Smm * Override any spares and level 2 cache devices as specified by 2630209962Smm * the user, as these may have correct device names/devids, etc. 2631209962Smm */ 2632209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2633209962Smm &spares, &nspares) == 0) { 2634209962Smm if (spa->spa_spares.sav_config) 2635209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2636209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2637209962Smm else 2638209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2639209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 2640209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2641209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2642209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2643209962Smm spa_load_spares(spa); 2644209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 2645209962Smm spa->spa_spares.sav_sync = B_TRUE; 2646209962Smm } 2647209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2648209962Smm &l2cache, &nl2cache) == 0) { 2649209962Smm if (spa->spa_l2cache.sav_config) 2650209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2651209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2652209962Smm else 2653209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2654209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 2655209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2656209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2657209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2658209962Smm spa_load_l2cache(spa); 2659209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 2660209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 2661209962Smm } 2662209962Smm 2663209962Smm if (spa_writeable(spa)) { 2664209962Smm /* 2665209962Smm * Update the config cache to include the newly-imported pool. 2666209962Smm */ 2667209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2668209962Smm } 2669209962Smm 2670209962Smm mutex_exit(&spa_namespace_lock); 2671209962Smm 2672209962Smm return (0); 2673185029Spjd} 2674185029Spjd 2675185029Spjd/* 2676168404Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 2677168404Spjd * to get the vdev stats associated with the imported devices. 2678168404Spjd */ 2679168404Spjd#define TRYIMPORT_NAME "$import" 2680168404Spjd 2681168404Spjdnvlist_t * 2682168404Spjdspa_tryimport(nvlist_t *tryconfig) 2683168404Spjd{ 2684168404Spjd nvlist_t *config = NULL; 2685168404Spjd char *poolname; 2686168404Spjd spa_t *spa; 2687168404Spjd uint64_t state; 2688208443Smm int error; 2689168404Spjd 2690168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2691168404Spjd return (NULL); 2692168404Spjd 2693168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2694168404Spjd return (NULL); 2695168404Spjd 2696168404Spjd /* 2697168404Spjd * Create and initialize the spa structure. 2698168404Spjd */ 2699168404Spjd mutex_enter(&spa_namespace_lock); 2700168404Spjd spa = spa_add(TRYIMPORT_NAME, NULL); 2701209962Smm spa_activate(spa, FREAD); 2702168404Spjd 2703168404Spjd /* 2704168404Spjd * Pass off the heavy lifting to spa_load(). 2705168404Spjd * Pass TRUE for mosconfig because the user-supplied config 2706168404Spjd * is actually the one to trust when doing an import. 2707168404Spjd */ 2708208443Smm error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2709168404Spjd 2710168404Spjd /* 2711168404Spjd * If 'tryconfig' was at least parsable, return the current config. 2712168404Spjd */ 2713168404Spjd if (spa->spa_root_vdev != NULL) { 2714168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2715168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2716168404Spjd poolname) == 0); 2717168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2718168404Spjd state) == 0); 2719168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2720168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 2721168404Spjd 2722168404Spjd /* 2723185029Spjd * If the bootfs property exists on this pool then we 2724185029Spjd * copy it out so that external consumers can tell which 2725185029Spjd * pools are bootable. 2726168404Spjd */ 2727208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 2728185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2729185029Spjd 2730185029Spjd /* 2731185029Spjd * We have to play games with the name since the 2732185029Spjd * pool was opened as TRYIMPORT_NAME. 2733185029Spjd */ 2734185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 2735185029Spjd spa->spa_bootfs, tmpname) == 0) { 2736185029Spjd char *cp; 2737185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2738185029Spjd 2739185029Spjd cp = strchr(tmpname, '/'); 2740185029Spjd if (cp == NULL) { 2741185029Spjd (void) strlcpy(dsname, tmpname, 2742185029Spjd MAXPATHLEN); 2743185029Spjd } else { 2744185029Spjd (void) snprintf(dsname, MAXPATHLEN, 2745185029Spjd "%s/%s", poolname, ++cp); 2746185029Spjd } 2747185029Spjd VERIFY(nvlist_add_string(config, 2748185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2749185029Spjd kmem_free(dsname, MAXPATHLEN); 2750185029Spjd } 2751185029Spjd kmem_free(tmpname, MAXPATHLEN); 2752185029Spjd } 2753185029Spjd 2754185029Spjd /* 2755185029Spjd * Add the list of hot spares and level 2 cache devices. 2756185029Spjd */ 2757209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2758168404Spjd spa_add_spares(spa, config); 2759185029Spjd spa_add_l2cache(spa, config); 2760209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 2761168404Spjd } 2762168404Spjd 2763168404Spjd spa_unload(spa); 2764168404Spjd spa_deactivate(spa); 2765168404Spjd spa_remove(spa); 2766168404Spjd mutex_exit(&spa_namespace_lock); 2767168404Spjd 2768168404Spjd return (config); 2769168404Spjd} 2770168404Spjd 2771168404Spjd/* 2772168404Spjd * Pool export/destroy 2773168404Spjd * 2774168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 2775168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 2776168404Spjd * update the pool state and sync all the labels to disk, removing the 2777207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 2778207670Smm * we don't sync the labels or remove the configuration cache. 2779168404Spjd */ 2780168404Spjdstatic int 2781185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 2782207670Smm boolean_t force, boolean_t hardforce) 2783168404Spjd{ 2784168404Spjd spa_t *spa; 2785168404Spjd 2786168404Spjd if (oldconfig) 2787168404Spjd *oldconfig = NULL; 2788168404Spjd 2789209962Smm if (!(spa_mode_global & FWRITE)) 2790168404Spjd return (EROFS); 2791168404Spjd 2792168404Spjd mutex_enter(&spa_namespace_lock); 2793168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 2794168404Spjd mutex_exit(&spa_namespace_lock); 2795168404Spjd return (ENOENT); 2796168404Spjd } 2797168404Spjd 2798168404Spjd /* 2799168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 2800168404Spjd * reacquire the namespace lock, and see if we can export. 2801168404Spjd */ 2802168404Spjd spa_open_ref(spa, FTAG); 2803168404Spjd mutex_exit(&spa_namespace_lock); 2804168404Spjd spa_async_suspend(spa); 2805168404Spjd mutex_enter(&spa_namespace_lock); 2806168404Spjd spa_close(spa, FTAG); 2807168404Spjd 2808168404Spjd /* 2809168404Spjd * The pool will be in core if it's openable, 2810168404Spjd * in which case we can modify its state. 2811168404Spjd */ 2812168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2813168404Spjd /* 2814168404Spjd * Objsets may be open only because they're dirty, so we 2815168404Spjd * have to force it to sync before checking spa_refcnt. 2816168404Spjd */ 2817168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 2818168404Spjd 2819168404Spjd /* 2820168404Spjd * A pool cannot be exported or destroyed if there are active 2821168404Spjd * references. If we are resetting a pool, allow references by 2822168404Spjd * fault injection handlers. 2823168404Spjd */ 2824168404Spjd if (!spa_refcount_zero(spa) || 2825168404Spjd (spa->spa_inject_ref != 0 && 2826168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 2827168404Spjd spa_async_resume(spa); 2828168404Spjd mutex_exit(&spa_namespace_lock); 2829168404Spjd return (EBUSY); 2830168404Spjd } 2831168404Spjd 2832185029Spjd /* 2833185029Spjd * A pool cannot be exported if it has an active shared spare. 2834185029Spjd * This is to prevent other pools stealing the active spare 2835185029Spjd * from an exported pool. At user's own will, such pool can 2836185029Spjd * be forcedly exported. 2837185029Spjd */ 2838185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 2839185029Spjd spa_has_active_shared_spare(spa)) { 2840185029Spjd spa_async_resume(spa); 2841185029Spjd mutex_exit(&spa_namespace_lock); 2842185029Spjd return (EXDEV); 2843185029Spjd } 2844168404Spjd 2845168404Spjd /* 2846168404Spjd * We want this to be reflected on every label, 2847168404Spjd * so mark them all dirty. spa_unload() will do the 2848168404Spjd * final sync that pushes these changes out. 2849168404Spjd */ 2850207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 2851185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2852168404Spjd spa->spa_state = new_state; 2853168404Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2854168404Spjd vdev_config_dirty(spa->spa_root_vdev); 2855185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2856168404Spjd } 2857168404Spjd } 2858168404Spjd 2859185029Spjd spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2860185029Spjd 2861168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2862168404Spjd spa_unload(spa); 2863168404Spjd spa_deactivate(spa); 2864168404Spjd } 2865168404Spjd 2866168404Spjd if (oldconfig && spa->spa_config) 2867168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2868168404Spjd 2869168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 2870207670Smm if (!hardforce) 2871207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 2872168404Spjd spa_remove(spa); 2873168404Spjd } 2874168404Spjd mutex_exit(&spa_namespace_lock); 2875168404Spjd 2876168404Spjd return (0); 2877168404Spjd} 2878168404Spjd 2879168404Spjd/* 2880168404Spjd * Destroy a storage pool. 2881168404Spjd */ 2882168404Spjdint 2883168404Spjdspa_destroy(char *pool) 2884168404Spjd{ 2885207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 2886207670Smm B_FALSE, B_FALSE)); 2887168404Spjd} 2888168404Spjd 2889168404Spjd/* 2890168404Spjd * Export a storage pool. 2891168404Spjd */ 2892168404Spjdint 2893207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 2894207670Smm boolean_t hardforce) 2895168404Spjd{ 2896207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 2897207670Smm force, hardforce)); 2898168404Spjd} 2899168404Spjd 2900168404Spjd/* 2901168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 2902168404Spjd * from the namespace in any way. 2903168404Spjd */ 2904168404Spjdint 2905168404Spjdspa_reset(char *pool) 2906168404Spjd{ 2907185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 2908207670Smm B_FALSE, B_FALSE)); 2909168404Spjd} 2910168404Spjd 2911168404Spjd/* 2912168404Spjd * ========================================================================== 2913168404Spjd * Device manipulation 2914168404Spjd * ========================================================================== 2915168404Spjd */ 2916168404Spjd 2917168404Spjd/* 2918185029Spjd * Add a device to a storage pool. 2919168404Spjd */ 2920168404Spjdint 2921168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2922168404Spjd{ 2923168404Spjd uint64_t txg; 2924209962Smm int error; 2925168404Spjd vdev_t *rvd = spa->spa_root_vdev; 2926168404Spjd vdev_t *vd, *tvd; 2927185029Spjd nvlist_t **spares, **l2cache; 2928185029Spjd uint_t nspares, nl2cache; 2929168404Spjd 2930168404Spjd txg = spa_vdev_enter(spa); 2931168404Spjd 2932168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2933168404Spjd VDEV_ALLOC_ADD)) != 0) 2934168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 2935168404Spjd 2936185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 2937168404Spjd 2938185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2939185029Spjd &nspares) != 0) 2940168404Spjd nspares = 0; 2941168404Spjd 2942185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2943185029Spjd &nl2cache) != 0) 2944185029Spjd nl2cache = 0; 2945185029Spjd 2946185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 2947168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2948168404Spjd 2949185029Spjd if (vd->vdev_children != 0 && 2950185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 2951185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 2952168404Spjd 2953168404Spjd /* 2954185029Spjd * We must validate the spares and l2cache devices after checking the 2955185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2956168404Spjd */ 2957185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 2958168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 2959168404Spjd 2960168404Spjd /* 2961168404Spjd * Transfer each new top-level vdev from vd to rvd. 2962168404Spjd */ 2963209962Smm for (int c = 0; c < vd->vdev_children; c++) { 2964168404Spjd tvd = vd->vdev_child[c]; 2965168404Spjd vdev_remove_child(vd, tvd); 2966168404Spjd tvd->vdev_id = rvd->vdev_children; 2967168404Spjd vdev_add_child(rvd, tvd); 2968168404Spjd vdev_config_dirty(tvd); 2969168404Spjd } 2970168404Spjd 2971168404Spjd if (nspares != 0) { 2972185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 2973185029Spjd ZPOOL_CONFIG_SPARES); 2974168404Spjd spa_load_spares(spa); 2975185029Spjd spa->spa_spares.sav_sync = B_TRUE; 2976168404Spjd } 2977168404Spjd 2978185029Spjd if (nl2cache != 0) { 2979185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 2980185029Spjd ZPOOL_CONFIG_L2CACHE); 2981185029Spjd spa_load_l2cache(spa); 2982185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2983185029Spjd } 2984185029Spjd 2985168404Spjd /* 2986168404Spjd * We have to be careful when adding new vdevs to an existing pool. 2987168404Spjd * If other threads start allocating from these vdevs before we 2988168404Spjd * sync the config cache, and we lose power, then upon reboot we may 2989168404Spjd * fail to open the pool because there are DVAs that the config cache 2990168404Spjd * can't translate. Therefore, we first add the vdevs without 2991168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2992168404Spjd * and then let spa_config_update() initialize the new metaslabs. 2993168404Spjd * 2994168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 2995168404Spjd * if we lose power at any point in this sequence, the remaining 2996168404Spjd * steps will be completed the next time we load the pool. 2997168404Spjd */ 2998168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 2999168404Spjd 3000168404Spjd mutex_enter(&spa_namespace_lock); 3001168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3002168404Spjd mutex_exit(&spa_namespace_lock); 3003168404Spjd 3004168404Spjd return (0); 3005168404Spjd} 3006168404Spjd 3007168404Spjd/* 3008168404Spjd * Attach a device to a mirror. The arguments are the path to any device 3009168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 3010168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 3011168404Spjd * 3012168404Spjd * If 'replacing' is specified, the new device is intended to replace the 3013168404Spjd * existing device; in this case the two devices are made into their own 3014185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 3015168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 3016168404Spjd * extra rules: you can't attach to it after it's been created, and upon 3017168404Spjd * completion of resilvering, the first disk (the one being replaced) 3018168404Spjd * is automatically detached. 3019168404Spjd */ 3020168404Spjdint 3021168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3022168404Spjd{ 3023168404Spjd uint64_t txg, open_txg; 3024168404Spjd vdev_t *rvd = spa->spa_root_vdev; 3025168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3026168404Spjd vdev_ops_t *pvops; 3027185029Spjd dmu_tx_t *tx; 3028185029Spjd char *oldvdpath, *newvdpath; 3029185029Spjd int newvd_isspare; 3030185029Spjd int error; 3031168404Spjd 3032168404Spjd txg = spa_vdev_enter(spa); 3033168404Spjd 3034185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3035168404Spjd 3036168404Spjd if (oldvd == NULL) 3037168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3038168404Spjd 3039168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 3040168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3041168404Spjd 3042168404Spjd pvd = oldvd->vdev_parent; 3043168404Spjd 3044168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3045185029Spjd VDEV_ALLOC_ADD)) != 0) 3046185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3047185029Spjd 3048185029Spjd if (newrootvd->vdev_children != 1) 3049168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3050168404Spjd 3051168404Spjd newvd = newrootvd->vdev_child[0]; 3052168404Spjd 3053168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 3054168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3055168404Spjd 3056168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3057168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 3058168404Spjd 3059185029Spjd /* 3060185029Spjd * Spares can't replace logs 3061185029Spjd */ 3062185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3063185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3064185029Spjd 3065168404Spjd if (!replacing) { 3066168404Spjd /* 3067168404Spjd * For attach, the only allowable parent is a mirror or the root 3068168404Spjd * vdev. 3069168404Spjd */ 3070168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 3071168404Spjd pvd->vdev_ops != &vdev_root_ops) 3072168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3073168404Spjd 3074168404Spjd pvops = &vdev_mirror_ops; 3075168404Spjd } else { 3076168404Spjd /* 3077168404Spjd * Active hot spares can only be replaced by inactive hot 3078168404Spjd * spares. 3079168404Spjd */ 3080168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 3081168404Spjd pvd->vdev_child[1] == oldvd && 3082168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 3083168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3084168404Spjd 3085168404Spjd /* 3086168404Spjd * If the source is a hot spare, and the parent isn't already a 3087168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 3088168404Spjd * want to create a replacing vdev. The user is not allowed to 3089168404Spjd * attach to a spared vdev child unless the 'isspare' state is 3090168404Spjd * the same (spare replaces spare, non-spare replaces 3091168404Spjd * non-spare). 3092168404Spjd */ 3093168404Spjd if (pvd->vdev_ops == &vdev_replacing_ops) 3094168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3095168404Spjd else if (pvd->vdev_ops == &vdev_spare_ops && 3096168404Spjd newvd->vdev_isspare != oldvd->vdev_isspare) 3097168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3098168404Spjd else if (pvd->vdev_ops != &vdev_spare_ops && 3099168404Spjd newvd->vdev_isspare) 3100168404Spjd pvops = &vdev_spare_ops; 3101168404Spjd else 3102168404Spjd pvops = &vdev_replacing_ops; 3103168404Spjd } 3104168404Spjd 3105168404Spjd /* 3106168404Spjd * Compare the new device size with the replaceable/attachable 3107168404Spjd * device size. 3108168404Spjd */ 3109168404Spjd if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 3110168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3111168404Spjd 3112168404Spjd /* 3113168404Spjd * The new device cannot have a higher alignment requirement 3114168404Spjd * than the top-level vdev. 3115168404Spjd */ 3116168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3117168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3118168404Spjd 3119168404Spjd /* 3120168404Spjd * If this is an in-place replacement, update oldvd's path and devid 3121168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 3122168404Spjd */ 3123168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3124168404Spjd spa_strfree(oldvd->vdev_path); 3125168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3126168404Spjd KM_SLEEP); 3127168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 3128168404Spjd newvd->vdev_path, "old"); 3129168404Spjd if (oldvd->vdev_devid != NULL) { 3130168404Spjd spa_strfree(oldvd->vdev_devid); 3131168404Spjd oldvd->vdev_devid = NULL; 3132168404Spjd } 3133168404Spjd } 3134168404Spjd 3135168404Spjd /* 3136168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 3137168404Spjd * mirror/replacing/spare vdev above oldvd. 3138168404Spjd */ 3139168404Spjd if (pvd->vdev_ops != pvops) 3140168404Spjd pvd = vdev_add_parent(oldvd, pvops); 3141168404Spjd 3142168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 3143168404Spjd ASSERT(pvd->vdev_ops == pvops); 3144168404Spjd ASSERT(oldvd->vdev_parent == pvd); 3145168404Spjd 3146168404Spjd /* 3147168404Spjd * Extract the new device from its root and add it to pvd. 3148168404Spjd */ 3149168404Spjd vdev_remove_child(newrootvd, newvd); 3150168404Spjd newvd->vdev_id = pvd->vdev_children; 3151168404Spjd vdev_add_child(pvd, newvd); 3152168404Spjd 3153168404Spjd /* 3154168404Spjd * If newvd is smaller than oldvd, but larger than its rsize, 3155168404Spjd * the addition of newvd may have decreased our parent's asize. 3156168404Spjd */ 3157168404Spjd pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 3158168404Spjd 3159168404Spjd tvd = newvd->vdev_top; 3160168404Spjd ASSERT(pvd->vdev_top == tvd); 3161168404Spjd ASSERT(tvd->vdev_parent == rvd); 3162168404Spjd 3163168404Spjd vdev_config_dirty(tvd); 3164168404Spjd 3165168404Spjd /* 3166168404Spjd * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3167168404Spjd * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3168168404Spjd */ 3169168404Spjd open_txg = txg + TXG_CONCURRENT_STATES - 1; 3170168404Spjd 3171209962Smm vdev_dtl_dirty(newvd, DTL_MISSING, 3172209962Smm TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3173168404Spjd 3174209962Smm if (newvd->vdev_isspare) { 3175168404Spjd spa_spare_activate(newvd); 3176209962Smm spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3177209962Smm } 3178209962Smm 3179185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 3180185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 3181185029Spjd newvd_isspare = newvd->vdev_isspare; 3182168404Spjd 3183168404Spjd /* 3184168404Spjd * Mark newvd's DTL dirty in this txg. 3185168404Spjd */ 3186168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 3187168404Spjd 3188168404Spjd (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3189168404Spjd 3190185029Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3191185029Spjd if (dmu_tx_assign(tx, TXG_WAIT) == 0) { 3192185029Spjd spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, 3193185029Spjd CRED(), "%s vdev=%s %s vdev=%s", 3194185029Spjd replacing && newvd_isspare ? "spare in" : 3195185029Spjd replacing ? "replace" : "attach", newvdpath, 3196185029Spjd replacing ? "for" : "to", oldvdpath); 3197185029Spjd dmu_tx_commit(tx); 3198185029Spjd } else { 3199185029Spjd dmu_tx_abort(tx); 3200185029Spjd } 3201185029Spjd 3202185029Spjd spa_strfree(oldvdpath); 3203185029Spjd spa_strfree(newvdpath); 3204185029Spjd 3205168404Spjd /* 3206168404Spjd * Kick off a resilver to update newvd. 3207168404Spjd */ 3208185029Spjd VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3209168404Spjd 3210168404Spjd return (0); 3211168404Spjd} 3212168404Spjd 3213168404Spjd/* 3214168404Spjd * Detach a device from a mirror or replacing vdev. 3215168404Spjd * If 'replace_done' is specified, only detach if the parent 3216168404Spjd * is a replacing vdev. 3217168404Spjd */ 3218168404Spjdint 3219209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3220168404Spjd{ 3221168404Spjd uint64_t txg; 3222209962Smm int error; 3223168404Spjd vdev_t *rvd = spa->spa_root_vdev; 3224168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 3225168404Spjd boolean_t unspare = B_FALSE; 3226168404Spjd uint64_t unspare_guid; 3227185029Spjd size_t len; 3228168404Spjd 3229168404Spjd txg = spa_vdev_enter(spa); 3230168404Spjd 3231185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3232168404Spjd 3233168404Spjd if (vd == NULL) 3234168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3235168404Spjd 3236168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 3237168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3238168404Spjd 3239168404Spjd pvd = vd->vdev_parent; 3240168404Spjd 3241168404Spjd /* 3242209962Smm * If the parent/child relationship is not as expected, don't do it. 3243209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3244209962Smm * vdev that's replacing B with C. The user's intent in replacing 3245209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 3246209962Smm * the replace by detaching C, the expected behavior is to end up 3247209962Smm * M(A,B). But suppose that right after deciding to detach C, 3248209962Smm * the replacement of B completes. We would have M(A,C), and then 3249209962Smm * ask to detach C, which would leave us with just A -- not what 3250209962Smm * the user wanted. To prevent this, we make sure that the 3251209962Smm * parent/child relationship hasn't changed -- in this example, 3252209962Smm * that C's parent is still the replacing vdev R. 3253209962Smm */ 3254209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 3255209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3256209962Smm 3257209962Smm /* 3258168404Spjd * If replace_done is specified, only remove this device if it's 3259168404Spjd * the first child of a replacing vdev. For the 'spare' vdev, either 3260168404Spjd * disk can be removed. 3261168404Spjd */ 3262168404Spjd if (replace_done) { 3263168404Spjd if (pvd->vdev_ops == &vdev_replacing_ops) { 3264168404Spjd if (vd->vdev_id != 0) 3265168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3266168404Spjd } else if (pvd->vdev_ops != &vdev_spare_ops) { 3267168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3268168404Spjd } 3269168404Spjd } 3270168404Spjd 3271168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3272185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 3273168404Spjd 3274168404Spjd /* 3275168404Spjd * Only mirror, replacing, and spare vdevs support detach. 3276168404Spjd */ 3277168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 3278168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 3279168404Spjd pvd->vdev_ops != &vdev_spare_ops) 3280168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3281168404Spjd 3282168404Spjd /* 3283209962Smm * If this device has the only valid copy of some data, 3284209962Smm * we cannot safely detach it. 3285168404Spjd */ 3286209962Smm if (vdev_dtl_required(vd)) 3287168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3288168404Spjd 3289209962Smm ASSERT(pvd->vdev_children >= 2); 3290168404Spjd 3291168404Spjd /* 3292185029Spjd * If we are detaching the second disk from a replacing vdev, then 3293185029Spjd * check to see if we changed the original vdev's path to have "/old" 3294185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 3295168404Spjd */ 3296185029Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3297185029Spjd pvd->vdev_child[0]->vdev_path != NULL && 3298185029Spjd pvd->vdev_child[1]->vdev_path != NULL) { 3299185029Spjd ASSERT(pvd->vdev_child[1] == vd); 3300185029Spjd cvd = pvd->vdev_child[0]; 3301185029Spjd len = strlen(vd->vdev_path); 3302185029Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3303185029Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 3304185029Spjd spa_strfree(cvd->vdev_path); 3305185029Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 3306185029Spjd } 3307185029Spjd } 3308168404Spjd 3309168404Spjd /* 3310168404Spjd * If we are detaching the original disk from a spare, then it implies 3311168404Spjd * that the spare should become a real disk, and be removed from the 3312168404Spjd * active spare list for the pool. 3313168404Spjd */ 3314168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 3315209962Smm vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3316168404Spjd unspare = B_TRUE; 3317168404Spjd 3318168404Spjd /* 3319168404Spjd * Erase the disk labels so the disk can be used for other things. 3320168404Spjd * This must be done after all other error cases are handled, 3321168404Spjd * but before we disembowel vd (so we can still do I/O to it). 3322168404Spjd * But if we can't do it, don't treat the error as fatal -- 3323168404Spjd * it may be that the unwritability of the disk is the reason 3324168404Spjd * it's being detached! 3325168404Spjd */ 3326168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3327168404Spjd 3328168404Spjd /* 3329168404Spjd * Remove vd from its parent and compact the parent's children. 3330168404Spjd */ 3331168404Spjd vdev_remove_child(pvd, vd); 3332168404Spjd vdev_compact_children(pvd); 3333168404Spjd 3334168404Spjd /* 3335168404Spjd * Remember one of the remaining children so we can get tvd below. 3336168404Spjd */ 3337168404Spjd cvd = pvd->vdev_child[0]; 3338168404Spjd 3339168404Spjd /* 3340168404Spjd * If we need to remove the remaining child from the list of hot spares, 3341209962Smm * do it now, marking the vdev as no longer a spare in the process. 3342209962Smm * We must do this before vdev_remove_parent(), because that can 3343209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 3344209962Smm * reason, we must remove the spare now, in the same txg as the detach; 3345209962Smm * otherwise someone could attach a new sibling, change the GUID, and 3346209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3347168404Spjd */ 3348168404Spjd if (unspare) { 3349168404Spjd ASSERT(cvd->vdev_isspare); 3350168404Spjd spa_spare_remove(cvd); 3351168404Spjd unspare_guid = cvd->vdev_guid; 3352209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3353168404Spjd } 3354168404Spjd 3355168404Spjd /* 3356168404Spjd * If the parent mirror/replacing vdev only has one child, 3357168404Spjd * the parent is no longer needed. Remove it from the tree. 3358168404Spjd */ 3359168404Spjd if (pvd->vdev_children == 1) 3360168404Spjd vdev_remove_parent(cvd); 3361168404Spjd 3362168404Spjd /* 3363168404Spjd * We don't set tvd until now because the parent we just removed 3364168404Spjd * may have been the previous top-level vdev. 3365168404Spjd */ 3366168404Spjd tvd = cvd->vdev_top; 3367168404Spjd ASSERT(tvd->vdev_parent == rvd); 3368168404Spjd 3369168404Spjd /* 3370168404Spjd * Reevaluate the parent vdev state. 3371168404Spjd */ 3372185029Spjd vdev_propagate_state(cvd); 3373168404Spjd 3374168404Spjd /* 3375168404Spjd * If the device we just detached was smaller than the others, it may be 3376168404Spjd * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 3377168404Spjd * can't fail because the existing metaslabs are already in core, so 3378168404Spjd * there's nothing to read from disk. 3379168404Spjd */ 3380168404Spjd VERIFY(vdev_metaslab_init(tvd, txg) == 0); 3381168404Spjd 3382168404Spjd vdev_config_dirty(tvd); 3383168404Spjd 3384168404Spjd /* 3385168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3386168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 3387168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 3388168404Spjd * prevent vd from being accessed after it's freed. 3389168404Spjd */ 3390209962Smm for (int t = 0; t < TXG_SIZE; t++) 3391168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3392168404Spjd vd->vdev_detached = B_TRUE; 3393168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 3394168404Spjd 3395185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3396185029Spjd 3397168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 3398168404Spjd 3399168404Spjd /* 3400168404Spjd * If this was the removal of the original device in a hot spare vdev, 3401168404Spjd * then we want to go through and remove the device from the hot spare 3402168404Spjd * list of every other pool. 3403168404Spjd */ 3404168404Spjd if (unspare) { 3405209962Smm spa_t *myspa = spa; 3406168404Spjd spa = NULL; 3407168404Spjd mutex_enter(&spa_namespace_lock); 3408168404Spjd while ((spa = spa_next(spa)) != NULL) { 3409168404Spjd if (spa->spa_state != POOL_STATE_ACTIVE) 3410168404Spjd continue; 3411209962Smm if (spa == myspa) 3412209962Smm continue; 3413185029Spjd spa_open_ref(spa, FTAG); 3414185029Spjd mutex_exit(&spa_namespace_lock); 3415168404Spjd (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3416185029Spjd mutex_enter(&spa_namespace_lock); 3417185029Spjd spa_close(spa, FTAG); 3418168404Spjd } 3419168404Spjd mutex_exit(&spa_namespace_lock); 3420168404Spjd } 3421168404Spjd 3422168404Spjd return (error); 3423168404Spjd} 3424168404Spjd 3425185029Spjdstatic nvlist_t * 3426185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 3427185029Spjd{ 3428185029Spjd for (int i = 0; i < count; i++) { 3429185029Spjd uint64_t guid; 3430185029Spjd 3431185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 3432185029Spjd &guid) == 0); 3433185029Spjd 3434185029Spjd if (guid == target_guid) 3435185029Spjd return (nvpp[i]); 3436185029Spjd } 3437185029Spjd 3438185029Spjd return (NULL); 3439185029Spjd} 3440185029Spjd 3441185029Spjdstatic void 3442185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 3443185029Spjd nvlist_t *dev_to_remove) 3444185029Spjd{ 3445185029Spjd nvlist_t **newdev = NULL; 3446185029Spjd 3447185029Spjd if (count > 1) 3448185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 3449185029Spjd 3450185029Spjd for (int i = 0, j = 0; i < count; i++) { 3451185029Spjd if (dev[i] == dev_to_remove) 3452185029Spjd continue; 3453185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 3454185029Spjd } 3455185029Spjd 3456185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 3457185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 3458185029Spjd 3459185029Spjd for (int i = 0; i < count - 1; i++) 3460185029Spjd nvlist_free(newdev[i]); 3461185029Spjd 3462185029Spjd if (count > 1) 3463185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 3464185029Spjd} 3465185029Spjd 3466168404Spjd/* 3467168404Spjd * Remove a device from the pool. Currently, this supports removing only hot 3468185029Spjd * spares and level 2 ARC devices. 3469168404Spjd */ 3470168404Spjdint 3471168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3472168404Spjd{ 3473168404Spjd vdev_t *vd; 3474185029Spjd nvlist_t **spares, **l2cache, *nv; 3475185029Spjd uint_t nspares, nl2cache; 3476209962Smm uint64_t txg = 0; 3477185029Spjd int error = 0; 3478209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 3479168404Spjd 3480209962Smm if (!locked) 3481209962Smm txg = spa_vdev_enter(spa); 3482168404Spjd 3483185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3484168404Spjd 3485185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 3486185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3487185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 3488185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 3489185029Spjd /* 3490185029Spjd * Only remove the hot spare if it's not currently in use 3491185029Spjd * in this pool. 3492185029Spjd */ 3493185029Spjd if (vd == NULL || unspare) { 3494185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 3495185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 3496185029Spjd spa_load_spares(spa); 3497185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3498185029Spjd } else { 3499185029Spjd error = EBUSY; 3500168404Spjd } 3501185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 3502185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3503185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 3504185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 3505185029Spjd /* 3506185029Spjd * Cache devices can always be removed. 3507185029Spjd */ 3508185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 3509185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 3510185029Spjd spa_load_l2cache(spa); 3511185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3512185029Spjd } else if (vd != NULL) { 3513185029Spjd /* 3514185029Spjd * Normal vdevs cannot be removed (yet). 3515185029Spjd */ 3516185029Spjd error = ENOTSUP; 3517168404Spjd } else { 3518185029Spjd /* 3519185029Spjd * There is no vdev of any kind with the specified guid. 3520185029Spjd */ 3521185029Spjd error = ENOENT; 3522168404Spjd } 3523168404Spjd 3524209962Smm if (!locked) 3525209962Smm return (spa_vdev_exit(spa, NULL, txg, error)); 3526209962Smm 3527209962Smm return (error); 3528168404Spjd} 3529168404Spjd 3530168404Spjd/* 3531185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 3532185029Spjd * current spared, so we can detach it. 3533168404Spjd */ 3534168404Spjdstatic vdev_t * 3535185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 3536168404Spjd{ 3537168404Spjd vdev_t *newvd, *oldvd; 3538168404Spjd int c; 3539168404Spjd 3540168404Spjd for (c = 0; c < vd->vdev_children; c++) { 3541185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3542168404Spjd if (oldvd != NULL) 3543168404Spjd return (oldvd); 3544168404Spjd } 3545168404Spjd 3546185029Spjd /* 3547185029Spjd * Check for a completed replacement. 3548185029Spjd */ 3549168404Spjd if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3550168404Spjd oldvd = vd->vdev_child[0]; 3551168404Spjd newvd = vd->vdev_child[1]; 3552168404Spjd 3553209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 3554209962Smm !vdev_dtl_required(oldvd)) 3555168404Spjd return (oldvd); 3556168404Spjd } 3557168404Spjd 3558185029Spjd /* 3559185029Spjd * Check for a completed resilver with the 'unspare' flag set. 3560185029Spjd */ 3561185029Spjd if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3562185029Spjd newvd = vd->vdev_child[0]; 3563185029Spjd oldvd = vd->vdev_child[1]; 3564185029Spjd 3565185029Spjd if (newvd->vdev_unspare && 3566209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 3567209962Smm !vdev_dtl_required(oldvd)) { 3568185029Spjd newvd->vdev_unspare = 0; 3569185029Spjd return (oldvd); 3570185029Spjd } 3571185029Spjd } 3572185029Spjd 3573168404Spjd return (NULL); 3574168404Spjd} 3575168404Spjd 3576168404Spjdstatic void 3577185029Spjdspa_vdev_resilver_done(spa_t *spa) 3578168404Spjd{ 3579209962Smm vdev_t *vd, *pvd, *ppvd; 3580209962Smm uint64_t guid, sguid, pguid, ppguid; 3581168404Spjd 3582209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3583168404Spjd 3584185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3585209962Smm pvd = vd->vdev_parent; 3586209962Smm ppvd = pvd->vdev_parent; 3587168404Spjd guid = vd->vdev_guid; 3588209962Smm pguid = pvd->vdev_guid; 3589209962Smm ppguid = ppvd->vdev_guid; 3590209962Smm sguid = 0; 3591168404Spjd /* 3592168404Spjd * If we have just finished replacing a hot spared device, then 3593168404Spjd * we need to detach the parent's first child (the original hot 3594168404Spjd * spare) as well. 3595168404Spjd */ 3596209962Smm if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 3597168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3598209962Smm ASSERT(ppvd->vdev_children == 2); 3599209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 3600168404Spjd } 3601209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 3602209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 3603168404Spjd return; 3604209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 3605168404Spjd return; 3606209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3607168404Spjd } 3608168404Spjd 3609209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 3610168404Spjd} 3611168404Spjd 3612168404Spjd/* 3613209962Smm * Update the stored path or FRU for this vdev. Dirty the vdev configuration, 3614209962Smm * relying on spa_vdev_enter/exit() to synchronize the labels and cache. 3615168404Spjd */ 3616168404Spjdint 3617209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 3618209962Smm boolean_t ispath) 3619168404Spjd{ 3620185029Spjd vdev_t *vd; 3621168404Spjd uint64_t txg; 3622168404Spjd 3623168404Spjd txg = spa_vdev_enter(spa); 3624168404Spjd 3625209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 3626185029Spjd return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3627168404Spjd 3628168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 3629168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3630168404Spjd 3631209962Smm if (ispath) { 3632209962Smm spa_strfree(vd->vdev_path); 3633209962Smm vd->vdev_path = spa_strdup(value); 3634209962Smm } else { 3635209962Smm if (vd->vdev_fru != NULL) 3636209962Smm spa_strfree(vd->vdev_fru); 3637209962Smm vd->vdev_fru = spa_strdup(value); 3638209962Smm } 3639168404Spjd 3640168404Spjd vdev_config_dirty(vd->vdev_top); 3641168404Spjd 3642168404Spjd return (spa_vdev_exit(spa, NULL, txg, 0)); 3643168404Spjd} 3644168404Spjd 3645209962Smmint 3646209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3647209962Smm{ 3648209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 3649209962Smm} 3650209962Smm 3651209962Smmint 3652209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 3653209962Smm{ 3654209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 3655209962Smm} 3656209962Smm 3657168404Spjd/* 3658168404Spjd * ========================================================================== 3659168404Spjd * SPA Scrubbing 3660168404Spjd * ========================================================================== 3661168404Spjd */ 3662168404Spjd 3663168404Spjdint 3664185029Spjdspa_scrub(spa_t *spa, pool_scrub_type_t type) 3665168404Spjd{ 3666185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3667168404Spjd 3668168404Spjd if ((uint_t)type >= POOL_SCRUB_TYPES) 3669168404Spjd return (ENOTSUP); 3670168404Spjd 3671168404Spjd /* 3672185029Spjd * If a resilver was requested, but there is no DTL on a 3673185029Spjd * writeable leaf device, we have nothing to do. 3674168404Spjd */ 3675185029Spjd if (type == POOL_SCRUB_RESILVER && 3676185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 3677185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3678168404Spjd return (0); 3679168404Spjd } 3680168404Spjd 3681185029Spjd if (type == POOL_SCRUB_EVERYTHING && 3682185029Spjd spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 3683185029Spjd spa->spa_dsl_pool->dp_scrub_isresilver) 3684185029Spjd return (EBUSY); 3685168404Spjd 3686185029Spjd if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 3687185029Spjd return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 3688185029Spjd } else if (type == POOL_SCRUB_NONE) { 3689185029Spjd return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 3690168404Spjd } else { 3691185029Spjd return (EINVAL); 3692168404Spjd } 3693168404Spjd} 3694168404Spjd 3695168404Spjd/* 3696168404Spjd * ========================================================================== 3697168404Spjd * SPA async task processing 3698168404Spjd * ========================================================================== 3699168404Spjd */ 3700168404Spjd 3701168404Spjdstatic void 3702185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 3703168404Spjd{ 3704185029Spjd if (vd->vdev_remove_wanted) { 3705185029Spjd vd->vdev_remove_wanted = 0; 3706185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 3707209962Smm 3708209962Smm /* 3709209962Smm * We want to clear the stats, but we don't want to do a full 3710209962Smm * vdev_clear() as that will cause us to throw away 3711209962Smm * degraded/faulted state as well as attempt to reopen the 3712209962Smm * device, all of which is a waste. 3713209962Smm */ 3714209962Smm vd->vdev_stat.vs_read_errors = 0; 3715209962Smm vd->vdev_stat.vs_write_errors = 0; 3716209962Smm vd->vdev_stat.vs_checksum_errors = 0; 3717209962Smm 3718185029Spjd vdev_state_dirty(vd->vdev_top); 3719185029Spjd } 3720168404Spjd 3721185029Spjd for (int c = 0; c < vd->vdev_children; c++) 3722185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 3723185029Spjd} 3724168404Spjd 3725185029Spjdstatic void 3726185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 3727185029Spjd{ 3728185029Spjd if (vd->vdev_probe_wanted) { 3729185029Spjd vd->vdev_probe_wanted = 0; 3730185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 3731168404Spjd } 3732168404Spjd 3733185029Spjd for (int c = 0; c < vd->vdev_children; c++) 3734185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 3735168404Spjd} 3736168404Spjd 3737168404Spjdstatic void 3738168404Spjdspa_async_thread(void *arg) 3739168404Spjd{ 3740168404Spjd spa_t *spa = arg; 3741168404Spjd int tasks; 3742168404Spjd 3743168404Spjd ASSERT(spa->spa_sync_on); 3744168404Spjd 3745168404Spjd mutex_enter(&spa->spa_async_lock); 3746168404Spjd tasks = spa->spa_async_tasks; 3747168404Spjd spa->spa_async_tasks = 0; 3748168404Spjd mutex_exit(&spa->spa_async_lock); 3749168404Spjd 3750168404Spjd /* 3751168404Spjd * See if the config needs to be updated. 3752168404Spjd */ 3753168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3754168404Spjd mutex_enter(&spa_namespace_lock); 3755168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3756168404Spjd mutex_exit(&spa_namespace_lock); 3757168404Spjd } 3758168404Spjd 3759168404Spjd /* 3760185029Spjd * See if any devices need to be marked REMOVED. 3761168404Spjd */ 3762185029Spjd if (tasks & SPA_ASYNC_REMOVE) { 3763185029Spjd spa_vdev_state_enter(spa); 3764185029Spjd spa_async_remove(spa, spa->spa_root_vdev); 3765185029Spjd for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 3766185029Spjd spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 3767185029Spjd for (int i = 0; i < spa->spa_spares.sav_count; i++) 3768185029Spjd spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 3769185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 3770185029Spjd } 3771168404Spjd 3772168404Spjd /* 3773185029Spjd * See if any devices need to be probed. 3774168404Spjd */ 3775185029Spjd if (tasks & SPA_ASYNC_PROBE) { 3776185029Spjd spa_vdev_state_enter(spa); 3777185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 3778185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 3779185029Spjd } 3780168404Spjd 3781168404Spjd /* 3782185029Spjd * If any devices are done replacing, detach them. 3783168404Spjd */ 3784185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 3785185029Spjd spa_vdev_resilver_done(spa); 3786168404Spjd 3787168404Spjd /* 3788168404Spjd * Kick off a resilver. 3789168404Spjd */ 3790168404Spjd if (tasks & SPA_ASYNC_RESILVER) 3791185029Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 3792168404Spjd 3793168404Spjd /* 3794168404Spjd * Let the world know that we're done. 3795168404Spjd */ 3796168404Spjd mutex_enter(&spa->spa_async_lock); 3797168404Spjd spa->spa_async_thread = NULL; 3798168404Spjd cv_broadcast(&spa->spa_async_cv); 3799168404Spjd mutex_exit(&spa->spa_async_lock); 3800168404Spjd thread_exit(); 3801168404Spjd} 3802168404Spjd 3803168404Spjdvoid 3804168404Spjdspa_async_suspend(spa_t *spa) 3805168404Spjd{ 3806168404Spjd mutex_enter(&spa->spa_async_lock); 3807168404Spjd spa->spa_async_suspended++; 3808168404Spjd while (spa->spa_async_thread != NULL) 3809168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3810168404Spjd mutex_exit(&spa->spa_async_lock); 3811168404Spjd} 3812168404Spjd 3813168404Spjdvoid 3814168404Spjdspa_async_resume(spa_t *spa) 3815168404Spjd{ 3816168404Spjd mutex_enter(&spa->spa_async_lock); 3817168404Spjd ASSERT(spa->spa_async_suspended != 0); 3818168404Spjd spa->spa_async_suspended--; 3819168404Spjd mutex_exit(&spa->spa_async_lock); 3820168404Spjd} 3821168404Spjd 3822168404Spjdstatic void 3823168404Spjdspa_async_dispatch(spa_t *spa) 3824168404Spjd{ 3825168404Spjd mutex_enter(&spa->spa_async_lock); 3826168404Spjd if (spa->spa_async_tasks && !spa->spa_async_suspended && 3827168404Spjd spa->spa_async_thread == NULL && 3828168404Spjd rootdir != NULL && !vn_is_readonly(rootdir)) 3829168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 3830168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3831168404Spjd mutex_exit(&spa->spa_async_lock); 3832168404Spjd} 3833168404Spjd 3834168404Spjdvoid 3835168404Spjdspa_async_request(spa_t *spa, int task) 3836168404Spjd{ 3837168404Spjd mutex_enter(&spa->spa_async_lock); 3838168404Spjd spa->spa_async_tasks |= task; 3839168404Spjd mutex_exit(&spa->spa_async_lock); 3840168404Spjd} 3841168404Spjd 3842168404Spjd/* 3843168404Spjd * ========================================================================== 3844168404Spjd * SPA syncing routines 3845168404Spjd * ========================================================================== 3846168404Spjd */ 3847168404Spjd 3848168404Spjdstatic void 3849168404Spjdspa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3850168404Spjd{ 3851168404Spjd bplist_t *bpl = &spa->spa_sync_bplist; 3852168404Spjd dmu_tx_t *tx; 3853168404Spjd blkptr_t blk; 3854168404Spjd uint64_t itor = 0; 3855168404Spjd zio_t *zio; 3856168404Spjd int error; 3857168404Spjd uint8_t c = 1; 3858168404Spjd 3859185029Spjd zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 3860168404Spjd 3861185029Spjd while (bplist_iterate(bpl, &itor, &blk) == 0) { 3862185029Spjd ASSERT(blk.blk_birth < txg); 3863185029Spjd zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, 3864185029Spjd ZIO_FLAG_MUSTSUCCEED)); 3865185029Spjd } 3866168404Spjd 3867168404Spjd error = zio_wait(zio); 3868168404Spjd ASSERT3U(error, ==, 0); 3869168404Spjd 3870168404Spjd tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3871168404Spjd bplist_vacate(bpl, tx); 3872168404Spjd 3873168404Spjd /* 3874168404Spjd * Pre-dirty the first block so we sync to convergence faster. 3875168404Spjd * (Usually only the first block is needed.) 3876168404Spjd */ 3877168404Spjd dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3878168404Spjd dmu_tx_commit(tx); 3879168404Spjd} 3880168404Spjd 3881168404Spjdstatic void 3882168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3883168404Spjd{ 3884168404Spjd char *packed = NULL; 3885185029Spjd size_t bufsize; 3886168404Spjd size_t nvsize = 0; 3887168404Spjd dmu_buf_t *db; 3888168404Spjd 3889168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3890168404Spjd 3891185029Spjd /* 3892185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 3893185029Spjd * information. This avoids the dbuf_will_dirty() path and 3894185029Spjd * saves us a pre-read to get data we don't actually care about. 3895185029Spjd */ 3896185029Spjd bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 3897185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 3898168404Spjd 3899168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3900168404Spjd KM_SLEEP) == 0); 3901185029Spjd bzero(packed + nvsize, bufsize - nvsize); 3902168404Spjd 3903185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 3904168404Spjd 3905185029Spjd kmem_free(packed, bufsize); 3906168404Spjd 3907168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3908168404Spjd dmu_buf_will_dirty(db, tx); 3909168404Spjd *(uint64_t *)db->db_data = nvsize; 3910168404Spjd dmu_buf_rele(db, FTAG); 3911168404Spjd} 3912168404Spjd 3913168404Spjdstatic void 3914185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 3915185029Spjd const char *config, const char *entry) 3916168404Spjd{ 3917168404Spjd nvlist_t *nvroot; 3918185029Spjd nvlist_t **list; 3919168404Spjd int i; 3920168404Spjd 3921185029Spjd if (!sav->sav_sync) 3922168404Spjd return; 3923168404Spjd 3924168404Spjd /* 3925185029Spjd * Update the MOS nvlist describing the list of available devices. 3926185029Spjd * spa_validate_aux() will have already made sure this nvlist is 3927185029Spjd * valid and the vdevs are labeled appropriately. 3928168404Spjd */ 3929185029Spjd if (sav->sav_object == 0) { 3930185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 3931185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 3932185029Spjd sizeof (uint64_t), tx); 3933168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 3934185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 3935185029Spjd &sav->sav_object, tx) == 0); 3936168404Spjd } 3937168404Spjd 3938168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3939185029Spjd if (sav->sav_count == 0) { 3940185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 3941168404Spjd } else { 3942185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 3943185029Spjd for (i = 0; i < sav->sav_count; i++) 3944185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 3945185029Spjd B_FALSE, B_FALSE, B_TRUE); 3946185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 3947185029Spjd sav->sav_count) == 0); 3948185029Spjd for (i = 0; i < sav->sav_count; i++) 3949185029Spjd nvlist_free(list[i]); 3950185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 3951168404Spjd } 3952168404Spjd 3953185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 3954168404Spjd nvlist_free(nvroot); 3955168404Spjd 3956185029Spjd sav->sav_sync = B_FALSE; 3957168404Spjd} 3958168404Spjd 3959168404Spjdstatic void 3960168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3961168404Spjd{ 3962168404Spjd nvlist_t *config; 3963168404Spjd 3964185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) 3965168404Spjd return; 3966168404Spjd 3967185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 3968168404Spjd 3969185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 3970185029Spjd dmu_tx_get_txg(tx), B_FALSE); 3971185029Spjd 3972185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 3973185029Spjd 3974168404Spjd if (spa->spa_config_syncing) 3975168404Spjd nvlist_free(spa->spa_config_syncing); 3976168404Spjd spa->spa_config_syncing = config; 3977168404Spjd 3978168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3979168404Spjd} 3980168404Spjd 3981185029Spjd/* 3982185029Spjd * Set zpool properties. 3983185029Spjd */ 3984168404Spjdstatic void 3985185029Spjdspa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3986168404Spjd{ 3987168404Spjd spa_t *spa = arg1; 3988185029Spjd objset_t *mos = spa->spa_meta_objset; 3989168404Spjd nvlist_t *nvp = arg2; 3990185029Spjd nvpair_t *elem; 3991185029Spjd uint64_t intval; 3992185029Spjd char *strval; 3993185029Spjd zpool_prop_t prop; 3994185029Spjd const char *propname; 3995185029Spjd zprop_type_t proptype; 3996168404Spjd 3997168404Spjd mutex_enter(&spa->spa_props_lock); 3998168404Spjd 3999185029Spjd elem = NULL; 4000185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 4001185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 4002185029Spjd case ZPOOL_PROP_VERSION: 4003185029Spjd /* 4004185029Spjd * Only set version for non-zpool-creation cases 4005185029Spjd * (set/import). spa_create() needs special care 4006185029Spjd * for version setting. 4007185029Spjd */ 4008185029Spjd if (tx->tx_txg != TXG_INITIAL) { 4009185029Spjd VERIFY(nvpair_value_uint64(elem, 4010185029Spjd &intval) == 0); 4011185029Spjd ASSERT(intval <= SPA_VERSION); 4012185029Spjd ASSERT(intval >= spa_version(spa)); 4013185029Spjd spa->spa_uberblock.ub_version = intval; 4014185029Spjd vdev_config_dirty(spa->spa_root_vdev); 4015185029Spjd } 4016185029Spjd break; 4017168404Spjd 4018185029Spjd case ZPOOL_PROP_ALTROOT: 4019185029Spjd /* 4020185029Spjd * 'altroot' is a non-persistent property. It should 4021185029Spjd * have been set temporarily at creation or import time. 4022185029Spjd */ 4023185029Spjd ASSERT(spa->spa_root != NULL); 4024185029Spjd break; 4025168404Spjd 4026185029Spjd case ZPOOL_PROP_CACHEFILE: 4027185029Spjd /* 4028209962Smm * 'cachefile' is also a non-persisitent property. 4029185029Spjd */ 4030168404Spjd break; 4031185029Spjd default: 4032185029Spjd /* 4033185029Spjd * Set pool property values in the poolprops mos object. 4034185029Spjd */ 4035185029Spjd if (spa->spa_pool_props_object == 0) { 4036185029Spjd objset_t *mos = spa->spa_meta_objset; 4037185029Spjd 4038185029Spjd VERIFY((spa->spa_pool_props_object = 4039185029Spjd zap_create(mos, DMU_OT_POOL_PROPS, 4040185029Spjd DMU_OT_NONE, 0, tx)) > 0); 4041185029Spjd 4042185029Spjd VERIFY(zap_update(mos, 4043185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 4044185029Spjd 8, 1, &spa->spa_pool_props_object, tx) 4045185029Spjd == 0); 4046185029Spjd } 4047185029Spjd 4048185029Spjd /* normalize the property name */ 4049185029Spjd propname = zpool_prop_to_name(prop); 4050185029Spjd proptype = zpool_prop_get_type(prop); 4051185029Spjd 4052185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 4053185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 4054185029Spjd VERIFY(nvpair_value_string(elem, &strval) == 0); 4055185029Spjd VERIFY(zap_update(mos, 4056185029Spjd spa->spa_pool_props_object, propname, 4057185029Spjd 1, strlen(strval) + 1, strval, tx) == 0); 4058185029Spjd 4059185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 4060185029Spjd VERIFY(nvpair_value_uint64(elem, &intval) == 0); 4061185029Spjd 4062185029Spjd if (proptype == PROP_TYPE_INDEX) { 4063185029Spjd const char *unused; 4064185029Spjd VERIFY(zpool_prop_index_to_string( 4065185029Spjd prop, intval, &unused) == 0); 4066185029Spjd } 4067185029Spjd VERIFY(zap_update(mos, 4068185029Spjd spa->spa_pool_props_object, propname, 4069185029Spjd 8, 1, &intval, tx) == 0); 4070185029Spjd } else { 4071185029Spjd ASSERT(0); /* not allowed */ 4072185029Spjd } 4073185029Spjd 4074185029Spjd switch (prop) { 4075185029Spjd case ZPOOL_PROP_DELEGATION: 4076185029Spjd spa->spa_delegation = intval; 4077185029Spjd break; 4078185029Spjd case ZPOOL_PROP_BOOTFS: 4079185029Spjd spa->spa_bootfs = intval; 4080185029Spjd break; 4081185029Spjd case ZPOOL_PROP_FAILUREMODE: 4082185029Spjd spa->spa_failmode = intval; 4083185029Spjd break; 4084185029Spjd default: 4085185029Spjd break; 4086185029Spjd } 4087168404Spjd } 4088185029Spjd 4089185029Spjd /* log internal history if this is not a zpool create */ 4090185029Spjd if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 4091185029Spjd tx->tx_txg != TXG_INITIAL) { 4092185029Spjd spa_history_internal_log(LOG_POOL_PROPSET, 4093185029Spjd spa, tx, cr, "%s %lld %s", 4094185029Spjd nvpair_name(elem), intval, spa_name(spa)); 4095185029Spjd } 4096168404Spjd } 4097185029Spjd 4098185029Spjd mutex_exit(&spa->spa_props_lock); 4099168404Spjd} 4100168404Spjd 4101168404Spjd/* 4102168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 4103168404Spjd * part of the process, so we iterate until it converges. 4104168404Spjd */ 4105168404Spjdvoid 4106168404Spjdspa_sync(spa_t *spa, uint64_t txg) 4107168404Spjd{ 4108168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 4109168404Spjd objset_t *mos = spa->spa_meta_objset; 4110168404Spjd bplist_t *bpl = &spa->spa_sync_bplist; 4111168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4112168404Spjd vdev_t *vd; 4113168404Spjd dmu_tx_t *tx; 4114168404Spjd int dirty_vdevs; 4115185029Spjd int error; 4116168404Spjd 4117168404Spjd /* 4118168404Spjd * Lock out configuration changes. 4119168404Spjd */ 4120185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4121168404Spjd 4122168404Spjd spa->spa_syncing_txg = txg; 4123168404Spjd spa->spa_sync_pass = 0; 4124168404Spjd 4125185029Spjd /* 4126185029Spjd * If there are any pending vdev state changes, convert them 4127185029Spjd * into config changes that go out with this transaction group. 4128185029Spjd */ 4129185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4130209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 4131209962Smm /* 4132209962Smm * We need the write lock here because, for aux vdevs, 4133209962Smm * calling vdev_config_dirty() modifies sav_config. 4134209962Smm * This is ugly and will become unnecessary when we 4135209962Smm * eliminate the aux vdev wart by integrating all vdevs 4136209962Smm * into the root vdev tree. 4137209962Smm */ 4138209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4139209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 4140209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 4141209962Smm vdev_state_clean(vd); 4142209962Smm vdev_config_dirty(vd); 4143209962Smm } 4144209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4145209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 4146185029Spjd } 4147185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 4148185029Spjd 4149168404Spjd VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 4150168404Spjd 4151168404Spjd tx = dmu_tx_create_assigned(dp, txg); 4152168404Spjd 4153168404Spjd /* 4154185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4155168404Spjd * set spa_deflate if we have no raid-z vdevs. 4156168404Spjd */ 4157185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4158185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4159168404Spjd int i; 4160168404Spjd 4161168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 4162168404Spjd vd = rvd->vdev_child[i]; 4163168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4164168404Spjd break; 4165168404Spjd } 4166168404Spjd if (i == rvd->vdev_children) { 4167168404Spjd spa->spa_deflate = TRUE; 4168168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 4169168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4170168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4171168404Spjd } 4172168404Spjd } 4173168404Spjd 4174185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 4175185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 4176185029Spjd dsl_pool_create_origin(dp, tx); 4177185029Spjd 4178185029Spjd /* Keeping the origin open increases spa_minref */ 4179185029Spjd spa->spa_minref += 3; 4180185029Spjd } 4181185029Spjd 4182185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 4183185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 4184185029Spjd dsl_pool_upgrade_clones(dp, tx); 4185185029Spjd } 4186185029Spjd 4187168404Spjd /* 4188168404Spjd * If anything has changed in this txg, push the deferred frees 4189168404Spjd * from the previous txg. If not, leave them alone so that we 4190168404Spjd * don't generate work on an otherwise idle system. 4191168404Spjd */ 4192168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4193168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4194168404Spjd !txg_list_empty(&dp->dp_sync_tasks, txg)) 4195168404Spjd spa_sync_deferred_frees(spa, txg); 4196168404Spjd 4197168404Spjd /* 4198168404Spjd * Iterate to convergence. 4199168404Spjd */ 4200168404Spjd do { 4201168404Spjd spa->spa_sync_pass++; 4202168404Spjd 4203168404Spjd spa_sync_config_object(spa, tx); 4204185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4205185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4206185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4207185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4208168404Spjd spa_errlog_sync(spa, txg); 4209168404Spjd dsl_pool_sync(dp, txg); 4210168404Spjd 4211168404Spjd dirty_vdevs = 0; 4212168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4213168404Spjd vdev_sync(vd, txg); 4214168404Spjd dirty_vdevs++; 4215168404Spjd } 4216168404Spjd 4217168404Spjd bplist_sync(bpl, tx); 4218168404Spjd } while (dirty_vdevs); 4219168404Spjd 4220168404Spjd bplist_close(bpl); 4221168404Spjd 4222168404Spjd dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4223168404Spjd 4224168404Spjd /* 4225168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 4226168404Spjd * to commit the transaction group. 4227168404Spjd * 4228185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 4229185029Spjd * random top-level vdevs that are known to be visible in the 4230185029Spjd * config cache (see spa_vdev_add() for a complete description). 4231185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 4232168404Spjd */ 4233185029Spjd for (;;) { 4234185029Spjd /* 4235185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 4236185029Spjd * while we're attempting to write the vdev labels. 4237185029Spjd */ 4238185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4239168404Spjd 4240185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 4241185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 4242185029Spjd int svdcount = 0; 4243185029Spjd int children = rvd->vdev_children; 4244185029Spjd int c0 = spa_get_random(children); 4245185029Spjd int c; 4246185029Spjd 4247185029Spjd for (c = 0; c < children; c++) { 4248185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 4249185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4250185029Spjd continue; 4251185029Spjd svd[svdcount++] = vd; 4252185029Spjd if (svdcount == SPA_DVAS_PER_BP) 4253185029Spjd break; 4254185029Spjd } 4255213198Smm error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 4256213198Smm if (error != 0) 4257213198Smm error = vdev_config_sync(svd, svdcount, txg, 4258213198Smm B_TRUE); 4259185029Spjd } else { 4260185029Spjd error = vdev_config_sync(rvd->vdev_child, 4261213198Smm rvd->vdev_children, txg, B_FALSE); 4262213198Smm if (error != 0) 4263213198Smm error = vdev_config_sync(rvd->vdev_child, 4264213198Smm rvd->vdev_children, txg, B_TRUE); 4265168404Spjd } 4266185029Spjd 4267185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 4268185029Spjd 4269185029Spjd if (error == 0) 4270185029Spjd break; 4271185029Spjd zio_suspend(spa, NULL); 4272185029Spjd zio_resume_wait(spa); 4273168404Spjd } 4274168404Spjd dmu_tx_commit(tx); 4275168404Spjd 4276168404Spjd /* 4277168404Spjd * Clear the dirty config list. 4278168404Spjd */ 4279185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 4280168404Spjd vdev_config_clean(vd); 4281168404Spjd 4282168404Spjd /* 4283168404Spjd * Now that the new config has synced transactionally, 4284168404Spjd * let it become visible to the config cache. 4285168404Spjd */ 4286168404Spjd if (spa->spa_config_syncing != NULL) { 4287168404Spjd spa_config_set(spa, spa->spa_config_syncing); 4288168404Spjd spa->spa_config_txg = txg; 4289168404Spjd spa->spa_config_syncing = NULL; 4290168404Spjd } 4291168404Spjd 4292168404Spjd spa->spa_ubsync = spa->spa_uberblock; 4293168404Spjd 4294168404Spjd /* 4295168404Spjd * Clean up the ZIL records for the synced txg. 4296168404Spjd */ 4297168404Spjd dsl_pool_zil_clean(dp); 4298168404Spjd 4299168404Spjd /* 4300168404Spjd * Update usable space statistics. 4301168404Spjd */ 4302168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4303168404Spjd vdev_sync_done(vd, txg); 4304168404Spjd 4305168404Spjd /* 4306168404Spjd * It had better be the case that we didn't dirty anything 4307168404Spjd * since vdev_config_sync(). 4308168404Spjd */ 4309168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4310168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4311168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4312168404Spjd ASSERT(bpl->bpl_queue == NULL); 4313168404Spjd 4314185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 4315168404Spjd 4316168404Spjd /* 4317168404Spjd * If any async tasks have been requested, kick them off. 4318168404Spjd */ 4319168404Spjd spa_async_dispatch(spa); 4320168404Spjd} 4321168404Spjd 4322168404Spjd/* 4323168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 4324168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 4325168404Spjd * sync. 4326168404Spjd */ 4327168404Spjdvoid 4328168404Spjdspa_sync_allpools(void) 4329168404Spjd{ 4330168404Spjd spa_t *spa = NULL; 4331168404Spjd mutex_enter(&spa_namespace_lock); 4332168404Spjd while ((spa = spa_next(spa)) != NULL) { 4333185029Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 4334168404Spjd continue; 4335168404Spjd spa_open_ref(spa, FTAG); 4336168404Spjd mutex_exit(&spa_namespace_lock); 4337168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 4338168404Spjd mutex_enter(&spa_namespace_lock); 4339168404Spjd spa_close(spa, FTAG); 4340168404Spjd } 4341168404Spjd mutex_exit(&spa_namespace_lock); 4342168404Spjd} 4343168404Spjd 4344168404Spjd/* 4345168404Spjd * ========================================================================== 4346168404Spjd * Miscellaneous routines 4347168404Spjd * ========================================================================== 4348168404Spjd */ 4349168404Spjd 4350168404Spjd/* 4351168404Spjd * Remove all pools in the system. 4352168404Spjd */ 4353168404Spjdvoid 4354168404Spjdspa_evict_all(void) 4355168404Spjd{ 4356168404Spjd spa_t *spa; 4357168404Spjd 4358168404Spjd /* 4359168404Spjd * Remove all cached state. All pools should be closed now, 4360168404Spjd * so every spa in the AVL tree should be unreferenced. 4361168404Spjd */ 4362168404Spjd mutex_enter(&spa_namespace_lock); 4363168404Spjd while ((spa = spa_next(NULL)) != NULL) { 4364168404Spjd /* 4365168404Spjd * Stop async tasks. The async thread may need to detach 4366168404Spjd * a device that's been replaced, which requires grabbing 4367168404Spjd * spa_namespace_lock, so we must drop it here. 4368168404Spjd */ 4369168404Spjd spa_open_ref(spa, FTAG); 4370168404Spjd mutex_exit(&spa_namespace_lock); 4371168404Spjd spa_async_suspend(spa); 4372168404Spjd mutex_enter(&spa_namespace_lock); 4373168404Spjd spa_close(spa, FTAG); 4374168404Spjd 4375168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4376168404Spjd spa_unload(spa); 4377168404Spjd spa_deactivate(spa); 4378168404Spjd } 4379168404Spjd spa_remove(spa); 4380168404Spjd } 4381168404Spjd mutex_exit(&spa_namespace_lock); 4382168404Spjd} 4383168404Spjd 4384168404Spjdvdev_t * 4385209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 4386168404Spjd{ 4387185029Spjd vdev_t *vd; 4388185029Spjd int i; 4389185029Spjd 4390185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4391185029Spjd return (vd); 4392185029Spjd 4393209962Smm if (aux) { 4394185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4395185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 4396185029Spjd if (vd->vdev_guid == guid) 4397185029Spjd return (vd); 4398185029Spjd } 4399209962Smm 4400209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 4401209962Smm vd = spa->spa_spares.sav_vdevs[i]; 4402209962Smm if (vd->vdev_guid == guid) 4403209962Smm return (vd); 4404209962Smm } 4405185029Spjd } 4406185029Spjd 4407185029Spjd return (NULL); 4408168404Spjd} 4409168404Spjd 4410168404Spjdvoid 4411185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 4412168404Spjd{ 4413185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4414168404Spjd 4415168404Spjd /* 4416168404Spjd * This should only be called for a non-faulted pool, and since a 4417168404Spjd * future version would result in an unopenable pool, this shouldn't be 4418168404Spjd * possible. 4419168404Spjd */ 4420185029Spjd ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4421185029Spjd ASSERT(version >= spa->spa_uberblock.ub_version); 4422168404Spjd 4423185029Spjd spa->spa_uberblock.ub_version = version; 4424168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4425168404Spjd 4426185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4427168404Spjd 4428168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 4429168404Spjd} 4430168404Spjd 4431168404Spjdboolean_t 4432168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 4433168404Spjd{ 4434168404Spjd int i; 4435168404Spjd uint64_t spareguid; 4436185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 4437168404Spjd 4438185029Spjd for (i = 0; i < sav->sav_count; i++) 4439185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 4440168404Spjd return (B_TRUE); 4441168404Spjd 4442185029Spjd for (i = 0; i < sav->sav_npending; i++) { 4443185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4444185029Spjd &spareguid) == 0 && spareguid == guid) 4445168404Spjd return (B_TRUE); 4446168404Spjd } 4447168404Spjd 4448168404Spjd return (B_FALSE); 4449168404Spjd} 4450168404Spjd 4451185029Spjd/* 4452185029Spjd * Check if a pool has an active shared spare device. 4453185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 4454185029Spjd */ 4455185029Spjdstatic boolean_t 4456185029Spjdspa_has_active_shared_spare(spa_t *spa) 4457168404Spjd{ 4458185029Spjd int i, refcnt; 4459185029Spjd uint64_t pool; 4460185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 4461185029Spjd 4462185029Spjd for (i = 0; i < sav->sav_count; i++) { 4463185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4464185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4465185029Spjd refcnt > 2) 4466185029Spjd return (B_TRUE); 4467185029Spjd } 4468185029Spjd 4469185029Spjd return (B_FALSE); 4470168404Spjd} 4471168404Spjd 4472185029Spjd/* 4473185029Spjd * Post a sysevent corresponding to the given event. The 'name' must be one of 4474185029Spjd * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4475185029Spjd * filled in from the spa and (optionally) the vdev. This doesn't do anything 4476185029Spjd * in the userland libzpool, as we don't want consumers to misinterpret ztest 4477185029Spjd * or zdb as real changes. 4478185029Spjd */ 4479185029Spjdvoid 4480185029Spjdspa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4481168404Spjd{ 4482185029Spjd#if 0 4483185029Spjd#ifdef _KERNEL 4484185029Spjd sysevent_t *ev; 4485185029Spjd sysevent_attr_list_t *attr = NULL; 4486185029Spjd sysevent_value_t value; 4487185029Spjd sysevent_id_t eid; 4488168404Spjd 4489185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4490185029Spjd SE_SLEEP); 4491168404Spjd 4492185029Spjd value.value_type = SE_DATA_TYPE_STRING; 4493185029Spjd value.value.sv_string = spa_name(spa); 4494185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4495185029Spjd goto done; 4496168404Spjd 4497185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 4498185029Spjd value.value.sv_uint64 = spa_guid(spa); 4499185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4500185029Spjd goto done; 4501168404Spjd 4502185029Spjd if (vd) { 4503185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 4504185029Spjd value.value.sv_uint64 = vd->vdev_guid; 4505185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4506185029Spjd SE_SLEEP) != 0) 4507185029Spjd goto done; 4508168404Spjd 4509185029Spjd if (vd->vdev_path) { 4510185029Spjd value.value_type = SE_DATA_TYPE_STRING; 4511185029Spjd value.value.sv_string = vd->vdev_path; 4512185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4513185029Spjd &value, SE_SLEEP) != 0) 4514185029Spjd goto done; 4515168404Spjd } 4516168404Spjd } 4517168404Spjd 4518185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 4519185029Spjd goto done; 4520185029Spjd attr = NULL; 4521168404Spjd 4522185029Spjd (void) log_sysevent(ev, SE_SLEEP, &eid); 4523185029Spjd 4524185029Spjddone: 4525185029Spjd if (attr) 4526185029Spjd sysevent_free_attr(attr); 4527185029Spjd sysevent_free(ev); 4528185029Spjd#endif 4529185029Spjd#endif 4530168404Spjd} 4531