spa.c revision 253993
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24249195Smm * Copyright (c) 2013 by Delphix. All rights reserved. 25249188Smm * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 26247265Smm * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30251629Sdelphij * SPA: Storage Pool Allocator 31251629Sdelphij * 32168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 33168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 34168404Spjd * pool. 35168404Spjd */ 36168404Spjd 37168404Spjd#include <sys/zfs_context.h> 38168404Spjd#include <sys/fm/fs/zfs.h> 39168404Spjd#include <sys/spa_impl.h> 40168404Spjd#include <sys/zio.h> 41168404Spjd#include <sys/zio_checksum.h> 42168404Spjd#include <sys/dmu.h> 43168404Spjd#include <sys/dmu_tx.h> 44168404Spjd#include <sys/zap.h> 45168404Spjd#include <sys/zil.h> 46219089Spjd#include <sys/ddt.h> 47168404Spjd#include <sys/vdev_impl.h> 48168404Spjd#include <sys/metaslab.h> 49219089Spjd#include <sys/metaslab_impl.h> 50168404Spjd#include <sys/uberblock_impl.h> 51168404Spjd#include <sys/txg.h> 52168404Spjd#include <sys/avl.h> 53168404Spjd#include <sys/dmu_traverse.h> 54168404Spjd#include <sys/dmu_objset.h> 55168404Spjd#include <sys/unique.h> 56168404Spjd#include <sys/dsl_pool.h> 57168404Spjd#include <sys/dsl_dataset.h> 58168404Spjd#include <sys/dsl_dir.h> 59168404Spjd#include <sys/dsl_prop.h> 60168404Spjd#include <sys/dsl_synctask.h> 61168404Spjd#include <sys/fs/zfs.h> 62185029Spjd#include <sys/arc.h> 63168404Spjd#include <sys/callb.h> 64185029Spjd#include <sys/spa_boot.h> 65219089Spjd#include <sys/zfs_ioctl.h> 66219089Spjd#include <sys/dsl_scan.h> 67248571Smm#include <sys/dmu_send.h> 68248571Smm#include <sys/dsl_destroy.h> 69248571Smm#include <sys/dsl_userhold.h> 70236884Smm#include <sys/zfeature.h> 71219089Spjd#include <sys/zvol.h> 72240868Spjd#include <sys/trim_map.h> 73168404Spjd 74219089Spjd#ifdef _KERNEL 75219089Spjd#include <sys/callb.h> 76219089Spjd#include <sys/cpupart.h> 77219089Spjd#include <sys/zone.h> 78219089Spjd#endif /* _KERNEL */ 79219089Spjd 80185029Spjd#include "zfs_prop.h" 81185029Spjd#include "zfs_comutil.h" 82168404Spjd 83204073Spjd/* Check hostid on import? */ 84204073Spjdstatic int check_hostid = 1; 85204073Spjd 86204073SpjdSYSCTL_DECL(_vfs_zfs); 87204073SpjdTUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 88204073SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 89204073Spjd "Check hostid on import?"); 90204073Spjd 91251636Sdelphij/* 92251636Sdelphij * The interval, in seconds, at which failed configuration cache file writes 93251636Sdelphij * should be retried. 94251636Sdelphij */ 95251636Sdelphijstatic int zfs_ccw_retry_interval = 300; 96251636Sdelphij 97219089Spjdtypedef enum zti_modes { 98209962Smm zti_mode_fixed, /* value is # of threads (min 1) */ 99209962Smm zti_mode_online_percent, /* value is % of online CPUs */ 100219089Spjd zti_mode_batch, /* cpu-intensive; value is ignored */ 101211931Smm zti_mode_null, /* don't create a taskq */ 102209962Smm zti_nmodes 103219089Spjd} zti_modes_t; 104168712Spjd 105211931Smm#define ZTI_FIX(n) { zti_mode_fixed, (n) } 106211931Smm#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 107219089Spjd#define ZTI_BATCH { zti_mode_batch, 0 } 108211931Smm#define ZTI_NULL { zti_mode_null, 0 } 109209962Smm 110211931Smm#define ZTI_ONE ZTI_FIX(1) 111209962Smm 112209962Smmtypedef struct zio_taskq_info { 113211931Smm enum zti_modes zti_mode; 114211931Smm uint_t zti_value; 115209962Smm} zio_taskq_info_t; 116209962Smm 117209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 118219089Spjd "issue", "issue_high", "intr", "intr_high" 119209962Smm}; 120209962Smm 121211931Smm/* 122211931Smm * Define the taskq threads for the following I/O types: 123211931Smm * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 124211931Smm */ 125211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 126211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 127211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 128219089Spjd { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 129219089Spjd { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 130219089Spjd { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 131211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 132211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 133209962Smm}; 134209962Smm 135248571Smmstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 136248571Smmstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 137185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 138219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 139219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 140219089Spjd char **ereport); 141219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 142185029Spjd 143219089Spjduint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 144219089Spjd#ifdef PSRSET_BIND 145219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 146219089Spjd#endif 147219089Spjd#ifdef SYSDC 148219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 149219089Spjd#endif 150219089Spjduint_t zio_taskq_basedc = 80; /* base duty cycle */ 151219089Spjd 152219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 153243503Smmextern int zfs_sync_pass_deferred_free; 154219089Spjd 155247265Smm#ifndef illumos 156247265Smmextern void spa_deadman(void *arg); 157247265Smm#endif 158247265Smm 159168404Spjd/* 160219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 161219089Spjd * to get the vdev stats associated with the imported devices. 162219089Spjd */ 163219089Spjd#define TRYIMPORT_NAME "$import" 164219089Spjd 165219089Spjd/* 166168404Spjd * ========================================================================== 167185029Spjd * SPA properties routines 168185029Spjd * ========================================================================== 169185029Spjd */ 170185029Spjd 171185029Spjd/* 172185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 173185029Spjd */ 174185029Spjdstatic void 175185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 176185029Spjd uint64_t intval, zprop_source_t src) 177185029Spjd{ 178185029Spjd const char *propname = zpool_prop_to_name(prop); 179185029Spjd nvlist_t *propval; 180185029Spjd 181185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 182185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 183185029Spjd 184185029Spjd if (strval != NULL) 185185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 186185029Spjd else 187185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 188185029Spjd 189185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 190185029Spjd nvlist_free(propval); 191185029Spjd} 192185029Spjd 193185029Spjd/* 194185029Spjd * Get property values from the spa configuration. 195185029Spjd */ 196185029Spjdstatic void 197185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 198185029Spjd{ 199236155Smm vdev_t *rvd = spa->spa_root_vdev; 200236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 201209962Smm uint64_t size; 202219089Spjd uint64_t alloc; 203236155Smm uint64_t space; 204185029Spjd uint64_t cap, version; 205185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 206185029Spjd spa_config_dirent_t *dp; 207185029Spjd 208185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 209185029Spjd 210236155Smm if (rvd != NULL) { 211219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 212219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 213209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 214209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 215219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 216219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 217219089Spjd size - alloc, src); 218236155Smm 219236155Smm space = 0; 220236155Smm for (int c = 0; c < rvd->vdev_children; c++) { 221236155Smm vdev_t *tvd = rvd->vdev_child[c]; 222236155Smm space += tvd->vdev_max_asize - tvd->vdev_asize; 223236155Smm } 224236155Smm spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 225236155Smm src); 226236155Smm 227219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 228219089Spjd (spa_mode(spa) == FREAD), src); 229185029Spjd 230219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 231209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 232185029Spjd 233219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 234219089Spjd ddt_get_pool_dedup_ratio(spa), src); 235219089Spjd 236209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 237236155Smm rvd->vdev_state, src); 238209962Smm 239209962Smm version = spa_version(spa); 240209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 241209962Smm src = ZPROP_SRC_DEFAULT; 242209962Smm else 243209962Smm src = ZPROP_SRC_LOCAL; 244209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 245209962Smm } 246209962Smm 247236884Smm if (pool != NULL) { 248236884Smm dsl_dir_t *freedir = pool->dp_free_dir; 249236884Smm 250236884Smm /* 251236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 252236884Smm * when opening pools before this version freedir will be NULL. 253236884Smm */ 254236884Smm if (freedir != NULL) { 255236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 256236884Smm freedir->dd_phys->dd_used_bytes, src); 257236884Smm } else { 258236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 259236884Smm NULL, 0, src); 260236884Smm } 261236884Smm } 262236884Smm 263185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 264185029Spjd 265228103Smm if (spa->spa_comment != NULL) { 266228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 267228103Smm 0, ZPROP_SRC_LOCAL); 268228103Smm } 269228103Smm 270185029Spjd if (spa->spa_root != NULL) 271185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 272185029Spjd 0, ZPROP_SRC_LOCAL); 273185029Spjd 274185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 275185029Spjd if (dp->scd_path == NULL) { 276185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 277185029Spjd "none", 0, ZPROP_SRC_LOCAL); 278185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 279185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 280185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 281185029Spjd } 282185029Spjd } 283185029Spjd} 284185029Spjd 285185029Spjd/* 286185029Spjd * Get zpool property values. 287185029Spjd */ 288185029Spjdint 289185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 290185029Spjd{ 291219089Spjd objset_t *mos = spa->spa_meta_objset; 292185029Spjd zap_cursor_t zc; 293185029Spjd zap_attribute_t za; 294185029Spjd int err; 295185029Spjd 296185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 297185029Spjd 298185029Spjd mutex_enter(&spa->spa_props_lock); 299185029Spjd 300185029Spjd /* 301185029Spjd * Get properties from the spa config. 302185029Spjd */ 303185029Spjd spa_prop_get_config(spa, nvp); 304185029Spjd 305185029Spjd /* If no pool property object, no more prop to get. */ 306219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 307185029Spjd mutex_exit(&spa->spa_props_lock); 308185029Spjd return (0); 309185029Spjd } 310185029Spjd 311185029Spjd /* 312185029Spjd * Get properties from the MOS pool property object. 313185029Spjd */ 314185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 315185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 316185029Spjd zap_cursor_advance(&zc)) { 317185029Spjd uint64_t intval = 0; 318185029Spjd char *strval = NULL; 319185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 320185029Spjd zpool_prop_t prop; 321185029Spjd 322185029Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 323185029Spjd continue; 324185029Spjd 325185029Spjd switch (za.za_integer_length) { 326185029Spjd case 8: 327185029Spjd /* integer property */ 328185029Spjd if (za.za_first_integer != 329185029Spjd zpool_prop_default_numeric(prop)) 330185029Spjd src = ZPROP_SRC_LOCAL; 331185029Spjd 332185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 333185029Spjd dsl_pool_t *dp; 334185029Spjd dsl_dataset_t *ds = NULL; 335185029Spjd 336185029Spjd dp = spa_get_dsl(spa); 337248571Smm dsl_pool_config_enter(dp, FTAG); 338185029Spjd if (err = dsl_dataset_hold_obj(dp, 339185029Spjd za.za_first_integer, FTAG, &ds)) { 340248571Smm dsl_pool_config_exit(dp, FTAG); 341185029Spjd break; 342185029Spjd } 343185029Spjd 344185029Spjd strval = kmem_alloc( 345185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 346185029Spjd KM_SLEEP); 347185029Spjd dsl_dataset_name(ds, strval); 348185029Spjd dsl_dataset_rele(ds, FTAG); 349248571Smm dsl_pool_config_exit(dp, FTAG); 350185029Spjd } else { 351185029Spjd strval = NULL; 352185029Spjd intval = za.za_first_integer; 353185029Spjd } 354185029Spjd 355185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 356185029Spjd 357185029Spjd if (strval != NULL) 358185029Spjd kmem_free(strval, 359185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 360185029Spjd 361185029Spjd break; 362185029Spjd 363185029Spjd case 1: 364185029Spjd /* string property */ 365185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 366185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 367185029Spjd za.za_name, 1, za.za_num_integers, strval); 368185029Spjd if (err) { 369185029Spjd kmem_free(strval, za.za_num_integers); 370185029Spjd break; 371185029Spjd } 372185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 373185029Spjd kmem_free(strval, za.za_num_integers); 374185029Spjd break; 375185029Spjd 376185029Spjd default: 377185029Spjd break; 378185029Spjd } 379185029Spjd } 380185029Spjd zap_cursor_fini(&zc); 381185029Spjd mutex_exit(&spa->spa_props_lock); 382185029Spjdout: 383185029Spjd if (err && err != ENOENT) { 384185029Spjd nvlist_free(*nvp); 385185029Spjd *nvp = NULL; 386185029Spjd return (err); 387185029Spjd } 388185029Spjd 389185029Spjd return (0); 390185029Spjd} 391185029Spjd 392185029Spjd/* 393185029Spjd * Validate the given pool properties nvlist and modify the list 394185029Spjd * for the property values to be set. 395185029Spjd */ 396185029Spjdstatic int 397185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 398185029Spjd{ 399185029Spjd nvpair_t *elem; 400185029Spjd int error = 0, reset_bootfs = 0; 401247187Smm uint64_t objnum = 0; 402236884Smm boolean_t has_feature = B_FALSE; 403185029Spjd 404185029Spjd elem = NULL; 405185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 406185029Spjd uint64_t intval; 407236884Smm char *strval, *slash, *check, *fname; 408236884Smm const char *propname = nvpair_name(elem); 409236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 410185029Spjd 411236884Smm switch (prop) { 412236884Smm case ZPROP_INVAL: 413236884Smm if (!zpool_prop_feature(propname)) { 414249195Smm error = SET_ERROR(EINVAL); 415236884Smm break; 416236884Smm } 417185029Spjd 418236884Smm /* 419236884Smm * Sanitize the input. 420236884Smm */ 421236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 422249195Smm error = SET_ERROR(EINVAL); 423236884Smm break; 424236884Smm } 425185029Spjd 426236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 427249195Smm error = SET_ERROR(EINVAL); 428236884Smm break; 429236884Smm } 430236884Smm 431236884Smm if (intval != 0) { 432249195Smm error = SET_ERROR(EINVAL); 433236884Smm break; 434236884Smm } 435236884Smm 436236884Smm fname = strchr(propname, '@') + 1; 437236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 438249195Smm error = SET_ERROR(EINVAL); 439236884Smm break; 440236884Smm } 441236884Smm 442236884Smm has_feature = B_TRUE; 443236884Smm break; 444236884Smm 445185029Spjd case ZPOOL_PROP_VERSION: 446185029Spjd error = nvpair_value_uint64(elem, &intval); 447185029Spjd if (!error && 448236884Smm (intval < spa_version(spa) || 449236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 450236884Smm has_feature)) 451249195Smm error = SET_ERROR(EINVAL); 452185029Spjd break; 453185029Spjd 454185029Spjd case ZPOOL_PROP_DELEGATION: 455185029Spjd case ZPOOL_PROP_AUTOREPLACE: 456185029Spjd case ZPOOL_PROP_LISTSNAPS: 457219089Spjd case ZPOOL_PROP_AUTOEXPAND: 458185029Spjd error = nvpair_value_uint64(elem, &intval); 459185029Spjd if (!error && intval > 1) 460249195Smm error = SET_ERROR(EINVAL); 461185029Spjd break; 462185029Spjd 463185029Spjd case ZPOOL_PROP_BOOTFS: 464209962Smm /* 465209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 466209962Smm * or the pool is still being created (version == 0), 467209962Smm * the bootfs property cannot be set. 468209962Smm */ 469185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 470249195Smm error = SET_ERROR(ENOTSUP); 471185029Spjd break; 472185029Spjd } 473185029Spjd 474185029Spjd /* 475185029Spjd * Make sure the vdev config is bootable 476185029Spjd */ 477185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 478249195Smm error = SET_ERROR(ENOTSUP); 479185029Spjd break; 480185029Spjd } 481185029Spjd 482185029Spjd reset_bootfs = 1; 483185029Spjd 484185029Spjd error = nvpair_value_string(elem, &strval); 485185029Spjd 486185029Spjd if (!error) { 487236884Smm objset_t *os; 488185029Spjd uint64_t compress; 489185029Spjd 490185029Spjd if (strval == NULL || strval[0] == '\0') { 491185029Spjd objnum = zpool_prop_default_numeric( 492185029Spjd ZPOOL_PROP_BOOTFS); 493185029Spjd break; 494185029Spjd } 495185029Spjd 496219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 497185029Spjd break; 498185029Spjd 499219089Spjd /* Must be ZPL and not gzip compressed. */ 500219089Spjd 501219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 502249195Smm error = SET_ERROR(ENOTSUP); 503248571Smm } else if ((error = 504248571Smm dsl_prop_get_int_ds(dmu_objset_ds(os), 505185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 506248571Smm &compress)) == 0 && 507185029Spjd !BOOTFS_COMPRESS_VALID(compress)) { 508249195Smm error = SET_ERROR(ENOTSUP); 509185029Spjd } else { 510185029Spjd objnum = dmu_objset_id(os); 511185029Spjd } 512219089Spjd dmu_objset_rele(os, FTAG); 513185029Spjd } 514185029Spjd break; 515185029Spjd 516185029Spjd case ZPOOL_PROP_FAILUREMODE: 517185029Spjd error = nvpair_value_uint64(elem, &intval); 518185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 519185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 520249195Smm error = SET_ERROR(EINVAL); 521185029Spjd 522185029Spjd /* 523185029Spjd * This is a special case which only occurs when 524185029Spjd * the pool has completely failed. This allows 525185029Spjd * the user to change the in-core failmode property 526185029Spjd * without syncing it out to disk (I/Os might 527185029Spjd * currently be blocked). We do this by returning 528185029Spjd * EIO to the caller (spa_prop_set) to trick it 529185029Spjd * into thinking we encountered a property validation 530185029Spjd * error. 531185029Spjd */ 532185029Spjd if (!error && spa_suspended(spa)) { 533185029Spjd spa->spa_failmode = intval; 534249195Smm error = SET_ERROR(EIO); 535185029Spjd } 536185029Spjd break; 537185029Spjd 538185029Spjd case ZPOOL_PROP_CACHEFILE: 539185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 540185029Spjd break; 541185029Spjd 542185029Spjd if (strval[0] == '\0') 543185029Spjd break; 544185029Spjd 545185029Spjd if (strcmp(strval, "none") == 0) 546185029Spjd break; 547185029Spjd 548185029Spjd if (strval[0] != '/') { 549249195Smm error = SET_ERROR(EINVAL); 550185029Spjd break; 551185029Spjd } 552185029Spjd 553185029Spjd slash = strrchr(strval, '/'); 554185029Spjd ASSERT(slash != NULL); 555185029Spjd 556185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 557185029Spjd strcmp(slash, "/..") == 0) 558249195Smm error = SET_ERROR(EINVAL); 559185029Spjd break; 560219089Spjd 561228103Smm case ZPOOL_PROP_COMMENT: 562228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 563228103Smm break; 564228103Smm for (check = strval; *check != '\0'; check++) { 565228103Smm /* 566228103Smm * The kernel doesn't have an easy isprint() 567228103Smm * check. For this kernel check, we merely 568228103Smm * check ASCII apart from DEL. Fix this if 569228103Smm * there is an easy-to-use kernel isprint(). 570228103Smm */ 571228103Smm if (*check >= 0x7f) { 572249195Smm error = SET_ERROR(EINVAL); 573228103Smm break; 574228103Smm } 575228103Smm check++; 576228103Smm } 577228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 578228103Smm error = E2BIG; 579228103Smm break; 580228103Smm 581219089Spjd case ZPOOL_PROP_DEDUPDITTO: 582219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 583249195Smm error = SET_ERROR(ENOTSUP); 584219089Spjd else 585219089Spjd error = nvpair_value_uint64(elem, &intval); 586219089Spjd if (error == 0 && 587219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 588249195Smm error = SET_ERROR(EINVAL); 589219089Spjd break; 590185029Spjd } 591185029Spjd 592185029Spjd if (error) 593185029Spjd break; 594185029Spjd } 595185029Spjd 596185029Spjd if (!error && reset_bootfs) { 597185029Spjd error = nvlist_remove(props, 598185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 599185029Spjd 600185029Spjd if (!error) { 601185029Spjd error = nvlist_add_uint64(props, 602185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 603185029Spjd } 604185029Spjd } 605185029Spjd 606185029Spjd return (error); 607185029Spjd} 608185029Spjd 609209962Smmvoid 610209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 611209962Smm{ 612209962Smm char *cachefile; 613209962Smm spa_config_dirent_t *dp; 614209962Smm 615209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 616209962Smm &cachefile) != 0) 617209962Smm return; 618209962Smm 619209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 620209962Smm KM_SLEEP); 621209962Smm 622209962Smm if (cachefile[0] == '\0') 623209962Smm dp->scd_path = spa_strdup(spa_config_path); 624209962Smm else if (strcmp(cachefile, "none") == 0) 625209962Smm dp->scd_path = NULL; 626209962Smm else 627209962Smm dp->scd_path = spa_strdup(cachefile); 628209962Smm 629209962Smm list_insert_head(&spa->spa_config_list, dp); 630209962Smm if (need_sync) 631209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 632209962Smm} 633209962Smm 634185029Spjdint 635185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 636185029Spjd{ 637185029Spjd int error; 638236884Smm nvpair_t *elem = NULL; 639209962Smm boolean_t need_sync = B_FALSE; 640185029Spjd 641185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 642185029Spjd return (error); 643185029Spjd 644209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 645236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 646209962Smm 647219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 648219089Spjd prop == ZPOOL_PROP_ALTROOT || 649219089Spjd prop == ZPOOL_PROP_READONLY) 650209962Smm continue; 651209962Smm 652236884Smm if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 653236884Smm uint64_t ver; 654236884Smm 655236884Smm if (prop == ZPOOL_PROP_VERSION) { 656236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 657236884Smm } else { 658236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 659236884Smm ver = SPA_VERSION_FEATURES; 660236884Smm need_sync = B_TRUE; 661236884Smm } 662236884Smm 663236884Smm /* Save time if the version is already set. */ 664236884Smm if (ver == spa_version(spa)) 665236884Smm continue; 666236884Smm 667236884Smm /* 668236884Smm * In addition to the pool directory object, we might 669236884Smm * create the pool properties object, the features for 670236884Smm * read object, the features for write object, or the 671236884Smm * feature descriptions object. 672236884Smm */ 673248571Smm error = dsl_sync_task(spa->spa_name, NULL, 674248571Smm spa_sync_version, &ver, 6); 675236884Smm if (error) 676236884Smm return (error); 677236884Smm continue; 678236884Smm } 679236884Smm 680209962Smm need_sync = B_TRUE; 681209962Smm break; 682209962Smm } 683209962Smm 684236884Smm if (need_sync) { 685248571Smm return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 686248571Smm nvp, 6)); 687236884Smm } 688236884Smm 689236884Smm return (0); 690185029Spjd} 691185029Spjd 692185029Spjd/* 693185029Spjd * If the bootfs property value is dsobj, clear it. 694185029Spjd */ 695185029Spjdvoid 696185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 697185029Spjd{ 698185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 699185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 700185029Spjd spa->spa_pool_props_object, 701185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 702185029Spjd spa->spa_bootfs = 0; 703185029Spjd } 704185029Spjd} 705185029Spjd 706239620Smm/*ARGSUSED*/ 707239620Smmstatic int 708248571Smmspa_change_guid_check(void *arg, dmu_tx_t *tx) 709239620Smm{ 710248571Smm uint64_t *newguid = arg; 711248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 712239620Smm vdev_t *rvd = spa->spa_root_vdev; 713239620Smm uint64_t vdev_state; 714239620Smm 715239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 716239620Smm vdev_state = rvd->vdev_state; 717239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 718239620Smm 719239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 720249195Smm return (SET_ERROR(ENXIO)); 721239620Smm 722239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 723239620Smm 724239620Smm return (0); 725239620Smm} 726239620Smm 727239620Smmstatic void 728248571Smmspa_change_guid_sync(void *arg, dmu_tx_t *tx) 729239620Smm{ 730248571Smm uint64_t *newguid = arg; 731248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 732239620Smm uint64_t oldguid; 733239620Smm vdev_t *rvd = spa->spa_root_vdev; 734239620Smm 735239620Smm oldguid = spa_guid(spa); 736239620Smm 737239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 738239620Smm rvd->vdev_guid = *newguid; 739239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 740239620Smm vdev_config_dirty(rvd); 741239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 742239620Smm 743248571Smm spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 744239620Smm oldguid, *newguid); 745239620Smm} 746239620Smm 747185029Spjd/* 748228103Smm * Change the GUID for the pool. This is done so that we can later 749228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 750228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 751228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 752228103Smm * online when we do this, or else any vdevs that weren't present 753228103Smm * would be orphaned from our pool. We are also going to issue a 754228103Smm * sysevent to update any watchers. 755228103Smm */ 756228103Smmint 757228103Smmspa_change_guid(spa_t *spa) 758228103Smm{ 759239620Smm int error; 760239620Smm uint64_t guid; 761228103Smm 762239620Smm mutex_enter(&spa_namespace_lock); 763239620Smm guid = spa_generate_guid(NULL); 764228103Smm 765248571Smm error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 766248571Smm spa_change_guid_sync, &guid, 5); 767228103Smm 768239620Smm if (error == 0) { 769239620Smm spa_config_sync(spa, B_FALSE, B_TRUE); 770239620Smm spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 771239620Smm } 772228103Smm 773239620Smm mutex_exit(&spa_namespace_lock); 774228103Smm 775239620Smm return (error); 776228103Smm} 777228103Smm 778228103Smm/* 779185029Spjd * ========================================================================== 780168404Spjd * SPA state manipulation (open/create/destroy/import/export) 781168404Spjd * ========================================================================== 782168404Spjd */ 783168404Spjd 784168404Spjdstatic int 785168404Spjdspa_error_entry_compare(const void *a, const void *b) 786168404Spjd{ 787168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 788168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 789168404Spjd int ret; 790168404Spjd 791168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 792168404Spjd sizeof (zbookmark_t)); 793168404Spjd 794168404Spjd if (ret < 0) 795168404Spjd return (-1); 796168404Spjd else if (ret > 0) 797168404Spjd return (1); 798168404Spjd else 799168404Spjd return (0); 800168404Spjd} 801168404Spjd 802168404Spjd/* 803168404Spjd * Utility function which retrieves copies of the current logs and 804168404Spjd * re-initializes them in the process. 805168404Spjd */ 806168404Spjdvoid 807168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 808168404Spjd{ 809168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 810168404Spjd 811168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 812168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 813168404Spjd 814168404Spjd avl_create(&spa->spa_errlist_scrub, 815168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 816168404Spjd offsetof(spa_error_entry_t, se_avl)); 817168404Spjd avl_create(&spa->spa_errlist_last, 818168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 819168404Spjd offsetof(spa_error_entry_t, se_avl)); 820168404Spjd} 821168404Spjd 822219089Spjdstatic taskq_t * 823219089Spjdspa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 824219089Spjd uint_t value) 825168404Spjd{ 826219089Spjd uint_t flags = TASKQ_PREPOPULATE; 827219089Spjd boolean_t batch = B_FALSE; 828168404Spjd 829219089Spjd switch (mode) { 830219089Spjd case zti_mode_null: 831219089Spjd return (NULL); /* no taskq needed */ 832168404Spjd 833219089Spjd case zti_mode_fixed: 834219089Spjd ASSERT3U(value, >=, 1); 835219089Spjd value = MAX(value, 1); 836219089Spjd break; 837168404Spjd 838219089Spjd case zti_mode_batch: 839219089Spjd batch = B_TRUE; 840219089Spjd flags |= TASKQ_THREADS_CPU_PCT; 841219089Spjd value = zio_taskq_batch_pct; 842219089Spjd break; 843219089Spjd 844219089Spjd case zti_mode_online_percent: 845219089Spjd flags |= TASKQ_THREADS_CPU_PCT; 846219089Spjd break; 847219089Spjd 848219089Spjd default: 849219089Spjd panic("unrecognized mode for %s taskq (%u:%u) in " 850219089Spjd "spa_activate()", 851219089Spjd name, mode, value); 852219089Spjd break; 853219089Spjd } 854219089Spjd 855219089Spjd#ifdef SYSDC 856219089Spjd if (zio_taskq_sysdc && spa->spa_proc != &p0) { 857219089Spjd if (batch) 858219089Spjd flags |= TASKQ_DC_BATCH; 859219089Spjd 860219089Spjd return (taskq_create_sysdc(name, value, 50, INT_MAX, 861219089Spjd spa->spa_proc, zio_taskq_basedc, flags)); 862219089Spjd } 863219089Spjd#endif 864219089Spjd return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 865219089Spjd spa->spa_proc, flags)); 866219089Spjd} 867219089Spjd 868219089Spjdstatic void 869219089Spjdspa_create_zio_taskqs(spa_t *spa) 870219089Spjd{ 871185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 872185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 873211931Smm const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 874211931Smm enum zti_modes mode = ztip->zti_mode; 875211931Smm uint_t value = ztip->zti_value; 876209962Smm char name[32]; 877209962Smm 878209962Smm (void) snprintf(name, sizeof (name), 879211931Smm "%s_%s", zio_type_name[t], zio_taskq_types[q]); 880209962Smm 881219089Spjd spa->spa_zio_taskq[t][q] = 882219089Spjd spa_taskq_create(spa, name, mode, value); 883219089Spjd } 884219089Spjd } 885219089Spjd} 886209962Smm 887219089Spjd#ifdef _KERNEL 888219089Spjd#ifdef SPA_PROCESS 889219089Spjdstatic void 890219089Spjdspa_thread(void *arg) 891219089Spjd{ 892219089Spjd callb_cpr_t cprinfo; 893209962Smm 894219089Spjd spa_t *spa = arg; 895219089Spjd user_t *pu = PTOU(curproc); 896209962Smm 897219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 898219089Spjd spa->spa_name); 899209962Smm 900219089Spjd ASSERT(curproc != &p0); 901219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 902219089Spjd "zpool-%s", spa->spa_name); 903219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 904211931Smm 905219089Spjd#ifdef PSRSET_BIND 906219089Spjd /* bind this thread to the requested psrset */ 907219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 908219089Spjd pool_lock(); 909219089Spjd mutex_enter(&cpu_lock); 910219089Spjd mutex_enter(&pidlock); 911219089Spjd mutex_enter(&curproc->p_lock); 912219089Spjd 913219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 914219089Spjd 0, NULL, NULL) == 0) { 915219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 916219089Spjd } else { 917219089Spjd cmn_err(CE_WARN, 918219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 919219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 920219089Spjd } 921219089Spjd 922219089Spjd mutex_exit(&curproc->p_lock); 923219089Spjd mutex_exit(&pidlock); 924219089Spjd mutex_exit(&cpu_lock); 925219089Spjd pool_unlock(); 926219089Spjd } 927219089Spjd#endif 928219089Spjd 929219089Spjd#ifdef SYSDC 930219089Spjd if (zio_taskq_sysdc) { 931219089Spjd sysdc_thread_enter(curthread, 100, 0); 932219089Spjd } 933219089Spjd#endif 934219089Spjd 935219089Spjd spa->spa_proc = curproc; 936219089Spjd spa->spa_did = curthread->t_did; 937219089Spjd 938219089Spjd spa_create_zio_taskqs(spa); 939219089Spjd 940219089Spjd mutex_enter(&spa->spa_proc_lock); 941219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 942219089Spjd 943219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 944219089Spjd cv_broadcast(&spa->spa_proc_cv); 945219089Spjd 946219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 947219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 948219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 949219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 950219089Spjd 951219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 952219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 953219089Spjd spa->spa_proc = &p0; 954219089Spjd cv_broadcast(&spa->spa_proc_cv); 955219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 956219089Spjd 957219089Spjd mutex_enter(&curproc->p_lock); 958219089Spjd lwp_exit(); 959219089Spjd} 960219089Spjd#endif /* SPA_PROCESS */ 961219089Spjd#endif 962219089Spjd 963219089Spjd/* 964219089Spjd * Activate an uninitialized pool. 965219089Spjd */ 966219089Spjdstatic void 967219089Spjdspa_activate(spa_t *spa, int mode) 968219089Spjd{ 969219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 970219089Spjd 971219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 972219089Spjd spa->spa_mode = mode; 973219089Spjd 974219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 975219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 976219089Spjd 977219089Spjd /* Try to create a covering process */ 978219089Spjd mutex_enter(&spa->spa_proc_lock); 979219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 980219089Spjd ASSERT(spa->spa_proc == &p0); 981219089Spjd spa->spa_did = 0; 982219089Spjd 983219089Spjd#ifdef SPA_PROCESS 984219089Spjd /* Only create a process if we're going to be around a while. */ 985219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 986219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 987219089Spjd NULL, 0) == 0) { 988219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 989219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 990219089Spjd cv_wait(&spa->spa_proc_cv, 991219089Spjd &spa->spa_proc_lock); 992209962Smm } 993219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 994219089Spjd ASSERT(spa->spa_proc != &p0); 995219089Spjd ASSERT(spa->spa_did != 0); 996219089Spjd } else { 997219089Spjd#ifdef _KERNEL 998219089Spjd cmn_err(CE_WARN, 999219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 1000219089Spjd spa->spa_name); 1001219089Spjd#endif 1002185029Spjd } 1003168404Spjd } 1004219089Spjd#endif /* SPA_PROCESS */ 1005219089Spjd mutex_exit(&spa->spa_proc_lock); 1006168404Spjd 1007219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1008219089Spjd ASSERT(spa->spa_proc == &p0); 1009219089Spjd if (spa->spa_proc == &p0) { 1010219089Spjd spa_create_zio_taskqs(spa); 1011219089Spjd } 1012219089Spjd 1013240868Spjd /* 1014240868Spjd * Start TRIM thread. 1015240868Spjd */ 1016240868Spjd trim_thread_create(spa); 1017240868Spjd 1018185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1019185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1020185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1021185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1022168404Spjd 1023168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 1024168404Spjd offsetof(struct vdev, vdev_txg_node)); 1025168404Spjd 1026168404Spjd avl_create(&spa->spa_errlist_scrub, 1027168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1028168404Spjd offsetof(spa_error_entry_t, se_avl)); 1029168404Spjd avl_create(&spa->spa_errlist_last, 1030168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1031168404Spjd offsetof(spa_error_entry_t, se_avl)); 1032168404Spjd} 1033168404Spjd 1034168404Spjd/* 1035168404Spjd * Opposite of spa_activate(). 1036168404Spjd */ 1037168404Spjdstatic void 1038168404Spjdspa_deactivate(spa_t *spa) 1039168404Spjd{ 1040168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1041168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1042168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1043209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1044168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1045168404Spjd 1046240868Spjd /* 1047240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1048240868Spjd * before spa_deactivate(). 1049240868Spjd */ 1050240868Spjd trim_thread_destroy(spa); 1051240868Spjd 1052168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1053168404Spjd 1054185029Spjd list_destroy(&spa->spa_config_dirty_list); 1055185029Spjd list_destroy(&spa->spa_state_dirty_list); 1056168404Spjd 1057185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1058185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1059211931Smm if (spa->spa_zio_taskq[t][q] != NULL) 1060211931Smm taskq_destroy(spa->spa_zio_taskq[t][q]); 1061185029Spjd spa->spa_zio_taskq[t][q] = NULL; 1062185029Spjd } 1063168404Spjd } 1064168404Spjd 1065168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1066168404Spjd spa->spa_normal_class = NULL; 1067168404Spjd 1068185029Spjd metaslab_class_destroy(spa->spa_log_class); 1069185029Spjd spa->spa_log_class = NULL; 1070185029Spjd 1071168404Spjd /* 1072168404Spjd * If this was part of an import or the open otherwise failed, we may 1073168404Spjd * still have errors left in the queues. Empty them just in case. 1074168404Spjd */ 1075168404Spjd spa_errlog_drain(spa); 1076168404Spjd 1077168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1078168404Spjd avl_destroy(&spa->spa_errlist_last); 1079168404Spjd 1080168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1081219089Spjd 1082219089Spjd mutex_enter(&spa->spa_proc_lock); 1083219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1084219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1085219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1086219089Spjd cv_broadcast(&spa->spa_proc_cv); 1087219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1088219089Spjd ASSERT(spa->spa_proc != &p0); 1089219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1090219089Spjd } 1091219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1092219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1093219089Spjd } 1094219089Spjd ASSERT(spa->spa_proc == &p0); 1095219089Spjd mutex_exit(&spa->spa_proc_lock); 1096219089Spjd 1097219089Spjd#ifdef SPA_PROCESS 1098219089Spjd /* 1099219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1100219089Spjd * module, so that the module can't be unloaded out from underneath 1101219089Spjd * it. 1102219089Spjd */ 1103219089Spjd if (spa->spa_did != 0) { 1104219089Spjd thread_join(spa->spa_did); 1105219089Spjd spa->spa_did = 0; 1106219089Spjd } 1107219089Spjd#endif /* SPA_PROCESS */ 1108168404Spjd} 1109168404Spjd 1110168404Spjd/* 1111168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1112168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1113168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1114168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1115168404Spjd */ 1116168404Spjdstatic int 1117168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1118168404Spjd uint_t id, int atype) 1119168404Spjd{ 1120168404Spjd nvlist_t **child; 1121219089Spjd uint_t children; 1122168404Spjd int error; 1123168404Spjd 1124168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1125168404Spjd return (error); 1126168404Spjd 1127168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1128168404Spjd return (0); 1129168404Spjd 1130185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1131185029Spjd &child, &children); 1132185029Spjd 1133185029Spjd if (error == ENOENT) 1134185029Spjd return (0); 1135185029Spjd 1136185029Spjd if (error) { 1137168404Spjd vdev_free(*vdp); 1138168404Spjd *vdp = NULL; 1139249195Smm return (SET_ERROR(EINVAL)); 1140168404Spjd } 1141168404Spjd 1142219089Spjd for (int c = 0; c < children; c++) { 1143168404Spjd vdev_t *vd; 1144168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1145168404Spjd atype)) != 0) { 1146168404Spjd vdev_free(*vdp); 1147168404Spjd *vdp = NULL; 1148168404Spjd return (error); 1149168404Spjd } 1150168404Spjd } 1151168404Spjd 1152168404Spjd ASSERT(*vdp != NULL); 1153168404Spjd 1154168404Spjd return (0); 1155168404Spjd} 1156168404Spjd 1157168404Spjd/* 1158168404Spjd * Opposite of spa_load(). 1159168404Spjd */ 1160168404Spjdstatic void 1161168404Spjdspa_unload(spa_t *spa) 1162168404Spjd{ 1163168404Spjd int i; 1164168404Spjd 1165185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1166185029Spjd 1167168404Spjd /* 1168240868Spjd * Stop TRIM thread. 1169240868Spjd */ 1170240868Spjd trim_thread_destroy(spa); 1171240868Spjd 1172240868Spjd /* 1173168404Spjd * Stop async tasks. 1174168404Spjd */ 1175168404Spjd spa_async_suspend(spa); 1176168404Spjd 1177168404Spjd /* 1178168404Spjd * Stop syncing. 1179168404Spjd */ 1180168404Spjd if (spa->spa_sync_on) { 1181168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1182168404Spjd spa->spa_sync_on = B_FALSE; 1183168404Spjd } 1184168404Spjd 1185168404Spjd /* 1186185029Spjd * Wait for any outstanding async I/O to complete. 1187168404Spjd */ 1188209962Smm if (spa->spa_async_zio_root != NULL) { 1189209962Smm (void) zio_wait(spa->spa_async_zio_root); 1190209962Smm spa->spa_async_zio_root = NULL; 1191209962Smm } 1192168404Spjd 1193219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1194219089Spjd 1195168404Spjd /* 1196168404Spjd * Close the dsl pool. 1197168404Spjd */ 1198168404Spjd if (spa->spa_dsl_pool) { 1199168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1200168404Spjd spa->spa_dsl_pool = NULL; 1201219089Spjd spa->spa_meta_objset = NULL; 1202168404Spjd } 1203168404Spjd 1204219089Spjd ddt_unload(spa); 1205219089Spjd 1206209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1207209962Smm 1208168404Spjd /* 1209209962Smm * Drop and purge level 2 cache 1210209962Smm */ 1211209962Smm spa_l2cache_drop(spa); 1212209962Smm 1213209962Smm /* 1214168404Spjd * Close all vdevs. 1215168404Spjd */ 1216168404Spjd if (spa->spa_root_vdev) 1217168404Spjd vdev_free(spa->spa_root_vdev); 1218168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1219168404Spjd 1220185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1221185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1222185029Spjd if (spa->spa_spares.sav_vdevs) { 1223185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1224185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1225185029Spjd spa->spa_spares.sav_vdevs = NULL; 1226168404Spjd } 1227185029Spjd if (spa->spa_spares.sav_config) { 1228185029Spjd nvlist_free(spa->spa_spares.sav_config); 1229185029Spjd spa->spa_spares.sav_config = NULL; 1230168404Spjd } 1231185029Spjd spa->spa_spares.sav_count = 0; 1232168404Spjd 1233230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1234230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1235185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1236230514Smm } 1237185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1238185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1239185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1240185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1241185029Spjd } 1242185029Spjd if (spa->spa_l2cache.sav_config) { 1243185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1244185029Spjd spa->spa_l2cache.sav_config = NULL; 1245185029Spjd } 1246185029Spjd spa->spa_l2cache.sav_count = 0; 1247185029Spjd 1248168404Spjd spa->spa_async_suspended = 0; 1249209962Smm 1250228103Smm if (spa->spa_comment != NULL) { 1251228103Smm spa_strfree(spa->spa_comment); 1252228103Smm spa->spa_comment = NULL; 1253228103Smm } 1254228103Smm 1255209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1256168404Spjd} 1257168404Spjd 1258168404Spjd/* 1259168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1260168404Spjd * this pool. When this is called, we have some form of basic information in 1261185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1262185029Spjd * then re-generate a more complete list including status information. 1263168404Spjd */ 1264168404Spjdstatic void 1265168404Spjdspa_load_spares(spa_t *spa) 1266168404Spjd{ 1267168404Spjd nvlist_t **spares; 1268168404Spjd uint_t nspares; 1269168404Spjd int i; 1270168404Spjd vdev_t *vd, *tvd; 1271168404Spjd 1272185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1273185029Spjd 1274168404Spjd /* 1275168404Spjd * First, close and free any existing spare vdevs. 1276168404Spjd */ 1277185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1278185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1279168404Spjd 1280168404Spjd /* Undo the call to spa_activate() below */ 1281185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1282185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1283168404Spjd spa_spare_remove(tvd); 1284168404Spjd vdev_close(vd); 1285168404Spjd vdev_free(vd); 1286168404Spjd } 1287168404Spjd 1288185029Spjd if (spa->spa_spares.sav_vdevs) 1289185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1290185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1291168404Spjd 1292185029Spjd if (spa->spa_spares.sav_config == NULL) 1293168404Spjd nspares = 0; 1294168404Spjd else 1295185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1296168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1297168404Spjd 1298185029Spjd spa->spa_spares.sav_count = (int)nspares; 1299185029Spjd spa->spa_spares.sav_vdevs = NULL; 1300168404Spjd 1301168404Spjd if (nspares == 0) 1302168404Spjd return; 1303168404Spjd 1304168404Spjd /* 1305168404Spjd * Construct the array of vdevs, opening them to get status in the 1306168404Spjd * process. For each spare, there is potentially two different vdev_t 1307168404Spjd * structures associated with it: one in the list of spares (used only 1308168404Spjd * for basic validation purposes) and one in the active vdev 1309168404Spjd * configuration (if it's spared in). During this phase we open and 1310168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1311168404Spjd * active configuration, then we also mark this vdev as an active spare. 1312168404Spjd */ 1313185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1314185029Spjd KM_SLEEP); 1315185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1316168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1317168404Spjd VDEV_ALLOC_SPARE) == 0); 1318168404Spjd ASSERT(vd != NULL); 1319168404Spjd 1320185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1321168404Spjd 1322185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1323185029Spjd B_FALSE)) != NULL) { 1324168404Spjd if (!tvd->vdev_isspare) 1325168404Spjd spa_spare_add(tvd); 1326168404Spjd 1327168404Spjd /* 1328168404Spjd * We only mark the spare active if we were successfully 1329168404Spjd * able to load the vdev. Otherwise, importing a pool 1330168404Spjd * with a bad active spare would result in strange 1331168404Spjd * behavior, because multiple pool would think the spare 1332168404Spjd * is actively in use. 1333168404Spjd * 1334168404Spjd * There is a vulnerability here to an equally bizarre 1335168404Spjd * circumstance, where a dead active spare is later 1336168404Spjd * brought back to life (onlined or otherwise). Given 1337168404Spjd * the rarity of this scenario, and the extra complexity 1338168404Spjd * it adds, we ignore the possibility. 1339168404Spjd */ 1340168404Spjd if (!vdev_is_dead(tvd)) 1341168404Spjd spa_spare_activate(tvd); 1342168404Spjd } 1343168404Spjd 1344185029Spjd vd->vdev_top = vd; 1345209962Smm vd->vdev_aux = &spa->spa_spares; 1346185029Spjd 1347168404Spjd if (vdev_open(vd) != 0) 1348168404Spjd continue; 1349168404Spjd 1350185029Spjd if (vdev_validate_aux(vd) == 0) 1351185029Spjd spa_spare_add(vd); 1352168404Spjd } 1353168404Spjd 1354168404Spjd /* 1355168404Spjd * Recompute the stashed list of spares, with status information 1356168404Spjd * this time. 1357168404Spjd */ 1358185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1359168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1360168404Spjd 1361185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1362185029Spjd KM_SLEEP); 1363185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1364185029Spjd spares[i] = vdev_config_generate(spa, 1365219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1366185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1367185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1368185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1369168404Spjd nvlist_free(spares[i]); 1370185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1371168404Spjd} 1372168404Spjd 1373185029Spjd/* 1374185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1375185029Spjd * this pool. When this is called, we have some form of basic information in 1376185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1377185029Spjd * then re-generate a more complete list including status information. 1378185029Spjd * Devices which are already active have their details maintained, and are 1379185029Spjd * not re-opened. 1380185029Spjd */ 1381185029Spjdstatic void 1382185029Spjdspa_load_l2cache(spa_t *spa) 1383185029Spjd{ 1384185029Spjd nvlist_t **l2cache; 1385185029Spjd uint_t nl2cache; 1386185029Spjd int i, j, oldnvdevs; 1387219089Spjd uint64_t guid; 1388185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1389185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1390185029Spjd 1391185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1392185029Spjd 1393185029Spjd if (sav->sav_config != NULL) { 1394185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1395185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1396185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1397185029Spjd } else { 1398185029Spjd nl2cache = 0; 1399247187Smm newvdevs = NULL; 1400185029Spjd } 1401185029Spjd 1402185029Spjd oldvdevs = sav->sav_vdevs; 1403185029Spjd oldnvdevs = sav->sav_count; 1404185029Spjd sav->sav_vdevs = NULL; 1405185029Spjd sav->sav_count = 0; 1406185029Spjd 1407185029Spjd /* 1408185029Spjd * Process new nvlist of vdevs. 1409185029Spjd */ 1410185029Spjd for (i = 0; i < nl2cache; i++) { 1411185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1412185029Spjd &guid) == 0); 1413185029Spjd 1414185029Spjd newvdevs[i] = NULL; 1415185029Spjd for (j = 0; j < oldnvdevs; j++) { 1416185029Spjd vd = oldvdevs[j]; 1417185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1418185029Spjd /* 1419185029Spjd * Retain previous vdev for add/remove ops. 1420185029Spjd */ 1421185029Spjd newvdevs[i] = vd; 1422185029Spjd oldvdevs[j] = NULL; 1423185029Spjd break; 1424185029Spjd } 1425185029Spjd } 1426185029Spjd 1427185029Spjd if (newvdevs[i] == NULL) { 1428185029Spjd /* 1429185029Spjd * Create new vdev 1430185029Spjd */ 1431185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1432185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1433185029Spjd ASSERT(vd != NULL); 1434185029Spjd newvdevs[i] = vd; 1435185029Spjd 1436185029Spjd /* 1437185029Spjd * Commit this vdev as an l2cache device, 1438185029Spjd * even if it fails to open. 1439185029Spjd */ 1440185029Spjd spa_l2cache_add(vd); 1441185029Spjd 1442185029Spjd vd->vdev_top = vd; 1443185029Spjd vd->vdev_aux = sav; 1444185029Spjd 1445185029Spjd spa_l2cache_activate(vd); 1446185029Spjd 1447185029Spjd if (vdev_open(vd) != 0) 1448185029Spjd continue; 1449185029Spjd 1450185029Spjd (void) vdev_validate_aux(vd); 1451185029Spjd 1452219089Spjd if (!vdev_is_dead(vd)) 1453219089Spjd l2arc_add_vdev(spa, vd); 1454185029Spjd } 1455185029Spjd } 1456185029Spjd 1457185029Spjd /* 1458185029Spjd * Purge vdevs that were dropped 1459185029Spjd */ 1460185029Spjd for (i = 0; i < oldnvdevs; i++) { 1461185029Spjd uint64_t pool; 1462185029Spjd 1463185029Spjd vd = oldvdevs[i]; 1464185029Spjd if (vd != NULL) { 1465230514Smm ASSERT(vd->vdev_isl2cache); 1466230514Smm 1467209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1468209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1469185029Spjd l2arc_remove_vdev(vd); 1470230514Smm vdev_clear_stats(vd); 1471230514Smm vdev_free(vd); 1472185029Spjd } 1473185029Spjd } 1474185029Spjd 1475185029Spjd if (oldvdevs) 1476185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1477185029Spjd 1478185029Spjd if (sav->sav_config == NULL) 1479185029Spjd goto out; 1480185029Spjd 1481185029Spjd sav->sav_vdevs = newvdevs; 1482185029Spjd sav->sav_count = (int)nl2cache; 1483185029Spjd 1484185029Spjd /* 1485185029Spjd * Recompute the stashed list of l2cache devices, with status 1486185029Spjd * information this time. 1487185029Spjd */ 1488185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1489185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1490185029Spjd 1491185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1492185029Spjd for (i = 0; i < sav->sav_count; i++) 1493185029Spjd l2cache[i] = vdev_config_generate(spa, 1494219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1495185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1496185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1497185029Spjdout: 1498185029Spjd for (i = 0; i < sav->sav_count; i++) 1499185029Spjd nvlist_free(l2cache[i]); 1500185029Spjd if (sav->sav_count) 1501185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1502185029Spjd} 1503185029Spjd 1504168404Spjdstatic int 1505168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1506168404Spjd{ 1507168404Spjd dmu_buf_t *db; 1508168404Spjd char *packed = NULL; 1509168404Spjd size_t nvsize = 0; 1510168404Spjd int error; 1511168404Spjd *value = NULL; 1512168404Spjd 1513168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1514168404Spjd nvsize = *(uint64_t *)db->db_data; 1515168404Spjd dmu_buf_rele(db, FTAG); 1516168404Spjd 1517168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1518209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1519209962Smm DMU_READ_PREFETCH); 1520168404Spjd if (error == 0) 1521168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1522168404Spjd kmem_free(packed, nvsize); 1523168404Spjd 1524168404Spjd return (error); 1525168404Spjd} 1526168404Spjd 1527168404Spjd/* 1528185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1529185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1530185029Spjd */ 1531185029Spjdstatic void 1532185029Spjdspa_check_removed(vdev_t *vd) 1533185029Spjd{ 1534219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1535185029Spjd spa_check_removed(vd->vdev_child[c]); 1536185029Spjd 1537249188Smm if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1538249188Smm !vd->vdev_ishole) { 1539185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1540185029Spjd spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1541185029Spjd } 1542185029Spjd} 1543185029Spjd 1544185029Spjd/* 1545219089Spjd * Validate the current config against the MOS config 1546213197Smm */ 1547219089Spjdstatic boolean_t 1548219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1549213197Smm{ 1550219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1551219089Spjd nvlist_t *nv; 1552213197Smm 1553219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1554213197Smm 1555219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1556219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1557219089Spjd 1558219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1559219089Spjd 1560219089Spjd /* 1561219089Spjd * If we're doing a normal import, then build up any additional 1562219089Spjd * diagnostic information about missing devices in this config. 1563219089Spjd * We'll pass this up to the user for further processing. 1564219089Spjd */ 1565219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1566219089Spjd nvlist_t **child, *nv; 1567219089Spjd uint64_t idx = 0; 1568219089Spjd 1569219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1570219089Spjd KM_SLEEP); 1571219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1572219089Spjd 1573219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1574219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1575219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1576219089Spjd 1577219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1578219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1579219089Spjd mtvd->vdev_islog) 1580219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1581219089Spjd B_FALSE, 0); 1582219089Spjd } 1583219089Spjd 1584219089Spjd if (idx) { 1585219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1586219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1587219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1588219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1589219089Spjd 1590219089Spjd for (int i = 0; i < idx; i++) 1591219089Spjd nvlist_free(child[i]); 1592219089Spjd } 1593219089Spjd nvlist_free(nv); 1594219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1595219089Spjd } 1596219089Spjd 1597219089Spjd /* 1598219089Spjd * Compare the root vdev tree with the information we have 1599219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1600219089Spjd * with the corresponding MOS config top-level (mtvd). 1601219089Spjd */ 1602219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1603213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1604219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1605213197Smm 1606219089Spjd /* 1607219089Spjd * Resolve any "missing" vdevs in the current configuration. 1608219089Spjd * If we find that the MOS config has more accurate information 1609219089Spjd * about the top-level vdev then use that vdev instead. 1610219089Spjd */ 1611219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1612219089Spjd mtvd->vdev_ops != &vdev_missing_ops) { 1613219089Spjd 1614219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1615219089Spjd continue; 1616219089Spjd 1617219089Spjd /* 1618219089Spjd * Device specific actions. 1619219089Spjd */ 1620219089Spjd if (mtvd->vdev_islog) { 1621219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1622219089Spjd } else { 1623219089Spjd /* 1624219089Spjd * XXX - once we have 'readonly' pool 1625219089Spjd * support we should be able to handle 1626219089Spjd * missing data devices by transitioning 1627219089Spjd * the pool to readonly. 1628219089Spjd */ 1629219089Spjd continue; 1630219089Spjd } 1631219089Spjd 1632219089Spjd /* 1633219089Spjd * Swap the missing vdev with the data we were 1634219089Spjd * able to obtain from the MOS config. 1635219089Spjd */ 1636219089Spjd vdev_remove_child(rvd, tvd); 1637219089Spjd vdev_remove_child(mrvd, mtvd); 1638219089Spjd 1639219089Spjd vdev_add_child(rvd, mtvd); 1640219089Spjd vdev_add_child(mrvd, tvd); 1641219089Spjd 1642219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1643219089Spjd vdev_load(mtvd); 1644219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1645219089Spjd 1646219089Spjd vdev_reopen(rvd); 1647219089Spjd } else if (mtvd->vdev_islog) { 1648219089Spjd /* 1649219089Spjd * Load the slog device's state from the MOS config 1650219089Spjd * since it's possible that the label does not 1651219089Spjd * contain the most up-to-date information. 1652219089Spjd */ 1653219089Spjd vdev_load_log_state(tvd, mtvd); 1654219089Spjd vdev_reopen(tvd); 1655219089Spjd } 1656213197Smm } 1657219089Spjd vdev_free(mrvd); 1658219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1659219089Spjd 1660219089Spjd /* 1661219089Spjd * Ensure we were able to validate the config. 1662219089Spjd */ 1663219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1664213197Smm} 1665213197Smm 1666213197Smm/* 1667185029Spjd * Check for missing log devices 1668185029Spjd */ 1669248571Smmstatic boolean_t 1670185029Spjdspa_check_logs(spa_t *spa) 1671185029Spjd{ 1672248571Smm boolean_t rv = B_FALSE; 1673248571Smm 1674185029Spjd switch (spa->spa_log_state) { 1675185029Spjd case SPA_LOG_MISSING: 1676185029Spjd /* need to recheck in case slog has been restored */ 1677185029Spjd case SPA_LOG_UNKNOWN: 1678248571Smm rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, 1679248571Smm NULL, DS_FIND_CHILDREN) != 0); 1680248571Smm if (rv) 1681219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1682185029Spjd break; 1683185029Spjd } 1684248571Smm return (rv); 1685185029Spjd} 1686185029Spjd 1687219089Spjdstatic boolean_t 1688219089Spjdspa_passivate_log(spa_t *spa) 1689219089Spjd{ 1690219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1691219089Spjd boolean_t slog_found = B_FALSE; 1692219089Spjd 1693219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1694219089Spjd 1695219089Spjd if (!spa_has_slogs(spa)) 1696219089Spjd return (B_FALSE); 1697219089Spjd 1698219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1699219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1700219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1701219089Spjd 1702219089Spjd if (tvd->vdev_islog) { 1703219089Spjd metaslab_group_passivate(mg); 1704219089Spjd slog_found = B_TRUE; 1705219089Spjd } 1706219089Spjd } 1707219089Spjd 1708219089Spjd return (slog_found); 1709219089Spjd} 1710219089Spjd 1711219089Spjdstatic void 1712219089Spjdspa_activate_log(spa_t *spa) 1713219089Spjd{ 1714219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1715219089Spjd 1716219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1717219089Spjd 1718219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1719219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1720219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1721219089Spjd 1722219089Spjd if (tvd->vdev_islog) 1723219089Spjd metaslab_group_activate(mg); 1724219089Spjd } 1725219089Spjd} 1726219089Spjd 1727219089Spjdint 1728219089Spjdspa_offline_log(spa_t *spa) 1729219089Spjd{ 1730248571Smm int error; 1731219089Spjd 1732248571Smm error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1733248571Smm NULL, DS_FIND_CHILDREN); 1734248571Smm if (error == 0) { 1735219089Spjd /* 1736219089Spjd * We successfully offlined the log device, sync out the 1737219089Spjd * current txg so that the "stubby" block can be removed 1738219089Spjd * by zil_sync(). 1739219089Spjd */ 1740219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1741219089Spjd } 1742219089Spjd return (error); 1743219089Spjd} 1744219089Spjd 1745219089Spjdstatic void 1746219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1747219089Spjd{ 1748219089Spjd int i; 1749219089Spjd 1750219089Spjd for (i = 0; i < sav->sav_count; i++) 1751219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1752219089Spjd} 1753219089Spjd 1754219089Spjdvoid 1755219089Spjdspa_claim_notify(zio_t *zio) 1756219089Spjd{ 1757219089Spjd spa_t *spa = zio->io_spa; 1758219089Spjd 1759219089Spjd if (zio->io_error) 1760219089Spjd return; 1761219089Spjd 1762219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1763219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1764219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1765219089Spjd mutex_exit(&spa->spa_props_lock); 1766219089Spjd} 1767219089Spjd 1768219089Spjdtypedef struct spa_load_error { 1769219089Spjd uint64_t sle_meta_count; 1770219089Spjd uint64_t sle_data_count; 1771219089Spjd} spa_load_error_t; 1772219089Spjd 1773219089Spjdstatic void 1774219089Spjdspa_load_verify_done(zio_t *zio) 1775219089Spjd{ 1776219089Spjd blkptr_t *bp = zio->io_bp; 1777219089Spjd spa_load_error_t *sle = zio->io_private; 1778219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1779219089Spjd int error = zio->io_error; 1780219089Spjd 1781219089Spjd if (error) { 1782236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1783219089Spjd type != DMU_OT_INTENT_LOG) 1784219089Spjd atomic_add_64(&sle->sle_meta_count, 1); 1785219089Spjd else 1786219089Spjd atomic_add_64(&sle->sle_data_count, 1); 1787219089Spjd } 1788219089Spjd zio_data_buf_free(zio->io_data, zio->io_size); 1789219089Spjd} 1790219089Spjd 1791219089Spjd/*ARGSUSED*/ 1792219089Spjdstatic int 1793219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1794246666Smm const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1795219089Spjd{ 1796219089Spjd if (bp != NULL) { 1797219089Spjd zio_t *rio = arg; 1798219089Spjd size_t size = BP_GET_PSIZE(bp); 1799219089Spjd void *data = zio_data_buf_alloc(size); 1800219089Spjd 1801219089Spjd zio_nowait(zio_read(rio, spa, bp, data, size, 1802219089Spjd spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1803219089Spjd ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1804219089Spjd ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1805219089Spjd } 1806219089Spjd return (0); 1807219089Spjd} 1808219089Spjd 1809219089Spjdstatic int 1810219089Spjdspa_load_verify(spa_t *spa) 1811219089Spjd{ 1812219089Spjd zio_t *rio; 1813219089Spjd spa_load_error_t sle = { 0 }; 1814219089Spjd zpool_rewind_policy_t policy; 1815219089Spjd boolean_t verify_ok = B_FALSE; 1816219089Spjd int error; 1817219089Spjd 1818219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 1819219089Spjd 1820219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1821219089Spjd return (0); 1822219089Spjd 1823219089Spjd rio = zio_root(spa, NULL, &sle, 1824219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1825219089Spjd 1826219089Spjd error = traverse_pool(spa, spa->spa_verify_min_txg, 1827219089Spjd TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1828219089Spjd 1829219089Spjd (void) zio_wait(rio); 1830219089Spjd 1831219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 1832219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 1833219089Spjd 1834219089Spjd if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1835219089Spjd sle.sle_data_count <= policy.zrp_maxdata) { 1836219089Spjd int64_t loss = 0; 1837219089Spjd 1838219089Spjd verify_ok = B_TRUE; 1839219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1840219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1841219089Spjd 1842219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1843219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1844219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1845219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 1846219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1847219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1848219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1849219089Spjd } else { 1850219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1851219089Spjd } 1852219089Spjd 1853219089Spjd if (error) { 1854219089Spjd if (error != ENXIO && error != EIO) 1855249195Smm error = SET_ERROR(EIO); 1856219089Spjd return (error); 1857219089Spjd } 1858219089Spjd 1859219089Spjd return (verify_ok ? 0 : EIO); 1860219089Spjd} 1861219089Spjd 1862185029Spjd/* 1863219089Spjd * Find a value in the pool props object. 1864168404Spjd */ 1865219089Spjdstatic void 1866219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1867219089Spjd{ 1868219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1869219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1870219089Spjd} 1871219089Spjd 1872219089Spjd/* 1873219089Spjd * Find a value in the pool directory object. 1874219089Spjd */ 1875168404Spjdstatic int 1876219089Spjdspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1877168404Spjd{ 1878219089Spjd return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1879219089Spjd name, sizeof (uint64_t), 1, val)); 1880219089Spjd} 1881168404Spjd 1882219089Spjdstatic int 1883219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1884219089Spjd{ 1885219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1886219089Spjd return (err); 1887219089Spjd} 1888219089Spjd 1889219089Spjd/* 1890219089Spjd * Fix up config after a partly-completed split. This is done with the 1891219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1892219089Spjd * pool have that entry in their config, but only the splitting one contains 1893219089Spjd * a list of all the guids of the vdevs that are being split off. 1894219089Spjd * 1895219089Spjd * This function determines what to do with that list: either rejoin 1896219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 1897219089Spjd * the rejoin, each disk that is offlined is marked online again, and 1898219089Spjd * we do a reopen() call. If the vdev label for every disk that was 1899219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1900219089Spjd * then we call vdev_split() on each disk, and complete the split. 1901219089Spjd * 1902219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 1903219089Spjd * the original pool. 1904219089Spjd */ 1905219089Spjdstatic void 1906219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 1907219089Spjd{ 1908219089Spjd uint_t extracted; 1909219089Spjd uint64_t *glist; 1910219089Spjd uint_t i, gcount; 1911219089Spjd nvlist_t *nvl; 1912219089Spjd vdev_t **vd; 1913219089Spjd boolean_t attempt_reopen; 1914219089Spjd 1915219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1916219089Spjd return; 1917219089Spjd 1918219089Spjd /* check that the config is complete */ 1919219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1920219089Spjd &glist, &gcount) != 0) 1921219089Spjd return; 1922219089Spjd 1923219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1924219089Spjd 1925219089Spjd /* attempt to online all the vdevs & validate */ 1926219089Spjd attempt_reopen = B_TRUE; 1927219089Spjd for (i = 0; i < gcount; i++) { 1928219089Spjd if (glist[i] == 0) /* vdev is hole */ 1929219089Spjd continue; 1930219089Spjd 1931219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1932219089Spjd if (vd[i] == NULL) { 1933219089Spjd /* 1934219089Spjd * Don't bother attempting to reopen the disks; 1935219089Spjd * just do the split. 1936219089Spjd */ 1937219089Spjd attempt_reopen = B_FALSE; 1938219089Spjd } else { 1939219089Spjd /* attempt to re-online it */ 1940219089Spjd vd[i]->vdev_offline = B_FALSE; 1941219089Spjd } 1942219089Spjd } 1943219089Spjd 1944219089Spjd if (attempt_reopen) { 1945219089Spjd vdev_reopen(spa->spa_root_vdev); 1946219089Spjd 1947219089Spjd /* check each device to see what state it's in */ 1948219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 1949219089Spjd if (vd[i] != NULL && 1950219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1951219089Spjd break; 1952219089Spjd ++extracted; 1953219089Spjd } 1954219089Spjd } 1955219089Spjd 1956209962Smm /* 1957219089Spjd * If every disk has been moved to the new pool, or if we never 1958219089Spjd * even attempted to look at them, then we split them off for 1959219089Spjd * good. 1960209962Smm */ 1961219089Spjd if (!attempt_reopen || gcount == extracted) { 1962219089Spjd for (i = 0; i < gcount; i++) 1963219089Spjd if (vd[i] != NULL) 1964219089Spjd vdev_split(vd[i]); 1965219089Spjd vdev_reopen(spa->spa_root_vdev); 1966219089Spjd } 1967209962Smm 1968219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 1969219089Spjd} 1970185029Spjd 1971219089Spjdstatic int 1972219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1973219089Spjd boolean_t mosconfig) 1974219089Spjd{ 1975219089Spjd nvlist_t *config = spa->spa_config; 1976219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 1977228103Smm char *comment; 1978219089Spjd int error; 1979219089Spjd uint64_t pool_guid; 1980219089Spjd nvlist_t *nvl; 1981168404Spjd 1982219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1983249195Smm return (SET_ERROR(EINVAL)); 1984168404Spjd 1985228103Smm ASSERT(spa->spa_comment == NULL); 1986228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 1987228103Smm spa->spa_comment = spa_strdup(comment); 1988228103Smm 1989168404Spjd /* 1990168404Spjd * Versioning wasn't explicitly added to the label until later, so if 1991168404Spjd * it's not present treat it as the initial version. 1992168404Spjd */ 1993219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1994219089Spjd &spa->spa_ubsync.ub_version) != 0) 1995219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1996168404Spjd 1997168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1998168404Spjd &spa->spa_config_txg); 1999168404Spjd 2000168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2001168404Spjd spa_guid_exists(pool_guid, 0)) { 2002249195Smm error = SET_ERROR(EEXIST); 2003219089Spjd } else { 2004228103Smm spa->spa_config_guid = pool_guid; 2005219089Spjd 2006219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2007219089Spjd &nvl) == 0) { 2008219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2009219089Spjd KM_SLEEP) == 0); 2010219089Spjd } 2011219089Spjd 2012236884Smm nvlist_free(spa->spa_load_info); 2013236884Smm spa->spa_load_info = fnvlist_alloc(); 2014236884Smm 2015219089Spjd gethrestime(&spa->spa_loaded_ts); 2016219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 2017219089Spjd mosconfig, &ereport); 2018168404Spjd } 2019168404Spjd 2020219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2021219089Spjd if (error) { 2022219089Spjd if (error != EEXIST) { 2023219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2024219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2025219089Spjd } 2026219089Spjd if (error != EBADF) { 2027219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2028219089Spjd } 2029219089Spjd } 2030219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2031219089Spjd spa->spa_ena = 0; 2032168404Spjd 2033219089Spjd return (error); 2034219089Spjd} 2035219089Spjd 2036219089Spjd/* 2037219089Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 2038219089Spjd * source of configuration information. 2039219089Spjd */ 2040219089Spjdstatic int 2041219089Spjdspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2042219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2043219089Spjd char **ereport) 2044219089Spjd{ 2045219089Spjd int error = 0; 2046219089Spjd nvlist_t *nvroot = NULL; 2047236884Smm nvlist_t *label; 2048219089Spjd vdev_t *rvd; 2049219089Spjd uberblock_t *ub = &spa->spa_uberblock; 2050219089Spjd uint64_t children, config_cache_txg = spa->spa_config_txg; 2051219089Spjd int orig_mode = spa->spa_mode; 2052219089Spjd int parse; 2053219089Spjd uint64_t obj; 2054236884Smm boolean_t missing_feat_write = B_FALSE; 2055219089Spjd 2056168404Spjd /* 2057219089Spjd * If this is an untrusted config, access the pool in read-only mode. 2058219089Spjd * This prevents things like resilvering recently removed devices. 2059219089Spjd */ 2060219089Spjd if (!mosconfig) 2061219089Spjd spa->spa_mode = FREAD; 2062219089Spjd 2063219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2064219089Spjd 2065219089Spjd spa->spa_load_state = state; 2066219089Spjd 2067219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2068249195Smm return (SET_ERROR(EINVAL)); 2069219089Spjd 2070219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 2071219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2072219089Spjd 2073219089Spjd /* 2074209962Smm * Create "The Godfather" zio to hold all async IOs 2075209962Smm */ 2076209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2077209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2078209962Smm 2079209962Smm /* 2080168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2081168404Spjd * value that will be returned by spa_version() since parsing the 2082168404Spjd * configuration requires knowing the version number. 2083168404Spjd */ 2084185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2085219089Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2086185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2087168404Spjd 2088168404Spjd if (error != 0) 2089219089Spjd return (error); 2090168404Spjd 2091168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2092168404Spjd 2093219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2094219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2095219089Spjd } 2096219089Spjd 2097168404Spjd /* 2098168404Spjd * Try to open all vdevs, loading each label in the process. 2099168404Spjd */ 2100185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2101168926Spjd error = vdev_open(rvd); 2102185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2103168926Spjd if (error != 0) 2104219089Spjd return (error); 2105168404Spjd 2106168404Spjd /* 2107209962Smm * We need to validate the vdev labels against the configuration that 2108209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 2109209962Smm * mosconfig is true then we're validating the vdev labels based on 2110219089Spjd * that config. Otherwise, we're validating against the cached config 2111209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 2112209962Smm * later we will recursively call spa_load() and validate against 2113209962Smm * the vdev config. 2114219089Spjd * 2115219089Spjd * If we're assembling a new pool that's been split off from an 2116219089Spjd * existing pool, the labels haven't yet been updated so we skip 2117219089Spjd * validation for now. 2118168404Spjd */ 2119219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2120219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2121230514Smm error = vdev_validate(rvd, mosconfig); 2122219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2123168404Spjd 2124219089Spjd if (error != 0) 2125219089Spjd return (error); 2126219089Spjd 2127219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2128249195Smm return (SET_ERROR(ENXIO)); 2129168404Spjd } 2130168404Spjd 2131168404Spjd /* 2132168404Spjd * Find the best uberblock. 2133168404Spjd */ 2134236884Smm vdev_uberblock_load(rvd, ub, &label); 2135168404Spjd 2136168404Spjd /* 2137168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2138168404Spjd */ 2139236884Smm if (ub->ub_txg == 0) { 2140236884Smm nvlist_free(label); 2141219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2142236884Smm } 2143168404Spjd 2144168404Spjd /* 2145236884Smm * If the pool has an unsupported version we can't open it. 2146168404Spjd */ 2147236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2148236884Smm nvlist_free(label); 2149219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2150236884Smm } 2151168404Spjd 2152236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2153236884Smm nvlist_t *features; 2154236884Smm 2155236884Smm /* 2156236884Smm * If we weren't able to find what's necessary for reading the 2157236884Smm * MOS in the label, return failure. 2158236884Smm */ 2159236884Smm if (label == NULL || nvlist_lookup_nvlist(label, 2160236884Smm ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2161236884Smm nvlist_free(label); 2162236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2163236884Smm ENXIO)); 2164236884Smm } 2165236884Smm 2166236884Smm /* 2167236884Smm * Update our in-core representation with the definitive values 2168236884Smm * from the label. 2169236884Smm */ 2170236884Smm nvlist_free(spa->spa_label_features); 2171236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2172236884Smm } 2173236884Smm 2174236884Smm nvlist_free(label); 2175236884Smm 2176168404Spjd /* 2177236884Smm * Look through entries in the label nvlist's features_for_read. If 2178236884Smm * there is a feature listed there which we don't understand then we 2179236884Smm * cannot open a pool. 2180236884Smm */ 2181236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2182236884Smm nvlist_t *unsup_feat; 2183236884Smm 2184236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2185236884Smm 0); 2186236884Smm 2187236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2188236884Smm NULL); nvp != NULL; 2189236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2190236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2191236884Smm VERIFY(nvlist_add_string(unsup_feat, 2192236884Smm nvpair_name(nvp), "") == 0); 2193236884Smm } 2194236884Smm } 2195236884Smm 2196236884Smm if (!nvlist_empty(unsup_feat)) { 2197236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2198236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2199236884Smm nvlist_free(unsup_feat); 2200236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2201236884Smm ENOTSUP)); 2202236884Smm } 2203236884Smm 2204236884Smm nvlist_free(unsup_feat); 2205236884Smm } 2206236884Smm 2207236884Smm /* 2208168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2209219089Spjd * incomplete configuration. We first check to see if the pool 2210219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2211219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2212219089Spjd * can handle missing vdevs. 2213168404Spjd */ 2214219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2215219089Spjd &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2216219089Spjd rvd->vdev_guid_sum != ub->ub_guid_sum) 2217219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2218219089Spjd 2219219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2220219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2221219089Spjd spa_try_repair(spa, config); 2222219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2223219089Spjd nvlist_free(spa->spa_config_splitting); 2224219089Spjd spa->spa_config_splitting = NULL; 2225168404Spjd } 2226168404Spjd 2227168404Spjd /* 2228168404Spjd * Initialize internal SPA structures. 2229168404Spjd */ 2230168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2231168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2232219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2233219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2234219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2235219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2236219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2237219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2238219089Spjd 2239236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2240219089Spjd if (error) 2241219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2242168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2243168404Spjd 2244219089Spjd if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2245219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2246168404Spjd 2247236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2248236884Smm boolean_t missing_feat_read = B_FALSE; 2249238926Smm nvlist_t *unsup_feat, *enabled_feat; 2250236884Smm 2251236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2252236884Smm &spa->spa_feat_for_read_obj) != 0) { 2253236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2254236884Smm } 2255236884Smm 2256236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2257236884Smm &spa->spa_feat_for_write_obj) != 0) { 2258236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2259236884Smm } 2260236884Smm 2261236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2262236884Smm &spa->spa_feat_desc_obj) != 0) { 2263236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2264236884Smm } 2265236884Smm 2266238926Smm enabled_feat = fnvlist_alloc(); 2267238926Smm unsup_feat = fnvlist_alloc(); 2268236884Smm 2269236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2270236884Smm spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, 2271238926Smm unsup_feat, enabled_feat)) 2272236884Smm missing_feat_read = B_TRUE; 2273236884Smm 2274236884Smm if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2275236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2276236884Smm spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, 2277238926Smm unsup_feat, enabled_feat)) { 2278236884Smm missing_feat_write = B_TRUE; 2279238926Smm } 2280236884Smm } 2281236884Smm 2282238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2283238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2284238926Smm 2285236884Smm if (!nvlist_empty(unsup_feat)) { 2286238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2287238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2288236884Smm } 2289236884Smm 2290238926Smm fnvlist_free(enabled_feat); 2291238926Smm fnvlist_free(unsup_feat); 2292236884Smm 2293236884Smm if (!missing_feat_read) { 2294236884Smm fnvlist_add_boolean(spa->spa_load_info, 2295236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2296236884Smm } 2297236884Smm 2298236884Smm /* 2299236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2300236884Smm * twofold: to determine whether the pool is available for 2301236884Smm * import in read-write mode and (if it is not) whether the 2302236884Smm * pool is available for import in read-only mode. If the pool 2303236884Smm * is available for import in read-write mode, it is displayed 2304236884Smm * as available in userland; if it is not available for import 2305236884Smm * in read-only mode, it is displayed as unavailable in 2306236884Smm * userland. If the pool is available for import in read-only 2307236884Smm * mode but not read-write mode, it is displayed as unavailable 2308236884Smm * in userland with a special note that the pool is actually 2309236884Smm * available for open in read-only mode. 2310236884Smm * 2311236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2312236884Smm * missing a feature for write, we must first determine whether 2313236884Smm * the pool can be opened read-only before returning to 2314236884Smm * userland in order to know whether to display the 2315236884Smm * abovementioned note. 2316236884Smm */ 2317236884Smm if (missing_feat_read || (missing_feat_write && 2318236884Smm spa_writeable(spa))) { 2319236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2320236884Smm ENOTSUP)); 2321236884Smm } 2322236884Smm } 2323236884Smm 2324236884Smm spa->spa_is_initializing = B_TRUE; 2325236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2326236884Smm spa->spa_is_initializing = B_FALSE; 2327236884Smm if (error != 0) 2328236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2329236884Smm 2330168404Spjd if (!mosconfig) { 2331168498Spjd uint64_t hostid; 2332219089Spjd nvlist_t *policy = NULL, *nvconfig; 2333168404Spjd 2334219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2335219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2336168404Spjd 2337219089Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2338185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2339168498Spjd char *hostname; 2340168498Spjd unsigned long myhostid = 0; 2341168498Spjd 2342219089Spjd VERIFY(nvlist_lookup_string(nvconfig, 2343168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2344168498Spjd 2345219089Spjd#ifdef _KERNEL 2346219089Spjd myhostid = zone_get_hostid(NULL); 2347219089Spjd#else /* _KERNEL */ 2348219089Spjd /* 2349219089Spjd * We're emulating the system's hostid in userland, so 2350219089Spjd * we can't use zone_get_hostid(). 2351219089Spjd */ 2352168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2353219089Spjd#endif /* _KERNEL */ 2354204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 2355219089Spjd hostid != myhostid) { 2356219089Spjd nvlist_free(nvconfig); 2357168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 2358168498Spjd "loaded as it was last accessed by " 2359185029Spjd "another system (host: %s hostid: 0x%lx). " 2360236146Smm "See: http://illumos.org/msg/ZFS-8000-EY", 2361185029Spjd spa_name(spa), hostname, 2362168498Spjd (unsigned long)hostid); 2363249195Smm return (SET_ERROR(EBADF)); 2364168498Spjd } 2365168498Spjd } 2366219089Spjd if (nvlist_lookup_nvlist(spa->spa_config, 2367219089Spjd ZPOOL_REWIND_POLICY, &policy) == 0) 2368219089Spjd VERIFY(nvlist_add_nvlist(nvconfig, 2369219089Spjd ZPOOL_REWIND_POLICY, policy) == 0); 2370168498Spjd 2371219089Spjd spa_config_set(spa, nvconfig); 2372168404Spjd spa_unload(spa); 2373168404Spjd spa_deactivate(spa); 2374209962Smm spa_activate(spa, orig_mode); 2375168404Spjd 2376219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2377168404Spjd } 2378168404Spjd 2379219089Spjd if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2380219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2381219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2382219089Spjd if (error != 0) 2383219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2384168404Spjd 2385168404Spjd /* 2386168404Spjd * Load the bit that tells us to use the new accounting function 2387168404Spjd * (raid-z deflation). If we have an older pool, this will not 2388168404Spjd * be present. 2389168404Spjd */ 2390219089Spjd error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2391219089Spjd if (error != 0 && error != ENOENT) 2392219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2393168404Spjd 2394219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2395219089Spjd &spa->spa_creation_version); 2396219089Spjd if (error != 0 && error != ENOENT) 2397219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2398219089Spjd 2399168404Spjd /* 2400168404Spjd * Load the persistent error log. If we have an older pool, this will 2401168404Spjd * not be present. 2402168404Spjd */ 2403219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2404219089Spjd if (error != 0 && error != ENOENT) 2405219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2406168404Spjd 2407219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2408219089Spjd &spa->spa_errlog_scrub); 2409219089Spjd if (error != 0 && error != ENOENT) 2410219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2411168404Spjd 2412168404Spjd /* 2413168404Spjd * Load the history object. If we have an older pool, this 2414168404Spjd * will not be present. 2415168404Spjd */ 2416219089Spjd error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2417219089Spjd if (error != 0 && error != ENOENT) 2418219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2419168404Spjd 2420168404Spjd /* 2421219089Spjd * If we're assembling the pool from the split-off vdevs of 2422219089Spjd * an existing pool, we don't want to attach the spares & cache 2423219089Spjd * devices. 2424219089Spjd */ 2425219089Spjd 2426219089Spjd /* 2427168404Spjd * Load any hot spares for this pool. 2428168404Spjd */ 2429219089Spjd error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2430219089Spjd if (error != 0 && error != ENOENT) 2431219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2432219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2433185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2434185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 2435219089Spjd &spa->spa_spares.sav_config) != 0) 2436219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2437168404Spjd 2438185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2439168404Spjd spa_load_spares(spa); 2440185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2441219089Spjd } else if (error == 0) { 2442219089Spjd spa->spa_spares.sav_sync = B_TRUE; 2443168404Spjd } 2444168404Spjd 2445185029Spjd /* 2446185029Spjd * Load any level 2 ARC devices for this pool. 2447185029Spjd */ 2448219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2449185029Spjd &spa->spa_l2cache.sav_object); 2450219089Spjd if (error != 0 && error != ENOENT) 2451219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2452219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2453185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2454185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2455219089Spjd &spa->spa_l2cache.sav_config) != 0) 2456219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2457185029Spjd 2458185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2459185029Spjd spa_load_l2cache(spa); 2460185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2461219089Spjd } else if (error == 0) { 2462219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2463185029Spjd } 2464185029Spjd 2465219089Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2466213197Smm 2467219089Spjd error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2468219089Spjd if (error && error != ENOENT) 2469219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2470185029Spjd 2471219089Spjd if (error == 0) { 2472219089Spjd uint64_t autoreplace; 2473185029Spjd 2474219089Spjd spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2475219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2476219089Spjd spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2477219089Spjd spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2478219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2479219089Spjd spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2480219089Spjd &spa->spa_dedup_ditto); 2481185029Spjd 2482219089Spjd spa->spa_autoreplace = (autoreplace != 0); 2483168404Spjd } 2484168404Spjd 2485168404Spjd /* 2486185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 2487185029Spjd * the ZFS DE that it should not issue any faults for unopenable 2488185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 2489185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 2490185029Spjd * over. 2491185029Spjd */ 2492219089Spjd if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2493185029Spjd spa_check_removed(spa->spa_root_vdev); 2494219089Spjd /* 2495219089Spjd * For the import case, this is done in spa_import(), because 2496219089Spjd * at this point we're using the spare definitions from 2497219089Spjd * the MOS config, not necessarily from the userland config. 2498219089Spjd */ 2499219089Spjd if (state != SPA_LOAD_IMPORT) { 2500219089Spjd spa_aux_check_removed(&spa->spa_spares); 2501219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 2502219089Spjd } 2503219089Spjd } 2504185029Spjd 2505185029Spjd /* 2506168404Spjd * Load the vdev state for all toplevel vdevs. 2507168404Spjd */ 2508168404Spjd vdev_load(rvd); 2509168404Spjd 2510168404Spjd /* 2511168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 2512168404Spjd */ 2513185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2514168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2515185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2516168404Spjd 2517168404Spjd /* 2518219089Spjd * Load the DDTs (dedup tables). 2519168404Spjd */ 2520219089Spjd error = ddt_load(spa); 2521219089Spjd if (error != 0) 2522219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2523219089Spjd 2524219089Spjd spa_update_dspace(spa); 2525219089Spjd 2526219089Spjd /* 2527219089Spjd * Validate the config, using the MOS config to fill in any 2528219089Spjd * information which might be missing. If we fail to validate 2529219089Spjd * the config then declare the pool unfit for use. If we're 2530219089Spjd * assembling a pool from a split, the log is not transferred 2531219089Spjd * over. 2532219089Spjd */ 2533219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2534219089Spjd nvlist_t *nvconfig; 2535219089Spjd 2536219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2537219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2538219089Spjd 2539219089Spjd if (!spa_config_valid(spa, nvconfig)) { 2540219089Spjd nvlist_free(nvconfig); 2541219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2542219089Spjd ENXIO)); 2543219089Spjd } 2544219089Spjd nvlist_free(nvconfig); 2545219089Spjd 2546219089Spjd /* 2547236884Smm * Now that we've validated the config, check the state of the 2548219089Spjd * root vdev. If it can't be opened, it indicates one or 2549219089Spjd * more toplevel vdevs are faulted. 2550219089Spjd */ 2551219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2552249195Smm return (SET_ERROR(ENXIO)); 2553219089Spjd 2554219089Spjd if (spa_check_logs(spa)) { 2555219089Spjd *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2556219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2557219089Spjd } 2558168404Spjd } 2559168404Spjd 2560236884Smm if (missing_feat_write) { 2561236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 2562236884Smm 2563236884Smm /* 2564236884Smm * At this point, we know that we can open the pool in 2565236884Smm * read-only mode but not read-write mode. We now have enough 2566236884Smm * information and can return to userland. 2567236884Smm */ 2568236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2569236884Smm } 2570236884Smm 2571219089Spjd /* 2572219089Spjd * We've successfully opened the pool, verify that we're ready 2573219089Spjd * to start pushing transactions. 2574219089Spjd */ 2575219089Spjd if (state != SPA_LOAD_TRYIMPORT) { 2576219089Spjd if (error = spa_load_verify(spa)) 2577219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2578219089Spjd error)); 2579219089Spjd } 2580219089Spjd 2581219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2582219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 2583168404Spjd dmu_tx_t *tx; 2584168404Spjd int need_update = B_FALSE; 2585168404Spjd 2586209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 2587209962Smm 2588168404Spjd /* 2589168404Spjd * Claim log blocks that haven't been committed yet. 2590168404Spjd * This must all happen in a single txg. 2591219089Spjd * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2592219089Spjd * invoked from zil_claim_log_block()'s i/o done callback. 2593219089Spjd * Price of rollback is that we abandon the log. 2594168404Spjd */ 2595219089Spjd spa->spa_claiming = B_TRUE; 2596219089Spjd 2597168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2598168404Spjd spa_first_txg(spa)); 2599185029Spjd (void) dmu_objset_find(spa_name(spa), 2600168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 2601168404Spjd dmu_tx_commit(tx); 2602168404Spjd 2603219089Spjd spa->spa_claiming = B_FALSE; 2604219089Spjd 2605219089Spjd spa_set_log_state(spa, SPA_LOG_GOOD); 2606168404Spjd spa->spa_sync_on = B_TRUE; 2607168404Spjd txg_sync_start(spa->spa_dsl_pool); 2608168404Spjd 2609168404Spjd /* 2610219089Spjd * Wait for all claims to sync. We sync up to the highest 2611219089Spjd * claimed log block birth time so that claimed log blocks 2612219089Spjd * don't appear to be from the future. spa_claim_max_txg 2613219089Spjd * will have been set for us by either zil_check_log_chain() 2614219089Spjd * (invoked from spa_check_logs()) or zil_claim() above. 2615168404Spjd */ 2616219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2617168404Spjd 2618168404Spjd /* 2619168404Spjd * If the config cache is stale, or we have uninitialized 2620168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 2621209962Smm * 2622219089Spjd * If this is a verbatim import, trust the current 2623209962Smm * in-core spa_config and update the disk labels. 2624168404Spjd */ 2625168404Spjd if (config_cache_txg != spa->spa_config_txg || 2626219089Spjd state == SPA_LOAD_IMPORT || 2627219089Spjd state == SPA_LOAD_RECOVER || 2628219089Spjd (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2629168404Spjd need_update = B_TRUE; 2630168404Spjd 2631209962Smm for (int c = 0; c < rvd->vdev_children; c++) 2632168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 2633168404Spjd need_update = B_TRUE; 2634168404Spjd 2635168404Spjd /* 2636168404Spjd * Update the config cache asychronously in case we're the 2637168404Spjd * root pool, in which case the config cache isn't writable yet. 2638168404Spjd */ 2639168404Spjd if (need_update) 2640168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2641208683Spjd 2642208683Spjd /* 2643208683Spjd * Check all DTLs to see if anything needs resilvering. 2644208683Spjd */ 2645219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2646219089Spjd vdev_resilver_needed(rvd, NULL, NULL)) 2647208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2648219089Spjd 2649219089Spjd /* 2650248571Smm * Log the fact that we booted up (so that we can detect if 2651248571Smm * we rebooted in the middle of an operation). 2652248571Smm */ 2653248571Smm spa_history_log_version(spa, "open"); 2654248571Smm 2655248571Smm /* 2656219089Spjd * Delete any inconsistent datasets. 2657219089Spjd */ 2658219089Spjd (void) dmu_objset_find(spa_name(spa), 2659219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2660219089Spjd 2661219089Spjd /* 2662219089Spjd * Clean up any stale temporary dataset userrefs. 2663219089Spjd */ 2664219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2665168404Spjd } 2666168404Spjd 2667219089Spjd return (0); 2668219089Spjd} 2669168404Spjd 2670219089Spjdstatic int 2671219089Spjdspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2672219089Spjd{ 2673219089Spjd int mode = spa->spa_mode; 2674219089Spjd 2675219089Spjd spa_unload(spa); 2676219089Spjd spa_deactivate(spa); 2677219089Spjd 2678219089Spjd spa->spa_load_max_txg--; 2679219089Spjd 2680219089Spjd spa_activate(spa, mode); 2681219089Spjd spa_async_suspend(spa); 2682219089Spjd 2683219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2684168404Spjd} 2685168404Spjd 2686236884Smm/* 2687236884Smm * If spa_load() fails this function will try loading prior txg's. If 2688236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2689236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2690236884Smm * function will not rewind the pool and will return the same error as 2691236884Smm * spa_load(). 2692236884Smm */ 2693219089Spjdstatic int 2694219089Spjdspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2695219089Spjd uint64_t max_request, int rewind_flags) 2696219089Spjd{ 2697236884Smm nvlist_t *loadinfo = NULL; 2698219089Spjd nvlist_t *config = NULL; 2699219089Spjd int load_error, rewind_error; 2700219089Spjd uint64_t safe_rewind_txg; 2701219089Spjd uint64_t min_txg; 2702219089Spjd 2703219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2704219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 2705219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2706219089Spjd } else { 2707219089Spjd spa->spa_load_max_txg = max_request; 2708219089Spjd } 2709219089Spjd 2710219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2711219089Spjd mosconfig); 2712219089Spjd if (load_error == 0) 2713219089Spjd return (0); 2714219089Spjd 2715219089Spjd if (spa->spa_root_vdev != NULL) 2716219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2717219089Spjd 2718219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2719219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2720219089Spjd 2721219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 2722219089Spjd nvlist_free(config); 2723219089Spjd return (load_error); 2724219089Spjd } 2725219089Spjd 2726236884Smm if (state == SPA_LOAD_RECOVER) { 2727236884Smm /* Price of rolling back is discarding txgs, including log */ 2728219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2729236884Smm } else { 2730236884Smm /* 2731236884Smm * If we aren't rolling back save the load info from our first 2732236884Smm * import attempt so that we can restore it after attempting 2733236884Smm * to rewind. 2734236884Smm */ 2735236884Smm loadinfo = spa->spa_load_info; 2736236884Smm spa->spa_load_info = fnvlist_alloc(); 2737236884Smm } 2738219089Spjd 2739219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2740219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2741219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2742219089Spjd TXG_INITIAL : safe_rewind_txg; 2743219089Spjd 2744219089Spjd /* 2745219089Spjd * Continue as long as we're finding errors, we're still within 2746219089Spjd * the acceptable rewind range, and we're still finding uberblocks 2747219089Spjd */ 2748219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2749219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2750219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 2751219089Spjd spa->spa_extreme_rewind = B_TRUE; 2752219089Spjd rewind_error = spa_load_retry(spa, state, mosconfig); 2753219089Spjd } 2754219089Spjd 2755219089Spjd spa->spa_extreme_rewind = B_FALSE; 2756219089Spjd spa->spa_load_max_txg = UINT64_MAX; 2757219089Spjd 2758219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2759219089Spjd spa_config_set(spa, config); 2760219089Spjd 2761236884Smm if (state == SPA_LOAD_RECOVER) { 2762236884Smm ASSERT3P(loadinfo, ==, NULL); 2763236884Smm return (rewind_error); 2764236884Smm } else { 2765236884Smm /* Store the rewind info as part of the initial load info */ 2766236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2767236884Smm spa->spa_load_info); 2768236884Smm 2769236884Smm /* Restore the initial load info */ 2770236884Smm fnvlist_free(spa->spa_load_info); 2771236884Smm spa->spa_load_info = loadinfo; 2772236884Smm 2773236884Smm return (load_error); 2774236884Smm } 2775219089Spjd} 2776219089Spjd 2777168404Spjd/* 2778168404Spjd * Pool Open/Import 2779168404Spjd * 2780168404Spjd * The import case is identical to an open except that the configuration is sent 2781168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 2782168404Spjd * case of an open, the pool configuration will exist in the 2783185029Spjd * POOL_STATE_UNINITIALIZED state. 2784168404Spjd * 2785168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 2786168404Spjd * the same time open the pool, without having to keep around the spa_t in some 2787168404Spjd * ambiguous state. 2788168404Spjd */ 2789168404Spjdstatic int 2790219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2791219089Spjd nvlist_t **config) 2792168404Spjd{ 2793168404Spjd spa_t *spa; 2794219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 2795168404Spjd int error; 2796168404Spjd int locked = B_FALSE; 2797219089Spjd int firstopen = B_FALSE; 2798168404Spjd 2799168404Spjd *spapp = NULL; 2800168404Spjd 2801168404Spjd /* 2802168404Spjd * As disgusting as this is, we need to support recursive calls to this 2803168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 2804168404Spjd * up calling spa_open() again. The real fix is to figure out how to 2805168404Spjd * avoid dsl_dir_open() calling this in the first place. 2806168404Spjd */ 2807168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 2808168404Spjd mutex_enter(&spa_namespace_lock); 2809168404Spjd locked = B_TRUE; 2810168404Spjd } 2811168404Spjd 2812168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 2813168404Spjd if (locked) 2814168404Spjd mutex_exit(&spa_namespace_lock); 2815249195Smm return (SET_ERROR(ENOENT)); 2816168404Spjd } 2817219089Spjd 2818168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2819219089Spjd zpool_rewind_policy_t policy; 2820168404Spjd 2821219089Spjd firstopen = B_TRUE; 2822219089Spjd 2823219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2824219089Spjd &policy); 2825219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 2826219089Spjd state = SPA_LOAD_RECOVER; 2827219089Spjd 2828209962Smm spa_activate(spa, spa_mode_global); 2829168404Spjd 2830219089Spjd if (state != SPA_LOAD_RECOVER) 2831219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2832168404Spjd 2833219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2834219089Spjd policy.zrp_request); 2835219089Spjd 2836168404Spjd if (error == EBADF) { 2837168404Spjd /* 2838168404Spjd * If vdev_validate() returns failure (indicated by 2839168404Spjd * EBADF), it indicates that one of the vdevs indicates 2840168404Spjd * that the pool has been exported or destroyed. If 2841168404Spjd * this is the case, the config cache is out of sync and 2842168404Spjd * we should remove the pool from the namespace. 2843168404Spjd */ 2844168404Spjd spa_unload(spa); 2845168404Spjd spa_deactivate(spa); 2846185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 2847168404Spjd spa_remove(spa); 2848168404Spjd if (locked) 2849168404Spjd mutex_exit(&spa_namespace_lock); 2850249195Smm return (SET_ERROR(ENOENT)); 2851168404Spjd } 2852168404Spjd 2853168404Spjd if (error) { 2854168404Spjd /* 2855168404Spjd * We can't open the pool, but we still have useful 2856168404Spjd * information: the state of each vdev after the 2857168404Spjd * attempted vdev_open(). Return this to the user. 2858168404Spjd */ 2859219089Spjd if (config != NULL && spa->spa_config) { 2860219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 2861219089Spjd KM_SLEEP) == 0); 2862219089Spjd VERIFY(nvlist_add_nvlist(*config, 2863219089Spjd ZPOOL_CONFIG_LOAD_INFO, 2864219089Spjd spa->spa_load_info) == 0); 2865219089Spjd } 2866168404Spjd spa_unload(spa); 2867168404Spjd spa_deactivate(spa); 2868219089Spjd spa->spa_last_open_failed = error; 2869168404Spjd if (locked) 2870168404Spjd mutex_exit(&spa_namespace_lock); 2871168404Spjd *spapp = NULL; 2872168404Spjd return (error); 2873168404Spjd } 2874168404Spjd } 2875168404Spjd 2876168404Spjd spa_open_ref(spa, tag); 2877185029Spjd 2878219089Spjd if (config != NULL) 2879219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2880219089Spjd 2881219089Spjd /* 2882219089Spjd * If we've recovered the pool, pass back any information we 2883219089Spjd * gathered while doing the load. 2884219089Spjd */ 2885219089Spjd if (state == SPA_LOAD_RECOVER) { 2886219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2887219089Spjd spa->spa_load_info) == 0); 2888219089Spjd } 2889219089Spjd 2890219089Spjd if (locked) { 2891219089Spjd spa->spa_last_open_failed = 0; 2892219089Spjd spa->spa_last_ubsync_txg = 0; 2893219089Spjd spa->spa_load_txg = 0; 2894168404Spjd mutex_exit(&spa_namespace_lock); 2895219089Spjd#ifdef __FreeBSD__ 2896219089Spjd#ifdef _KERNEL 2897219089Spjd if (firstopen) 2898249047Savg zvol_create_minors(spa->spa_name); 2899219089Spjd#endif 2900219089Spjd#endif 2901219089Spjd } 2902168404Spjd 2903168404Spjd *spapp = spa; 2904168404Spjd 2905168404Spjd return (0); 2906168404Spjd} 2907168404Spjd 2908168404Spjdint 2909219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2910219089Spjd nvlist_t **config) 2911219089Spjd{ 2912219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 2913219089Spjd} 2914219089Spjd 2915219089Spjdint 2916168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 2917168404Spjd{ 2918219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 2919168404Spjd} 2920168404Spjd 2921168404Spjd/* 2922168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 2923168404Spjd * preventing it from being exported or destroyed. 2924168404Spjd */ 2925168404Spjdspa_t * 2926168404Spjdspa_inject_addref(char *name) 2927168404Spjd{ 2928168404Spjd spa_t *spa; 2929168404Spjd 2930168404Spjd mutex_enter(&spa_namespace_lock); 2931168404Spjd if ((spa = spa_lookup(name)) == NULL) { 2932168404Spjd mutex_exit(&spa_namespace_lock); 2933168404Spjd return (NULL); 2934168404Spjd } 2935168404Spjd spa->spa_inject_ref++; 2936168404Spjd mutex_exit(&spa_namespace_lock); 2937168404Spjd 2938168404Spjd return (spa); 2939168404Spjd} 2940168404Spjd 2941168404Spjdvoid 2942168404Spjdspa_inject_delref(spa_t *spa) 2943168404Spjd{ 2944168404Spjd mutex_enter(&spa_namespace_lock); 2945168404Spjd spa->spa_inject_ref--; 2946168404Spjd mutex_exit(&spa_namespace_lock); 2947168404Spjd} 2948168404Spjd 2949185029Spjd/* 2950185029Spjd * Add spares device information to the nvlist. 2951185029Spjd */ 2952168404Spjdstatic void 2953168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 2954168404Spjd{ 2955168404Spjd nvlist_t **spares; 2956168404Spjd uint_t i, nspares; 2957168404Spjd nvlist_t *nvroot; 2958168404Spjd uint64_t guid; 2959168404Spjd vdev_stat_t *vs; 2960168404Spjd uint_t vsc; 2961168404Spjd uint64_t pool; 2962168404Spjd 2963209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2964209962Smm 2965185029Spjd if (spa->spa_spares.sav_count == 0) 2966168404Spjd return; 2967168404Spjd 2968168404Spjd VERIFY(nvlist_lookup_nvlist(config, 2969168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2970185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2971168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2972168404Spjd if (nspares != 0) { 2973168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 2974168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2975168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 2976168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2977168404Spjd 2978168404Spjd /* 2979168404Spjd * Go through and find any spares which have since been 2980168404Spjd * repurposed as an active spare. If this is the case, update 2981168404Spjd * their status appropriately. 2982168404Spjd */ 2983168404Spjd for (i = 0; i < nspares; i++) { 2984168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 2985168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 2986185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 2987185029Spjd pool != 0ULL) { 2988168404Spjd VERIFY(nvlist_lookup_uint64_array( 2989219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 2990168404Spjd (uint64_t **)&vs, &vsc) == 0); 2991168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 2992168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 2993168404Spjd } 2994168404Spjd } 2995168404Spjd } 2996168404Spjd} 2997168404Spjd 2998185029Spjd/* 2999185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 3000185029Spjd */ 3001185029Spjdstatic void 3002185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 3003185029Spjd{ 3004185029Spjd nvlist_t **l2cache; 3005185029Spjd uint_t i, j, nl2cache; 3006185029Spjd nvlist_t *nvroot; 3007185029Spjd uint64_t guid; 3008185029Spjd vdev_t *vd; 3009185029Spjd vdev_stat_t *vs; 3010185029Spjd uint_t vsc; 3011185029Spjd 3012209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3013209962Smm 3014185029Spjd if (spa->spa_l2cache.sav_count == 0) 3015185029Spjd return; 3016185029Spjd 3017185029Spjd VERIFY(nvlist_lookup_nvlist(config, 3018185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3019185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3020185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3021185029Spjd if (nl2cache != 0) { 3022185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3023185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3024185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3025185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3026185029Spjd 3027185029Spjd /* 3028185029Spjd * Update level 2 cache device stats. 3029185029Spjd */ 3030185029Spjd 3031185029Spjd for (i = 0; i < nl2cache; i++) { 3032185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 3033185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3034185029Spjd 3035185029Spjd vd = NULL; 3036185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3037185029Spjd if (guid == 3038185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3039185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 3040185029Spjd break; 3041185029Spjd } 3042185029Spjd } 3043185029Spjd ASSERT(vd != NULL); 3044185029Spjd 3045185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3046219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3047219089Spjd == 0); 3048185029Spjd vdev_get_stats(vd, vs); 3049185029Spjd } 3050185029Spjd } 3051185029Spjd} 3052185029Spjd 3053236884Smmstatic void 3054236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3055236884Smm{ 3056236884Smm nvlist_t *features; 3057236884Smm zap_cursor_t zc; 3058236884Smm zap_attribute_t za; 3059236884Smm 3060236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3061236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3062236884Smm 3063253993Smav /* We may be unable to read features if pool is suspended. */ 3064253993Smav if (spa_suspended(spa)) 3065253993Smav goto out; 3066253993Smav 3067236884Smm if (spa->spa_feat_for_read_obj != 0) { 3068236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3069236884Smm spa->spa_feat_for_read_obj); 3070236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3071236884Smm zap_cursor_advance(&zc)) { 3072236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3073236884Smm za.za_num_integers == 1); 3074236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3075236884Smm za.za_first_integer)); 3076236884Smm } 3077236884Smm zap_cursor_fini(&zc); 3078236884Smm } 3079236884Smm 3080236884Smm if (spa->spa_feat_for_write_obj != 0) { 3081236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3082236884Smm spa->spa_feat_for_write_obj); 3083236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3084236884Smm zap_cursor_advance(&zc)) { 3085236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3086236884Smm za.za_num_integers == 1); 3087236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3088236884Smm za.za_first_integer)); 3089236884Smm } 3090236884Smm zap_cursor_fini(&zc); 3091236884Smm } 3092236884Smm 3093253993Smavout: 3094236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3095236884Smm features) == 0); 3096236884Smm nvlist_free(features); 3097236884Smm} 3098236884Smm 3099168404Spjdint 3100236884Smmspa_get_stats(const char *name, nvlist_t **config, 3101236884Smm char *altroot, size_t buflen) 3102168404Spjd{ 3103168404Spjd int error; 3104168404Spjd spa_t *spa; 3105168404Spjd 3106168404Spjd *config = NULL; 3107219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3108168404Spjd 3109209962Smm if (spa != NULL) { 3110209962Smm /* 3111209962Smm * This still leaves a window of inconsistency where the spares 3112209962Smm * or l2cache devices could change and the config would be 3113209962Smm * self-inconsistent. 3114209962Smm */ 3115209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3116168404Spjd 3117209962Smm if (*config != NULL) { 3118219089Spjd uint64_t loadtimes[2]; 3119219089Spjd 3120219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3121219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3122219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3123219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3124219089Spjd 3125185029Spjd VERIFY(nvlist_add_uint64(*config, 3126209962Smm ZPOOL_CONFIG_ERRCOUNT, 3127209962Smm spa_get_errlog_size(spa)) == 0); 3128185029Spjd 3129209962Smm if (spa_suspended(spa)) 3130209962Smm VERIFY(nvlist_add_uint64(*config, 3131209962Smm ZPOOL_CONFIG_SUSPENDED, 3132209962Smm spa->spa_failmode) == 0); 3133209962Smm 3134209962Smm spa_add_spares(spa, *config); 3135209962Smm spa_add_l2cache(spa, *config); 3136236884Smm spa_add_feature_stats(spa, *config); 3137209962Smm } 3138168404Spjd } 3139168404Spjd 3140168404Spjd /* 3141168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3142168404Spjd * and call spa_lookup() directly. 3143168404Spjd */ 3144168404Spjd if (altroot) { 3145168404Spjd if (spa == NULL) { 3146168404Spjd mutex_enter(&spa_namespace_lock); 3147168404Spjd spa = spa_lookup(name); 3148168404Spjd if (spa) 3149168404Spjd spa_altroot(spa, altroot, buflen); 3150168404Spjd else 3151168404Spjd altroot[0] = '\0'; 3152168404Spjd spa = NULL; 3153168404Spjd mutex_exit(&spa_namespace_lock); 3154168404Spjd } else { 3155168404Spjd spa_altroot(spa, altroot, buflen); 3156168404Spjd } 3157168404Spjd } 3158168404Spjd 3159209962Smm if (spa != NULL) { 3160209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3161168404Spjd spa_close(spa, FTAG); 3162209962Smm } 3163168404Spjd 3164168404Spjd return (error); 3165168404Spjd} 3166168404Spjd 3167168404Spjd/* 3168185029Spjd * Validate that the auxiliary device array is well formed. We must have an 3169185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 3170185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3171185029Spjd * specified, as long as they are well-formed. 3172168404Spjd */ 3173168404Spjdstatic int 3174185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3175185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 3176185029Spjd vdev_labeltype_t label) 3177168404Spjd{ 3178185029Spjd nvlist_t **dev; 3179185029Spjd uint_t i, ndev; 3180168404Spjd vdev_t *vd; 3181168404Spjd int error; 3182168404Spjd 3183185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3184185029Spjd 3185168404Spjd /* 3186185029Spjd * It's acceptable to have no devs specified. 3187168404Spjd */ 3188185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3189168404Spjd return (0); 3190168404Spjd 3191185029Spjd if (ndev == 0) 3192249195Smm return (SET_ERROR(EINVAL)); 3193168404Spjd 3194168404Spjd /* 3195185029Spjd * Make sure the pool is formatted with a version that supports this 3196185029Spjd * device type. 3197168404Spjd */ 3198185029Spjd if (spa_version(spa) < version) 3199249195Smm return (SET_ERROR(ENOTSUP)); 3200168404Spjd 3201168404Spjd /* 3202185029Spjd * Set the pending device list so we correctly handle device in-use 3203168404Spjd * checking. 3204168404Spjd */ 3205185029Spjd sav->sav_pending = dev; 3206185029Spjd sav->sav_npending = ndev; 3207168404Spjd 3208185029Spjd for (i = 0; i < ndev; i++) { 3209185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3210168404Spjd mode)) != 0) 3211168404Spjd goto out; 3212168404Spjd 3213168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3214168404Spjd vdev_free(vd); 3215249195Smm error = SET_ERROR(EINVAL); 3216168404Spjd goto out; 3217168404Spjd } 3218168404Spjd 3219185029Spjd /* 3220185029Spjd * The L2ARC currently only supports disk devices in 3221185029Spjd * kernel context. For user-level testing, we allow it. 3222185029Spjd */ 3223185029Spjd#ifdef _KERNEL 3224185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3225185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3226249195Smm error = SET_ERROR(ENOTBLK); 3227230514Smm vdev_free(vd); 3228185029Spjd goto out; 3229185029Spjd } 3230185029Spjd#endif 3231168404Spjd vd->vdev_top = vd; 3232168404Spjd 3233168404Spjd if ((error = vdev_open(vd)) == 0 && 3234185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 3235185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3236168404Spjd vd->vdev_guid) == 0); 3237168404Spjd } 3238168404Spjd 3239168404Spjd vdev_free(vd); 3240168404Spjd 3241185029Spjd if (error && 3242185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3243168404Spjd goto out; 3244168404Spjd else 3245168404Spjd error = 0; 3246168404Spjd } 3247168404Spjd 3248168404Spjdout: 3249185029Spjd sav->sav_pending = NULL; 3250185029Spjd sav->sav_npending = 0; 3251168404Spjd return (error); 3252168404Spjd} 3253168404Spjd 3254185029Spjdstatic int 3255185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3256185029Spjd{ 3257185029Spjd int error; 3258185029Spjd 3259185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3260185029Spjd 3261185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3262185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3263185029Spjd VDEV_LABEL_SPARE)) != 0) { 3264185029Spjd return (error); 3265185029Spjd } 3266185029Spjd 3267185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3268185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3269185029Spjd VDEV_LABEL_L2CACHE)); 3270185029Spjd} 3271185029Spjd 3272185029Spjdstatic void 3273185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3274185029Spjd const char *config) 3275185029Spjd{ 3276185029Spjd int i; 3277185029Spjd 3278185029Spjd if (sav->sav_config != NULL) { 3279185029Spjd nvlist_t **olddevs; 3280185029Spjd uint_t oldndevs; 3281185029Spjd nvlist_t **newdevs; 3282185029Spjd 3283185029Spjd /* 3284185029Spjd * Generate new dev list by concatentating with the 3285185029Spjd * current dev list. 3286185029Spjd */ 3287185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3288185029Spjd &olddevs, &oldndevs) == 0); 3289185029Spjd 3290185029Spjd newdevs = kmem_alloc(sizeof (void *) * 3291185029Spjd (ndevs + oldndevs), KM_SLEEP); 3292185029Spjd for (i = 0; i < oldndevs; i++) 3293185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3294185029Spjd KM_SLEEP) == 0); 3295185029Spjd for (i = 0; i < ndevs; i++) 3296185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3297185029Spjd KM_SLEEP) == 0); 3298185029Spjd 3299185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 3300185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 3301185029Spjd 3302185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3303185029Spjd config, newdevs, ndevs + oldndevs) == 0); 3304185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 3305185029Spjd nvlist_free(newdevs[i]); 3306185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3307185029Spjd } else { 3308185029Spjd /* 3309185029Spjd * Generate a new dev list. 3310185029Spjd */ 3311185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3312185029Spjd KM_SLEEP) == 0); 3313185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3314185029Spjd devs, ndevs) == 0); 3315185029Spjd } 3316185029Spjd} 3317185029Spjd 3318168404Spjd/* 3319185029Spjd * Stop and drop level 2 ARC devices 3320185029Spjd */ 3321185029Spjdvoid 3322185029Spjdspa_l2cache_drop(spa_t *spa) 3323185029Spjd{ 3324185029Spjd vdev_t *vd; 3325185029Spjd int i; 3326185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 3327185029Spjd 3328185029Spjd for (i = 0; i < sav->sav_count; i++) { 3329185029Spjd uint64_t pool; 3330185029Spjd 3331185029Spjd vd = sav->sav_vdevs[i]; 3332185029Spjd ASSERT(vd != NULL); 3333185029Spjd 3334209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3335209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 3336185029Spjd l2arc_remove_vdev(vd); 3337185029Spjd } 3338185029Spjd} 3339185029Spjd 3340185029Spjd/* 3341168404Spjd * Pool Creation 3342168404Spjd */ 3343168404Spjdint 3344185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3345248571Smm nvlist_t *zplprops) 3346168404Spjd{ 3347168404Spjd spa_t *spa; 3348185029Spjd char *altroot = NULL; 3349168404Spjd vdev_t *rvd; 3350168404Spjd dsl_pool_t *dp; 3351168404Spjd dmu_tx_t *tx; 3352219089Spjd int error = 0; 3353168404Spjd uint64_t txg = TXG_INITIAL; 3354185029Spjd nvlist_t **spares, **l2cache; 3355185029Spjd uint_t nspares, nl2cache; 3356219089Spjd uint64_t version, obj; 3357236884Smm boolean_t has_features; 3358168404Spjd 3359168404Spjd /* 3360168404Spjd * If this pool already exists, return failure. 3361168404Spjd */ 3362168404Spjd mutex_enter(&spa_namespace_lock); 3363168404Spjd if (spa_lookup(pool) != NULL) { 3364168404Spjd mutex_exit(&spa_namespace_lock); 3365249195Smm return (SET_ERROR(EEXIST)); 3366168404Spjd } 3367168404Spjd 3368168404Spjd /* 3369168404Spjd * Allocate a new spa_t structure. 3370168404Spjd */ 3371185029Spjd (void) nvlist_lookup_string(props, 3372185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3373219089Spjd spa = spa_add(pool, NULL, altroot); 3374209962Smm spa_activate(spa, spa_mode_global); 3375168404Spjd 3376185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 3377185029Spjd spa_deactivate(spa); 3378185029Spjd spa_remove(spa); 3379185029Spjd mutex_exit(&spa_namespace_lock); 3380185029Spjd return (error); 3381185029Spjd } 3382185029Spjd 3383236884Smm has_features = B_FALSE; 3384236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3385236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3386236884Smm if (zpool_prop_feature(nvpair_name(elem))) 3387236884Smm has_features = B_TRUE; 3388236884Smm } 3389236884Smm 3390236884Smm if (has_features || nvlist_lookup_uint64(props, 3391236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3392185029Spjd version = SPA_VERSION; 3393236884Smm } 3394236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3395219089Spjd 3396219089Spjd spa->spa_first_txg = txg; 3397219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 3398185029Spjd spa->spa_uberblock.ub_version = version; 3399168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3400168404Spjd 3401168404Spjd /* 3402209962Smm * Create "The Godfather" zio to hold all async IOs 3403209962Smm */ 3404209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 3405209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 3406209962Smm 3407209962Smm /* 3408168404Spjd * Create the root vdev. 3409168404Spjd */ 3410185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3411168404Spjd 3412168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3413168404Spjd 3414168404Spjd ASSERT(error != 0 || rvd != NULL); 3415168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3416168404Spjd 3417185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 3418249195Smm error = SET_ERROR(EINVAL); 3419168404Spjd 3420168404Spjd if (error == 0 && 3421168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3422185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 3423168404Spjd VDEV_ALLOC_ADD)) == 0) { 3424219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 3425219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 3426219089Spjd vdev_expand(rvd->vdev_child[c], txg); 3427219089Spjd } 3428168404Spjd } 3429168404Spjd 3430185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3431168404Spjd 3432168404Spjd if (error != 0) { 3433168404Spjd spa_unload(spa); 3434168404Spjd spa_deactivate(spa); 3435168404Spjd spa_remove(spa); 3436168404Spjd mutex_exit(&spa_namespace_lock); 3437168404Spjd return (error); 3438168404Spjd } 3439168404Spjd 3440168404Spjd /* 3441168404Spjd * Get the list of spares, if specified. 3442168404Spjd */ 3443168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3444168404Spjd &spares, &nspares) == 0) { 3445185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3446168404Spjd KM_SLEEP) == 0); 3447185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3448168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3449185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3450168404Spjd spa_load_spares(spa); 3451185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3452185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3453168404Spjd } 3454168404Spjd 3455185029Spjd /* 3456185029Spjd * Get the list of level 2 cache devices, if specified. 3457185029Spjd */ 3458185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3459185029Spjd &l2cache, &nl2cache) == 0) { 3460185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3461185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 3462185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3463185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3464185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3465185029Spjd spa_load_l2cache(spa); 3466185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3467185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3468185029Spjd } 3469185029Spjd 3470236884Smm spa->spa_is_initializing = B_TRUE; 3471185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3472168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 3473236884Smm spa->spa_is_initializing = B_FALSE; 3474168404Spjd 3475219089Spjd /* 3476219089Spjd * Create DDTs (dedup tables). 3477219089Spjd */ 3478219089Spjd ddt_create(spa); 3479219089Spjd 3480219089Spjd spa_update_dspace(spa); 3481219089Spjd 3482168404Spjd tx = dmu_tx_create_assigned(dp, txg); 3483168404Spjd 3484168404Spjd /* 3485168404Spjd * Create the pool config object. 3486168404Spjd */ 3487168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3488185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3489168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3490168404Spjd 3491168404Spjd if (zap_add(spa->spa_meta_objset, 3492168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3493168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3494168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 3495168404Spjd } 3496168404Spjd 3497236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 3498236884Smm spa_feature_create_zap_objects(spa, tx); 3499236884Smm 3500219089Spjd if (zap_add(spa->spa_meta_objset, 3501219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3502219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 3503219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 3504219089Spjd } 3505219089Spjd 3506185029Spjd /* Newly created pools with the right version are always deflated. */ 3507185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3508185029Spjd spa->spa_deflate = TRUE; 3509185029Spjd if (zap_add(spa->spa_meta_objset, 3510185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3511185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3512185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 3513185029Spjd } 3514168404Spjd } 3515168404Spjd 3516168404Spjd /* 3517219089Spjd * Create the deferred-free bpobj. Turn off compression 3518168404Spjd * because sync-to-convergence takes longer if the blocksize 3519168404Spjd * keeps changing. 3520168404Spjd */ 3521219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3522219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 3523168404Spjd ZIO_COMPRESS_OFF, tx); 3524168404Spjd if (zap_add(spa->spa_meta_objset, 3525219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3526219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 3527219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 3528168404Spjd } 3529219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3530219089Spjd spa->spa_meta_objset, obj)); 3531168404Spjd 3532168404Spjd /* 3533168404Spjd * Create the pool's history object. 3534168404Spjd */ 3535185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 3536185029Spjd spa_history_create_obj(spa, tx); 3537168404Spjd 3538185029Spjd /* 3539185029Spjd * Set pool properties. 3540185029Spjd */ 3541185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3542185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3543185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3544219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3545219089Spjd 3546209962Smm if (props != NULL) { 3547209962Smm spa_configfile_set(spa, props, B_FALSE); 3548248571Smm spa_sync_props(props, tx); 3549209962Smm } 3550185029Spjd 3551168404Spjd dmu_tx_commit(tx); 3552168404Spjd 3553168404Spjd spa->spa_sync_on = B_TRUE; 3554168404Spjd txg_sync_start(spa->spa_dsl_pool); 3555168404Spjd 3556168404Spjd /* 3557168404Spjd * We explicitly wait for the first transaction to complete so that our 3558168404Spjd * bean counters are appropriately updated. 3559168404Spjd */ 3560168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 3561168404Spjd 3562185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3563168404Spjd 3564248571Smm spa_history_log_version(spa, "create"); 3565185029Spjd 3566208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 3567208442Smm 3568168404Spjd mutex_exit(&spa_namespace_lock); 3569168404Spjd 3570168404Spjd return (0); 3571168404Spjd} 3572168404Spjd 3573241286Savg#ifdef _KERNEL 3574219089Spjd#if defined(sun) 3575185029Spjd/* 3576219089Spjd * Get the root pool information from the root disk, then import the root pool 3577219089Spjd * during the system boot up time. 3578185029Spjd */ 3579219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3580219089Spjd 3581219089Spjdstatic nvlist_t * 3582219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3583185029Spjd{ 3584219089Spjd nvlist_t *config; 3585185029Spjd nvlist_t *nvtop, *nvroot; 3586185029Spjd uint64_t pgid; 3587185029Spjd 3588219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3589219089Spjd return (NULL); 3590219089Spjd 3591168404Spjd /* 3592185029Spjd * Add this top-level vdev to the child array. 3593168404Spjd */ 3594219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3595219089Spjd &nvtop) == 0); 3596219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3597219089Spjd &pgid) == 0); 3598219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3599168404Spjd 3600185029Spjd /* 3601185029Spjd * Put this pool's top-level vdevs into a root vdev. 3602185029Spjd */ 3603185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3604219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3605219089Spjd VDEV_TYPE_ROOT) == 0); 3606185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3607185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3608185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3609185029Spjd &nvtop, 1) == 0); 3610168404Spjd 3611168404Spjd /* 3612185029Spjd * Replace the existing vdev_tree with the new root vdev in 3613185029Spjd * this pool's configuration (remove the old, add the new). 3614168404Spjd */ 3615185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3616185029Spjd nvlist_free(nvroot); 3617219089Spjd return (config); 3618185029Spjd} 3619168404Spjd 3620185029Spjd/* 3621219089Spjd * Walk the vdev tree and see if we can find a device with "better" 3622219089Spjd * configuration. A configuration is "better" if the label on that 3623219089Spjd * device has a more recent txg. 3624185029Spjd */ 3625219089Spjdstatic void 3626219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3627185029Spjd{ 3628219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3629219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3630185029Spjd 3631219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 3632219089Spjd nvlist_t *label; 3633219089Spjd uint64_t label_txg; 3634185029Spjd 3635219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3636219089Spjd &label) != 0) 3637219089Spjd return; 3638185029Spjd 3639219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3640219089Spjd &label_txg) == 0); 3641168404Spjd 3642219089Spjd /* 3643219089Spjd * Do we have a better boot device? 3644219089Spjd */ 3645219089Spjd if (label_txg > *txg) { 3646219089Spjd *txg = label_txg; 3647219089Spjd *avd = vd; 3648185029Spjd } 3649219089Spjd nvlist_free(label); 3650185029Spjd } 3651185029Spjd} 3652185029Spjd 3653185029Spjd/* 3654185029Spjd * Import a root pool. 3655185029Spjd * 3656185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 3657185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3658185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 3659185029Spjd * 3660185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 3661185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 3662185029Spjd * e.g. 3663185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 3664185029Spjd */ 3665185029Spjdint 3666185029Spjdspa_import_rootpool(char *devpath, char *devid) 3667185029Spjd{ 3668219089Spjd spa_t *spa; 3669219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 3670219089Spjd nvlist_t *config, *nvtop; 3671219089Spjd uint64_t guid, txg; 3672185029Spjd char *pname; 3673185029Spjd int error; 3674185029Spjd 3675185029Spjd /* 3676219089Spjd * Read the label from the boot device and generate a configuration. 3677185029Spjd */ 3678219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3679219089Spjd#if defined(_OBP) && defined(_KERNEL) 3680219089Spjd if (config == NULL) { 3681219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 3682219089Spjd /* iscsi boot */ 3683219089Spjd get_iscsi_bootpath_phy(devpath); 3684219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3685219089Spjd } 3686219089Spjd } 3687219089Spjd#endif 3688219089Spjd if (config == NULL) { 3689236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3690219089Spjd devpath); 3691249195Smm return (SET_ERROR(EIO)); 3692219089Spjd } 3693185029Spjd 3694219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3695219089Spjd &pname) == 0); 3696219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3697185029Spjd 3698209962Smm mutex_enter(&spa_namespace_lock); 3699209962Smm if ((spa = spa_lookup(pname)) != NULL) { 3700209962Smm /* 3701209962Smm * Remove the existing root pool from the namespace so that we 3702209962Smm * can replace it with the correct config we just read in. 3703209962Smm */ 3704209962Smm spa_remove(spa); 3705209962Smm } 3706185029Spjd 3707219089Spjd spa = spa_add(pname, config, NULL); 3708209962Smm spa->spa_is_root = B_TRUE; 3709219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3710209962Smm 3711219089Spjd /* 3712219089Spjd * Build up a vdev tree based on the boot device's label config. 3713219089Spjd */ 3714219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3715219089Spjd &nvtop) == 0); 3716219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3717219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3718219089Spjd VDEV_ALLOC_ROOTPOOL); 3719219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3720219089Spjd if (error) { 3721209962Smm mutex_exit(&spa_namespace_lock); 3722219089Spjd nvlist_free(config); 3723219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3724219089Spjd pname); 3725219089Spjd return (error); 3726209962Smm } 3727209962Smm 3728219089Spjd /* 3729219089Spjd * Get the boot vdev. 3730219089Spjd */ 3731219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3732219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3733219089Spjd (u_longlong_t)guid); 3734249195Smm error = SET_ERROR(ENOENT); 3735219089Spjd goto out; 3736219089Spjd } 3737209962Smm 3738219089Spjd /* 3739219089Spjd * Determine if there is a better boot device. 3740219089Spjd */ 3741219089Spjd avd = bvd; 3742219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 3743219089Spjd if (avd != bvd) { 3744219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3745219089Spjd "try booting from '%s'", avd->vdev_path); 3746249195Smm error = SET_ERROR(EINVAL); 3747219089Spjd goto out; 3748219089Spjd } 3749209962Smm 3750219089Spjd /* 3751219089Spjd * If the boot device is part of a spare vdev then ensure that 3752219089Spjd * we're booting off the active spare. 3753219089Spjd */ 3754219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3755219089Spjd !bvd->vdev_isspare) { 3756219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3757219089Spjd "try booting from '%s'", 3758219089Spjd bvd->vdev_parent-> 3759219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3760249195Smm error = SET_ERROR(EINVAL); 3761219089Spjd goto out; 3762219089Spjd } 3763209962Smm 3764219089Spjd error = 0; 3765219089Spjdout: 3766219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3767219089Spjd vdev_free(rvd); 3768219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3769209962Smm mutex_exit(&spa_namespace_lock); 3770209962Smm 3771219089Spjd nvlist_free(config); 3772219089Spjd return (error); 3773185029Spjd} 3774185029Spjd 3775241286Savg#else 3776241286Savg 3777243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3778243502Savg uint64_t *count); 3779241286Savg 3780241286Savgstatic nvlist_t * 3781241286Savgspa_generate_rootconf(const char *name) 3782241286Savg{ 3783243502Savg nvlist_t **configs, **tops; 3784241286Savg nvlist_t *config; 3785243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 3786243502Savg uint64_t *holes; 3787243502Savg uint64_t best_txg; 3788243213Savg uint64_t nchildren; 3789241286Savg uint64_t pgid; 3790243502Savg uint64_t count; 3791243502Savg uint64_t i; 3792243502Savg uint_t nholes; 3793241286Savg 3794243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 3795241286Savg return (NULL); 3796241286Savg 3797243502Savg ASSERT3U(count, !=, 0); 3798243502Savg best_txg = 0; 3799243502Savg for (i = 0; i < count; i++) { 3800243502Savg uint64_t txg; 3801243502Savg 3802243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 3803243502Savg &txg) == 0); 3804243502Savg if (txg > best_txg) { 3805243502Savg best_txg = txg; 3806243502Savg best_cfg = configs[i]; 3807243502Savg } 3808243502Savg } 3809243502Savg 3810241286Savg /* 3811243213Savg * Multi-vdev root pool configuration discovery is not supported yet. 3812243213Savg */ 3813245945Savg nchildren = 1; 3814245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 3815243502Savg holes = NULL; 3816243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 3817243502Savg &holes, &nholes); 3818243502Savg 3819244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 3820243502Savg for (i = 0; i < nchildren; i++) { 3821243502Savg if (i >= count) 3822243502Savg break; 3823243502Savg if (configs[i] == NULL) 3824243502Savg continue; 3825243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 3826243502Savg &nvtop) == 0); 3827243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 3828243213Savg } 3829243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 3830243502Savg if (i >= nchildren) 3831243502Savg continue; 3832243502Savg if (tops[holes[i]] != NULL) 3833243502Savg continue; 3834243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 3835243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 3836243502Savg VDEV_TYPE_HOLE) == 0); 3837243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 3838243502Savg holes[i]) == 0); 3839243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 3840243502Savg 0) == 0); 3841243502Savg } 3842243502Savg for (i = 0; i < nchildren; i++) { 3843243502Savg if (tops[i] != NULL) 3844243502Savg continue; 3845243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 3846243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 3847243502Savg VDEV_TYPE_MISSING) == 0); 3848243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 3849243502Savg i) == 0); 3850243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 3851243502Savg 0) == 0); 3852243502Savg } 3853243213Savg 3854243213Savg /* 3855243502Savg * Create pool config based on the best vdev config. 3856241286Savg */ 3857243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 3858241286Savg 3859241286Savg /* 3860241286Savg * Put this pool's top-level vdevs into a root vdev. 3861241286Savg */ 3862243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3863243502Savg &pgid) == 0); 3864241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3865241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3866241286Savg VDEV_TYPE_ROOT) == 0); 3867241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3868241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3869241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3870243502Savg tops, nchildren) == 0); 3871241286Savg 3872241286Savg /* 3873241286Savg * Replace the existing vdev_tree with the new root vdev in 3874241286Savg * this pool's configuration (remove the old, add the new). 3875241286Savg */ 3876241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3877243502Savg 3878243502Savg /* 3879243502Savg * Drop vdev config elements that should not be present at pool level. 3880243502Savg */ 3881243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 3882243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 3883243502Savg 3884243502Savg for (i = 0; i < count; i++) 3885243502Savg nvlist_free(configs[i]); 3886243502Savg kmem_free(configs, count * sizeof(void *)); 3887243502Savg for (i = 0; i < nchildren; i++) 3888243502Savg nvlist_free(tops[i]); 3889243502Savg kmem_free(tops, nchildren * sizeof(void *)); 3890241286Savg nvlist_free(nvroot); 3891241286Savg return (config); 3892241286Savg} 3893241286Savg 3894241286Savgint 3895241286Savgspa_import_rootpool(const char *name) 3896241286Savg{ 3897241286Savg spa_t *spa; 3898241286Savg vdev_t *rvd, *bvd, *avd = NULL; 3899241286Savg nvlist_t *config, *nvtop; 3900241286Savg uint64_t txg; 3901241286Savg char *pname; 3902241286Savg int error; 3903241286Savg 3904241286Savg /* 3905241286Savg * Read the label from the boot device and generate a configuration. 3906241286Savg */ 3907241286Savg config = spa_generate_rootconf(name); 3908243213Savg 3909243213Savg mutex_enter(&spa_namespace_lock); 3910243213Savg if (config != NULL) { 3911243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3912243213Savg &pname) == 0 && strcmp(name, pname) == 0); 3913243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 3914243213Savg == 0); 3915243213Savg 3916243213Savg if ((spa = spa_lookup(pname)) != NULL) { 3917243213Savg /* 3918243213Savg * Remove the existing root pool from the namespace so 3919243213Savg * that we can replace it with the correct config 3920243213Savg * we just read in. 3921243213Savg */ 3922243213Savg spa_remove(spa); 3923243213Savg } 3924243213Savg spa = spa_add(pname, config, NULL); 3925243501Savg 3926243501Savg /* 3927243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 3928243501Savg * via spa_version(). 3929243501Savg */ 3930243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3931243501Savg &spa->spa_ubsync.ub_version) != 0) 3932243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3933243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 3934241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 3935241286Savg name); 3936241286Savg return (EIO); 3937243213Savg } else { 3938243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 3939241286Savg } 3940241286Savg spa->spa_is_root = B_TRUE; 3941241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3942241286Savg 3943241286Savg /* 3944241286Savg * Build up a vdev tree based on the boot device's label config. 3945241286Savg */ 3946241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3947241286Savg &nvtop) == 0); 3948241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3949241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3950241286Savg VDEV_ALLOC_ROOTPOOL); 3951241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 3952241286Savg if (error) { 3953241286Savg mutex_exit(&spa_namespace_lock); 3954241286Savg nvlist_free(config); 3955241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3956241286Savg pname); 3957241286Savg return (error); 3958241286Savg } 3959241286Savg 3960241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3961241286Savg vdev_free(rvd); 3962241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 3963241286Savg mutex_exit(&spa_namespace_lock); 3964241286Savg 3965243213Savg nvlist_free(config); 3966243213Savg return (0); 3967241286Savg} 3968241286Savg 3969241286Savg#endif /* sun */ 3970219089Spjd#endif 3971219089Spjd 3972209962Smm/* 3973209962Smm * Import a non-root pool into the system. 3974209962Smm */ 3975185029Spjdint 3976219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3977185029Spjd{ 3978209962Smm spa_t *spa; 3979209962Smm char *altroot = NULL; 3980219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 3981219089Spjd zpool_rewind_policy_t policy; 3982219089Spjd uint64_t mode = spa_mode_global; 3983219089Spjd uint64_t readonly = B_FALSE; 3984209962Smm int error; 3985209962Smm nvlist_t *nvroot; 3986209962Smm nvlist_t **spares, **l2cache; 3987209962Smm uint_t nspares, nl2cache; 3988209962Smm 3989209962Smm /* 3990209962Smm * If a pool with this name exists, return failure. 3991209962Smm */ 3992209962Smm mutex_enter(&spa_namespace_lock); 3993219089Spjd if (spa_lookup(pool) != NULL) { 3994209962Smm mutex_exit(&spa_namespace_lock); 3995249195Smm return (SET_ERROR(EEXIST)); 3996209962Smm } 3997209962Smm 3998209962Smm /* 3999209962Smm * Create and initialize the spa structure. 4000209962Smm */ 4001209962Smm (void) nvlist_lookup_string(props, 4002209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4003219089Spjd (void) nvlist_lookup_uint64(props, 4004219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4005219089Spjd if (readonly) 4006219089Spjd mode = FREAD; 4007219089Spjd spa = spa_add(pool, config, altroot); 4008219089Spjd spa->spa_import_flags = flags; 4009209962Smm 4010209962Smm /* 4011219089Spjd * Verbatim import - Take a pool and insert it into the namespace 4012219089Spjd * as if it had been loaded at boot. 4013219089Spjd */ 4014219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4015219089Spjd if (props != NULL) 4016219089Spjd spa_configfile_set(spa, props, B_FALSE); 4017219089Spjd 4018219089Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 4019219089Spjd 4020219089Spjd mutex_exit(&spa_namespace_lock); 4021248571Smm spa_history_log_version(spa, "import"); 4022219089Spjd 4023219089Spjd return (0); 4024219089Spjd } 4025219089Spjd 4026219089Spjd spa_activate(spa, mode); 4027219089Spjd 4028219089Spjd /* 4029209962Smm * Don't start async tasks until we know everything is healthy. 4030209962Smm */ 4031209962Smm spa_async_suspend(spa); 4032209962Smm 4033219089Spjd zpool_get_rewind_policy(config, &policy); 4034219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 4035219089Spjd state = SPA_LOAD_RECOVER; 4036219089Spjd 4037209962Smm /* 4038209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4039209962Smm * because the user-supplied config is actually the one to trust when 4040209962Smm * doing an import. 4041209962Smm */ 4042219089Spjd if (state != SPA_LOAD_RECOVER) 4043219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4044209962Smm 4045219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4046219089Spjd policy.zrp_request); 4047219089Spjd 4048219089Spjd /* 4049219089Spjd * Propagate anything learned while loading the pool and pass it 4050219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 4051219089Spjd */ 4052219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4053219089Spjd spa->spa_load_info) == 0); 4054219089Spjd 4055209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4056209962Smm /* 4057209962Smm * Toss any existing sparelist, as it doesn't have any validity 4058209962Smm * anymore, and conflicts with spa_has_spare(). 4059209962Smm */ 4060209962Smm if (spa->spa_spares.sav_config) { 4061209962Smm nvlist_free(spa->spa_spares.sav_config); 4062209962Smm spa->spa_spares.sav_config = NULL; 4063209962Smm spa_load_spares(spa); 4064209962Smm } 4065209962Smm if (spa->spa_l2cache.sav_config) { 4066209962Smm nvlist_free(spa->spa_l2cache.sav_config); 4067209962Smm spa->spa_l2cache.sav_config = NULL; 4068209962Smm spa_load_l2cache(spa); 4069209962Smm } 4070209962Smm 4071209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4072209962Smm &nvroot) == 0); 4073209962Smm if (error == 0) 4074209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4075209962Smm VDEV_ALLOC_SPARE); 4076209962Smm if (error == 0) 4077209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4078209962Smm VDEV_ALLOC_L2CACHE); 4079209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4080209962Smm 4081209962Smm if (props != NULL) 4082209962Smm spa_configfile_set(spa, props, B_FALSE); 4083209962Smm 4084209962Smm if (error != 0 || (props && spa_writeable(spa) && 4085209962Smm (error = spa_prop_set(spa, props)))) { 4086209962Smm spa_unload(spa); 4087209962Smm spa_deactivate(spa); 4088209962Smm spa_remove(spa); 4089209962Smm mutex_exit(&spa_namespace_lock); 4090209962Smm return (error); 4091209962Smm } 4092209962Smm 4093209962Smm spa_async_resume(spa); 4094209962Smm 4095209962Smm /* 4096209962Smm * Override any spares and level 2 cache devices as specified by 4097209962Smm * the user, as these may have correct device names/devids, etc. 4098209962Smm */ 4099209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4100209962Smm &spares, &nspares) == 0) { 4101209962Smm if (spa->spa_spares.sav_config) 4102209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4103209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4104209962Smm else 4105209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4106209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4107209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4108209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4109209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4110209962Smm spa_load_spares(spa); 4111209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4112209962Smm spa->spa_spares.sav_sync = B_TRUE; 4113209962Smm } 4114209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4115209962Smm &l2cache, &nl2cache) == 0) { 4116209962Smm if (spa->spa_l2cache.sav_config) 4117209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4118209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4119209962Smm else 4120209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4121209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4122209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4123209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4124209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4125209962Smm spa_load_l2cache(spa); 4126209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4127209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 4128209962Smm } 4129209962Smm 4130219089Spjd /* 4131219089Spjd * Check for any removed devices. 4132219089Spjd */ 4133219089Spjd if (spa->spa_autoreplace) { 4134219089Spjd spa_aux_check_removed(&spa->spa_spares); 4135219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 4136219089Spjd } 4137219089Spjd 4138209962Smm if (spa_writeable(spa)) { 4139209962Smm /* 4140209962Smm * Update the config cache to include the newly-imported pool. 4141209962Smm */ 4142209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4143209962Smm } 4144209962Smm 4145219089Spjd /* 4146219089Spjd * It's possible that the pool was expanded while it was exported. 4147219089Spjd * We kick off an async task to handle this for us. 4148219089Spjd */ 4149219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4150219089Spjd 4151209962Smm mutex_exit(&spa_namespace_lock); 4152248571Smm spa_history_log_version(spa, "import"); 4153209962Smm 4154219089Spjd#ifdef __FreeBSD__ 4155219089Spjd#ifdef _KERNEL 4156219089Spjd zvol_create_minors(pool); 4157219089Spjd#endif 4158219089Spjd#endif 4159209962Smm return (0); 4160185029Spjd} 4161185029Spjd 4162168404Spjdnvlist_t * 4163168404Spjdspa_tryimport(nvlist_t *tryconfig) 4164168404Spjd{ 4165168404Spjd nvlist_t *config = NULL; 4166168404Spjd char *poolname; 4167168404Spjd spa_t *spa; 4168168404Spjd uint64_t state; 4169208443Smm int error; 4170168404Spjd 4171168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4172168404Spjd return (NULL); 4173168404Spjd 4174168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4175168404Spjd return (NULL); 4176168404Spjd 4177168404Spjd /* 4178168404Spjd * Create and initialize the spa structure. 4179168404Spjd */ 4180168404Spjd mutex_enter(&spa_namespace_lock); 4181219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4182209962Smm spa_activate(spa, FREAD); 4183168404Spjd 4184168404Spjd /* 4185168404Spjd * Pass off the heavy lifting to spa_load(). 4186168404Spjd * Pass TRUE for mosconfig because the user-supplied config 4187168404Spjd * is actually the one to trust when doing an import. 4188168404Spjd */ 4189219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4190168404Spjd 4191168404Spjd /* 4192168404Spjd * If 'tryconfig' was at least parsable, return the current config. 4193168404Spjd */ 4194168404Spjd if (spa->spa_root_vdev != NULL) { 4195168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4196168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4197168404Spjd poolname) == 0); 4198168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4199168404Spjd state) == 0); 4200168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4201168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 4202236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4203236884Smm spa->spa_load_info) == 0); 4204168404Spjd 4205168404Spjd /* 4206185029Spjd * If the bootfs property exists on this pool then we 4207185029Spjd * copy it out so that external consumers can tell which 4208185029Spjd * pools are bootable. 4209168404Spjd */ 4210208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 4211185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4212185029Spjd 4213185029Spjd /* 4214185029Spjd * We have to play games with the name since the 4215185029Spjd * pool was opened as TRYIMPORT_NAME. 4216185029Spjd */ 4217185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 4218185029Spjd spa->spa_bootfs, tmpname) == 0) { 4219185029Spjd char *cp; 4220185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4221185029Spjd 4222185029Spjd cp = strchr(tmpname, '/'); 4223185029Spjd if (cp == NULL) { 4224185029Spjd (void) strlcpy(dsname, tmpname, 4225185029Spjd MAXPATHLEN); 4226185029Spjd } else { 4227185029Spjd (void) snprintf(dsname, MAXPATHLEN, 4228185029Spjd "%s/%s", poolname, ++cp); 4229185029Spjd } 4230185029Spjd VERIFY(nvlist_add_string(config, 4231185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4232185029Spjd kmem_free(dsname, MAXPATHLEN); 4233185029Spjd } 4234185029Spjd kmem_free(tmpname, MAXPATHLEN); 4235185029Spjd } 4236185029Spjd 4237185029Spjd /* 4238185029Spjd * Add the list of hot spares and level 2 cache devices. 4239185029Spjd */ 4240209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4241168404Spjd spa_add_spares(spa, config); 4242185029Spjd spa_add_l2cache(spa, config); 4243209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4244168404Spjd } 4245168404Spjd 4246168404Spjd spa_unload(spa); 4247168404Spjd spa_deactivate(spa); 4248168404Spjd spa_remove(spa); 4249168404Spjd mutex_exit(&spa_namespace_lock); 4250168404Spjd 4251168404Spjd return (config); 4252168404Spjd} 4253168404Spjd 4254168404Spjd/* 4255168404Spjd * Pool export/destroy 4256168404Spjd * 4257168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 4258168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 4259168404Spjd * update the pool state and sync all the labels to disk, removing the 4260207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4261207670Smm * we don't sync the labels or remove the configuration cache. 4262168404Spjd */ 4263168404Spjdstatic int 4264185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4265207670Smm boolean_t force, boolean_t hardforce) 4266168404Spjd{ 4267168404Spjd spa_t *spa; 4268168404Spjd 4269168404Spjd if (oldconfig) 4270168404Spjd *oldconfig = NULL; 4271168404Spjd 4272209962Smm if (!(spa_mode_global & FWRITE)) 4273249195Smm return (SET_ERROR(EROFS)); 4274168404Spjd 4275168404Spjd mutex_enter(&spa_namespace_lock); 4276168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4277168404Spjd mutex_exit(&spa_namespace_lock); 4278249195Smm return (SET_ERROR(ENOENT)); 4279168404Spjd } 4280168404Spjd 4281168404Spjd /* 4282168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 4283168404Spjd * reacquire the namespace lock, and see if we can export. 4284168404Spjd */ 4285168404Spjd spa_open_ref(spa, FTAG); 4286168404Spjd mutex_exit(&spa_namespace_lock); 4287168404Spjd spa_async_suspend(spa); 4288168404Spjd mutex_enter(&spa_namespace_lock); 4289168404Spjd spa_close(spa, FTAG); 4290168404Spjd 4291168404Spjd /* 4292168404Spjd * The pool will be in core if it's openable, 4293168404Spjd * in which case we can modify its state. 4294168404Spjd */ 4295168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4296168404Spjd /* 4297168404Spjd * Objsets may be open only because they're dirty, so we 4298168404Spjd * have to force it to sync before checking spa_refcnt. 4299168404Spjd */ 4300168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 4301168404Spjd 4302168404Spjd /* 4303168404Spjd * A pool cannot be exported or destroyed if there are active 4304168404Spjd * references. If we are resetting a pool, allow references by 4305168404Spjd * fault injection handlers. 4306168404Spjd */ 4307168404Spjd if (!spa_refcount_zero(spa) || 4308168404Spjd (spa->spa_inject_ref != 0 && 4309168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 4310168404Spjd spa_async_resume(spa); 4311168404Spjd mutex_exit(&spa_namespace_lock); 4312249195Smm return (SET_ERROR(EBUSY)); 4313168404Spjd } 4314168404Spjd 4315185029Spjd /* 4316185029Spjd * A pool cannot be exported if it has an active shared spare. 4317185029Spjd * This is to prevent other pools stealing the active spare 4318185029Spjd * from an exported pool. At user's own will, such pool can 4319185029Spjd * be forcedly exported. 4320185029Spjd */ 4321185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 4322185029Spjd spa_has_active_shared_spare(spa)) { 4323185029Spjd spa_async_resume(spa); 4324185029Spjd mutex_exit(&spa_namespace_lock); 4325249195Smm return (SET_ERROR(EXDEV)); 4326185029Spjd } 4327168404Spjd 4328168404Spjd /* 4329168404Spjd * We want this to be reflected on every label, 4330168404Spjd * so mark them all dirty. spa_unload() will do the 4331168404Spjd * final sync that pushes these changes out. 4332168404Spjd */ 4333207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4334185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4335168404Spjd spa->spa_state = new_state; 4336219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 4337219089Spjd TXG_DEFER_SIZE + 1; 4338168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4339185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4340168404Spjd } 4341168404Spjd } 4342168404Spjd 4343185029Spjd spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4344185029Spjd 4345168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4346168404Spjd spa_unload(spa); 4347168404Spjd spa_deactivate(spa); 4348168404Spjd } 4349168404Spjd 4350168404Spjd if (oldconfig && spa->spa_config) 4351168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4352168404Spjd 4353168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 4354207670Smm if (!hardforce) 4355207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 4356168404Spjd spa_remove(spa); 4357168404Spjd } 4358168404Spjd mutex_exit(&spa_namespace_lock); 4359168404Spjd 4360168404Spjd return (0); 4361168404Spjd} 4362168404Spjd 4363168404Spjd/* 4364168404Spjd * Destroy a storage pool. 4365168404Spjd */ 4366168404Spjdint 4367168404Spjdspa_destroy(char *pool) 4368168404Spjd{ 4369207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4370207670Smm B_FALSE, B_FALSE)); 4371168404Spjd} 4372168404Spjd 4373168404Spjd/* 4374168404Spjd * Export a storage pool. 4375168404Spjd */ 4376168404Spjdint 4377207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4378207670Smm boolean_t hardforce) 4379168404Spjd{ 4380207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4381207670Smm force, hardforce)); 4382168404Spjd} 4383168404Spjd 4384168404Spjd/* 4385168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 4386168404Spjd * from the namespace in any way. 4387168404Spjd */ 4388168404Spjdint 4389168404Spjdspa_reset(char *pool) 4390168404Spjd{ 4391185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4392207670Smm B_FALSE, B_FALSE)); 4393168404Spjd} 4394168404Spjd 4395168404Spjd/* 4396168404Spjd * ========================================================================== 4397168404Spjd * Device manipulation 4398168404Spjd * ========================================================================== 4399168404Spjd */ 4400168404Spjd 4401168404Spjd/* 4402185029Spjd * Add a device to a storage pool. 4403168404Spjd */ 4404168404Spjdint 4405168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4406168404Spjd{ 4407219089Spjd uint64_t txg, id; 4408209962Smm int error; 4409168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4410168404Spjd vdev_t *vd, *tvd; 4411185029Spjd nvlist_t **spares, **l2cache; 4412185029Spjd uint_t nspares, nl2cache; 4413168404Spjd 4414219089Spjd ASSERT(spa_writeable(spa)); 4415219089Spjd 4416168404Spjd txg = spa_vdev_enter(spa); 4417168404Spjd 4418168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4419168404Spjd VDEV_ALLOC_ADD)) != 0) 4420168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4421168404Spjd 4422185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4423168404Spjd 4424185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4425185029Spjd &nspares) != 0) 4426168404Spjd nspares = 0; 4427168404Spjd 4428185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4429185029Spjd &nl2cache) != 0) 4430185029Spjd nl2cache = 0; 4431185029Spjd 4432185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4433168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4434168404Spjd 4435185029Spjd if (vd->vdev_children != 0 && 4436185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 4437185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4438168404Spjd 4439168404Spjd /* 4440185029Spjd * We must validate the spares and l2cache devices after checking the 4441185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4442168404Spjd */ 4443185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4444168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4445168404Spjd 4446168404Spjd /* 4447168404Spjd * Transfer each new top-level vdev from vd to rvd. 4448168404Spjd */ 4449209962Smm for (int c = 0; c < vd->vdev_children; c++) { 4450219089Spjd 4451219089Spjd /* 4452219089Spjd * Set the vdev id to the first hole, if one exists. 4453219089Spjd */ 4454219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 4455219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 4456219089Spjd vdev_free(rvd->vdev_child[id]); 4457219089Spjd break; 4458219089Spjd } 4459219089Spjd } 4460168404Spjd tvd = vd->vdev_child[c]; 4461168404Spjd vdev_remove_child(vd, tvd); 4462219089Spjd tvd->vdev_id = id; 4463168404Spjd vdev_add_child(rvd, tvd); 4464168404Spjd vdev_config_dirty(tvd); 4465168404Spjd } 4466168404Spjd 4467168404Spjd if (nspares != 0) { 4468185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4469185029Spjd ZPOOL_CONFIG_SPARES); 4470168404Spjd spa_load_spares(spa); 4471185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4472168404Spjd } 4473168404Spjd 4474185029Spjd if (nl2cache != 0) { 4475185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4476185029Spjd ZPOOL_CONFIG_L2CACHE); 4477185029Spjd spa_load_l2cache(spa); 4478185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4479185029Spjd } 4480185029Spjd 4481168404Spjd /* 4482168404Spjd * We have to be careful when adding new vdevs to an existing pool. 4483168404Spjd * If other threads start allocating from these vdevs before we 4484168404Spjd * sync the config cache, and we lose power, then upon reboot we may 4485168404Spjd * fail to open the pool because there are DVAs that the config cache 4486168404Spjd * can't translate. Therefore, we first add the vdevs without 4487168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4488168404Spjd * and then let spa_config_update() initialize the new metaslabs. 4489168404Spjd * 4490168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 4491168404Spjd * if we lose power at any point in this sequence, the remaining 4492168404Spjd * steps will be completed the next time we load the pool. 4493168404Spjd */ 4494168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 4495168404Spjd 4496168404Spjd mutex_enter(&spa_namespace_lock); 4497168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4498168404Spjd mutex_exit(&spa_namespace_lock); 4499168404Spjd 4500168404Spjd return (0); 4501168404Spjd} 4502168404Spjd 4503168404Spjd/* 4504168404Spjd * Attach a device to a mirror. The arguments are the path to any device 4505168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 4506168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 4507168404Spjd * 4508168404Spjd * If 'replacing' is specified, the new device is intended to replace the 4509168404Spjd * existing device; in this case the two devices are made into their own 4510185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 4511168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 4512168404Spjd * extra rules: you can't attach to it after it's been created, and upon 4513168404Spjd * completion of resilvering, the first disk (the one being replaced) 4514168404Spjd * is automatically detached. 4515168404Spjd */ 4516168404Spjdint 4517168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4518168404Spjd{ 4519219089Spjd uint64_t txg, dtl_max_txg; 4520168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4521168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4522168404Spjd vdev_ops_t *pvops; 4523185029Spjd char *oldvdpath, *newvdpath; 4524185029Spjd int newvd_isspare; 4525185029Spjd int error; 4526168404Spjd 4527219089Spjd ASSERT(spa_writeable(spa)); 4528219089Spjd 4529168404Spjd txg = spa_vdev_enter(spa); 4530168404Spjd 4531185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4532168404Spjd 4533168404Spjd if (oldvd == NULL) 4534168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4535168404Spjd 4536168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 4537168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4538168404Spjd 4539168404Spjd pvd = oldvd->vdev_parent; 4540168404Spjd 4541168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4542230514Smm VDEV_ALLOC_ATTACH)) != 0) 4543185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4544185029Spjd 4545185029Spjd if (newrootvd->vdev_children != 1) 4546168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4547168404Spjd 4548168404Spjd newvd = newrootvd->vdev_child[0]; 4549168404Spjd 4550168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 4551168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4552168404Spjd 4553168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4554168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 4555168404Spjd 4556185029Spjd /* 4557185029Spjd * Spares can't replace logs 4558185029Spjd */ 4559185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4560185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4561185029Spjd 4562168404Spjd if (!replacing) { 4563168404Spjd /* 4564168404Spjd * For attach, the only allowable parent is a mirror or the root 4565168404Spjd * vdev. 4566168404Spjd */ 4567168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 4568168404Spjd pvd->vdev_ops != &vdev_root_ops) 4569168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4570168404Spjd 4571168404Spjd pvops = &vdev_mirror_ops; 4572168404Spjd } else { 4573168404Spjd /* 4574168404Spjd * Active hot spares can only be replaced by inactive hot 4575168404Spjd * spares. 4576168404Spjd */ 4577168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4578219089Spjd oldvd->vdev_isspare && 4579168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 4580168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4581168404Spjd 4582168404Spjd /* 4583168404Spjd * If the source is a hot spare, and the parent isn't already a 4584168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 4585168404Spjd * want to create a replacing vdev. The user is not allowed to 4586168404Spjd * attach to a spared vdev child unless the 'isspare' state is 4587168404Spjd * the same (spare replaces spare, non-spare replaces 4588168404Spjd * non-spare). 4589168404Spjd */ 4590219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 4591219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4592168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4593219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 4594219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 4595168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4596219089Spjd } 4597219089Spjd 4598219089Spjd if (newvd->vdev_isspare) 4599168404Spjd pvops = &vdev_spare_ops; 4600168404Spjd else 4601168404Spjd pvops = &vdev_replacing_ops; 4602168404Spjd } 4603168404Spjd 4604168404Spjd /* 4605219089Spjd * Make sure the new device is big enough. 4606168404Spjd */ 4607219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4608168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4609168404Spjd 4610168404Spjd /* 4611168404Spjd * The new device cannot have a higher alignment requirement 4612168404Spjd * than the top-level vdev. 4613168404Spjd */ 4614168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4615168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4616168404Spjd 4617168404Spjd /* 4618168404Spjd * If this is an in-place replacement, update oldvd's path and devid 4619168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 4620168404Spjd */ 4621168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4622168404Spjd spa_strfree(oldvd->vdev_path); 4623168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4624168404Spjd KM_SLEEP); 4625168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 4626168404Spjd newvd->vdev_path, "old"); 4627168404Spjd if (oldvd->vdev_devid != NULL) { 4628168404Spjd spa_strfree(oldvd->vdev_devid); 4629168404Spjd oldvd->vdev_devid = NULL; 4630168404Spjd } 4631168404Spjd } 4632168404Spjd 4633219089Spjd /* mark the device being resilvered */ 4634219089Spjd newvd->vdev_resilvering = B_TRUE; 4635219089Spjd 4636168404Spjd /* 4637168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 4638168404Spjd * mirror/replacing/spare vdev above oldvd. 4639168404Spjd */ 4640168404Spjd if (pvd->vdev_ops != pvops) 4641168404Spjd pvd = vdev_add_parent(oldvd, pvops); 4642168404Spjd 4643168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 4644168404Spjd ASSERT(pvd->vdev_ops == pvops); 4645168404Spjd ASSERT(oldvd->vdev_parent == pvd); 4646168404Spjd 4647168404Spjd /* 4648168404Spjd * Extract the new device from its root and add it to pvd. 4649168404Spjd */ 4650168404Spjd vdev_remove_child(newrootvd, newvd); 4651168404Spjd newvd->vdev_id = pvd->vdev_children; 4652219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 4653168404Spjd vdev_add_child(pvd, newvd); 4654168404Spjd 4655168404Spjd tvd = newvd->vdev_top; 4656168404Spjd ASSERT(pvd->vdev_top == tvd); 4657168404Spjd ASSERT(tvd->vdev_parent == rvd); 4658168404Spjd 4659168404Spjd vdev_config_dirty(tvd); 4660168404Spjd 4661168404Spjd /* 4662219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4663219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 4664219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 4665168404Spjd */ 4666219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4667168404Spjd 4668219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4669219089Spjd dtl_max_txg - TXG_INITIAL); 4670168404Spjd 4671209962Smm if (newvd->vdev_isspare) { 4672168404Spjd spa_spare_activate(newvd); 4673209962Smm spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4674209962Smm } 4675209962Smm 4676185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 4677185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 4678185029Spjd newvd_isspare = newvd->vdev_isspare; 4679168404Spjd 4680168404Spjd /* 4681168404Spjd * Mark newvd's DTL dirty in this txg. 4682168404Spjd */ 4683168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 4684168404Spjd 4685219089Spjd /* 4686219089Spjd * Restart the resilver 4687219089Spjd */ 4688219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4689168404Spjd 4690219089Spjd /* 4691219089Spjd * Commit the config 4692219089Spjd */ 4693219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4694185029Spjd 4695248571Smm spa_history_log_internal(spa, "vdev attach", NULL, 4696219089Spjd "%s vdev=%s %s vdev=%s", 4697219089Spjd replacing && newvd_isspare ? "spare in" : 4698219089Spjd replacing ? "replace" : "attach", newvdpath, 4699219089Spjd replacing ? "for" : "to", oldvdpath); 4700219089Spjd 4701185029Spjd spa_strfree(oldvdpath); 4702185029Spjd spa_strfree(newvdpath); 4703185029Spjd 4704219089Spjd if (spa->spa_bootfs) 4705219089Spjd spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4706168404Spjd 4707168404Spjd return (0); 4708168404Spjd} 4709168404Spjd 4710168404Spjd/* 4711168404Spjd * Detach a device from a mirror or replacing vdev. 4712251631Sdelphij * 4713168404Spjd * If 'replace_done' is specified, only detach if the parent 4714168404Spjd * is a replacing vdev. 4715168404Spjd */ 4716168404Spjdint 4717209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4718168404Spjd{ 4719168404Spjd uint64_t txg; 4720209962Smm int error; 4721168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4722168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 4723168404Spjd boolean_t unspare = B_FALSE; 4724247187Smm uint64_t unspare_guid = 0; 4725219089Spjd char *vdpath; 4726168404Spjd 4727219089Spjd ASSERT(spa_writeable(spa)); 4728219089Spjd 4729168404Spjd txg = spa_vdev_enter(spa); 4730168404Spjd 4731185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4732168404Spjd 4733168404Spjd if (vd == NULL) 4734168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4735168404Spjd 4736168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 4737168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4738168404Spjd 4739168404Spjd pvd = vd->vdev_parent; 4740168404Spjd 4741168404Spjd /* 4742209962Smm * If the parent/child relationship is not as expected, don't do it. 4743209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4744209962Smm * vdev that's replacing B with C. The user's intent in replacing 4745209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 4746209962Smm * the replace by detaching C, the expected behavior is to end up 4747209962Smm * M(A,B). But suppose that right after deciding to detach C, 4748209962Smm * the replacement of B completes. We would have M(A,C), and then 4749209962Smm * ask to detach C, which would leave us with just A -- not what 4750209962Smm * the user wanted. To prevent this, we make sure that the 4751209962Smm * parent/child relationship hasn't changed -- in this example, 4752209962Smm * that C's parent is still the replacing vdev R. 4753209962Smm */ 4754209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 4755209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4756209962Smm 4757209962Smm /* 4758219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 4759168404Spjd */ 4760219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4761219089Spjd pvd->vdev_ops != &vdev_spare_ops) 4762219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4763168404Spjd 4764168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4765185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 4766168404Spjd 4767168404Spjd /* 4768168404Spjd * Only mirror, replacing, and spare vdevs support detach. 4769168404Spjd */ 4770168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 4771168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 4772168404Spjd pvd->vdev_ops != &vdev_spare_ops) 4773168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4774168404Spjd 4775168404Spjd /* 4776209962Smm * If this device has the only valid copy of some data, 4777209962Smm * we cannot safely detach it. 4778168404Spjd */ 4779209962Smm if (vdev_dtl_required(vd)) 4780168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4781168404Spjd 4782209962Smm ASSERT(pvd->vdev_children >= 2); 4783168404Spjd 4784168404Spjd /* 4785185029Spjd * If we are detaching the second disk from a replacing vdev, then 4786185029Spjd * check to see if we changed the original vdev's path to have "/old" 4787185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 4788168404Spjd */ 4789219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4790219089Spjd vd->vdev_path != NULL) { 4791219089Spjd size_t len = strlen(vd->vdev_path); 4792219089Spjd 4793219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 4794219089Spjd cvd = pvd->vdev_child[c]; 4795219089Spjd 4796219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 4797219089Spjd continue; 4798219089Spjd 4799219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4800219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 4801219089Spjd spa_strfree(cvd->vdev_path); 4802219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 4803219089Spjd break; 4804219089Spjd } 4805185029Spjd } 4806185029Spjd } 4807168404Spjd 4808168404Spjd /* 4809168404Spjd * If we are detaching the original disk from a spare, then it implies 4810168404Spjd * that the spare should become a real disk, and be removed from the 4811168404Spjd * active spare list for the pool. 4812168404Spjd */ 4813168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4814219089Spjd vd->vdev_id == 0 && 4815219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4816168404Spjd unspare = B_TRUE; 4817168404Spjd 4818168404Spjd /* 4819168404Spjd * Erase the disk labels so the disk can be used for other things. 4820168404Spjd * This must be done after all other error cases are handled, 4821168404Spjd * but before we disembowel vd (so we can still do I/O to it). 4822168404Spjd * But if we can't do it, don't treat the error as fatal -- 4823168404Spjd * it may be that the unwritability of the disk is the reason 4824168404Spjd * it's being detached! 4825168404Spjd */ 4826168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4827168404Spjd 4828168404Spjd /* 4829168404Spjd * Remove vd from its parent and compact the parent's children. 4830168404Spjd */ 4831168404Spjd vdev_remove_child(pvd, vd); 4832168404Spjd vdev_compact_children(pvd); 4833168404Spjd 4834168404Spjd /* 4835168404Spjd * Remember one of the remaining children so we can get tvd below. 4836168404Spjd */ 4837219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4838168404Spjd 4839168404Spjd /* 4840168404Spjd * If we need to remove the remaining child from the list of hot spares, 4841209962Smm * do it now, marking the vdev as no longer a spare in the process. 4842209962Smm * We must do this before vdev_remove_parent(), because that can 4843209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 4844209962Smm * reason, we must remove the spare now, in the same txg as the detach; 4845209962Smm * otherwise someone could attach a new sibling, change the GUID, and 4846209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4847168404Spjd */ 4848168404Spjd if (unspare) { 4849168404Spjd ASSERT(cvd->vdev_isspare); 4850168404Spjd spa_spare_remove(cvd); 4851168404Spjd unspare_guid = cvd->vdev_guid; 4852209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4853219089Spjd cvd->vdev_unspare = B_TRUE; 4854168404Spjd } 4855168404Spjd 4856168404Spjd /* 4857168404Spjd * If the parent mirror/replacing vdev only has one child, 4858168404Spjd * the parent is no longer needed. Remove it from the tree. 4859168404Spjd */ 4860219089Spjd if (pvd->vdev_children == 1) { 4861219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 4862219089Spjd cvd->vdev_unspare = B_FALSE; 4863168404Spjd vdev_remove_parent(cvd); 4864219089Spjd cvd->vdev_resilvering = B_FALSE; 4865219089Spjd } 4866168404Spjd 4867219089Spjd 4868168404Spjd /* 4869168404Spjd * We don't set tvd until now because the parent we just removed 4870168404Spjd * may have been the previous top-level vdev. 4871168404Spjd */ 4872168404Spjd tvd = cvd->vdev_top; 4873168404Spjd ASSERT(tvd->vdev_parent == rvd); 4874168404Spjd 4875168404Spjd /* 4876168404Spjd * Reevaluate the parent vdev state. 4877168404Spjd */ 4878185029Spjd vdev_propagate_state(cvd); 4879168404Spjd 4880168404Spjd /* 4881219089Spjd * If the 'autoexpand' property is set on the pool then automatically 4882219089Spjd * try to expand the size of the pool. For example if the device we 4883219089Spjd * just detached was smaller than the others, it may be possible to 4884219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4885219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 4886168404Spjd */ 4887219089Spjd if (spa->spa_autoexpand) { 4888219089Spjd vdev_reopen(tvd); 4889219089Spjd vdev_expand(tvd, txg); 4890219089Spjd } 4891168404Spjd 4892168404Spjd vdev_config_dirty(tvd); 4893168404Spjd 4894168404Spjd /* 4895168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4896168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 4897168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 4898168404Spjd * prevent vd from being accessed after it's freed. 4899168404Spjd */ 4900219089Spjd vdpath = spa_strdup(vd->vdev_path); 4901209962Smm for (int t = 0; t < TXG_SIZE; t++) 4902168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4903168404Spjd vd->vdev_detached = B_TRUE; 4904168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 4905168404Spjd 4906185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4907185029Spjd 4908219089Spjd /* hang on to the spa before we release the lock */ 4909219089Spjd spa_open_ref(spa, FTAG); 4910219089Spjd 4911168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 4912168404Spjd 4913248571Smm spa_history_log_internal(spa, "detach", NULL, 4914219089Spjd "vdev=%s", vdpath); 4915219089Spjd spa_strfree(vdpath); 4916219089Spjd 4917168404Spjd /* 4918168404Spjd * If this was the removal of the original device in a hot spare vdev, 4919168404Spjd * then we want to go through and remove the device from the hot spare 4920168404Spjd * list of every other pool. 4921168404Spjd */ 4922168404Spjd if (unspare) { 4923219089Spjd spa_t *altspa = NULL; 4924219089Spjd 4925168404Spjd mutex_enter(&spa_namespace_lock); 4926219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 4927219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 4928219089Spjd altspa == spa) 4929168404Spjd continue; 4930219089Spjd 4931219089Spjd spa_open_ref(altspa, FTAG); 4932185029Spjd mutex_exit(&spa_namespace_lock); 4933219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4934185029Spjd mutex_enter(&spa_namespace_lock); 4935219089Spjd spa_close(altspa, FTAG); 4936168404Spjd } 4937168404Spjd mutex_exit(&spa_namespace_lock); 4938219089Spjd 4939219089Spjd /* search the rest of the vdevs for spares to remove */ 4940219089Spjd spa_vdev_resilver_done(spa); 4941168404Spjd } 4942168404Spjd 4943219089Spjd /* all done with the spa; OK to release */ 4944219089Spjd mutex_enter(&spa_namespace_lock); 4945219089Spjd spa_close(spa, FTAG); 4946219089Spjd mutex_exit(&spa_namespace_lock); 4947219089Spjd 4948168404Spjd return (error); 4949168404Spjd} 4950168404Spjd 4951219089Spjd/* 4952219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 4953219089Spjd */ 4954219089Spjdint 4955219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4956219089Spjd nvlist_t *props, boolean_t exp) 4957219089Spjd{ 4958219089Spjd int error = 0; 4959219089Spjd uint64_t txg, *glist; 4960219089Spjd spa_t *newspa; 4961219089Spjd uint_t c, children, lastlog; 4962219089Spjd nvlist_t **child, *nvl, *tmp; 4963219089Spjd dmu_tx_t *tx; 4964219089Spjd char *altroot = NULL; 4965219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4966219089Spjd boolean_t activate_slog; 4967219089Spjd 4968219089Spjd ASSERT(spa_writeable(spa)); 4969219089Spjd 4970219089Spjd txg = spa_vdev_enter(spa); 4971219089Spjd 4972219089Spjd /* clear the log and flush everything up to now */ 4973219089Spjd activate_slog = spa_passivate_log(spa); 4974219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4975219089Spjd error = spa_offline_log(spa); 4976219089Spjd txg = spa_vdev_config_enter(spa); 4977219089Spjd 4978219089Spjd if (activate_slog) 4979219089Spjd spa_activate_log(spa); 4980219089Spjd 4981219089Spjd if (error != 0) 4982219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4983219089Spjd 4984219089Spjd /* check new spa name before going any further */ 4985219089Spjd if (spa_lookup(newname) != NULL) 4986219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4987219089Spjd 4988219089Spjd /* 4989219089Spjd * scan through all the children to ensure they're all mirrors 4990219089Spjd */ 4991219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4992219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4993219089Spjd &children) != 0) 4994219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4995219089Spjd 4996219089Spjd /* first, check to ensure we've got the right child count */ 4997219089Spjd rvd = spa->spa_root_vdev; 4998219089Spjd lastlog = 0; 4999219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 5000219089Spjd vdev_t *vd = rvd->vdev_child[c]; 5001219089Spjd 5002219089Spjd /* don't count the holes & logs as children */ 5003219089Spjd if (vd->vdev_islog || vd->vdev_ishole) { 5004219089Spjd if (lastlog == 0) 5005219089Spjd lastlog = c; 5006219089Spjd continue; 5007219089Spjd } 5008219089Spjd 5009219089Spjd lastlog = 0; 5010219089Spjd } 5011219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5012219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5013219089Spjd 5014219089Spjd /* next, ensure no spare or cache devices are part of the split */ 5015219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5016219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5017219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5018219089Spjd 5019219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5020219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5021219089Spjd 5022219089Spjd /* then, loop over each vdev and validate it */ 5023219089Spjd for (c = 0; c < children; c++) { 5024219089Spjd uint64_t is_hole = 0; 5025219089Spjd 5026219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5027219089Spjd &is_hole); 5028219089Spjd 5029219089Spjd if (is_hole != 0) { 5030219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5031219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5032219089Spjd continue; 5033219089Spjd } else { 5034249195Smm error = SET_ERROR(EINVAL); 5035219089Spjd break; 5036219089Spjd } 5037219089Spjd } 5038219089Spjd 5039219089Spjd /* which disk is going to be split? */ 5040219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5041219089Spjd &glist[c]) != 0) { 5042249195Smm error = SET_ERROR(EINVAL); 5043219089Spjd break; 5044219089Spjd } 5045219089Spjd 5046219089Spjd /* look it up in the spa */ 5047219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5048219089Spjd if (vml[c] == NULL) { 5049249195Smm error = SET_ERROR(ENODEV); 5050219089Spjd break; 5051219089Spjd } 5052219089Spjd 5053219089Spjd /* make sure there's nothing stopping the split */ 5054219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5055219089Spjd vml[c]->vdev_islog || 5056219089Spjd vml[c]->vdev_ishole || 5057219089Spjd vml[c]->vdev_isspare || 5058219089Spjd vml[c]->vdev_isl2cache || 5059219089Spjd !vdev_writeable(vml[c]) || 5060219089Spjd vml[c]->vdev_children != 0 || 5061219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5062219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5063249195Smm error = SET_ERROR(EINVAL); 5064219089Spjd break; 5065219089Spjd } 5066219089Spjd 5067219089Spjd if (vdev_dtl_required(vml[c])) { 5068249195Smm error = SET_ERROR(EBUSY); 5069219089Spjd break; 5070219089Spjd } 5071219089Spjd 5072219089Spjd /* we need certain info from the top level */ 5073219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5074219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 5075219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5076219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 5077219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5078219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 5079219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5080219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 5081219089Spjd } 5082219089Spjd 5083219089Spjd if (error != 0) { 5084219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5085219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5086219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5087219089Spjd } 5088219089Spjd 5089219089Spjd /* stop writers from using the disks */ 5090219089Spjd for (c = 0; c < children; c++) { 5091219089Spjd if (vml[c] != NULL) 5092219089Spjd vml[c]->vdev_offline = B_TRUE; 5093219089Spjd } 5094219089Spjd vdev_reopen(spa->spa_root_vdev); 5095219089Spjd 5096219089Spjd /* 5097219089Spjd * Temporarily record the splitting vdevs in the spa config. This 5098219089Spjd * will disappear once the config is regenerated. 5099219089Spjd */ 5100219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5101219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5102219089Spjd glist, children) == 0); 5103219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5104219089Spjd 5105219089Spjd mutex_enter(&spa->spa_props_lock); 5106219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5107219089Spjd nvl) == 0); 5108219089Spjd mutex_exit(&spa->spa_props_lock); 5109219089Spjd spa->spa_config_splitting = nvl; 5110219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5111219089Spjd 5112219089Spjd /* configure and create the new pool */ 5113219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5114219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5115219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5116219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5117219089Spjd spa_version(spa)) == 0); 5118219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5119219089Spjd spa->spa_config_txg) == 0); 5120219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5121219089Spjd spa_generate_guid(NULL)) == 0); 5122219089Spjd (void) nvlist_lookup_string(props, 5123219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5124219089Spjd 5125219089Spjd /* add the new pool to the namespace */ 5126219089Spjd newspa = spa_add(newname, config, altroot); 5127219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 5128219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 5129219089Spjd 5130219089Spjd /* release the spa config lock, retaining the namespace lock */ 5131219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5132219089Spjd 5133219089Spjd if (zio_injection_enabled) 5134219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 5135219089Spjd 5136219089Spjd spa_activate(newspa, spa_mode_global); 5137219089Spjd spa_async_suspend(newspa); 5138219089Spjd 5139219089Spjd#ifndef sun 5140219089Spjd /* mark that we are creating new spa by splitting */ 5141219089Spjd newspa->spa_splitting_newspa = B_TRUE; 5142219089Spjd#endif 5143219089Spjd /* create the new pool from the disks of the original pool */ 5144219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5145219089Spjd#ifndef sun 5146219089Spjd newspa->spa_splitting_newspa = B_FALSE; 5147219089Spjd#endif 5148219089Spjd if (error) 5149219089Spjd goto out; 5150219089Spjd 5151219089Spjd /* if that worked, generate a real config for the new pool */ 5152219089Spjd if (newspa->spa_root_vdev != NULL) { 5153219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5154219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 5155219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5156219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5157219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5158219089Spjd B_TRUE)); 5159219089Spjd } 5160219089Spjd 5161219089Spjd /* set the props */ 5162219089Spjd if (props != NULL) { 5163219089Spjd spa_configfile_set(newspa, props, B_FALSE); 5164219089Spjd error = spa_prop_set(newspa, props); 5165219089Spjd if (error) 5166219089Spjd goto out; 5167219089Spjd } 5168219089Spjd 5169219089Spjd /* flush everything */ 5170219089Spjd txg = spa_vdev_config_enter(newspa); 5171219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 5172219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5173219089Spjd 5174219089Spjd if (zio_injection_enabled) 5175219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 5176219089Spjd 5177219089Spjd spa_async_resume(newspa); 5178219089Spjd 5179219089Spjd /* finally, update the original pool's config */ 5180219089Spjd txg = spa_vdev_config_enter(spa); 5181219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5182219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 5183219089Spjd if (error != 0) 5184219089Spjd dmu_tx_abort(tx); 5185219089Spjd for (c = 0; c < children; c++) { 5186219089Spjd if (vml[c] != NULL) { 5187219089Spjd vdev_split(vml[c]); 5188219089Spjd if (error == 0) 5189248571Smm spa_history_log_internal(spa, "detach", tx, 5190248571Smm "vdev=%s", vml[c]->vdev_path); 5191219089Spjd vdev_free(vml[c]); 5192219089Spjd } 5193219089Spjd } 5194219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5195219089Spjd spa->spa_config_splitting = NULL; 5196219089Spjd nvlist_free(nvl); 5197219089Spjd if (error == 0) 5198219089Spjd dmu_tx_commit(tx); 5199219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 5200219089Spjd 5201219089Spjd if (zio_injection_enabled) 5202219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 5203219089Spjd 5204219089Spjd /* split is complete; log a history record */ 5205248571Smm spa_history_log_internal(newspa, "split", NULL, 5206248571Smm "from pool %s", spa_name(spa)); 5207219089Spjd 5208219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5209219089Spjd 5210219089Spjd /* if we're not going to mount the filesystems in userland, export */ 5211219089Spjd if (exp) 5212219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5213219089Spjd B_FALSE, B_FALSE); 5214219089Spjd 5215219089Spjd return (error); 5216219089Spjd 5217219089Spjdout: 5218219089Spjd spa_unload(newspa); 5219219089Spjd spa_deactivate(newspa); 5220219089Spjd spa_remove(newspa); 5221219089Spjd 5222219089Spjd txg = spa_vdev_config_enter(spa); 5223219089Spjd 5224219089Spjd /* re-online all offlined disks */ 5225219089Spjd for (c = 0; c < children; c++) { 5226219089Spjd if (vml[c] != NULL) 5227219089Spjd vml[c]->vdev_offline = B_FALSE; 5228219089Spjd } 5229219089Spjd vdev_reopen(spa->spa_root_vdev); 5230219089Spjd 5231219089Spjd nvlist_free(spa->spa_config_splitting); 5232219089Spjd spa->spa_config_splitting = NULL; 5233219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 5234219089Spjd 5235219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5236219089Spjd return (error); 5237219089Spjd} 5238219089Spjd 5239185029Spjdstatic nvlist_t * 5240185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5241185029Spjd{ 5242185029Spjd for (int i = 0; i < count; i++) { 5243185029Spjd uint64_t guid; 5244185029Spjd 5245185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5246185029Spjd &guid) == 0); 5247185029Spjd 5248185029Spjd if (guid == target_guid) 5249185029Spjd return (nvpp[i]); 5250185029Spjd } 5251185029Spjd 5252185029Spjd return (NULL); 5253185029Spjd} 5254185029Spjd 5255185029Spjdstatic void 5256185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5257185029Spjd nvlist_t *dev_to_remove) 5258185029Spjd{ 5259185029Spjd nvlist_t **newdev = NULL; 5260185029Spjd 5261185029Spjd if (count > 1) 5262185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5263185029Spjd 5264185029Spjd for (int i = 0, j = 0; i < count; i++) { 5265185029Spjd if (dev[i] == dev_to_remove) 5266185029Spjd continue; 5267185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5268185029Spjd } 5269185029Spjd 5270185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5271185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5272185029Spjd 5273185029Spjd for (int i = 0; i < count - 1; i++) 5274185029Spjd nvlist_free(newdev[i]); 5275185029Spjd 5276185029Spjd if (count > 1) 5277185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 5278185029Spjd} 5279185029Spjd 5280168404Spjd/* 5281219089Spjd * Evacuate the device. 5282219089Spjd */ 5283219089Spjdstatic int 5284219089Spjdspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5285219089Spjd{ 5286219089Spjd uint64_t txg; 5287219089Spjd int error = 0; 5288219089Spjd 5289219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5290219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5291219089Spjd ASSERT(vd == vd->vdev_top); 5292219089Spjd 5293219089Spjd /* 5294219089Spjd * Evacuate the device. We don't hold the config lock as writer 5295219089Spjd * since we need to do I/O but we do keep the 5296219089Spjd * spa_namespace_lock held. Once this completes the device 5297219089Spjd * should no longer have any blocks allocated on it. 5298219089Spjd */ 5299219089Spjd if (vd->vdev_islog) { 5300219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5301219089Spjd error = spa_offline_log(spa); 5302219089Spjd } else { 5303249195Smm error = SET_ERROR(ENOTSUP); 5304219089Spjd } 5305219089Spjd 5306219089Spjd if (error) 5307219089Spjd return (error); 5308219089Spjd 5309219089Spjd /* 5310219089Spjd * The evacuation succeeded. Remove any remaining MOS metadata 5311219089Spjd * associated with this vdev, and wait for these changes to sync. 5312219089Spjd */ 5313240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 5314219089Spjd txg = spa_vdev_config_enter(spa); 5315219089Spjd vd->vdev_removing = B_TRUE; 5316219089Spjd vdev_dirty(vd, 0, NULL, txg); 5317219089Spjd vdev_config_dirty(vd); 5318219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5319219089Spjd 5320219089Spjd return (0); 5321219089Spjd} 5322219089Spjd 5323219089Spjd/* 5324219089Spjd * Complete the removal by cleaning up the namespace. 5325219089Spjd */ 5326219089Spjdstatic void 5327219089Spjdspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5328219089Spjd{ 5329219089Spjd vdev_t *rvd = spa->spa_root_vdev; 5330219089Spjd uint64_t id = vd->vdev_id; 5331219089Spjd boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5332219089Spjd 5333219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5334219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5335219089Spjd ASSERT(vd == vd->vdev_top); 5336219089Spjd 5337219089Spjd /* 5338219089Spjd * Only remove any devices which are empty. 5339219089Spjd */ 5340219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5341219089Spjd return; 5342219089Spjd 5343219089Spjd (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5344219089Spjd 5345219089Spjd if (list_link_active(&vd->vdev_state_dirty_node)) 5346219089Spjd vdev_state_clean(vd); 5347219089Spjd if (list_link_active(&vd->vdev_config_dirty_node)) 5348219089Spjd vdev_config_clean(vd); 5349219089Spjd 5350219089Spjd vdev_free(vd); 5351219089Spjd 5352219089Spjd if (last_vdev) { 5353219089Spjd vdev_compact_children(rvd); 5354219089Spjd } else { 5355219089Spjd vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5356219089Spjd vdev_add_child(rvd, vd); 5357219089Spjd } 5358219089Spjd vdev_config_dirty(rvd); 5359219089Spjd 5360219089Spjd /* 5361219089Spjd * Reassess the health of our root vdev. 5362219089Spjd */ 5363219089Spjd vdev_reopen(rvd); 5364219089Spjd} 5365219089Spjd 5366219089Spjd/* 5367219089Spjd * Remove a device from the pool - 5368219089Spjd * 5369219089Spjd * Removing a device from the vdev namespace requires several steps 5370219089Spjd * and can take a significant amount of time. As a result we use 5371219089Spjd * the spa_vdev_config_[enter/exit] functions which allow us to 5372219089Spjd * grab and release the spa_config_lock while still holding the namespace 5373219089Spjd * lock. During each step the configuration is synced out. 5374251631Sdelphij * 5375251631Sdelphij * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5376251631Sdelphij * devices. 5377219089Spjd */ 5378168404Spjdint 5379168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5380168404Spjd{ 5381168404Spjd vdev_t *vd; 5382219089Spjd metaslab_group_t *mg; 5383185029Spjd nvlist_t **spares, **l2cache, *nv; 5384219089Spjd uint64_t txg = 0; 5385185029Spjd uint_t nspares, nl2cache; 5386185029Spjd int error = 0; 5387209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5388168404Spjd 5389219089Spjd ASSERT(spa_writeable(spa)); 5390219089Spjd 5391209962Smm if (!locked) 5392209962Smm txg = spa_vdev_enter(spa); 5393168404Spjd 5394185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5395168404Spjd 5396185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 5397185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5398185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5399185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5400185029Spjd /* 5401185029Spjd * Only remove the hot spare if it's not currently in use 5402185029Spjd * in this pool. 5403185029Spjd */ 5404185029Spjd if (vd == NULL || unspare) { 5405185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 5406185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5407185029Spjd spa_load_spares(spa); 5408185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5409185029Spjd } else { 5410249195Smm error = SET_ERROR(EBUSY); 5411168404Spjd } 5412185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 5413185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5414185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5415185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5416185029Spjd /* 5417185029Spjd * Cache devices can always be removed. 5418185029Spjd */ 5419185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5420185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5421185029Spjd spa_load_l2cache(spa); 5422185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5423219089Spjd } else if (vd != NULL && vd->vdev_islog) { 5424219089Spjd ASSERT(!locked); 5425219089Spjd ASSERT(vd == vd->vdev_top); 5426219089Spjd 5427219089Spjd /* 5428219089Spjd * XXX - Once we have bp-rewrite this should 5429219089Spjd * become the common case. 5430219089Spjd */ 5431219089Spjd 5432219089Spjd mg = vd->vdev_mg; 5433219089Spjd 5434219089Spjd /* 5435219089Spjd * Stop allocating from this vdev. 5436219089Spjd */ 5437219089Spjd metaslab_group_passivate(mg); 5438219089Spjd 5439219089Spjd /* 5440219089Spjd * Wait for the youngest allocations and frees to sync, 5441219089Spjd * and then wait for the deferral of those frees to finish. 5442219089Spjd */ 5443219089Spjd spa_vdev_config_exit(spa, NULL, 5444219089Spjd txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5445219089Spjd 5446219089Spjd /* 5447219089Spjd * Attempt to evacuate the vdev. 5448219089Spjd */ 5449219089Spjd error = spa_vdev_remove_evacuate(spa, vd); 5450219089Spjd 5451219089Spjd txg = spa_vdev_config_enter(spa); 5452219089Spjd 5453219089Spjd /* 5454219089Spjd * If we couldn't evacuate the vdev, unwind. 5455219089Spjd */ 5456219089Spjd if (error) { 5457219089Spjd metaslab_group_activate(mg); 5458219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5459219089Spjd } 5460219089Spjd 5461219089Spjd /* 5462219089Spjd * Clean up the vdev namespace. 5463219089Spjd */ 5464219089Spjd spa_vdev_remove_from_namespace(spa, vd); 5465219089Spjd 5466185029Spjd } else if (vd != NULL) { 5467185029Spjd /* 5468185029Spjd * Normal vdevs cannot be removed (yet). 5469185029Spjd */ 5470249195Smm error = SET_ERROR(ENOTSUP); 5471168404Spjd } else { 5472185029Spjd /* 5473185029Spjd * There is no vdev of any kind with the specified guid. 5474185029Spjd */ 5475249195Smm error = SET_ERROR(ENOENT); 5476168404Spjd } 5477168404Spjd 5478209962Smm if (!locked) 5479209962Smm return (spa_vdev_exit(spa, NULL, txg, error)); 5480209962Smm 5481209962Smm return (error); 5482168404Spjd} 5483168404Spjd 5484168404Spjd/* 5485185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 5486251631Sdelphij * currently spared, so we can detach it. 5487168404Spjd */ 5488168404Spjdstatic vdev_t * 5489185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 5490168404Spjd{ 5491168404Spjd vdev_t *newvd, *oldvd; 5492168404Spjd 5493219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5494185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5495168404Spjd if (oldvd != NULL) 5496168404Spjd return (oldvd); 5497168404Spjd } 5498168404Spjd 5499185029Spjd /* 5500219089Spjd * Check for a completed replacement. We always consider the first 5501219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 5502219089Spjd * the newest (see spa_vdev_attach() for how that works). In 5503219089Spjd * the case where the newest vdev is faulted, we will not automatically 5504219089Spjd * remove it after a resilver completes. This is OK as it will require 5505219089Spjd * user intervention to determine which disk the admin wishes to keep. 5506185029Spjd */ 5507219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 5508219089Spjd ASSERT(vd->vdev_children > 1); 5509219089Spjd 5510219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 5511168404Spjd oldvd = vd->vdev_child[0]; 5512168404Spjd 5513209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 5514219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5515209962Smm !vdev_dtl_required(oldvd)) 5516168404Spjd return (oldvd); 5517168404Spjd } 5518168404Spjd 5519185029Spjd /* 5520185029Spjd * Check for a completed resilver with the 'unspare' flag set. 5521185029Spjd */ 5522219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 5523219089Spjd vdev_t *first = vd->vdev_child[0]; 5524219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5525185029Spjd 5526219089Spjd if (last->vdev_unspare) { 5527219089Spjd oldvd = first; 5528219089Spjd newvd = last; 5529219089Spjd } else if (first->vdev_unspare) { 5530219089Spjd oldvd = last; 5531219089Spjd newvd = first; 5532219089Spjd } else { 5533219089Spjd oldvd = NULL; 5534219089Spjd } 5535219089Spjd 5536219089Spjd if (oldvd != NULL && 5537209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 5538219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5539219089Spjd !vdev_dtl_required(oldvd)) 5540185029Spjd return (oldvd); 5541219089Spjd 5542219089Spjd /* 5543219089Spjd * If there are more than two spares attached to a disk, 5544219089Spjd * and those spares are not required, then we want to 5545219089Spjd * attempt to free them up now so that they can be used 5546219089Spjd * by other pools. Once we're back down to a single 5547219089Spjd * disk+spare, we stop removing them. 5548219089Spjd */ 5549219089Spjd if (vd->vdev_children > 2) { 5550219089Spjd newvd = vd->vdev_child[1]; 5551219089Spjd 5552219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 5553219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 5554219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 5555219089Spjd !vdev_dtl_required(newvd)) 5556219089Spjd return (newvd); 5557185029Spjd } 5558185029Spjd } 5559185029Spjd 5560168404Spjd return (NULL); 5561168404Spjd} 5562168404Spjd 5563168404Spjdstatic void 5564185029Spjdspa_vdev_resilver_done(spa_t *spa) 5565168404Spjd{ 5566209962Smm vdev_t *vd, *pvd, *ppvd; 5567209962Smm uint64_t guid, sguid, pguid, ppguid; 5568168404Spjd 5569209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5570168404Spjd 5571185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5572209962Smm pvd = vd->vdev_parent; 5573209962Smm ppvd = pvd->vdev_parent; 5574168404Spjd guid = vd->vdev_guid; 5575209962Smm pguid = pvd->vdev_guid; 5576209962Smm ppguid = ppvd->vdev_guid; 5577209962Smm sguid = 0; 5578168404Spjd /* 5579168404Spjd * If we have just finished replacing a hot spared device, then 5580168404Spjd * we need to detach the parent's first child (the original hot 5581168404Spjd * spare) as well. 5582168404Spjd */ 5583219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5584219089Spjd ppvd->vdev_children == 2) { 5585168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5586209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 5587168404Spjd } 5588209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5589209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5590168404Spjd return; 5591209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5592168404Spjd return; 5593209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5594168404Spjd } 5595168404Spjd 5596209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5597168404Spjd} 5598168404Spjd 5599168404Spjd/* 5600219089Spjd * Update the stored path or FRU for this vdev. 5601168404Spjd */ 5602168404Spjdint 5603209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5604209962Smm boolean_t ispath) 5605168404Spjd{ 5606185029Spjd vdev_t *vd; 5607219089Spjd boolean_t sync = B_FALSE; 5608168404Spjd 5609219089Spjd ASSERT(spa_writeable(spa)); 5610168404Spjd 5611219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 5612219089Spjd 5613209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5614219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5615168404Spjd 5616168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5617219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5618168404Spjd 5619209962Smm if (ispath) { 5620219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 5621219089Spjd spa_strfree(vd->vdev_path); 5622219089Spjd vd->vdev_path = spa_strdup(value); 5623219089Spjd sync = B_TRUE; 5624219089Spjd } 5625209962Smm } else { 5626219089Spjd if (vd->vdev_fru == NULL) { 5627219089Spjd vd->vdev_fru = spa_strdup(value); 5628219089Spjd sync = B_TRUE; 5629219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 5630209962Smm spa_strfree(vd->vdev_fru); 5631219089Spjd vd->vdev_fru = spa_strdup(value); 5632219089Spjd sync = B_TRUE; 5633219089Spjd } 5634209962Smm } 5635168404Spjd 5636219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5637168404Spjd} 5638168404Spjd 5639209962Smmint 5640209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5641209962Smm{ 5642209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5643209962Smm} 5644209962Smm 5645209962Smmint 5646209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5647209962Smm{ 5648209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5649209962Smm} 5650209962Smm 5651168404Spjd/* 5652168404Spjd * ========================================================================== 5653219089Spjd * SPA Scanning 5654168404Spjd * ========================================================================== 5655168404Spjd */ 5656168404Spjd 5657168404Spjdint 5658219089Spjdspa_scan_stop(spa_t *spa) 5659168404Spjd{ 5660185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5661219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5662249195Smm return (SET_ERROR(EBUSY)); 5663219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 5664219089Spjd} 5665168404Spjd 5666219089Spjdint 5667219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 5668219089Spjd{ 5669219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5670219089Spjd 5671219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5672249195Smm return (SET_ERROR(ENOTSUP)); 5673168404Spjd 5674168404Spjd /* 5675185029Spjd * If a resilver was requested, but there is no DTL on a 5676185029Spjd * writeable leaf device, we have nothing to do. 5677168404Spjd */ 5678219089Spjd if (func == POOL_SCAN_RESILVER && 5679185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5680185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5681168404Spjd return (0); 5682168404Spjd } 5683168404Spjd 5684219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 5685168404Spjd} 5686168404Spjd 5687168404Spjd/* 5688168404Spjd * ========================================================================== 5689168404Spjd * SPA async task processing 5690168404Spjd * ========================================================================== 5691168404Spjd */ 5692168404Spjd 5693168404Spjdstatic void 5694185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 5695168404Spjd{ 5696185029Spjd if (vd->vdev_remove_wanted) { 5697219089Spjd vd->vdev_remove_wanted = B_FALSE; 5698219089Spjd vd->vdev_delayed_close = B_FALSE; 5699185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5700209962Smm 5701209962Smm /* 5702209962Smm * We want to clear the stats, but we don't want to do a full 5703209962Smm * vdev_clear() as that will cause us to throw away 5704209962Smm * degraded/faulted state as well as attempt to reopen the 5705209962Smm * device, all of which is a waste. 5706209962Smm */ 5707209962Smm vd->vdev_stat.vs_read_errors = 0; 5708209962Smm vd->vdev_stat.vs_write_errors = 0; 5709209962Smm vd->vdev_stat.vs_checksum_errors = 0; 5710209962Smm 5711185029Spjd vdev_state_dirty(vd->vdev_top); 5712185029Spjd } 5713168404Spjd 5714185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5715185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 5716185029Spjd} 5717168404Spjd 5718185029Spjdstatic void 5719185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 5720185029Spjd{ 5721185029Spjd if (vd->vdev_probe_wanted) { 5722219089Spjd vd->vdev_probe_wanted = B_FALSE; 5723185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 5724168404Spjd } 5725168404Spjd 5726185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5727185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 5728168404Spjd} 5729168404Spjd 5730168404Spjdstatic void 5731219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 5732219089Spjd{ 5733219089Spjd sysevent_id_t eid; 5734219089Spjd nvlist_t *attr; 5735219089Spjd char *physpath; 5736219089Spjd 5737219089Spjd if (!spa->spa_autoexpand) 5738219089Spjd return; 5739219089Spjd 5740219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5741219089Spjd vdev_t *cvd = vd->vdev_child[c]; 5742219089Spjd spa_async_autoexpand(spa, cvd); 5743219089Spjd } 5744219089Spjd 5745219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5746219089Spjd return; 5747219089Spjd 5748219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5749219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5750219089Spjd 5751219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5752219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5753219089Spjd 5754219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5755219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5756219089Spjd 5757219089Spjd nvlist_free(attr); 5758219089Spjd kmem_free(physpath, MAXPATHLEN); 5759219089Spjd} 5760219089Spjd 5761219089Spjdstatic void 5762168404Spjdspa_async_thread(void *arg) 5763168404Spjd{ 5764168404Spjd spa_t *spa = arg; 5765168404Spjd int tasks; 5766168404Spjd 5767168404Spjd ASSERT(spa->spa_sync_on); 5768168404Spjd 5769168404Spjd mutex_enter(&spa->spa_async_lock); 5770168404Spjd tasks = spa->spa_async_tasks; 5771253990Smav spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 5772168404Spjd mutex_exit(&spa->spa_async_lock); 5773168404Spjd 5774168404Spjd /* 5775168404Spjd * See if the config needs to be updated. 5776168404Spjd */ 5777168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5778219089Spjd uint64_t old_space, new_space; 5779219089Spjd 5780168404Spjd mutex_enter(&spa_namespace_lock); 5781219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 5782168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5783219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 5784168404Spjd mutex_exit(&spa_namespace_lock); 5785219089Spjd 5786219089Spjd /* 5787219089Spjd * If the pool grew as a result of the config update, 5788219089Spjd * then log an internal history event. 5789219089Spjd */ 5790219089Spjd if (new_space != old_space) { 5791248571Smm spa_history_log_internal(spa, "vdev online", NULL, 5792219089Spjd "pool '%s' size: %llu(+%llu)", 5793219089Spjd spa_name(spa), new_space, new_space - old_space); 5794219089Spjd } 5795168404Spjd } 5796168404Spjd 5797219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5798219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5799219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 5800219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 5801219089Spjd } 5802219089Spjd 5803168404Spjd /* 5804185029Spjd * See if any devices need to be probed. 5805168404Spjd */ 5806185029Spjd if (tasks & SPA_ASYNC_PROBE) { 5807219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5808185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 5809185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5810185029Spjd } 5811168404Spjd 5812168404Spjd /* 5813185029Spjd * If any devices are done replacing, detach them. 5814168404Spjd */ 5815185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 5816185029Spjd spa_vdev_resilver_done(spa); 5817168404Spjd 5818168404Spjd /* 5819168404Spjd * Kick off a resilver. 5820168404Spjd */ 5821168404Spjd if (tasks & SPA_ASYNC_RESILVER) 5822219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 5823168404Spjd 5824168404Spjd /* 5825168404Spjd * Let the world know that we're done. 5826168404Spjd */ 5827168404Spjd mutex_enter(&spa->spa_async_lock); 5828168404Spjd spa->spa_async_thread = NULL; 5829168404Spjd cv_broadcast(&spa->spa_async_cv); 5830168404Spjd mutex_exit(&spa->spa_async_lock); 5831168404Spjd thread_exit(); 5832168404Spjd} 5833168404Spjd 5834253990Smavstatic void 5835253990Smavspa_async_thread_vd(void *arg) 5836253990Smav{ 5837253990Smav spa_t *spa = arg; 5838253990Smav int tasks; 5839253990Smav 5840253990Smav ASSERT(spa->spa_sync_on); 5841253990Smav 5842253990Smav mutex_enter(&spa->spa_async_lock); 5843253990Smav tasks = spa->spa_async_tasks; 5844253990Smavretry: 5845253990Smav spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 5846253990Smav mutex_exit(&spa->spa_async_lock); 5847253990Smav 5848253990Smav /* 5849253990Smav * See if any devices need to be marked REMOVED. 5850253990Smav */ 5851253990Smav if (tasks & SPA_ASYNC_REMOVE) { 5852253990Smav spa_vdev_state_enter(spa, SCL_NONE); 5853253990Smav spa_async_remove(spa, spa->spa_root_vdev); 5854253990Smav for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5855253990Smav spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5856253990Smav for (int i = 0; i < spa->spa_spares.sav_count; i++) 5857253990Smav spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5858253990Smav (void) spa_vdev_state_exit(spa, NULL, 0); 5859253990Smav } 5860253990Smav 5861253990Smav /* 5862253990Smav * Let the world know that we're done. 5863253990Smav */ 5864253990Smav mutex_enter(&spa->spa_async_lock); 5865253990Smav tasks = spa->spa_async_tasks; 5866253990Smav if ((tasks & SPA_ASYNC_REMOVE) != 0) 5867253990Smav goto retry; 5868253990Smav spa->spa_async_thread_vd = NULL; 5869253990Smav cv_broadcast(&spa->spa_async_cv); 5870253990Smav mutex_exit(&spa->spa_async_lock); 5871253990Smav thread_exit(); 5872253990Smav} 5873253990Smav 5874168404Spjdvoid 5875168404Spjdspa_async_suspend(spa_t *spa) 5876168404Spjd{ 5877168404Spjd mutex_enter(&spa->spa_async_lock); 5878168404Spjd spa->spa_async_suspended++; 5879253990Smav while (spa->spa_async_thread != NULL && 5880253990Smav spa->spa_async_thread_vd != NULL) 5881168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5882168404Spjd mutex_exit(&spa->spa_async_lock); 5883168404Spjd} 5884168404Spjd 5885168404Spjdvoid 5886168404Spjdspa_async_resume(spa_t *spa) 5887168404Spjd{ 5888168404Spjd mutex_enter(&spa->spa_async_lock); 5889168404Spjd ASSERT(spa->spa_async_suspended != 0); 5890168404Spjd spa->spa_async_suspended--; 5891168404Spjd mutex_exit(&spa->spa_async_lock); 5892168404Spjd} 5893168404Spjd 5894251636Sdelphijstatic boolean_t 5895251636Sdelphijspa_async_tasks_pending(spa_t *spa) 5896251636Sdelphij{ 5897251636Sdelphij uint_t non_config_tasks; 5898251636Sdelphij uint_t config_task; 5899251636Sdelphij boolean_t config_task_suspended; 5900251636Sdelphij 5901253990Smav non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 5902253990Smav SPA_ASYNC_REMOVE); 5903251636Sdelphij config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 5904251636Sdelphij if (spa->spa_ccw_fail_time == 0) { 5905251636Sdelphij config_task_suspended = B_FALSE; 5906251636Sdelphij } else { 5907251636Sdelphij config_task_suspended = 5908251636Sdelphij (gethrtime() - spa->spa_ccw_fail_time) < 5909251636Sdelphij (zfs_ccw_retry_interval * NANOSEC); 5910251636Sdelphij } 5911251636Sdelphij 5912251636Sdelphij return (non_config_tasks || (config_task && !config_task_suspended)); 5913251636Sdelphij} 5914251636Sdelphij 5915168404Spjdstatic void 5916168404Spjdspa_async_dispatch(spa_t *spa) 5917168404Spjd{ 5918168404Spjd mutex_enter(&spa->spa_async_lock); 5919251636Sdelphij if (spa_async_tasks_pending(spa) && 5920251636Sdelphij !spa->spa_async_suspended && 5921168404Spjd spa->spa_async_thread == NULL && 5922251636Sdelphij rootdir != NULL) 5923168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 5924168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5925168404Spjd mutex_exit(&spa->spa_async_lock); 5926168404Spjd} 5927168404Spjd 5928253990Smavstatic void 5929253990Smavspa_async_dispatch_vd(spa_t *spa) 5930253990Smav{ 5931253990Smav mutex_enter(&spa->spa_async_lock); 5932253990Smav if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 5933253990Smav !spa->spa_async_suspended && 5934253990Smav spa->spa_async_thread_vd == NULL && 5935253990Smav rootdir != NULL) 5936253990Smav spa->spa_async_thread_vd = thread_create(NULL, 0, 5937253990Smav spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 5938253990Smav mutex_exit(&spa->spa_async_lock); 5939253990Smav} 5940253990Smav 5941168404Spjdvoid 5942168404Spjdspa_async_request(spa_t *spa, int task) 5943168404Spjd{ 5944219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5945168404Spjd mutex_enter(&spa->spa_async_lock); 5946168404Spjd spa->spa_async_tasks |= task; 5947168404Spjd mutex_exit(&spa->spa_async_lock); 5948253990Smav spa_async_dispatch_vd(spa); 5949168404Spjd} 5950168404Spjd 5951168404Spjd/* 5952168404Spjd * ========================================================================== 5953168404Spjd * SPA syncing routines 5954168404Spjd * ========================================================================== 5955168404Spjd */ 5956168404Spjd 5957219089Spjdstatic int 5958219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5959168404Spjd{ 5960219089Spjd bpobj_t *bpo = arg; 5961219089Spjd bpobj_enqueue(bpo, bp, tx); 5962219089Spjd return (0); 5963219089Spjd} 5964168404Spjd 5965219089Spjdstatic int 5966219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5967219089Spjd{ 5968219089Spjd zio_t *zio = arg; 5969168404Spjd 5970219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5971240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 5972219089Spjd return (0); 5973168404Spjd} 5974168404Spjd 5975168404Spjdstatic void 5976168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5977168404Spjd{ 5978168404Spjd char *packed = NULL; 5979185029Spjd size_t bufsize; 5980168404Spjd size_t nvsize = 0; 5981168404Spjd dmu_buf_t *db; 5982168404Spjd 5983168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5984168404Spjd 5985185029Spjd /* 5986185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5987185029Spjd * information. This avoids the dbuf_will_dirty() path and 5988185029Spjd * saves us a pre-read to get data we don't actually care about. 5989185029Spjd */ 5990236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 5991185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 5992168404Spjd 5993168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5994168404Spjd KM_SLEEP) == 0); 5995185029Spjd bzero(packed + nvsize, bufsize - nvsize); 5996168404Spjd 5997185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5998168404Spjd 5999185029Spjd kmem_free(packed, bufsize); 6000168404Spjd 6001168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6002168404Spjd dmu_buf_will_dirty(db, tx); 6003168404Spjd *(uint64_t *)db->db_data = nvsize; 6004168404Spjd dmu_buf_rele(db, FTAG); 6005168404Spjd} 6006168404Spjd 6007168404Spjdstatic void 6008185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6009185029Spjd const char *config, const char *entry) 6010168404Spjd{ 6011168404Spjd nvlist_t *nvroot; 6012185029Spjd nvlist_t **list; 6013168404Spjd int i; 6014168404Spjd 6015185029Spjd if (!sav->sav_sync) 6016168404Spjd return; 6017168404Spjd 6018168404Spjd /* 6019185029Spjd * Update the MOS nvlist describing the list of available devices. 6020185029Spjd * spa_validate_aux() will have already made sure this nvlist is 6021185029Spjd * valid and the vdevs are labeled appropriately. 6022168404Spjd */ 6023185029Spjd if (sav->sav_object == 0) { 6024185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6025185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6026185029Spjd sizeof (uint64_t), tx); 6027168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 6028185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6029185029Spjd &sav->sav_object, tx) == 0); 6030168404Spjd } 6031168404Spjd 6032168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6033185029Spjd if (sav->sav_count == 0) { 6034185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6035168404Spjd } else { 6036185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6037185029Spjd for (i = 0; i < sav->sav_count; i++) 6038185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6039219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 6040185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6041185029Spjd sav->sav_count) == 0); 6042185029Spjd for (i = 0; i < sav->sav_count; i++) 6043185029Spjd nvlist_free(list[i]); 6044185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 6045168404Spjd } 6046168404Spjd 6047185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6048168404Spjd nvlist_free(nvroot); 6049168404Spjd 6050185029Spjd sav->sav_sync = B_FALSE; 6051168404Spjd} 6052168404Spjd 6053168404Spjdstatic void 6054168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6055168404Spjd{ 6056168404Spjd nvlist_t *config; 6057168404Spjd 6058185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) 6059168404Spjd return; 6060168404Spjd 6061185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6062168404Spjd 6063185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 6064185029Spjd dmu_tx_get_txg(tx), B_FALSE); 6065185029Spjd 6066243505Smm /* 6067243505Smm * If we're upgrading the spa version then make sure that 6068243505Smm * the config object gets updated with the correct version. 6069243505Smm */ 6070243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6071243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6072243505Smm spa->spa_uberblock.ub_version); 6073243505Smm 6074185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6075185029Spjd 6076168404Spjd if (spa->spa_config_syncing) 6077168404Spjd nvlist_free(spa->spa_config_syncing); 6078168404Spjd spa->spa_config_syncing = config; 6079168404Spjd 6080168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6081168404Spjd} 6082168404Spjd 6083236884Smmstatic void 6084248571Smmspa_sync_version(void *arg, dmu_tx_t *tx) 6085236884Smm{ 6086248571Smm uint64_t *versionp = arg; 6087248571Smm uint64_t version = *versionp; 6088248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6089236884Smm 6090236884Smm /* 6091236884Smm * Setting the version is special cased when first creating the pool. 6092236884Smm */ 6093236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 6094236884Smm 6095247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6096236884Smm ASSERT(version >= spa_version(spa)); 6097236884Smm 6098236884Smm spa->spa_uberblock.ub_version = version; 6099236884Smm vdev_config_dirty(spa->spa_root_vdev); 6100248571Smm spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6101236884Smm} 6102236884Smm 6103185029Spjd/* 6104185029Spjd * Set zpool properties. 6105185029Spjd */ 6106168404Spjdstatic void 6107248571Smmspa_sync_props(void *arg, dmu_tx_t *tx) 6108168404Spjd{ 6109248571Smm nvlist_t *nvp = arg; 6110248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6111185029Spjd objset_t *mos = spa->spa_meta_objset; 6112236884Smm nvpair_t *elem = NULL; 6113168404Spjd 6114168404Spjd mutex_enter(&spa->spa_props_lock); 6115168404Spjd 6116185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 6117236884Smm uint64_t intval; 6118236884Smm char *strval, *fname; 6119236884Smm zpool_prop_t prop; 6120236884Smm const char *propname; 6121236884Smm zprop_type_t proptype; 6122236884Smm zfeature_info_t *feature; 6123236884Smm 6124185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6125236884Smm case ZPROP_INVAL: 6126236884Smm /* 6127236884Smm * We checked this earlier in spa_prop_validate(). 6128236884Smm */ 6129236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 6130236884Smm 6131236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 6132236884Smm VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); 6133236884Smm 6134236884Smm spa_feature_enable(spa, feature, tx); 6135248571Smm spa_history_log_internal(spa, "set", tx, 6136248571Smm "%s=enabled", nvpair_name(elem)); 6137236884Smm break; 6138236884Smm 6139185029Spjd case ZPOOL_PROP_VERSION: 6140236884Smm VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6141185029Spjd /* 6142236884Smm * The version is synced seperatly before other 6143236884Smm * properties and should be correct by now. 6144185029Spjd */ 6145236884Smm ASSERT3U(spa_version(spa), >=, intval); 6146185029Spjd break; 6147168404Spjd 6148185029Spjd case ZPOOL_PROP_ALTROOT: 6149185029Spjd /* 6150185029Spjd * 'altroot' is a non-persistent property. It should 6151185029Spjd * have been set temporarily at creation or import time. 6152185029Spjd */ 6153185029Spjd ASSERT(spa->spa_root != NULL); 6154185029Spjd break; 6155168404Spjd 6156219089Spjd case ZPOOL_PROP_READONLY: 6157185029Spjd case ZPOOL_PROP_CACHEFILE: 6158185029Spjd /* 6159219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 6160219089Spjd * properties. 6161185029Spjd */ 6162168404Spjd break; 6163228103Smm case ZPOOL_PROP_COMMENT: 6164228103Smm VERIFY(nvpair_value_string(elem, &strval) == 0); 6165228103Smm if (spa->spa_comment != NULL) 6166228103Smm spa_strfree(spa->spa_comment); 6167228103Smm spa->spa_comment = spa_strdup(strval); 6168228103Smm /* 6169228103Smm * We need to dirty the configuration on all the vdevs 6170228103Smm * so that their labels get updated. It's unnecessary 6171228103Smm * to do this for pool creation since the vdev's 6172228103Smm * configuratoin has already been dirtied. 6173228103Smm */ 6174228103Smm if (tx->tx_txg != TXG_INITIAL) 6175228103Smm vdev_config_dirty(spa->spa_root_vdev); 6176248571Smm spa_history_log_internal(spa, "set", tx, 6177248571Smm "%s=%s", nvpair_name(elem), strval); 6178228103Smm break; 6179185029Spjd default: 6180185029Spjd /* 6181185029Spjd * Set pool property values in the poolprops mos object. 6182185029Spjd */ 6183185029Spjd if (spa->spa_pool_props_object == 0) { 6184236884Smm spa->spa_pool_props_object = 6185236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 6186185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6187236884Smm tx); 6188185029Spjd } 6189185029Spjd 6190185029Spjd /* normalize the property name */ 6191185029Spjd propname = zpool_prop_to_name(prop); 6192185029Spjd proptype = zpool_prop_get_type(prop); 6193185029Spjd 6194185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 6195185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 6196185029Spjd VERIFY(nvpair_value_string(elem, &strval) == 0); 6197185029Spjd VERIFY(zap_update(mos, 6198185029Spjd spa->spa_pool_props_object, propname, 6199185029Spjd 1, strlen(strval) + 1, strval, tx) == 0); 6200248571Smm spa_history_log_internal(spa, "set", tx, 6201248571Smm "%s=%s", nvpair_name(elem), strval); 6202185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6203185029Spjd VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6204185029Spjd 6205185029Spjd if (proptype == PROP_TYPE_INDEX) { 6206185029Spjd const char *unused; 6207185029Spjd VERIFY(zpool_prop_index_to_string( 6208185029Spjd prop, intval, &unused) == 0); 6209185029Spjd } 6210185029Spjd VERIFY(zap_update(mos, 6211185029Spjd spa->spa_pool_props_object, propname, 6212185029Spjd 8, 1, &intval, tx) == 0); 6213248571Smm spa_history_log_internal(spa, "set", tx, 6214248571Smm "%s=%lld", nvpair_name(elem), intval); 6215185029Spjd } else { 6216185029Spjd ASSERT(0); /* not allowed */ 6217185029Spjd } 6218185029Spjd 6219185029Spjd switch (prop) { 6220185029Spjd case ZPOOL_PROP_DELEGATION: 6221185029Spjd spa->spa_delegation = intval; 6222185029Spjd break; 6223185029Spjd case ZPOOL_PROP_BOOTFS: 6224185029Spjd spa->spa_bootfs = intval; 6225185029Spjd break; 6226185029Spjd case ZPOOL_PROP_FAILUREMODE: 6227185029Spjd spa->spa_failmode = intval; 6228185029Spjd break; 6229219089Spjd case ZPOOL_PROP_AUTOEXPAND: 6230219089Spjd spa->spa_autoexpand = intval; 6231219089Spjd if (tx->tx_txg != TXG_INITIAL) 6232219089Spjd spa_async_request(spa, 6233219089Spjd SPA_ASYNC_AUTOEXPAND); 6234219089Spjd break; 6235219089Spjd case ZPOOL_PROP_DEDUPDITTO: 6236219089Spjd spa->spa_dedup_ditto = intval; 6237219089Spjd break; 6238185029Spjd default: 6239185029Spjd break; 6240185029Spjd } 6241168404Spjd } 6242185029Spjd 6243168404Spjd } 6244185029Spjd 6245185029Spjd mutex_exit(&spa->spa_props_lock); 6246168404Spjd} 6247168404Spjd 6248168404Spjd/* 6249219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 6250219089Spjd * reflect the new version this txg, so there must be no changes this 6251219089Spjd * txg to anything that the upgrade code depends on after it executes. 6252219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 6253219089Spjd * tasks. 6254219089Spjd */ 6255219089Spjdstatic void 6256219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6257219089Spjd{ 6258219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6259219089Spjd 6260219089Spjd ASSERT(spa->spa_sync_pass == 1); 6261219089Spjd 6262248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6263248571Smm 6264219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6265219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6266219089Spjd dsl_pool_create_origin(dp, tx); 6267219089Spjd 6268219089Spjd /* Keeping the origin open increases spa_minref */ 6269219089Spjd spa->spa_minref += 3; 6270219089Spjd } 6271219089Spjd 6272219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6273219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6274219089Spjd dsl_pool_upgrade_clones(dp, tx); 6275219089Spjd } 6276219089Spjd 6277219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6278219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6279219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 6280219089Spjd 6281219089Spjd /* Keeping the freedir open increases spa_minref */ 6282219089Spjd spa->spa_minref += 3; 6283219089Spjd } 6284236884Smm 6285236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6286236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6287236884Smm spa_feature_create_zap_objects(spa, tx); 6288236884Smm } 6289248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 6290219089Spjd} 6291219089Spjd 6292219089Spjd/* 6293168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 6294168404Spjd * part of the process, so we iterate until it converges. 6295168404Spjd */ 6296168404Spjdvoid 6297168404Spjdspa_sync(spa_t *spa, uint64_t txg) 6298168404Spjd{ 6299168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6300168404Spjd objset_t *mos = spa->spa_meta_objset; 6301219089Spjd bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 6302219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6303168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6304168404Spjd vdev_t *vd; 6305168404Spjd dmu_tx_t *tx; 6306185029Spjd int error; 6307168404Spjd 6308219089Spjd VERIFY(spa_writeable(spa)); 6309219089Spjd 6310168404Spjd /* 6311168404Spjd * Lock out configuration changes. 6312168404Spjd */ 6313185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6314168404Spjd 6315168404Spjd spa->spa_syncing_txg = txg; 6316168404Spjd spa->spa_sync_pass = 0; 6317168404Spjd 6318185029Spjd /* 6319185029Spjd * If there are any pending vdev state changes, convert them 6320185029Spjd * into config changes that go out with this transaction group. 6321185029Spjd */ 6322185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6323209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 6324209962Smm /* 6325209962Smm * We need the write lock here because, for aux vdevs, 6326209962Smm * calling vdev_config_dirty() modifies sav_config. 6327209962Smm * This is ugly and will become unnecessary when we 6328209962Smm * eliminate the aux vdev wart by integrating all vdevs 6329209962Smm * into the root vdev tree. 6330209962Smm */ 6331209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6332209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6333209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6334209962Smm vdev_state_clean(vd); 6335209962Smm vdev_config_dirty(vd); 6336209962Smm } 6337209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6338209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6339185029Spjd } 6340185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6341185029Spjd 6342168404Spjd tx = dmu_tx_create_assigned(dp, txg); 6343168404Spjd 6344247265Smm spa->spa_sync_starttime = gethrtime(); 6345247265Smm#ifdef illumos 6346247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6347247265Smm spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6348247265Smm#else /* FreeBSD */ 6349247265Smm#ifdef _KERNEL 6350247265Smm callout_reset(&spa->spa_deadman_cycid, 6351247265Smm hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6352247265Smm#endif 6353247265Smm#endif 6354247265Smm 6355168404Spjd /* 6356185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6357168404Spjd * set spa_deflate if we have no raid-z vdevs. 6358168404Spjd */ 6359185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6360185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6361168404Spjd int i; 6362168404Spjd 6363168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 6364168404Spjd vd = rvd->vdev_child[i]; 6365168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6366168404Spjd break; 6367168404Spjd } 6368168404Spjd if (i == rvd->vdev_children) { 6369168404Spjd spa->spa_deflate = TRUE; 6370168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 6371168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6372168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6373168404Spjd } 6374168404Spjd } 6375168404Spjd 6376168404Spjd /* 6377219089Spjd * If anything has changed in this txg, or if someone is waiting 6378219089Spjd * for this txg to sync (eg, spa_vdev_remove()), push the 6379219089Spjd * deferred frees from the previous txg. If not, leave them 6380219089Spjd * alone so that we don't generate work on an otherwise idle 6381219089Spjd * system. 6382168404Spjd */ 6383168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6384168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6385219089Spjd !txg_list_empty(&dp->dp_sync_tasks, txg) || 6386219089Spjd ((dsl_scan_active(dp->dp_scan) || 6387219089Spjd txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6388219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6389219089Spjd VERIFY3U(bpobj_iterate(defer_bpo, 6390219089Spjd spa_free_sync_cb, zio, tx), ==, 0); 6391240415Smm VERIFY0(zio_wait(zio)); 6392219089Spjd } 6393168404Spjd 6394168404Spjd /* 6395168404Spjd * Iterate to convergence. 6396168404Spjd */ 6397168404Spjd do { 6398219089Spjd int pass = ++spa->spa_sync_pass; 6399168404Spjd 6400168404Spjd spa_sync_config_object(spa, tx); 6401185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6402185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6403185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6404185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6405168404Spjd spa_errlog_sync(spa, txg); 6406168404Spjd dsl_pool_sync(dp, txg); 6407168404Spjd 6408243503Smm if (pass < zfs_sync_pass_deferred_free) { 6409219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6410219089Spjd bplist_iterate(free_bpl, spa_free_sync_cb, 6411219089Spjd zio, tx); 6412219089Spjd VERIFY(zio_wait(zio) == 0); 6413219089Spjd } else { 6414219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 6415219089Spjd defer_bpo, tx); 6416168404Spjd } 6417168404Spjd 6418219089Spjd ddt_sync(spa, txg); 6419219089Spjd dsl_scan_sync(dp, tx); 6420168404Spjd 6421219089Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6422219089Spjd vdev_sync(vd, txg); 6423168404Spjd 6424219089Spjd if (pass == 1) 6425219089Spjd spa_sync_upgrades(spa, tx); 6426168404Spjd 6427219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 6428219089Spjd 6429168404Spjd /* 6430168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 6431168404Spjd * to commit the transaction group. 6432168404Spjd * 6433185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 6434185029Spjd * random top-level vdevs that are known to be visible in the 6435185029Spjd * config cache (see spa_vdev_add() for a complete description). 6436185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6437168404Spjd */ 6438185029Spjd for (;;) { 6439185029Spjd /* 6440185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 6441185029Spjd * while we're attempting to write the vdev labels. 6442185029Spjd */ 6443185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6444168404Spjd 6445185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 6446185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 6447185029Spjd int svdcount = 0; 6448185029Spjd int children = rvd->vdev_children; 6449185029Spjd int c0 = spa_get_random(children); 6450185029Spjd 6451219089Spjd for (int c = 0; c < children; c++) { 6452185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 6453185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6454185029Spjd continue; 6455185029Spjd svd[svdcount++] = vd; 6456185029Spjd if (svdcount == SPA_DVAS_PER_BP) 6457185029Spjd break; 6458185029Spjd } 6459213198Smm error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6460213198Smm if (error != 0) 6461213198Smm error = vdev_config_sync(svd, svdcount, txg, 6462213198Smm B_TRUE); 6463185029Spjd } else { 6464185029Spjd error = vdev_config_sync(rvd->vdev_child, 6465213198Smm rvd->vdev_children, txg, B_FALSE); 6466213198Smm if (error != 0) 6467213198Smm error = vdev_config_sync(rvd->vdev_child, 6468213198Smm rvd->vdev_children, txg, B_TRUE); 6469168404Spjd } 6470185029Spjd 6471239620Smm if (error == 0) 6472239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 6473239620Smm 6474185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6475185029Spjd 6476185029Spjd if (error == 0) 6477185029Spjd break; 6478185029Spjd zio_suspend(spa, NULL); 6479185029Spjd zio_resume_wait(spa); 6480168404Spjd } 6481168404Spjd dmu_tx_commit(tx); 6482168404Spjd 6483247265Smm#ifdef illumos 6484247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6485247265Smm#else /* FreeBSD */ 6486247265Smm#ifdef _KERNEL 6487247265Smm callout_drain(&spa->spa_deadman_cycid); 6488247265Smm#endif 6489247265Smm#endif 6490247265Smm 6491168404Spjd /* 6492168404Spjd * Clear the dirty config list. 6493168404Spjd */ 6494185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6495168404Spjd vdev_config_clean(vd); 6496168404Spjd 6497168404Spjd /* 6498168404Spjd * Now that the new config has synced transactionally, 6499168404Spjd * let it become visible to the config cache. 6500168404Spjd */ 6501168404Spjd if (spa->spa_config_syncing != NULL) { 6502168404Spjd spa_config_set(spa, spa->spa_config_syncing); 6503168404Spjd spa->spa_config_txg = txg; 6504168404Spjd spa->spa_config_syncing = NULL; 6505168404Spjd } 6506168404Spjd 6507168404Spjd spa->spa_ubsync = spa->spa_uberblock; 6508168404Spjd 6509219089Spjd dsl_pool_sync_done(dp, txg); 6510168404Spjd 6511168404Spjd /* 6512168404Spjd * Update usable space statistics. 6513168404Spjd */ 6514168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6515168404Spjd vdev_sync_done(vd, txg); 6516168404Spjd 6517219089Spjd spa_update_dspace(spa); 6518219089Spjd 6519168404Spjd /* 6520168404Spjd * It had better be the case that we didn't dirty anything 6521168404Spjd * since vdev_config_sync(). 6522168404Spjd */ 6523168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6524168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6525168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6526168404Spjd 6527219089Spjd spa->spa_sync_pass = 0; 6528219089Spjd 6529185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6530168404Spjd 6531219089Spjd spa_handle_ignored_writes(spa); 6532219089Spjd 6533168404Spjd /* 6534168404Spjd * If any async tasks have been requested, kick them off. 6535168404Spjd */ 6536168404Spjd spa_async_dispatch(spa); 6537253990Smav spa_async_dispatch_vd(spa); 6538168404Spjd} 6539168404Spjd 6540168404Spjd/* 6541168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 6542168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 6543168404Spjd * sync. 6544168404Spjd */ 6545168404Spjdvoid 6546168404Spjdspa_sync_allpools(void) 6547168404Spjd{ 6548168404Spjd spa_t *spa = NULL; 6549168404Spjd mutex_enter(&spa_namespace_lock); 6550168404Spjd while ((spa = spa_next(spa)) != NULL) { 6551219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 6552219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 6553168404Spjd continue; 6554168404Spjd spa_open_ref(spa, FTAG); 6555168404Spjd mutex_exit(&spa_namespace_lock); 6556168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6557168404Spjd mutex_enter(&spa_namespace_lock); 6558168404Spjd spa_close(spa, FTAG); 6559168404Spjd } 6560168404Spjd mutex_exit(&spa_namespace_lock); 6561168404Spjd} 6562168404Spjd 6563168404Spjd/* 6564168404Spjd * ========================================================================== 6565168404Spjd * Miscellaneous routines 6566168404Spjd * ========================================================================== 6567168404Spjd */ 6568168404Spjd 6569168404Spjd/* 6570168404Spjd * Remove all pools in the system. 6571168404Spjd */ 6572168404Spjdvoid 6573168404Spjdspa_evict_all(void) 6574168404Spjd{ 6575168404Spjd spa_t *spa; 6576168404Spjd 6577168404Spjd /* 6578168404Spjd * Remove all cached state. All pools should be closed now, 6579168404Spjd * so every spa in the AVL tree should be unreferenced. 6580168404Spjd */ 6581168404Spjd mutex_enter(&spa_namespace_lock); 6582168404Spjd while ((spa = spa_next(NULL)) != NULL) { 6583168404Spjd /* 6584168404Spjd * Stop async tasks. The async thread may need to detach 6585168404Spjd * a device that's been replaced, which requires grabbing 6586168404Spjd * spa_namespace_lock, so we must drop it here. 6587168404Spjd */ 6588168404Spjd spa_open_ref(spa, FTAG); 6589168404Spjd mutex_exit(&spa_namespace_lock); 6590168404Spjd spa_async_suspend(spa); 6591168404Spjd mutex_enter(&spa_namespace_lock); 6592168404Spjd spa_close(spa, FTAG); 6593168404Spjd 6594168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6595168404Spjd spa_unload(spa); 6596168404Spjd spa_deactivate(spa); 6597168404Spjd } 6598168404Spjd spa_remove(spa); 6599168404Spjd } 6600168404Spjd mutex_exit(&spa_namespace_lock); 6601168404Spjd} 6602168404Spjd 6603168404Spjdvdev_t * 6604209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6605168404Spjd{ 6606185029Spjd vdev_t *vd; 6607185029Spjd int i; 6608185029Spjd 6609185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6610185029Spjd return (vd); 6611185029Spjd 6612209962Smm if (aux) { 6613185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6614185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 6615185029Spjd if (vd->vdev_guid == guid) 6616185029Spjd return (vd); 6617185029Spjd } 6618209962Smm 6619209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 6620209962Smm vd = spa->spa_spares.sav_vdevs[i]; 6621209962Smm if (vd->vdev_guid == guid) 6622209962Smm return (vd); 6623209962Smm } 6624185029Spjd } 6625185029Spjd 6626185029Spjd return (NULL); 6627168404Spjd} 6628168404Spjd 6629168404Spjdvoid 6630185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 6631168404Spjd{ 6632219089Spjd ASSERT(spa_writeable(spa)); 6633219089Spjd 6634185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6635168404Spjd 6636168404Spjd /* 6637168404Spjd * This should only be called for a non-faulted pool, and since a 6638168404Spjd * future version would result in an unopenable pool, this shouldn't be 6639168404Spjd * possible. 6640168404Spjd */ 6641247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6642185029Spjd ASSERT(version >= spa->spa_uberblock.ub_version); 6643168404Spjd 6644185029Spjd spa->spa_uberblock.ub_version = version; 6645168404Spjd vdev_config_dirty(spa->spa_root_vdev); 6646168404Spjd 6647185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 6648168404Spjd 6649168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6650168404Spjd} 6651168404Spjd 6652168404Spjdboolean_t 6653168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 6654168404Spjd{ 6655168404Spjd int i; 6656168404Spjd uint64_t spareguid; 6657185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6658168404Spjd 6659185029Spjd for (i = 0; i < sav->sav_count; i++) 6660185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 6661168404Spjd return (B_TRUE); 6662168404Spjd 6663185029Spjd for (i = 0; i < sav->sav_npending; i++) { 6664185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6665185029Spjd &spareguid) == 0 && spareguid == guid) 6666168404Spjd return (B_TRUE); 6667168404Spjd } 6668168404Spjd 6669168404Spjd return (B_FALSE); 6670168404Spjd} 6671168404Spjd 6672185029Spjd/* 6673185029Spjd * Check if a pool has an active shared spare device. 6674185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 6675185029Spjd */ 6676185029Spjdstatic boolean_t 6677185029Spjdspa_has_active_shared_spare(spa_t *spa) 6678168404Spjd{ 6679185029Spjd int i, refcnt; 6680185029Spjd uint64_t pool; 6681185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6682185029Spjd 6683185029Spjd for (i = 0; i < sav->sav_count; i++) { 6684185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6685185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6686185029Spjd refcnt > 2) 6687185029Spjd return (B_TRUE); 6688185029Spjd } 6689185029Spjd 6690185029Spjd return (B_FALSE); 6691168404Spjd} 6692168404Spjd 6693185029Spjd/* 6694185029Spjd * Post a sysevent corresponding to the given event. The 'name' must be one of 6695185029Spjd * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6696185029Spjd * filled in from the spa and (optionally) the vdev. This doesn't do anything 6697185029Spjd * in the userland libzpool, as we don't want consumers to misinterpret ztest 6698185029Spjd * or zdb as real changes. 6699185029Spjd */ 6700185029Spjdvoid 6701185029Spjdspa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6702168404Spjd{ 6703185029Spjd#ifdef _KERNEL 6704185029Spjd sysevent_t *ev; 6705185029Spjd sysevent_attr_list_t *attr = NULL; 6706185029Spjd sysevent_value_t value; 6707185029Spjd sysevent_id_t eid; 6708168404Spjd 6709185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6710185029Spjd SE_SLEEP); 6711168404Spjd 6712185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6713185029Spjd value.value.sv_string = spa_name(spa); 6714185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6715185029Spjd goto done; 6716168404Spjd 6717185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6718185029Spjd value.value.sv_uint64 = spa_guid(spa); 6719185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6720185029Spjd goto done; 6721168404Spjd 6722185029Spjd if (vd) { 6723185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6724185029Spjd value.value.sv_uint64 = vd->vdev_guid; 6725185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6726185029Spjd SE_SLEEP) != 0) 6727185029Spjd goto done; 6728168404Spjd 6729185029Spjd if (vd->vdev_path) { 6730185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6731185029Spjd value.value.sv_string = vd->vdev_path; 6732185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6733185029Spjd &value, SE_SLEEP) != 0) 6734185029Spjd goto done; 6735168404Spjd } 6736168404Spjd } 6737168404Spjd 6738185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 6739185029Spjd goto done; 6740185029Spjd attr = NULL; 6741168404Spjd 6742185029Spjd (void) log_sysevent(ev, SE_SLEEP, &eid); 6743185029Spjd 6744185029Spjddone: 6745185029Spjd if (attr) 6746185029Spjd sysevent_free_attr(attr); 6747185029Spjd sysevent_free(ev); 6748185029Spjd#endif 6749168404Spjd} 6750