spa.c revision 258631
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24249195Smm * Copyright (c) 2013 by Delphix. All rights reserved. 25249188Smm * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 26247265Smm * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30251629Sdelphij * SPA: Storage Pool Allocator 31251629Sdelphij * 32168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 33168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 34168404Spjd * pool. 35168404Spjd */ 36168404Spjd 37168404Spjd#include <sys/zfs_context.h> 38168404Spjd#include <sys/fm/fs/zfs.h> 39168404Spjd#include <sys/spa_impl.h> 40168404Spjd#include <sys/zio.h> 41168404Spjd#include <sys/zio_checksum.h> 42168404Spjd#include <sys/dmu.h> 43168404Spjd#include <sys/dmu_tx.h> 44168404Spjd#include <sys/zap.h> 45168404Spjd#include <sys/zil.h> 46219089Spjd#include <sys/ddt.h> 47168404Spjd#include <sys/vdev_impl.h> 48168404Spjd#include <sys/metaslab.h> 49219089Spjd#include <sys/metaslab_impl.h> 50168404Spjd#include <sys/uberblock_impl.h> 51168404Spjd#include <sys/txg.h> 52168404Spjd#include <sys/avl.h> 53168404Spjd#include <sys/dmu_traverse.h> 54168404Spjd#include <sys/dmu_objset.h> 55168404Spjd#include <sys/unique.h> 56168404Spjd#include <sys/dsl_pool.h> 57168404Spjd#include <sys/dsl_dataset.h> 58168404Spjd#include <sys/dsl_dir.h> 59168404Spjd#include <sys/dsl_prop.h> 60168404Spjd#include <sys/dsl_synctask.h> 61168404Spjd#include <sys/fs/zfs.h> 62185029Spjd#include <sys/arc.h> 63168404Spjd#include <sys/callb.h> 64185029Spjd#include <sys/spa_boot.h> 65219089Spjd#include <sys/zfs_ioctl.h> 66219089Spjd#include <sys/dsl_scan.h> 67248571Smm#include <sys/dmu_send.h> 68248571Smm#include <sys/dsl_destroy.h> 69248571Smm#include <sys/dsl_userhold.h> 70236884Smm#include <sys/zfeature.h> 71219089Spjd#include <sys/zvol.h> 72240868Spjd#include <sys/trim_map.h> 73168404Spjd 74219089Spjd#ifdef _KERNEL 75219089Spjd#include <sys/callb.h> 76219089Spjd#include <sys/cpupart.h> 77219089Spjd#include <sys/zone.h> 78219089Spjd#endif /* _KERNEL */ 79219089Spjd 80185029Spjd#include "zfs_prop.h" 81185029Spjd#include "zfs_comutil.h" 82168404Spjd 83204073Spjd/* Check hostid on import? */ 84204073Spjdstatic int check_hostid = 1; 85204073Spjd 86204073SpjdSYSCTL_DECL(_vfs_zfs); 87204073SpjdTUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 88204073SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 89204073Spjd "Check hostid on import?"); 90204073Spjd 91251636Sdelphij/* 92251636Sdelphij * The interval, in seconds, at which failed configuration cache file writes 93251636Sdelphij * should be retried. 94251636Sdelphij */ 95251636Sdelphijstatic int zfs_ccw_retry_interval = 300; 96251636Sdelphij 97219089Spjdtypedef enum zti_modes { 98258631Savg ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 99258631Savg ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */ 100258631Savg ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 101258631Savg ZTI_MODE_NULL, /* don't create a taskq */ 102258631Savg ZTI_NMODES 103219089Spjd} zti_modes_t; 104168712Spjd 105258631Savg#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 106258631Savg#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 107258631Savg#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 108258631Savg#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 109209962Smm 110258631Savg#define ZTI_N(n) ZTI_P(n, 1) 111258631Savg#define ZTI_ONE ZTI_N(1) 112209962Smm 113209962Smmtypedef struct zio_taskq_info { 114258631Savg zti_modes_t zti_mode; 115211931Smm uint_t zti_value; 116258631Savg uint_t zti_count; 117209962Smm} zio_taskq_info_t; 118209962Smm 119209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 120219089Spjd "issue", "issue_high", "intr", "intr_high" 121209962Smm}; 122209962Smm 123211931Smm/* 124258631Savg * This table defines the taskq settings for each ZFS I/O type. When 125258631Savg * initializing a pool, we use this table to create an appropriately sized 126258631Savg * taskq. Some operations are low volume and therefore have a small, static 127258631Savg * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 128258631Savg * macros. Other operations process a large amount of data; the ZTI_BATCH 129258631Savg * macro causes us to create a taskq oriented for throughput. Some operations 130258631Savg * are so high frequency and short-lived that the taskq itself can become a a 131258631Savg * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 132258631Savg * additional degree of parallelism specified by the number of threads per- 133258631Savg * taskq and the number of taskqs; when dispatching an event in this case, the 134258631Savg * particular taskq is chosen at random. 135258631Savg * 136258631Savg * The different taskq priorities are to handle the different contexts (issue 137258631Savg * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 138258631Savg * need to be handled with minimum delay. 139211931Smm */ 140211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 141211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 142258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 143258631Savg { ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */ 144258631Savg { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 145258631Savg { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 146258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 147258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 148209962Smm}; 149209962Smm 150248571Smmstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 151248571Smmstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 152185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 153219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 154219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 155219089Spjd char **ereport); 156219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 157185029Spjd 158219089Spjduint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 159219089Spjd#ifdef PSRSET_BIND 160219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 161219089Spjd#endif 162219089Spjd#ifdef SYSDC 163219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 164219089Spjd#endif 165219089Spjduint_t zio_taskq_basedc = 80; /* base duty cycle */ 166219089Spjd 167219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 168243503Smmextern int zfs_sync_pass_deferred_free; 169219089Spjd 170247265Smm#ifndef illumos 171247265Smmextern void spa_deadman(void *arg); 172247265Smm#endif 173247265Smm 174168404Spjd/* 175219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 176219089Spjd * to get the vdev stats associated with the imported devices. 177219089Spjd */ 178219089Spjd#define TRYIMPORT_NAME "$import" 179219089Spjd 180219089Spjd/* 181168404Spjd * ========================================================================== 182185029Spjd * SPA properties routines 183185029Spjd * ========================================================================== 184185029Spjd */ 185185029Spjd 186185029Spjd/* 187185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 188185029Spjd */ 189185029Spjdstatic void 190185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 191185029Spjd uint64_t intval, zprop_source_t src) 192185029Spjd{ 193185029Spjd const char *propname = zpool_prop_to_name(prop); 194185029Spjd nvlist_t *propval; 195185029Spjd 196185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 197185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 198185029Spjd 199185029Spjd if (strval != NULL) 200185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 201185029Spjd else 202185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 203185029Spjd 204185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 205185029Spjd nvlist_free(propval); 206185029Spjd} 207185029Spjd 208185029Spjd/* 209185029Spjd * Get property values from the spa configuration. 210185029Spjd */ 211185029Spjdstatic void 212185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 213185029Spjd{ 214236155Smm vdev_t *rvd = spa->spa_root_vdev; 215236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 216209962Smm uint64_t size; 217219089Spjd uint64_t alloc; 218236155Smm uint64_t space; 219185029Spjd uint64_t cap, version; 220185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 221185029Spjd spa_config_dirent_t *dp; 222185029Spjd 223185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 224185029Spjd 225236155Smm if (rvd != NULL) { 226219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 227219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 228209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 229209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 230219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 231219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 232219089Spjd size - alloc, src); 233236155Smm 234236155Smm space = 0; 235236155Smm for (int c = 0; c < rvd->vdev_children; c++) { 236236155Smm vdev_t *tvd = rvd->vdev_child[c]; 237236155Smm space += tvd->vdev_max_asize - tvd->vdev_asize; 238236155Smm } 239236155Smm spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 240236155Smm src); 241236155Smm 242219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 243219089Spjd (spa_mode(spa) == FREAD), src); 244185029Spjd 245219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 246209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 247185029Spjd 248219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 249219089Spjd ddt_get_pool_dedup_ratio(spa), src); 250219089Spjd 251209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 252236155Smm rvd->vdev_state, src); 253209962Smm 254209962Smm version = spa_version(spa); 255209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 256209962Smm src = ZPROP_SRC_DEFAULT; 257209962Smm else 258209962Smm src = ZPROP_SRC_LOCAL; 259209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 260209962Smm } 261209962Smm 262236884Smm if (pool != NULL) { 263236884Smm dsl_dir_t *freedir = pool->dp_free_dir; 264236884Smm 265236884Smm /* 266236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 267236884Smm * when opening pools before this version freedir will be NULL. 268236884Smm */ 269236884Smm if (freedir != NULL) { 270236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 271236884Smm freedir->dd_phys->dd_used_bytes, src); 272236884Smm } else { 273236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 274236884Smm NULL, 0, src); 275236884Smm } 276236884Smm } 277236884Smm 278185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 279185029Spjd 280228103Smm if (spa->spa_comment != NULL) { 281228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 282228103Smm 0, ZPROP_SRC_LOCAL); 283228103Smm } 284228103Smm 285185029Spjd if (spa->spa_root != NULL) 286185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 287185029Spjd 0, ZPROP_SRC_LOCAL); 288185029Spjd 289185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 290185029Spjd if (dp->scd_path == NULL) { 291185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 292185029Spjd "none", 0, ZPROP_SRC_LOCAL); 293185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 294185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 295185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 296185029Spjd } 297185029Spjd } 298185029Spjd} 299185029Spjd 300185029Spjd/* 301185029Spjd * Get zpool property values. 302185029Spjd */ 303185029Spjdint 304185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 305185029Spjd{ 306219089Spjd objset_t *mos = spa->spa_meta_objset; 307185029Spjd zap_cursor_t zc; 308185029Spjd zap_attribute_t za; 309185029Spjd int err; 310185029Spjd 311185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 312185029Spjd 313185029Spjd mutex_enter(&spa->spa_props_lock); 314185029Spjd 315185029Spjd /* 316185029Spjd * Get properties from the spa config. 317185029Spjd */ 318185029Spjd spa_prop_get_config(spa, nvp); 319185029Spjd 320185029Spjd /* If no pool property object, no more prop to get. */ 321219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 322185029Spjd mutex_exit(&spa->spa_props_lock); 323185029Spjd return (0); 324185029Spjd } 325185029Spjd 326185029Spjd /* 327185029Spjd * Get properties from the MOS pool property object. 328185029Spjd */ 329185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 330185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 331185029Spjd zap_cursor_advance(&zc)) { 332185029Spjd uint64_t intval = 0; 333185029Spjd char *strval = NULL; 334185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 335185029Spjd zpool_prop_t prop; 336185029Spjd 337185029Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 338185029Spjd continue; 339185029Spjd 340185029Spjd switch (za.za_integer_length) { 341185029Spjd case 8: 342185029Spjd /* integer property */ 343185029Spjd if (za.za_first_integer != 344185029Spjd zpool_prop_default_numeric(prop)) 345185029Spjd src = ZPROP_SRC_LOCAL; 346185029Spjd 347185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 348185029Spjd dsl_pool_t *dp; 349185029Spjd dsl_dataset_t *ds = NULL; 350185029Spjd 351185029Spjd dp = spa_get_dsl(spa); 352248571Smm dsl_pool_config_enter(dp, FTAG); 353185029Spjd if (err = dsl_dataset_hold_obj(dp, 354185029Spjd za.za_first_integer, FTAG, &ds)) { 355248571Smm dsl_pool_config_exit(dp, FTAG); 356185029Spjd break; 357185029Spjd } 358185029Spjd 359185029Spjd strval = kmem_alloc( 360185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 361185029Spjd KM_SLEEP); 362185029Spjd dsl_dataset_name(ds, strval); 363185029Spjd dsl_dataset_rele(ds, FTAG); 364248571Smm dsl_pool_config_exit(dp, FTAG); 365185029Spjd } else { 366185029Spjd strval = NULL; 367185029Spjd intval = za.za_first_integer; 368185029Spjd } 369185029Spjd 370185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 371185029Spjd 372185029Spjd if (strval != NULL) 373185029Spjd kmem_free(strval, 374185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 375185029Spjd 376185029Spjd break; 377185029Spjd 378185029Spjd case 1: 379185029Spjd /* string property */ 380185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 381185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 382185029Spjd za.za_name, 1, za.za_num_integers, strval); 383185029Spjd if (err) { 384185029Spjd kmem_free(strval, za.za_num_integers); 385185029Spjd break; 386185029Spjd } 387185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 388185029Spjd kmem_free(strval, za.za_num_integers); 389185029Spjd break; 390185029Spjd 391185029Spjd default: 392185029Spjd break; 393185029Spjd } 394185029Spjd } 395185029Spjd zap_cursor_fini(&zc); 396185029Spjd mutex_exit(&spa->spa_props_lock); 397185029Spjdout: 398185029Spjd if (err && err != ENOENT) { 399185029Spjd nvlist_free(*nvp); 400185029Spjd *nvp = NULL; 401185029Spjd return (err); 402185029Spjd } 403185029Spjd 404185029Spjd return (0); 405185029Spjd} 406185029Spjd 407185029Spjd/* 408185029Spjd * Validate the given pool properties nvlist and modify the list 409185029Spjd * for the property values to be set. 410185029Spjd */ 411185029Spjdstatic int 412185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 413185029Spjd{ 414185029Spjd nvpair_t *elem; 415185029Spjd int error = 0, reset_bootfs = 0; 416247187Smm uint64_t objnum = 0; 417236884Smm boolean_t has_feature = B_FALSE; 418185029Spjd 419185029Spjd elem = NULL; 420185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 421185029Spjd uint64_t intval; 422236884Smm char *strval, *slash, *check, *fname; 423236884Smm const char *propname = nvpair_name(elem); 424236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 425185029Spjd 426236884Smm switch (prop) { 427236884Smm case ZPROP_INVAL: 428236884Smm if (!zpool_prop_feature(propname)) { 429249195Smm error = SET_ERROR(EINVAL); 430236884Smm break; 431236884Smm } 432185029Spjd 433236884Smm /* 434236884Smm * Sanitize the input. 435236884Smm */ 436236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 437249195Smm error = SET_ERROR(EINVAL); 438236884Smm break; 439236884Smm } 440185029Spjd 441236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 442249195Smm error = SET_ERROR(EINVAL); 443236884Smm break; 444236884Smm } 445236884Smm 446236884Smm if (intval != 0) { 447249195Smm error = SET_ERROR(EINVAL); 448236884Smm break; 449236884Smm } 450236884Smm 451236884Smm fname = strchr(propname, '@') + 1; 452236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 453249195Smm error = SET_ERROR(EINVAL); 454236884Smm break; 455236884Smm } 456236884Smm 457236884Smm has_feature = B_TRUE; 458236884Smm break; 459236884Smm 460185029Spjd case ZPOOL_PROP_VERSION: 461185029Spjd error = nvpair_value_uint64(elem, &intval); 462185029Spjd if (!error && 463236884Smm (intval < spa_version(spa) || 464236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 465236884Smm has_feature)) 466249195Smm error = SET_ERROR(EINVAL); 467185029Spjd break; 468185029Spjd 469185029Spjd case ZPOOL_PROP_DELEGATION: 470185029Spjd case ZPOOL_PROP_AUTOREPLACE: 471185029Spjd case ZPOOL_PROP_LISTSNAPS: 472219089Spjd case ZPOOL_PROP_AUTOEXPAND: 473185029Spjd error = nvpair_value_uint64(elem, &intval); 474185029Spjd if (!error && intval > 1) 475249195Smm error = SET_ERROR(EINVAL); 476185029Spjd break; 477185029Spjd 478185029Spjd case ZPOOL_PROP_BOOTFS: 479209962Smm /* 480209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 481209962Smm * or the pool is still being created (version == 0), 482209962Smm * the bootfs property cannot be set. 483209962Smm */ 484185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 485249195Smm error = SET_ERROR(ENOTSUP); 486185029Spjd break; 487185029Spjd } 488185029Spjd 489185029Spjd /* 490185029Spjd * Make sure the vdev config is bootable 491185029Spjd */ 492185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 493249195Smm error = SET_ERROR(ENOTSUP); 494185029Spjd break; 495185029Spjd } 496185029Spjd 497185029Spjd reset_bootfs = 1; 498185029Spjd 499185029Spjd error = nvpair_value_string(elem, &strval); 500185029Spjd 501185029Spjd if (!error) { 502236884Smm objset_t *os; 503185029Spjd uint64_t compress; 504185029Spjd 505185029Spjd if (strval == NULL || strval[0] == '\0') { 506185029Spjd objnum = zpool_prop_default_numeric( 507185029Spjd ZPOOL_PROP_BOOTFS); 508185029Spjd break; 509185029Spjd } 510185029Spjd 511219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 512185029Spjd break; 513185029Spjd 514219089Spjd /* Must be ZPL and not gzip compressed. */ 515219089Spjd 516219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 517249195Smm error = SET_ERROR(ENOTSUP); 518248571Smm } else if ((error = 519248571Smm dsl_prop_get_int_ds(dmu_objset_ds(os), 520185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 521248571Smm &compress)) == 0 && 522185029Spjd !BOOTFS_COMPRESS_VALID(compress)) { 523249195Smm error = SET_ERROR(ENOTSUP); 524185029Spjd } else { 525185029Spjd objnum = dmu_objset_id(os); 526185029Spjd } 527219089Spjd dmu_objset_rele(os, FTAG); 528185029Spjd } 529185029Spjd break; 530185029Spjd 531185029Spjd case ZPOOL_PROP_FAILUREMODE: 532185029Spjd error = nvpair_value_uint64(elem, &intval); 533185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 534185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 535249195Smm error = SET_ERROR(EINVAL); 536185029Spjd 537185029Spjd /* 538185029Spjd * This is a special case which only occurs when 539185029Spjd * the pool has completely failed. This allows 540185029Spjd * the user to change the in-core failmode property 541185029Spjd * without syncing it out to disk (I/Os might 542185029Spjd * currently be blocked). We do this by returning 543185029Spjd * EIO to the caller (spa_prop_set) to trick it 544185029Spjd * into thinking we encountered a property validation 545185029Spjd * error. 546185029Spjd */ 547185029Spjd if (!error && spa_suspended(spa)) { 548185029Spjd spa->spa_failmode = intval; 549249195Smm error = SET_ERROR(EIO); 550185029Spjd } 551185029Spjd break; 552185029Spjd 553185029Spjd case ZPOOL_PROP_CACHEFILE: 554185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 555185029Spjd break; 556185029Spjd 557185029Spjd if (strval[0] == '\0') 558185029Spjd break; 559185029Spjd 560185029Spjd if (strcmp(strval, "none") == 0) 561185029Spjd break; 562185029Spjd 563185029Spjd if (strval[0] != '/') { 564249195Smm error = SET_ERROR(EINVAL); 565185029Spjd break; 566185029Spjd } 567185029Spjd 568185029Spjd slash = strrchr(strval, '/'); 569185029Spjd ASSERT(slash != NULL); 570185029Spjd 571185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 572185029Spjd strcmp(slash, "/..") == 0) 573249195Smm error = SET_ERROR(EINVAL); 574185029Spjd break; 575219089Spjd 576228103Smm case ZPOOL_PROP_COMMENT: 577228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 578228103Smm break; 579228103Smm for (check = strval; *check != '\0'; check++) { 580228103Smm /* 581228103Smm * The kernel doesn't have an easy isprint() 582228103Smm * check. For this kernel check, we merely 583228103Smm * check ASCII apart from DEL. Fix this if 584228103Smm * there is an easy-to-use kernel isprint(). 585228103Smm */ 586228103Smm if (*check >= 0x7f) { 587249195Smm error = SET_ERROR(EINVAL); 588228103Smm break; 589228103Smm } 590228103Smm check++; 591228103Smm } 592228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 593228103Smm error = E2BIG; 594228103Smm break; 595228103Smm 596219089Spjd case ZPOOL_PROP_DEDUPDITTO: 597219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 598249195Smm error = SET_ERROR(ENOTSUP); 599219089Spjd else 600219089Spjd error = nvpair_value_uint64(elem, &intval); 601219089Spjd if (error == 0 && 602219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 603249195Smm error = SET_ERROR(EINVAL); 604219089Spjd break; 605185029Spjd } 606185029Spjd 607185029Spjd if (error) 608185029Spjd break; 609185029Spjd } 610185029Spjd 611185029Spjd if (!error && reset_bootfs) { 612185029Spjd error = nvlist_remove(props, 613185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 614185029Spjd 615185029Spjd if (!error) { 616185029Spjd error = nvlist_add_uint64(props, 617185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 618185029Spjd } 619185029Spjd } 620185029Spjd 621185029Spjd return (error); 622185029Spjd} 623185029Spjd 624209962Smmvoid 625209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 626209962Smm{ 627209962Smm char *cachefile; 628209962Smm spa_config_dirent_t *dp; 629209962Smm 630209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 631209962Smm &cachefile) != 0) 632209962Smm return; 633209962Smm 634209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 635209962Smm KM_SLEEP); 636209962Smm 637209962Smm if (cachefile[0] == '\0') 638209962Smm dp->scd_path = spa_strdup(spa_config_path); 639209962Smm else if (strcmp(cachefile, "none") == 0) 640209962Smm dp->scd_path = NULL; 641209962Smm else 642209962Smm dp->scd_path = spa_strdup(cachefile); 643209962Smm 644209962Smm list_insert_head(&spa->spa_config_list, dp); 645209962Smm if (need_sync) 646209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 647209962Smm} 648209962Smm 649185029Spjdint 650185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 651185029Spjd{ 652185029Spjd int error; 653236884Smm nvpair_t *elem = NULL; 654209962Smm boolean_t need_sync = B_FALSE; 655185029Spjd 656185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 657185029Spjd return (error); 658185029Spjd 659209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 660236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 661209962Smm 662219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 663219089Spjd prop == ZPOOL_PROP_ALTROOT || 664219089Spjd prop == ZPOOL_PROP_READONLY) 665209962Smm continue; 666209962Smm 667236884Smm if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 668236884Smm uint64_t ver; 669236884Smm 670236884Smm if (prop == ZPOOL_PROP_VERSION) { 671236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 672236884Smm } else { 673236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 674236884Smm ver = SPA_VERSION_FEATURES; 675236884Smm need_sync = B_TRUE; 676236884Smm } 677236884Smm 678236884Smm /* Save time if the version is already set. */ 679236884Smm if (ver == spa_version(spa)) 680236884Smm continue; 681236884Smm 682236884Smm /* 683236884Smm * In addition to the pool directory object, we might 684236884Smm * create the pool properties object, the features for 685236884Smm * read object, the features for write object, or the 686236884Smm * feature descriptions object. 687236884Smm */ 688248571Smm error = dsl_sync_task(spa->spa_name, NULL, 689248571Smm spa_sync_version, &ver, 6); 690236884Smm if (error) 691236884Smm return (error); 692236884Smm continue; 693236884Smm } 694236884Smm 695209962Smm need_sync = B_TRUE; 696209962Smm break; 697209962Smm } 698209962Smm 699236884Smm if (need_sync) { 700248571Smm return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 701248571Smm nvp, 6)); 702236884Smm } 703236884Smm 704236884Smm return (0); 705185029Spjd} 706185029Spjd 707185029Spjd/* 708185029Spjd * If the bootfs property value is dsobj, clear it. 709185029Spjd */ 710185029Spjdvoid 711185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 712185029Spjd{ 713185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 714185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 715185029Spjd spa->spa_pool_props_object, 716185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 717185029Spjd spa->spa_bootfs = 0; 718185029Spjd } 719185029Spjd} 720185029Spjd 721239620Smm/*ARGSUSED*/ 722239620Smmstatic int 723248571Smmspa_change_guid_check(void *arg, dmu_tx_t *tx) 724239620Smm{ 725248571Smm uint64_t *newguid = arg; 726248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 727239620Smm vdev_t *rvd = spa->spa_root_vdev; 728239620Smm uint64_t vdev_state; 729239620Smm 730239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 731239620Smm vdev_state = rvd->vdev_state; 732239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 733239620Smm 734239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 735249195Smm return (SET_ERROR(ENXIO)); 736239620Smm 737239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 738239620Smm 739239620Smm return (0); 740239620Smm} 741239620Smm 742239620Smmstatic void 743248571Smmspa_change_guid_sync(void *arg, dmu_tx_t *tx) 744239620Smm{ 745248571Smm uint64_t *newguid = arg; 746248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 747239620Smm uint64_t oldguid; 748239620Smm vdev_t *rvd = spa->spa_root_vdev; 749239620Smm 750239620Smm oldguid = spa_guid(spa); 751239620Smm 752239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 753239620Smm rvd->vdev_guid = *newguid; 754239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 755239620Smm vdev_config_dirty(rvd); 756239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 757239620Smm 758248571Smm spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 759239620Smm oldguid, *newguid); 760239620Smm} 761239620Smm 762185029Spjd/* 763228103Smm * Change the GUID for the pool. This is done so that we can later 764228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 765228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 766228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 767228103Smm * online when we do this, or else any vdevs that weren't present 768228103Smm * would be orphaned from our pool. We are also going to issue a 769228103Smm * sysevent to update any watchers. 770228103Smm */ 771228103Smmint 772228103Smmspa_change_guid(spa_t *spa) 773228103Smm{ 774239620Smm int error; 775239620Smm uint64_t guid; 776228103Smm 777254074Sdelphij mutex_enter(&spa->spa_vdev_top_lock); 778239620Smm mutex_enter(&spa_namespace_lock); 779239620Smm guid = spa_generate_guid(NULL); 780228103Smm 781248571Smm error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 782248571Smm spa_change_guid_sync, &guid, 5); 783228103Smm 784239620Smm if (error == 0) { 785239620Smm spa_config_sync(spa, B_FALSE, B_TRUE); 786239620Smm spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 787239620Smm } 788228103Smm 789239620Smm mutex_exit(&spa_namespace_lock); 790254074Sdelphij mutex_exit(&spa->spa_vdev_top_lock); 791228103Smm 792239620Smm return (error); 793228103Smm} 794228103Smm 795228103Smm/* 796185029Spjd * ========================================================================== 797168404Spjd * SPA state manipulation (open/create/destroy/import/export) 798168404Spjd * ========================================================================== 799168404Spjd */ 800168404Spjd 801168404Spjdstatic int 802168404Spjdspa_error_entry_compare(const void *a, const void *b) 803168404Spjd{ 804168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 805168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 806168404Spjd int ret; 807168404Spjd 808168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 809168404Spjd sizeof (zbookmark_t)); 810168404Spjd 811168404Spjd if (ret < 0) 812168404Spjd return (-1); 813168404Spjd else if (ret > 0) 814168404Spjd return (1); 815168404Spjd else 816168404Spjd return (0); 817168404Spjd} 818168404Spjd 819168404Spjd/* 820168404Spjd * Utility function which retrieves copies of the current logs and 821168404Spjd * re-initializes them in the process. 822168404Spjd */ 823168404Spjdvoid 824168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 825168404Spjd{ 826168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 827168404Spjd 828168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 829168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 830168404Spjd 831168404Spjd avl_create(&spa->spa_errlist_scrub, 832168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 833168404Spjd offsetof(spa_error_entry_t, se_avl)); 834168404Spjd avl_create(&spa->spa_errlist_last, 835168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 836168404Spjd offsetof(spa_error_entry_t, se_avl)); 837168404Spjd} 838168404Spjd 839258631Savgstatic void 840258631Savgspa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 841168404Spjd{ 842258631Savg const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 843258631Savg enum zti_modes mode = ztip->zti_mode; 844258631Savg uint_t value = ztip->zti_value; 845258631Savg uint_t count = ztip->zti_count; 846258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 847258631Savg char name[32]; 848258630Savg uint_t flags = 0; 849219089Spjd boolean_t batch = B_FALSE; 850168404Spjd 851258631Savg if (mode == ZTI_MODE_NULL) { 852258631Savg tqs->stqs_count = 0; 853258631Savg tqs->stqs_taskq = NULL; 854258631Savg return; 855258631Savg } 856168404Spjd 857258631Savg ASSERT3U(count, >, 0); 858168404Spjd 859258631Savg tqs->stqs_count = count; 860258631Savg tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 861219089Spjd 862258631Savg for (uint_t i = 0; i < count; i++) { 863258631Savg taskq_t *tq; 864219089Spjd 865258631Savg switch (mode) { 866258631Savg case ZTI_MODE_FIXED: 867258631Savg ASSERT3U(value, >=, 1); 868258631Savg value = MAX(value, 1); 869258631Savg break; 870219089Spjd 871258631Savg case ZTI_MODE_BATCH: 872258631Savg batch = B_TRUE; 873258631Savg flags |= TASKQ_THREADS_CPU_PCT; 874258631Savg value = zio_taskq_batch_pct; 875258631Savg break; 876258631Savg 877258631Savg case ZTI_MODE_ONLINE_PERCENT: 878258631Savg flags |= TASKQ_THREADS_CPU_PCT; 879258631Savg break; 880258631Savg 881258631Savg default: 882258631Savg panic("unrecognized mode for %s_%s taskq (%u:%u) in " 883258631Savg "spa_activate()", 884258631Savg zio_type_name[t], zio_taskq_types[q], mode, value); 885258631Savg break; 886258631Savg } 887258631Savg 888258631Savg if (count > 1) { 889258631Savg (void) snprintf(name, sizeof (name), "%s_%s_%u", 890258631Savg zio_type_name[t], zio_taskq_types[q], i); 891258631Savg } else { 892258631Savg (void) snprintf(name, sizeof (name), "%s_%s", 893258631Savg zio_type_name[t], zio_taskq_types[q]); 894258631Savg } 895258631Savg 896219089Spjd#ifdef SYSDC 897258631Savg if (zio_taskq_sysdc && spa->spa_proc != &p0) { 898258631Savg if (batch) 899258631Savg flags |= TASKQ_DC_BATCH; 900219089Spjd 901258631Savg tq = taskq_create_sysdc(name, value, 50, INT_MAX, 902258631Savg spa->spa_proc, zio_taskq_basedc, flags); 903258631Savg } else { 904258631Savg#endif 905258631Savg tq = taskq_create_proc(name, value, maxclsyspri, 50, 906258631Savg INT_MAX, spa->spa_proc, flags); 907258631Savg#ifdef SYSDC 908258631Savg } 909258631Savg#endif 910258631Savg 911258631Savg tqs->stqs_taskq[i] = tq; 912219089Spjd } 913219089Spjd} 914219089Spjd 915219089Spjdstatic void 916258631Savgspa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 917258631Savg{ 918258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 919258631Savg 920258631Savg if (tqs->stqs_taskq == NULL) { 921258631Savg ASSERT0(tqs->stqs_count); 922258631Savg return; 923258631Savg } 924258631Savg 925258631Savg for (uint_t i = 0; i < tqs->stqs_count; i++) { 926258631Savg ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 927258631Savg taskq_destroy(tqs->stqs_taskq[i]); 928258631Savg } 929258631Savg 930258631Savg kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 931258631Savg tqs->stqs_taskq = NULL; 932258631Savg} 933258631Savg 934258631Savg/* 935258631Savg * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 936258631Savg * Note that a type may have multiple discrete taskqs to avoid lock contention 937258631Savg * on the taskq itself. In that case we choose which taskq at random by using 938258631Savg * the low bits of gethrtime(). 939258631Savg */ 940258631Savgvoid 941258631Savgspa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 942258631Savg task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 943258631Savg{ 944258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 945258631Savg taskq_t *tq; 946258631Savg 947258631Savg ASSERT3P(tqs->stqs_taskq, !=, NULL); 948258631Savg ASSERT3U(tqs->stqs_count, !=, 0); 949258631Savg 950258631Savg if (tqs->stqs_count == 1) { 951258631Savg tq = tqs->stqs_taskq[0]; 952258631Savg } else { 953258631Savg tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 954258631Savg } 955258631Savg 956258631Savg taskq_dispatch_ent(tq, func, arg, flags, ent); 957258631Savg} 958258631Savg 959258631Savgstatic void 960219089Spjdspa_create_zio_taskqs(spa_t *spa) 961219089Spjd{ 962185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 963185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 964258631Savg spa_taskqs_init(spa, t, q); 965219089Spjd } 966219089Spjd } 967219089Spjd} 968209962Smm 969219089Spjd#ifdef _KERNEL 970219089Spjd#ifdef SPA_PROCESS 971219089Spjdstatic void 972219089Spjdspa_thread(void *arg) 973219089Spjd{ 974219089Spjd callb_cpr_t cprinfo; 975209962Smm 976219089Spjd spa_t *spa = arg; 977219089Spjd user_t *pu = PTOU(curproc); 978209962Smm 979219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 980219089Spjd spa->spa_name); 981209962Smm 982219089Spjd ASSERT(curproc != &p0); 983219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 984219089Spjd "zpool-%s", spa->spa_name); 985219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 986211931Smm 987219089Spjd#ifdef PSRSET_BIND 988219089Spjd /* bind this thread to the requested psrset */ 989219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 990219089Spjd pool_lock(); 991219089Spjd mutex_enter(&cpu_lock); 992219089Spjd mutex_enter(&pidlock); 993219089Spjd mutex_enter(&curproc->p_lock); 994219089Spjd 995219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 996219089Spjd 0, NULL, NULL) == 0) { 997219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 998219089Spjd } else { 999219089Spjd cmn_err(CE_WARN, 1000219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 1001219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1002219089Spjd } 1003219089Spjd 1004219089Spjd mutex_exit(&curproc->p_lock); 1005219089Spjd mutex_exit(&pidlock); 1006219089Spjd mutex_exit(&cpu_lock); 1007219089Spjd pool_unlock(); 1008219089Spjd } 1009219089Spjd#endif 1010219089Spjd 1011219089Spjd#ifdef SYSDC 1012219089Spjd if (zio_taskq_sysdc) { 1013219089Spjd sysdc_thread_enter(curthread, 100, 0); 1014219089Spjd } 1015219089Spjd#endif 1016219089Spjd 1017219089Spjd spa->spa_proc = curproc; 1018219089Spjd spa->spa_did = curthread->t_did; 1019219089Spjd 1020219089Spjd spa_create_zio_taskqs(spa); 1021219089Spjd 1022219089Spjd mutex_enter(&spa->spa_proc_lock); 1023219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1024219089Spjd 1025219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 1026219089Spjd cv_broadcast(&spa->spa_proc_cv); 1027219089Spjd 1028219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 1029219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1030219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1031219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1032219089Spjd 1033219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1034219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 1035219089Spjd spa->spa_proc = &p0; 1036219089Spjd cv_broadcast(&spa->spa_proc_cv); 1037219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1038219089Spjd 1039219089Spjd mutex_enter(&curproc->p_lock); 1040219089Spjd lwp_exit(); 1041219089Spjd} 1042219089Spjd#endif /* SPA_PROCESS */ 1043219089Spjd#endif 1044219089Spjd 1045219089Spjd/* 1046219089Spjd * Activate an uninitialized pool. 1047219089Spjd */ 1048219089Spjdstatic void 1049219089Spjdspa_activate(spa_t *spa, int mode) 1050219089Spjd{ 1051219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1052219089Spjd 1053219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 1054219089Spjd spa->spa_mode = mode; 1055219089Spjd 1056219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1057219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1058219089Spjd 1059219089Spjd /* Try to create a covering process */ 1060219089Spjd mutex_enter(&spa->spa_proc_lock); 1061219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1062219089Spjd ASSERT(spa->spa_proc == &p0); 1063219089Spjd spa->spa_did = 0; 1064219089Spjd 1065219089Spjd#ifdef SPA_PROCESS 1066219089Spjd /* Only create a process if we're going to be around a while. */ 1067219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1068219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1069219089Spjd NULL, 0) == 0) { 1070219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 1071219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 1072219089Spjd cv_wait(&spa->spa_proc_cv, 1073219089Spjd &spa->spa_proc_lock); 1074209962Smm } 1075219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1076219089Spjd ASSERT(spa->spa_proc != &p0); 1077219089Spjd ASSERT(spa->spa_did != 0); 1078219089Spjd } else { 1079219089Spjd#ifdef _KERNEL 1080219089Spjd cmn_err(CE_WARN, 1081219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 1082219089Spjd spa->spa_name); 1083219089Spjd#endif 1084185029Spjd } 1085168404Spjd } 1086219089Spjd#endif /* SPA_PROCESS */ 1087219089Spjd mutex_exit(&spa->spa_proc_lock); 1088168404Spjd 1089219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1090219089Spjd ASSERT(spa->spa_proc == &p0); 1091219089Spjd if (spa->spa_proc == &p0) { 1092219089Spjd spa_create_zio_taskqs(spa); 1093219089Spjd } 1094219089Spjd 1095240868Spjd /* 1096240868Spjd * Start TRIM thread. 1097240868Spjd */ 1098240868Spjd trim_thread_create(spa); 1099240868Spjd 1100185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1101185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1102185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1103185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1104168404Spjd 1105168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 1106168404Spjd offsetof(struct vdev, vdev_txg_node)); 1107168404Spjd 1108168404Spjd avl_create(&spa->spa_errlist_scrub, 1109168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1110168404Spjd offsetof(spa_error_entry_t, se_avl)); 1111168404Spjd avl_create(&spa->spa_errlist_last, 1112168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1113168404Spjd offsetof(spa_error_entry_t, se_avl)); 1114168404Spjd} 1115168404Spjd 1116168404Spjd/* 1117168404Spjd * Opposite of spa_activate(). 1118168404Spjd */ 1119168404Spjdstatic void 1120168404Spjdspa_deactivate(spa_t *spa) 1121168404Spjd{ 1122168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1123168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1124168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1125209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1126168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1127168404Spjd 1128240868Spjd /* 1129240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1130240868Spjd * before spa_deactivate(). 1131240868Spjd */ 1132240868Spjd trim_thread_destroy(spa); 1133240868Spjd 1134168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1135168404Spjd 1136185029Spjd list_destroy(&spa->spa_config_dirty_list); 1137185029Spjd list_destroy(&spa->spa_state_dirty_list); 1138168404Spjd 1139185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1140185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1141258631Savg spa_taskqs_fini(spa, t, q); 1142185029Spjd } 1143168404Spjd } 1144168404Spjd 1145168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1146168404Spjd spa->spa_normal_class = NULL; 1147168404Spjd 1148185029Spjd metaslab_class_destroy(spa->spa_log_class); 1149185029Spjd spa->spa_log_class = NULL; 1150185029Spjd 1151168404Spjd /* 1152168404Spjd * If this was part of an import or the open otherwise failed, we may 1153168404Spjd * still have errors left in the queues. Empty them just in case. 1154168404Spjd */ 1155168404Spjd spa_errlog_drain(spa); 1156168404Spjd 1157168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1158168404Spjd avl_destroy(&spa->spa_errlist_last); 1159168404Spjd 1160168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1161219089Spjd 1162219089Spjd mutex_enter(&spa->spa_proc_lock); 1163219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1164219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1165219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1166219089Spjd cv_broadcast(&spa->spa_proc_cv); 1167219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1168219089Spjd ASSERT(spa->spa_proc != &p0); 1169219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1170219089Spjd } 1171219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1172219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1173219089Spjd } 1174219089Spjd ASSERT(spa->spa_proc == &p0); 1175219089Spjd mutex_exit(&spa->spa_proc_lock); 1176219089Spjd 1177219089Spjd#ifdef SPA_PROCESS 1178219089Spjd /* 1179219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1180219089Spjd * module, so that the module can't be unloaded out from underneath 1181219089Spjd * it. 1182219089Spjd */ 1183219089Spjd if (spa->spa_did != 0) { 1184219089Spjd thread_join(spa->spa_did); 1185219089Spjd spa->spa_did = 0; 1186219089Spjd } 1187219089Spjd#endif /* SPA_PROCESS */ 1188168404Spjd} 1189168404Spjd 1190168404Spjd/* 1191168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1192168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1193168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1194168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1195168404Spjd */ 1196168404Spjdstatic int 1197168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1198168404Spjd uint_t id, int atype) 1199168404Spjd{ 1200168404Spjd nvlist_t **child; 1201219089Spjd uint_t children; 1202168404Spjd int error; 1203168404Spjd 1204168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1205168404Spjd return (error); 1206168404Spjd 1207168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1208168404Spjd return (0); 1209168404Spjd 1210185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1211185029Spjd &child, &children); 1212185029Spjd 1213185029Spjd if (error == ENOENT) 1214185029Spjd return (0); 1215185029Spjd 1216185029Spjd if (error) { 1217168404Spjd vdev_free(*vdp); 1218168404Spjd *vdp = NULL; 1219249195Smm return (SET_ERROR(EINVAL)); 1220168404Spjd } 1221168404Spjd 1222219089Spjd for (int c = 0; c < children; c++) { 1223168404Spjd vdev_t *vd; 1224168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1225168404Spjd atype)) != 0) { 1226168404Spjd vdev_free(*vdp); 1227168404Spjd *vdp = NULL; 1228168404Spjd return (error); 1229168404Spjd } 1230168404Spjd } 1231168404Spjd 1232168404Spjd ASSERT(*vdp != NULL); 1233168404Spjd 1234168404Spjd return (0); 1235168404Spjd} 1236168404Spjd 1237168404Spjd/* 1238168404Spjd * Opposite of spa_load(). 1239168404Spjd */ 1240168404Spjdstatic void 1241168404Spjdspa_unload(spa_t *spa) 1242168404Spjd{ 1243168404Spjd int i; 1244168404Spjd 1245185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1246185029Spjd 1247168404Spjd /* 1248240868Spjd * Stop TRIM thread. 1249240868Spjd */ 1250240868Spjd trim_thread_destroy(spa); 1251240868Spjd 1252240868Spjd /* 1253168404Spjd * Stop async tasks. 1254168404Spjd */ 1255168404Spjd spa_async_suspend(spa); 1256168404Spjd 1257168404Spjd /* 1258168404Spjd * Stop syncing. 1259168404Spjd */ 1260168404Spjd if (spa->spa_sync_on) { 1261168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1262168404Spjd spa->spa_sync_on = B_FALSE; 1263168404Spjd } 1264168404Spjd 1265168404Spjd /* 1266185029Spjd * Wait for any outstanding async I/O to complete. 1267168404Spjd */ 1268209962Smm if (spa->spa_async_zio_root != NULL) { 1269209962Smm (void) zio_wait(spa->spa_async_zio_root); 1270209962Smm spa->spa_async_zio_root = NULL; 1271209962Smm } 1272168404Spjd 1273219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1274219089Spjd 1275168404Spjd /* 1276168404Spjd * Close the dsl pool. 1277168404Spjd */ 1278168404Spjd if (spa->spa_dsl_pool) { 1279168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1280168404Spjd spa->spa_dsl_pool = NULL; 1281219089Spjd spa->spa_meta_objset = NULL; 1282168404Spjd } 1283168404Spjd 1284219089Spjd ddt_unload(spa); 1285219089Spjd 1286209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1287209962Smm 1288168404Spjd /* 1289209962Smm * Drop and purge level 2 cache 1290209962Smm */ 1291209962Smm spa_l2cache_drop(spa); 1292209962Smm 1293209962Smm /* 1294168404Spjd * Close all vdevs. 1295168404Spjd */ 1296168404Spjd if (spa->spa_root_vdev) 1297168404Spjd vdev_free(spa->spa_root_vdev); 1298168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1299168404Spjd 1300185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1301185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1302185029Spjd if (spa->spa_spares.sav_vdevs) { 1303185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1304185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1305185029Spjd spa->spa_spares.sav_vdevs = NULL; 1306168404Spjd } 1307185029Spjd if (spa->spa_spares.sav_config) { 1308185029Spjd nvlist_free(spa->spa_spares.sav_config); 1309185029Spjd spa->spa_spares.sav_config = NULL; 1310168404Spjd } 1311185029Spjd spa->spa_spares.sav_count = 0; 1312168404Spjd 1313230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1314230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1315185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1316230514Smm } 1317185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1318185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1319185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1320185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1321185029Spjd } 1322185029Spjd if (spa->spa_l2cache.sav_config) { 1323185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1324185029Spjd spa->spa_l2cache.sav_config = NULL; 1325185029Spjd } 1326185029Spjd spa->spa_l2cache.sav_count = 0; 1327185029Spjd 1328168404Spjd spa->spa_async_suspended = 0; 1329209962Smm 1330228103Smm if (spa->spa_comment != NULL) { 1331228103Smm spa_strfree(spa->spa_comment); 1332228103Smm spa->spa_comment = NULL; 1333228103Smm } 1334228103Smm 1335209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1336168404Spjd} 1337168404Spjd 1338168404Spjd/* 1339168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1340168404Spjd * this pool. When this is called, we have some form of basic information in 1341185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1342185029Spjd * then re-generate a more complete list including status information. 1343168404Spjd */ 1344168404Spjdstatic void 1345168404Spjdspa_load_spares(spa_t *spa) 1346168404Spjd{ 1347168404Spjd nvlist_t **spares; 1348168404Spjd uint_t nspares; 1349168404Spjd int i; 1350168404Spjd vdev_t *vd, *tvd; 1351168404Spjd 1352185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1353185029Spjd 1354168404Spjd /* 1355168404Spjd * First, close and free any existing spare vdevs. 1356168404Spjd */ 1357185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1358185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1359168404Spjd 1360168404Spjd /* Undo the call to spa_activate() below */ 1361185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1362185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1363168404Spjd spa_spare_remove(tvd); 1364168404Spjd vdev_close(vd); 1365168404Spjd vdev_free(vd); 1366168404Spjd } 1367168404Spjd 1368185029Spjd if (spa->spa_spares.sav_vdevs) 1369185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1370185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1371168404Spjd 1372185029Spjd if (spa->spa_spares.sav_config == NULL) 1373168404Spjd nspares = 0; 1374168404Spjd else 1375185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1376168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1377168404Spjd 1378185029Spjd spa->spa_spares.sav_count = (int)nspares; 1379185029Spjd spa->spa_spares.sav_vdevs = NULL; 1380168404Spjd 1381168404Spjd if (nspares == 0) 1382168404Spjd return; 1383168404Spjd 1384168404Spjd /* 1385168404Spjd * Construct the array of vdevs, opening them to get status in the 1386168404Spjd * process. For each spare, there is potentially two different vdev_t 1387168404Spjd * structures associated with it: one in the list of spares (used only 1388168404Spjd * for basic validation purposes) and one in the active vdev 1389168404Spjd * configuration (if it's spared in). During this phase we open and 1390168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1391168404Spjd * active configuration, then we also mark this vdev as an active spare. 1392168404Spjd */ 1393185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1394185029Spjd KM_SLEEP); 1395185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1396168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1397168404Spjd VDEV_ALLOC_SPARE) == 0); 1398168404Spjd ASSERT(vd != NULL); 1399168404Spjd 1400185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1401168404Spjd 1402185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1403185029Spjd B_FALSE)) != NULL) { 1404168404Spjd if (!tvd->vdev_isspare) 1405168404Spjd spa_spare_add(tvd); 1406168404Spjd 1407168404Spjd /* 1408168404Spjd * We only mark the spare active if we were successfully 1409168404Spjd * able to load the vdev. Otherwise, importing a pool 1410168404Spjd * with a bad active spare would result in strange 1411168404Spjd * behavior, because multiple pool would think the spare 1412168404Spjd * is actively in use. 1413168404Spjd * 1414168404Spjd * There is a vulnerability here to an equally bizarre 1415168404Spjd * circumstance, where a dead active spare is later 1416168404Spjd * brought back to life (onlined or otherwise). Given 1417168404Spjd * the rarity of this scenario, and the extra complexity 1418168404Spjd * it adds, we ignore the possibility. 1419168404Spjd */ 1420168404Spjd if (!vdev_is_dead(tvd)) 1421168404Spjd spa_spare_activate(tvd); 1422168404Spjd } 1423168404Spjd 1424185029Spjd vd->vdev_top = vd; 1425209962Smm vd->vdev_aux = &spa->spa_spares; 1426185029Spjd 1427168404Spjd if (vdev_open(vd) != 0) 1428168404Spjd continue; 1429168404Spjd 1430185029Spjd if (vdev_validate_aux(vd) == 0) 1431185029Spjd spa_spare_add(vd); 1432168404Spjd } 1433168404Spjd 1434168404Spjd /* 1435168404Spjd * Recompute the stashed list of spares, with status information 1436168404Spjd * this time. 1437168404Spjd */ 1438185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1439168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1440168404Spjd 1441185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1442185029Spjd KM_SLEEP); 1443185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1444185029Spjd spares[i] = vdev_config_generate(spa, 1445219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1446185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1447185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1448185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1449168404Spjd nvlist_free(spares[i]); 1450185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1451168404Spjd} 1452168404Spjd 1453185029Spjd/* 1454185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1455185029Spjd * this pool. When this is called, we have some form of basic information in 1456185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1457185029Spjd * then re-generate a more complete list including status information. 1458185029Spjd * Devices which are already active have their details maintained, and are 1459185029Spjd * not re-opened. 1460185029Spjd */ 1461185029Spjdstatic void 1462185029Spjdspa_load_l2cache(spa_t *spa) 1463185029Spjd{ 1464185029Spjd nvlist_t **l2cache; 1465185029Spjd uint_t nl2cache; 1466185029Spjd int i, j, oldnvdevs; 1467219089Spjd uint64_t guid; 1468185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1469185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1470185029Spjd 1471185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1472185029Spjd 1473185029Spjd if (sav->sav_config != NULL) { 1474185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1475185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1476185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1477185029Spjd } else { 1478185029Spjd nl2cache = 0; 1479247187Smm newvdevs = NULL; 1480185029Spjd } 1481185029Spjd 1482185029Spjd oldvdevs = sav->sav_vdevs; 1483185029Spjd oldnvdevs = sav->sav_count; 1484185029Spjd sav->sav_vdevs = NULL; 1485185029Spjd sav->sav_count = 0; 1486185029Spjd 1487185029Spjd /* 1488185029Spjd * Process new nvlist of vdevs. 1489185029Spjd */ 1490185029Spjd for (i = 0; i < nl2cache; i++) { 1491185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1492185029Spjd &guid) == 0); 1493185029Spjd 1494185029Spjd newvdevs[i] = NULL; 1495185029Spjd for (j = 0; j < oldnvdevs; j++) { 1496185029Spjd vd = oldvdevs[j]; 1497185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1498185029Spjd /* 1499185029Spjd * Retain previous vdev for add/remove ops. 1500185029Spjd */ 1501185029Spjd newvdevs[i] = vd; 1502185029Spjd oldvdevs[j] = NULL; 1503185029Spjd break; 1504185029Spjd } 1505185029Spjd } 1506185029Spjd 1507185029Spjd if (newvdevs[i] == NULL) { 1508185029Spjd /* 1509185029Spjd * Create new vdev 1510185029Spjd */ 1511185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1512185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1513185029Spjd ASSERT(vd != NULL); 1514185029Spjd newvdevs[i] = vd; 1515185029Spjd 1516185029Spjd /* 1517185029Spjd * Commit this vdev as an l2cache device, 1518185029Spjd * even if it fails to open. 1519185029Spjd */ 1520185029Spjd spa_l2cache_add(vd); 1521185029Spjd 1522185029Spjd vd->vdev_top = vd; 1523185029Spjd vd->vdev_aux = sav; 1524185029Spjd 1525185029Spjd spa_l2cache_activate(vd); 1526185029Spjd 1527185029Spjd if (vdev_open(vd) != 0) 1528185029Spjd continue; 1529185029Spjd 1530185029Spjd (void) vdev_validate_aux(vd); 1531185029Spjd 1532219089Spjd if (!vdev_is_dead(vd)) 1533219089Spjd l2arc_add_vdev(spa, vd); 1534185029Spjd } 1535185029Spjd } 1536185029Spjd 1537185029Spjd /* 1538185029Spjd * Purge vdevs that were dropped 1539185029Spjd */ 1540185029Spjd for (i = 0; i < oldnvdevs; i++) { 1541185029Spjd uint64_t pool; 1542185029Spjd 1543185029Spjd vd = oldvdevs[i]; 1544185029Spjd if (vd != NULL) { 1545230514Smm ASSERT(vd->vdev_isl2cache); 1546230514Smm 1547209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1548209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1549185029Spjd l2arc_remove_vdev(vd); 1550230514Smm vdev_clear_stats(vd); 1551230514Smm vdev_free(vd); 1552185029Spjd } 1553185029Spjd } 1554185029Spjd 1555185029Spjd if (oldvdevs) 1556185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1557185029Spjd 1558185029Spjd if (sav->sav_config == NULL) 1559185029Spjd goto out; 1560185029Spjd 1561185029Spjd sav->sav_vdevs = newvdevs; 1562185029Spjd sav->sav_count = (int)nl2cache; 1563185029Spjd 1564185029Spjd /* 1565185029Spjd * Recompute the stashed list of l2cache devices, with status 1566185029Spjd * information this time. 1567185029Spjd */ 1568185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1569185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1570185029Spjd 1571185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1572185029Spjd for (i = 0; i < sav->sav_count; i++) 1573185029Spjd l2cache[i] = vdev_config_generate(spa, 1574219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1575185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1576185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1577185029Spjdout: 1578185029Spjd for (i = 0; i < sav->sav_count; i++) 1579185029Spjd nvlist_free(l2cache[i]); 1580185029Spjd if (sav->sav_count) 1581185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1582185029Spjd} 1583185029Spjd 1584168404Spjdstatic int 1585168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1586168404Spjd{ 1587168404Spjd dmu_buf_t *db; 1588168404Spjd char *packed = NULL; 1589168404Spjd size_t nvsize = 0; 1590168404Spjd int error; 1591168404Spjd *value = NULL; 1592168404Spjd 1593168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1594168404Spjd nvsize = *(uint64_t *)db->db_data; 1595168404Spjd dmu_buf_rele(db, FTAG); 1596168404Spjd 1597168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1598209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1599209962Smm DMU_READ_PREFETCH); 1600168404Spjd if (error == 0) 1601168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1602168404Spjd kmem_free(packed, nvsize); 1603168404Spjd 1604168404Spjd return (error); 1605168404Spjd} 1606168404Spjd 1607168404Spjd/* 1608185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1609185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1610185029Spjd */ 1611185029Spjdstatic void 1612185029Spjdspa_check_removed(vdev_t *vd) 1613185029Spjd{ 1614219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1615185029Spjd spa_check_removed(vd->vdev_child[c]); 1616185029Spjd 1617249188Smm if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1618249188Smm !vd->vdev_ishole) { 1619185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1620185029Spjd spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1621185029Spjd } 1622185029Spjd} 1623185029Spjd 1624185029Spjd/* 1625219089Spjd * Validate the current config against the MOS config 1626213197Smm */ 1627219089Spjdstatic boolean_t 1628219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1629213197Smm{ 1630219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1631219089Spjd nvlist_t *nv; 1632213197Smm 1633219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1634213197Smm 1635219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1636219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1637219089Spjd 1638219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1639219089Spjd 1640219089Spjd /* 1641219089Spjd * If we're doing a normal import, then build up any additional 1642219089Spjd * diagnostic information about missing devices in this config. 1643219089Spjd * We'll pass this up to the user for further processing. 1644219089Spjd */ 1645219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1646219089Spjd nvlist_t **child, *nv; 1647219089Spjd uint64_t idx = 0; 1648219089Spjd 1649219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1650219089Spjd KM_SLEEP); 1651219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1652219089Spjd 1653219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1654219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1655219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1656219089Spjd 1657219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1658219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1659219089Spjd mtvd->vdev_islog) 1660219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1661219089Spjd B_FALSE, 0); 1662219089Spjd } 1663219089Spjd 1664219089Spjd if (idx) { 1665219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1666219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1667219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1668219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1669219089Spjd 1670219089Spjd for (int i = 0; i < idx; i++) 1671219089Spjd nvlist_free(child[i]); 1672219089Spjd } 1673219089Spjd nvlist_free(nv); 1674219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1675219089Spjd } 1676219089Spjd 1677219089Spjd /* 1678219089Spjd * Compare the root vdev tree with the information we have 1679219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1680219089Spjd * with the corresponding MOS config top-level (mtvd). 1681219089Spjd */ 1682219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1683213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1684219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1685213197Smm 1686219089Spjd /* 1687219089Spjd * Resolve any "missing" vdevs in the current configuration. 1688219089Spjd * If we find that the MOS config has more accurate information 1689219089Spjd * about the top-level vdev then use that vdev instead. 1690219089Spjd */ 1691219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1692219089Spjd mtvd->vdev_ops != &vdev_missing_ops) { 1693219089Spjd 1694219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1695219089Spjd continue; 1696219089Spjd 1697219089Spjd /* 1698219089Spjd * Device specific actions. 1699219089Spjd */ 1700219089Spjd if (mtvd->vdev_islog) { 1701219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1702219089Spjd } else { 1703219089Spjd /* 1704219089Spjd * XXX - once we have 'readonly' pool 1705219089Spjd * support we should be able to handle 1706219089Spjd * missing data devices by transitioning 1707219089Spjd * the pool to readonly. 1708219089Spjd */ 1709219089Spjd continue; 1710219089Spjd } 1711219089Spjd 1712219089Spjd /* 1713219089Spjd * Swap the missing vdev with the data we were 1714219089Spjd * able to obtain from the MOS config. 1715219089Spjd */ 1716219089Spjd vdev_remove_child(rvd, tvd); 1717219089Spjd vdev_remove_child(mrvd, mtvd); 1718219089Spjd 1719219089Spjd vdev_add_child(rvd, mtvd); 1720219089Spjd vdev_add_child(mrvd, tvd); 1721219089Spjd 1722219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1723219089Spjd vdev_load(mtvd); 1724219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1725219089Spjd 1726219089Spjd vdev_reopen(rvd); 1727219089Spjd } else if (mtvd->vdev_islog) { 1728219089Spjd /* 1729219089Spjd * Load the slog device's state from the MOS config 1730219089Spjd * since it's possible that the label does not 1731219089Spjd * contain the most up-to-date information. 1732219089Spjd */ 1733219089Spjd vdev_load_log_state(tvd, mtvd); 1734219089Spjd vdev_reopen(tvd); 1735219089Spjd } 1736213197Smm } 1737219089Spjd vdev_free(mrvd); 1738219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1739219089Spjd 1740219089Spjd /* 1741219089Spjd * Ensure we were able to validate the config. 1742219089Spjd */ 1743219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1744213197Smm} 1745213197Smm 1746213197Smm/* 1747185029Spjd * Check for missing log devices 1748185029Spjd */ 1749248571Smmstatic boolean_t 1750185029Spjdspa_check_logs(spa_t *spa) 1751185029Spjd{ 1752248571Smm boolean_t rv = B_FALSE; 1753248571Smm 1754185029Spjd switch (spa->spa_log_state) { 1755185029Spjd case SPA_LOG_MISSING: 1756185029Spjd /* need to recheck in case slog has been restored */ 1757185029Spjd case SPA_LOG_UNKNOWN: 1758248571Smm rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, 1759248571Smm NULL, DS_FIND_CHILDREN) != 0); 1760248571Smm if (rv) 1761219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1762185029Spjd break; 1763185029Spjd } 1764248571Smm return (rv); 1765185029Spjd} 1766185029Spjd 1767219089Spjdstatic boolean_t 1768219089Spjdspa_passivate_log(spa_t *spa) 1769219089Spjd{ 1770219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1771219089Spjd boolean_t slog_found = B_FALSE; 1772219089Spjd 1773219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1774219089Spjd 1775219089Spjd if (!spa_has_slogs(spa)) 1776219089Spjd return (B_FALSE); 1777219089Spjd 1778219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1779219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1780219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1781219089Spjd 1782219089Spjd if (tvd->vdev_islog) { 1783219089Spjd metaslab_group_passivate(mg); 1784219089Spjd slog_found = B_TRUE; 1785219089Spjd } 1786219089Spjd } 1787219089Spjd 1788219089Spjd return (slog_found); 1789219089Spjd} 1790219089Spjd 1791219089Spjdstatic void 1792219089Spjdspa_activate_log(spa_t *spa) 1793219089Spjd{ 1794219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1795219089Spjd 1796219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1797219089Spjd 1798219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1799219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1800219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1801219089Spjd 1802219089Spjd if (tvd->vdev_islog) 1803219089Spjd metaslab_group_activate(mg); 1804219089Spjd } 1805219089Spjd} 1806219089Spjd 1807219089Spjdint 1808219089Spjdspa_offline_log(spa_t *spa) 1809219089Spjd{ 1810248571Smm int error; 1811219089Spjd 1812248571Smm error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1813248571Smm NULL, DS_FIND_CHILDREN); 1814248571Smm if (error == 0) { 1815219089Spjd /* 1816219089Spjd * We successfully offlined the log device, sync out the 1817219089Spjd * current txg so that the "stubby" block can be removed 1818219089Spjd * by zil_sync(). 1819219089Spjd */ 1820219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1821219089Spjd } 1822219089Spjd return (error); 1823219089Spjd} 1824219089Spjd 1825219089Spjdstatic void 1826219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1827219089Spjd{ 1828219089Spjd int i; 1829219089Spjd 1830219089Spjd for (i = 0; i < sav->sav_count; i++) 1831219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1832219089Spjd} 1833219089Spjd 1834219089Spjdvoid 1835219089Spjdspa_claim_notify(zio_t *zio) 1836219089Spjd{ 1837219089Spjd spa_t *spa = zio->io_spa; 1838219089Spjd 1839219089Spjd if (zio->io_error) 1840219089Spjd return; 1841219089Spjd 1842219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1843219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1844219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1845219089Spjd mutex_exit(&spa->spa_props_lock); 1846219089Spjd} 1847219089Spjd 1848219089Spjdtypedef struct spa_load_error { 1849219089Spjd uint64_t sle_meta_count; 1850219089Spjd uint64_t sle_data_count; 1851219089Spjd} spa_load_error_t; 1852219089Spjd 1853219089Spjdstatic void 1854219089Spjdspa_load_verify_done(zio_t *zio) 1855219089Spjd{ 1856219089Spjd blkptr_t *bp = zio->io_bp; 1857219089Spjd spa_load_error_t *sle = zio->io_private; 1858219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1859219089Spjd int error = zio->io_error; 1860219089Spjd 1861219089Spjd if (error) { 1862236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1863219089Spjd type != DMU_OT_INTENT_LOG) 1864219089Spjd atomic_add_64(&sle->sle_meta_count, 1); 1865219089Spjd else 1866219089Spjd atomic_add_64(&sle->sle_data_count, 1); 1867219089Spjd } 1868219089Spjd zio_data_buf_free(zio->io_data, zio->io_size); 1869219089Spjd} 1870219089Spjd 1871219089Spjd/*ARGSUSED*/ 1872219089Spjdstatic int 1873219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1874246666Smm const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1875219089Spjd{ 1876219089Spjd if (bp != NULL) { 1877219089Spjd zio_t *rio = arg; 1878219089Spjd size_t size = BP_GET_PSIZE(bp); 1879219089Spjd void *data = zio_data_buf_alloc(size); 1880219089Spjd 1881219089Spjd zio_nowait(zio_read(rio, spa, bp, data, size, 1882219089Spjd spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1883219089Spjd ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1884219089Spjd ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1885219089Spjd } 1886219089Spjd return (0); 1887219089Spjd} 1888219089Spjd 1889219089Spjdstatic int 1890219089Spjdspa_load_verify(spa_t *spa) 1891219089Spjd{ 1892219089Spjd zio_t *rio; 1893219089Spjd spa_load_error_t sle = { 0 }; 1894219089Spjd zpool_rewind_policy_t policy; 1895219089Spjd boolean_t verify_ok = B_FALSE; 1896219089Spjd int error; 1897219089Spjd 1898219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 1899219089Spjd 1900219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1901219089Spjd return (0); 1902219089Spjd 1903219089Spjd rio = zio_root(spa, NULL, &sle, 1904219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1905219089Spjd 1906219089Spjd error = traverse_pool(spa, spa->spa_verify_min_txg, 1907219089Spjd TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1908219089Spjd 1909219089Spjd (void) zio_wait(rio); 1910219089Spjd 1911219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 1912219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 1913219089Spjd 1914219089Spjd if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1915219089Spjd sle.sle_data_count <= policy.zrp_maxdata) { 1916219089Spjd int64_t loss = 0; 1917219089Spjd 1918219089Spjd verify_ok = B_TRUE; 1919219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1920219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1921219089Spjd 1922219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1923219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1924219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1925219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 1926219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1927219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1928219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1929219089Spjd } else { 1930219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1931219089Spjd } 1932219089Spjd 1933219089Spjd if (error) { 1934219089Spjd if (error != ENXIO && error != EIO) 1935249195Smm error = SET_ERROR(EIO); 1936219089Spjd return (error); 1937219089Spjd } 1938219089Spjd 1939219089Spjd return (verify_ok ? 0 : EIO); 1940219089Spjd} 1941219089Spjd 1942185029Spjd/* 1943219089Spjd * Find a value in the pool props object. 1944168404Spjd */ 1945219089Spjdstatic void 1946219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1947219089Spjd{ 1948219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1949219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1950219089Spjd} 1951219089Spjd 1952219089Spjd/* 1953219089Spjd * Find a value in the pool directory object. 1954219089Spjd */ 1955168404Spjdstatic int 1956219089Spjdspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1957168404Spjd{ 1958219089Spjd return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1959219089Spjd name, sizeof (uint64_t), 1, val)); 1960219089Spjd} 1961168404Spjd 1962219089Spjdstatic int 1963219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1964219089Spjd{ 1965219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1966219089Spjd return (err); 1967219089Spjd} 1968219089Spjd 1969219089Spjd/* 1970219089Spjd * Fix up config after a partly-completed split. This is done with the 1971219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1972219089Spjd * pool have that entry in their config, but only the splitting one contains 1973219089Spjd * a list of all the guids of the vdevs that are being split off. 1974219089Spjd * 1975219089Spjd * This function determines what to do with that list: either rejoin 1976219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 1977219089Spjd * the rejoin, each disk that is offlined is marked online again, and 1978219089Spjd * we do a reopen() call. If the vdev label for every disk that was 1979219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1980219089Spjd * then we call vdev_split() on each disk, and complete the split. 1981219089Spjd * 1982219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 1983219089Spjd * the original pool. 1984219089Spjd */ 1985219089Spjdstatic void 1986219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 1987219089Spjd{ 1988219089Spjd uint_t extracted; 1989219089Spjd uint64_t *glist; 1990219089Spjd uint_t i, gcount; 1991219089Spjd nvlist_t *nvl; 1992219089Spjd vdev_t **vd; 1993219089Spjd boolean_t attempt_reopen; 1994219089Spjd 1995219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1996219089Spjd return; 1997219089Spjd 1998219089Spjd /* check that the config is complete */ 1999219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2000219089Spjd &glist, &gcount) != 0) 2001219089Spjd return; 2002219089Spjd 2003219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2004219089Spjd 2005219089Spjd /* attempt to online all the vdevs & validate */ 2006219089Spjd attempt_reopen = B_TRUE; 2007219089Spjd for (i = 0; i < gcount; i++) { 2008219089Spjd if (glist[i] == 0) /* vdev is hole */ 2009219089Spjd continue; 2010219089Spjd 2011219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2012219089Spjd if (vd[i] == NULL) { 2013219089Spjd /* 2014219089Spjd * Don't bother attempting to reopen the disks; 2015219089Spjd * just do the split. 2016219089Spjd */ 2017219089Spjd attempt_reopen = B_FALSE; 2018219089Spjd } else { 2019219089Spjd /* attempt to re-online it */ 2020219089Spjd vd[i]->vdev_offline = B_FALSE; 2021219089Spjd } 2022219089Spjd } 2023219089Spjd 2024219089Spjd if (attempt_reopen) { 2025219089Spjd vdev_reopen(spa->spa_root_vdev); 2026219089Spjd 2027219089Spjd /* check each device to see what state it's in */ 2028219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 2029219089Spjd if (vd[i] != NULL && 2030219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2031219089Spjd break; 2032219089Spjd ++extracted; 2033219089Spjd } 2034219089Spjd } 2035219089Spjd 2036209962Smm /* 2037219089Spjd * If every disk has been moved to the new pool, or if we never 2038219089Spjd * even attempted to look at them, then we split them off for 2039219089Spjd * good. 2040209962Smm */ 2041219089Spjd if (!attempt_reopen || gcount == extracted) { 2042219089Spjd for (i = 0; i < gcount; i++) 2043219089Spjd if (vd[i] != NULL) 2044219089Spjd vdev_split(vd[i]); 2045219089Spjd vdev_reopen(spa->spa_root_vdev); 2046219089Spjd } 2047209962Smm 2048219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 2049219089Spjd} 2050185029Spjd 2051219089Spjdstatic int 2052219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2053219089Spjd boolean_t mosconfig) 2054219089Spjd{ 2055219089Spjd nvlist_t *config = spa->spa_config; 2056219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 2057228103Smm char *comment; 2058219089Spjd int error; 2059219089Spjd uint64_t pool_guid; 2060219089Spjd nvlist_t *nvl; 2061168404Spjd 2062219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2063249195Smm return (SET_ERROR(EINVAL)); 2064168404Spjd 2065228103Smm ASSERT(spa->spa_comment == NULL); 2066228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2067228103Smm spa->spa_comment = spa_strdup(comment); 2068228103Smm 2069168404Spjd /* 2070168404Spjd * Versioning wasn't explicitly added to the label until later, so if 2071168404Spjd * it's not present treat it as the initial version. 2072168404Spjd */ 2073219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2074219089Spjd &spa->spa_ubsync.ub_version) != 0) 2075219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2076168404Spjd 2077168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2078168404Spjd &spa->spa_config_txg); 2079168404Spjd 2080168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2081168404Spjd spa_guid_exists(pool_guid, 0)) { 2082249195Smm error = SET_ERROR(EEXIST); 2083219089Spjd } else { 2084228103Smm spa->spa_config_guid = pool_guid; 2085219089Spjd 2086219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2087219089Spjd &nvl) == 0) { 2088219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2089219089Spjd KM_SLEEP) == 0); 2090219089Spjd } 2091219089Spjd 2092236884Smm nvlist_free(spa->spa_load_info); 2093236884Smm spa->spa_load_info = fnvlist_alloc(); 2094236884Smm 2095219089Spjd gethrestime(&spa->spa_loaded_ts); 2096219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 2097219089Spjd mosconfig, &ereport); 2098168404Spjd } 2099168404Spjd 2100219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2101219089Spjd if (error) { 2102219089Spjd if (error != EEXIST) { 2103219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2104219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2105219089Spjd } 2106219089Spjd if (error != EBADF) { 2107219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2108219089Spjd } 2109219089Spjd } 2110219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2111219089Spjd spa->spa_ena = 0; 2112168404Spjd 2113219089Spjd return (error); 2114219089Spjd} 2115219089Spjd 2116219089Spjd/* 2117219089Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 2118219089Spjd * source of configuration information. 2119219089Spjd */ 2120219089Spjdstatic int 2121219089Spjdspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2122219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2123219089Spjd char **ereport) 2124219089Spjd{ 2125219089Spjd int error = 0; 2126219089Spjd nvlist_t *nvroot = NULL; 2127236884Smm nvlist_t *label; 2128219089Spjd vdev_t *rvd; 2129219089Spjd uberblock_t *ub = &spa->spa_uberblock; 2130219089Spjd uint64_t children, config_cache_txg = spa->spa_config_txg; 2131219089Spjd int orig_mode = spa->spa_mode; 2132219089Spjd int parse; 2133219089Spjd uint64_t obj; 2134236884Smm boolean_t missing_feat_write = B_FALSE; 2135219089Spjd 2136168404Spjd /* 2137219089Spjd * If this is an untrusted config, access the pool in read-only mode. 2138219089Spjd * This prevents things like resilvering recently removed devices. 2139219089Spjd */ 2140219089Spjd if (!mosconfig) 2141219089Spjd spa->spa_mode = FREAD; 2142219089Spjd 2143219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2144219089Spjd 2145219089Spjd spa->spa_load_state = state; 2146219089Spjd 2147219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2148249195Smm return (SET_ERROR(EINVAL)); 2149219089Spjd 2150219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 2151219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2152219089Spjd 2153219089Spjd /* 2154209962Smm * Create "The Godfather" zio to hold all async IOs 2155209962Smm */ 2156209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2157209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2158209962Smm 2159209962Smm /* 2160168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2161168404Spjd * value that will be returned by spa_version() since parsing the 2162168404Spjd * configuration requires knowing the version number. 2163168404Spjd */ 2164185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2165219089Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2166185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2167168404Spjd 2168168404Spjd if (error != 0) 2169219089Spjd return (error); 2170168404Spjd 2171168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2172168404Spjd 2173219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2174219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2175219089Spjd } 2176219089Spjd 2177168404Spjd /* 2178168404Spjd * Try to open all vdevs, loading each label in the process. 2179168404Spjd */ 2180185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2181168926Spjd error = vdev_open(rvd); 2182185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2183168926Spjd if (error != 0) 2184219089Spjd return (error); 2185168404Spjd 2186168404Spjd /* 2187209962Smm * We need to validate the vdev labels against the configuration that 2188209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 2189209962Smm * mosconfig is true then we're validating the vdev labels based on 2190219089Spjd * that config. Otherwise, we're validating against the cached config 2191209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 2192209962Smm * later we will recursively call spa_load() and validate against 2193209962Smm * the vdev config. 2194219089Spjd * 2195219089Spjd * If we're assembling a new pool that's been split off from an 2196219089Spjd * existing pool, the labels haven't yet been updated so we skip 2197219089Spjd * validation for now. 2198168404Spjd */ 2199219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2200219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2201230514Smm error = vdev_validate(rvd, mosconfig); 2202219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2203168404Spjd 2204219089Spjd if (error != 0) 2205219089Spjd return (error); 2206219089Spjd 2207219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2208249195Smm return (SET_ERROR(ENXIO)); 2209168404Spjd } 2210168404Spjd 2211168404Spjd /* 2212168404Spjd * Find the best uberblock. 2213168404Spjd */ 2214236884Smm vdev_uberblock_load(rvd, ub, &label); 2215168404Spjd 2216168404Spjd /* 2217168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2218168404Spjd */ 2219236884Smm if (ub->ub_txg == 0) { 2220236884Smm nvlist_free(label); 2221219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2222236884Smm } 2223168404Spjd 2224168404Spjd /* 2225236884Smm * If the pool has an unsupported version we can't open it. 2226168404Spjd */ 2227236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2228236884Smm nvlist_free(label); 2229219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2230236884Smm } 2231168404Spjd 2232236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2233236884Smm nvlist_t *features; 2234236884Smm 2235236884Smm /* 2236236884Smm * If we weren't able to find what's necessary for reading the 2237236884Smm * MOS in the label, return failure. 2238236884Smm */ 2239236884Smm if (label == NULL || nvlist_lookup_nvlist(label, 2240236884Smm ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2241236884Smm nvlist_free(label); 2242236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2243236884Smm ENXIO)); 2244236884Smm } 2245236884Smm 2246236884Smm /* 2247236884Smm * Update our in-core representation with the definitive values 2248236884Smm * from the label. 2249236884Smm */ 2250236884Smm nvlist_free(spa->spa_label_features); 2251236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2252236884Smm } 2253236884Smm 2254236884Smm nvlist_free(label); 2255236884Smm 2256168404Spjd /* 2257236884Smm * Look through entries in the label nvlist's features_for_read. If 2258236884Smm * there is a feature listed there which we don't understand then we 2259236884Smm * cannot open a pool. 2260236884Smm */ 2261236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2262236884Smm nvlist_t *unsup_feat; 2263236884Smm 2264236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2265236884Smm 0); 2266236884Smm 2267236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2268236884Smm NULL); nvp != NULL; 2269236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2270236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2271236884Smm VERIFY(nvlist_add_string(unsup_feat, 2272236884Smm nvpair_name(nvp), "") == 0); 2273236884Smm } 2274236884Smm } 2275236884Smm 2276236884Smm if (!nvlist_empty(unsup_feat)) { 2277236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2278236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2279236884Smm nvlist_free(unsup_feat); 2280236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2281236884Smm ENOTSUP)); 2282236884Smm } 2283236884Smm 2284236884Smm nvlist_free(unsup_feat); 2285236884Smm } 2286236884Smm 2287236884Smm /* 2288168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2289219089Spjd * incomplete configuration. We first check to see if the pool 2290219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2291219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2292219089Spjd * can handle missing vdevs. 2293168404Spjd */ 2294219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2295219089Spjd &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2296219089Spjd rvd->vdev_guid_sum != ub->ub_guid_sum) 2297219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2298219089Spjd 2299219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2300219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2301219089Spjd spa_try_repair(spa, config); 2302219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2303219089Spjd nvlist_free(spa->spa_config_splitting); 2304219089Spjd spa->spa_config_splitting = NULL; 2305168404Spjd } 2306168404Spjd 2307168404Spjd /* 2308168404Spjd * Initialize internal SPA structures. 2309168404Spjd */ 2310168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2311168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2312219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2313219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2314219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2315219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2316219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2317219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2318219089Spjd 2319236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2320219089Spjd if (error) 2321219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2322168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2323168404Spjd 2324219089Spjd if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2325219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2326168404Spjd 2327236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2328236884Smm boolean_t missing_feat_read = B_FALSE; 2329238926Smm nvlist_t *unsup_feat, *enabled_feat; 2330236884Smm 2331236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2332236884Smm &spa->spa_feat_for_read_obj) != 0) { 2333236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2334236884Smm } 2335236884Smm 2336236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2337236884Smm &spa->spa_feat_for_write_obj) != 0) { 2338236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2339236884Smm } 2340236884Smm 2341236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2342236884Smm &spa->spa_feat_desc_obj) != 0) { 2343236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2344236884Smm } 2345236884Smm 2346238926Smm enabled_feat = fnvlist_alloc(); 2347238926Smm unsup_feat = fnvlist_alloc(); 2348236884Smm 2349236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2350236884Smm spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, 2351238926Smm unsup_feat, enabled_feat)) 2352236884Smm missing_feat_read = B_TRUE; 2353236884Smm 2354236884Smm if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2355236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2356236884Smm spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, 2357238926Smm unsup_feat, enabled_feat)) { 2358236884Smm missing_feat_write = B_TRUE; 2359238926Smm } 2360236884Smm } 2361236884Smm 2362238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2363238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2364238926Smm 2365236884Smm if (!nvlist_empty(unsup_feat)) { 2366238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2367238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2368236884Smm } 2369236884Smm 2370238926Smm fnvlist_free(enabled_feat); 2371238926Smm fnvlist_free(unsup_feat); 2372236884Smm 2373236884Smm if (!missing_feat_read) { 2374236884Smm fnvlist_add_boolean(spa->spa_load_info, 2375236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2376236884Smm } 2377236884Smm 2378236884Smm /* 2379236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2380236884Smm * twofold: to determine whether the pool is available for 2381236884Smm * import in read-write mode and (if it is not) whether the 2382236884Smm * pool is available for import in read-only mode. If the pool 2383236884Smm * is available for import in read-write mode, it is displayed 2384236884Smm * as available in userland; if it is not available for import 2385236884Smm * in read-only mode, it is displayed as unavailable in 2386236884Smm * userland. If the pool is available for import in read-only 2387236884Smm * mode but not read-write mode, it is displayed as unavailable 2388236884Smm * in userland with a special note that the pool is actually 2389236884Smm * available for open in read-only mode. 2390236884Smm * 2391236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2392236884Smm * missing a feature for write, we must first determine whether 2393236884Smm * the pool can be opened read-only before returning to 2394236884Smm * userland in order to know whether to display the 2395236884Smm * abovementioned note. 2396236884Smm */ 2397236884Smm if (missing_feat_read || (missing_feat_write && 2398236884Smm spa_writeable(spa))) { 2399236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2400236884Smm ENOTSUP)); 2401236884Smm } 2402236884Smm } 2403236884Smm 2404236884Smm spa->spa_is_initializing = B_TRUE; 2405236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2406236884Smm spa->spa_is_initializing = B_FALSE; 2407236884Smm if (error != 0) 2408236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2409236884Smm 2410168404Spjd if (!mosconfig) { 2411168498Spjd uint64_t hostid; 2412219089Spjd nvlist_t *policy = NULL, *nvconfig; 2413168404Spjd 2414219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2415219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2416168404Spjd 2417219089Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2418185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2419168498Spjd char *hostname; 2420168498Spjd unsigned long myhostid = 0; 2421168498Spjd 2422219089Spjd VERIFY(nvlist_lookup_string(nvconfig, 2423168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2424168498Spjd 2425219089Spjd#ifdef _KERNEL 2426219089Spjd myhostid = zone_get_hostid(NULL); 2427219089Spjd#else /* _KERNEL */ 2428219089Spjd /* 2429219089Spjd * We're emulating the system's hostid in userland, so 2430219089Spjd * we can't use zone_get_hostid(). 2431219089Spjd */ 2432168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2433219089Spjd#endif /* _KERNEL */ 2434204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 2435219089Spjd hostid != myhostid) { 2436219089Spjd nvlist_free(nvconfig); 2437168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 2438168498Spjd "loaded as it was last accessed by " 2439185029Spjd "another system (host: %s hostid: 0x%lx). " 2440236146Smm "See: http://illumos.org/msg/ZFS-8000-EY", 2441185029Spjd spa_name(spa), hostname, 2442168498Spjd (unsigned long)hostid); 2443249195Smm return (SET_ERROR(EBADF)); 2444168498Spjd } 2445168498Spjd } 2446219089Spjd if (nvlist_lookup_nvlist(spa->spa_config, 2447219089Spjd ZPOOL_REWIND_POLICY, &policy) == 0) 2448219089Spjd VERIFY(nvlist_add_nvlist(nvconfig, 2449219089Spjd ZPOOL_REWIND_POLICY, policy) == 0); 2450168498Spjd 2451219089Spjd spa_config_set(spa, nvconfig); 2452168404Spjd spa_unload(spa); 2453168404Spjd spa_deactivate(spa); 2454209962Smm spa_activate(spa, orig_mode); 2455168404Spjd 2456219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2457168404Spjd } 2458168404Spjd 2459219089Spjd if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2460219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2461219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2462219089Spjd if (error != 0) 2463219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2464168404Spjd 2465168404Spjd /* 2466168404Spjd * Load the bit that tells us to use the new accounting function 2467168404Spjd * (raid-z deflation). If we have an older pool, this will not 2468168404Spjd * be present. 2469168404Spjd */ 2470219089Spjd error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2471219089Spjd if (error != 0 && error != ENOENT) 2472219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2473168404Spjd 2474219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2475219089Spjd &spa->spa_creation_version); 2476219089Spjd if (error != 0 && error != ENOENT) 2477219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2478219089Spjd 2479168404Spjd /* 2480168404Spjd * Load the persistent error log. If we have an older pool, this will 2481168404Spjd * not be present. 2482168404Spjd */ 2483219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2484219089Spjd if (error != 0 && error != ENOENT) 2485219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2486168404Spjd 2487219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2488219089Spjd &spa->spa_errlog_scrub); 2489219089Spjd if (error != 0 && error != ENOENT) 2490219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2491168404Spjd 2492168404Spjd /* 2493168404Spjd * Load the history object. If we have an older pool, this 2494168404Spjd * will not be present. 2495168404Spjd */ 2496219089Spjd error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2497219089Spjd if (error != 0 && error != ENOENT) 2498219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2499168404Spjd 2500168404Spjd /* 2501219089Spjd * If we're assembling the pool from the split-off vdevs of 2502219089Spjd * an existing pool, we don't want to attach the spares & cache 2503219089Spjd * devices. 2504219089Spjd */ 2505219089Spjd 2506219089Spjd /* 2507168404Spjd * Load any hot spares for this pool. 2508168404Spjd */ 2509219089Spjd error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2510219089Spjd if (error != 0 && error != ENOENT) 2511219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2512219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2513185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2514185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 2515219089Spjd &spa->spa_spares.sav_config) != 0) 2516219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2517168404Spjd 2518185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2519168404Spjd spa_load_spares(spa); 2520185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2521219089Spjd } else if (error == 0) { 2522219089Spjd spa->spa_spares.sav_sync = B_TRUE; 2523168404Spjd } 2524168404Spjd 2525185029Spjd /* 2526185029Spjd * Load any level 2 ARC devices for this pool. 2527185029Spjd */ 2528219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2529185029Spjd &spa->spa_l2cache.sav_object); 2530219089Spjd if (error != 0 && error != ENOENT) 2531219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2532219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2533185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2534185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2535219089Spjd &spa->spa_l2cache.sav_config) != 0) 2536219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2537185029Spjd 2538185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2539185029Spjd spa_load_l2cache(spa); 2540185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2541219089Spjd } else if (error == 0) { 2542219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2543185029Spjd } 2544185029Spjd 2545219089Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2546213197Smm 2547219089Spjd error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2548219089Spjd if (error && error != ENOENT) 2549219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2550185029Spjd 2551219089Spjd if (error == 0) { 2552219089Spjd uint64_t autoreplace; 2553185029Spjd 2554219089Spjd spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2555219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2556219089Spjd spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2557219089Spjd spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2558219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2559219089Spjd spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2560219089Spjd &spa->spa_dedup_ditto); 2561185029Spjd 2562219089Spjd spa->spa_autoreplace = (autoreplace != 0); 2563168404Spjd } 2564168404Spjd 2565168404Spjd /* 2566185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 2567185029Spjd * the ZFS DE that it should not issue any faults for unopenable 2568185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 2569185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 2570185029Spjd * over. 2571185029Spjd */ 2572219089Spjd if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2573185029Spjd spa_check_removed(spa->spa_root_vdev); 2574219089Spjd /* 2575219089Spjd * For the import case, this is done in spa_import(), because 2576219089Spjd * at this point we're using the spare definitions from 2577219089Spjd * the MOS config, not necessarily from the userland config. 2578219089Spjd */ 2579219089Spjd if (state != SPA_LOAD_IMPORT) { 2580219089Spjd spa_aux_check_removed(&spa->spa_spares); 2581219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 2582219089Spjd } 2583219089Spjd } 2584185029Spjd 2585185029Spjd /* 2586168404Spjd * Load the vdev state for all toplevel vdevs. 2587168404Spjd */ 2588168404Spjd vdev_load(rvd); 2589168404Spjd 2590168404Spjd /* 2591168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 2592168404Spjd */ 2593185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2594168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2595185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2596168404Spjd 2597168404Spjd /* 2598219089Spjd * Load the DDTs (dedup tables). 2599168404Spjd */ 2600219089Spjd error = ddt_load(spa); 2601219089Spjd if (error != 0) 2602219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2603219089Spjd 2604219089Spjd spa_update_dspace(spa); 2605219089Spjd 2606219089Spjd /* 2607219089Spjd * Validate the config, using the MOS config to fill in any 2608219089Spjd * information which might be missing. If we fail to validate 2609219089Spjd * the config then declare the pool unfit for use. If we're 2610219089Spjd * assembling a pool from a split, the log is not transferred 2611219089Spjd * over. 2612219089Spjd */ 2613219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2614219089Spjd nvlist_t *nvconfig; 2615219089Spjd 2616219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2617219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2618219089Spjd 2619219089Spjd if (!spa_config_valid(spa, nvconfig)) { 2620219089Spjd nvlist_free(nvconfig); 2621219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2622219089Spjd ENXIO)); 2623219089Spjd } 2624219089Spjd nvlist_free(nvconfig); 2625219089Spjd 2626219089Spjd /* 2627236884Smm * Now that we've validated the config, check the state of the 2628219089Spjd * root vdev. If it can't be opened, it indicates one or 2629219089Spjd * more toplevel vdevs are faulted. 2630219089Spjd */ 2631219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2632249195Smm return (SET_ERROR(ENXIO)); 2633219089Spjd 2634219089Spjd if (spa_check_logs(spa)) { 2635219089Spjd *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2636219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2637219089Spjd } 2638168404Spjd } 2639168404Spjd 2640236884Smm if (missing_feat_write) { 2641236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 2642236884Smm 2643236884Smm /* 2644236884Smm * At this point, we know that we can open the pool in 2645236884Smm * read-only mode but not read-write mode. We now have enough 2646236884Smm * information and can return to userland. 2647236884Smm */ 2648236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2649236884Smm } 2650236884Smm 2651219089Spjd /* 2652219089Spjd * We've successfully opened the pool, verify that we're ready 2653219089Spjd * to start pushing transactions. 2654219089Spjd */ 2655219089Spjd if (state != SPA_LOAD_TRYIMPORT) { 2656219089Spjd if (error = spa_load_verify(spa)) 2657219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2658219089Spjd error)); 2659219089Spjd } 2660219089Spjd 2661219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2662219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 2663168404Spjd dmu_tx_t *tx; 2664168404Spjd int need_update = B_FALSE; 2665168404Spjd 2666209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 2667209962Smm 2668168404Spjd /* 2669168404Spjd * Claim log blocks that haven't been committed yet. 2670168404Spjd * This must all happen in a single txg. 2671219089Spjd * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2672219089Spjd * invoked from zil_claim_log_block()'s i/o done callback. 2673219089Spjd * Price of rollback is that we abandon the log. 2674168404Spjd */ 2675219089Spjd spa->spa_claiming = B_TRUE; 2676219089Spjd 2677168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2678168404Spjd spa_first_txg(spa)); 2679185029Spjd (void) dmu_objset_find(spa_name(spa), 2680168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 2681168404Spjd dmu_tx_commit(tx); 2682168404Spjd 2683219089Spjd spa->spa_claiming = B_FALSE; 2684219089Spjd 2685219089Spjd spa_set_log_state(spa, SPA_LOG_GOOD); 2686168404Spjd spa->spa_sync_on = B_TRUE; 2687168404Spjd txg_sync_start(spa->spa_dsl_pool); 2688168404Spjd 2689168404Spjd /* 2690219089Spjd * Wait for all claims to sync. We sync up to the highest 2691219089Spjd * claimed log block birth time so that claimed log blocks 2692219089Spjd * don't appear to be from the future. spa_claim_max_txg 2693219089Spjd * will have been set for us by either zil_check_log_chain() 2694219089Spjd * (invoked from spa_check_logs()) or zil_claim() above. 2695168404Spjd */ 2696219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2697168404Spjd 2698168404Spjd /* 2699168404Spjd * If the config cache is stale, or we have uninitialized 2700168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 2701209962Smm * 2702219089Spjd * If this is a verbatim import, trust the current 2703209962Smm * in-core spa_config and update the disk labels. 2704168404Spjd */ 2705168404Spjd if (config_cache_txg != spa->spa_config_txg || 2706219089Spjd state == SPA_LOAD_IMPORT || 2707219089Spjd state == SPA_LOAD_RECOVER || 2708219089Spjd (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2709168404Spjd need_update = B_TRUE; 2710168404Spjd 2711209962Smm for (int c = 0; c < rvd->vdev_children; c++) 2712168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 2713168404Spjd need_update = B_TRUE; 2714168404Spjd 2715168404Spjd /* 2716168404Spjd * Update the config cache asychronously in case we're the 2717168404Spjd * root pool, in which case the config cache isn't writable yet. 2718168404Spjd */ 2719168404Spjd if (need_update) 2720168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2721208683Spjd 2722208683Spjd /* 2723208683Spjd * Check all DTLs to see if anything needs resilvering. 2724208683Spjd */ 2725219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2726219089Spjd vdev_resilver_needed(rvd, NULL, NULL)) 2727208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2728219089Spjd 2729219089Spjd /* 2730248571Smm * Log the fact that we booted up (so that we can detect if 2731248571Smm * we rebooted in the middle of an operation). 2732248571Smm */ 2733248571Smm spa_history_log_version(spa, "open"); 2734248571Smm 2735248571Smm /* 2736219089Spjd * Delete any inconsistent datasets. 2737219089Spjd */ 2738219089Spjd (void) dmu_objset_find(spa_name(spa), 2739219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2740219089Spjd 2741219089Spjd /* 2742219089Spjd * Clean up any stale temporary dataset userrefs. 2743219089Spjd */ 2744219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2745168404Spjd } 2746168404Spjd 2747219089Spjd return (0); 2748219089Spjd} 2749168404Spjd 2750219089Spjdstatic int 2751219089Spjdspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2752219089Spjd{ 2753219089Spjd int mode = spa->spa_mode; 2754219089Spjd 2755219089Spjd spa_unload(spa); 2756219089Spjd spa_deactivate(spa); 2757219089Spjd 2758219089Spjd spa->spa_load_max_txg--; 2759219089Spjd 2760219089Spjd spa_activate(spa, mode); 2761219089Spjd spa_async_suspend(spa); 2762219089Spjd 2763219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2764168404Spjd} 2765168404Spjd 2766236884Smm/* 2767236884Smm * If spa_load() fails this function will try loading prior txg's. If 2768236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2769236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2770236884Smm * function will not rewind the pool and will return the same error as 2771236884Smm * spa_load(). 2772236884Smm */ 2773219089Spjdstatic int 2774219089Spjdspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2775219089Spjd uint64_t max_request, int rewind_flags) 2776219089Spjd{ 2777236884Smm nvlist_t *loadinfo = NULL; 2778219089Spjd nvlist_t *config = NULL; 2779219089Spjd int load_error, rewind_error; 2780219089Spjd uint64_t safe_rewind_txg; 2781219089Spjd uint64_t min_txg; 2782219089Spjd 2783219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2784219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 2785219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2786219089Spjd } else { 2787219089Spjd spa->spa_load_max_txg = max_request; 2788219089Spjd } 2789219089Spjd 2790219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2791219089Spjd mosconfig); 2792219089Spjd if (load_error == 0) 2793219089Spjd return (0); 2794219089Spjd 2795219089Spjd if (spa->spa_root_vdev != NULL) 2796219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2797219089Spjd 2798219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2799219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2800219089Spjd 2801219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 2802219089Spjd nvlist_free(config); 2803219089Spjd return (load_error); 2804219089Spjd } 2805219089Spjd 2806236884Smm if (state == SPA_LOAD_RECOVER) { 2807236884Smm /* Price of rolling back is discarding txgs, including log */ 2808219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2809236884Smm } else { 2810236884Smm /* 2811236884Smm * If we aren't rolling back save the load info from our first 2812236884Smm * import attempt so that we can restore it after attempting 2813236884Smm * to rewind. 2814236884Smm */ 2815236884Smm loadinfo = spa->spa_load_info; 2816236884Smm spa->spa_load_info = fnvlist_alloc(); 2817236884Smm } 2818219089Spjd 2819219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2820219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2821219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2822219089Spjd TXG_INITIAL : safe_rewind_txg; 2823219089Spjd 2824219089Spjd /* 2825219089Spjd * Continue as long as we're finding errors, we're still within 2826219089Spjd * the acceptable rewind range, and we're still finding uberblocks 2827219089Spjd */ 2828219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2829219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2830219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 2831219089Spjd spa->spa_extreme_rewind = B_TRUE; 2832219089Spjd rewind_error = spa_load_retry(spa, state, mosconfig); 2833219089Spjd } 2834219089Spjd 2835219089Spjd spa->spa_extreme_rewind = B_FALSE; 2836219089Spjd spa->spa_load_max_txg = UINT64_MAX; 2837219089Spjd 2838219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2839219089Spjd spa_config_set(spa, config); 2840219089Spjd 2841236884Smm if (state == SPA_LOAD_RECOVER) { 2842236884Smm ASSERT3P(loadinfo, ==, NULL); 2843236884Smm return (rewind_error); 2844236884Smm } else { 2845236884Smm /* Store the rewind info as part of the initial load info */ 2846236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2847236884Smm spa->spa_load_info); 2848236884Smm 2849236884Smm /* Restore the initial load info */ 2850236884Smm fnvlist_free(spa->spa_load_info); 2851236884Smm spa->spa_load_info = loadinfo; 2852236884Smm 2853236884Smm return (load_error); 2854236884Smm } 2855219089Spjd} 2856219089Spjd 2857168404Spjd/* 2858168404Spjd * Pool Open/Import 2859168404Spjd * 2860168404Spjd * The import case is identical to an open except that the configuration is sent 2861168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 2862168404Spjd * case of an open, the pool configuration will exist in the 2863185029Spjd * POOL_STATE_UNINITIALIZED state. 2864168404Spjd * 2865168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 2866168404Spjd * the same time open the pool, without having to keep around the spa_t in some 2867168404Spjd * ambiguous state. 2868168404Spjd */ 2869168404Spjdstatic int 2870219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2871219089Spjd nvlist_t **config) 2872168404Spjd{ 2873168404Spjd spa_t *spa; 2874219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 2875168404Spjd int error; 2876168404Spjd int locked = B_FALSE; 2877219089Spjd int firstopen = B_FALSE; 2878168404Spjd 2879168404Spjd *spapp = NULL; 2880168404Spjd 2881168404Spjd /* 2882168404Spjd * As disgusting as this is, we need to support recursive calls to this 2883168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 2884168404Spjd * up calling spa_open() again. The real fix is to figure out how to 2885168404Spjd * avoid dsl_dir_open() calling this in the first place. 2886168404Spjd */ 2887168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 2888168404Spjd mutex_enter(&spa_namespace_lock); 2889168404Spjd locked = B_TRUE; 2890168404Spjd } 2891168404Spjd 2892168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 2893168404Spjd if (locked) 2894168404Spjd mutex_exit(&spa_namespace_lock); 2895249195Smm return (SET_ERROR(ENOENT)); 2896168404Spjd } 2897219089Spjd 2898168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2899219089Spjd zpool_rewind_policy_t policy; 2900168404Spjd 2901219089Spjd firstopen = B_TRUE; 2902219089Spjd 2903219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2904219089Spjd &policy); 2905219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 2906219089Spjd state = SPA_LOAD_RECOVER; 2907219089Spjd 2908209962Smm spa_activate(spa, spa_mode_global); 2909168404Spjd 2910219089Spjd if (state != SPA_LOAD_RECOVER) 2911219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2912168404Spjd 2913219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2914219089Spjd policy.zrp_request); 2915219089Spjd 2916168404Spjd if (error == EBADF) { 2917168404Spjd /* 2918168404Spjd * If vdev_validate() returns failure (indicated by 2919168404Spjd * EBADF), it indicates that one of the vdevs indicates 2920168404Spjd * that the pool has been exported or destroyed. If 2921168404Spjd * this is the case, the config cache is out of sync and 2922168404Spjd * we should remove the pool from the namespace. 2923168404Spjd */ 2924168404Spjd spa_unload(spa); 2925168404Spjd spa_deactivate(spa); 2926185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 2927168404Spjd spa_remove(spa); 2928168404Spjd if (locked) 2929168404Spjd mutex_exit(&spa_namespace_lock); 2930249195Smm return (SET_ERROR(ENOENT)); 2931168404Spjd } 2932168404Spjd 2933168404Spjd if (error) { 2934168404Spjd /* 2935168404Spjd * We can't open the pool, but we still have useful 2936168404Spjd * information: the state of each vdev after the 2937168404Spjd * attempted vdev_open(). Return this to the user. 2938168404Spjd */ 2939219089Spjd if (config != NULL && spa->spa_config) { 2940219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 2941219089Spjd KM_SLEEP) == 0); 2942219089Spjd VERIFY(nvlist_add_nvlist(*config, 2943219089Spjd ZPOOL_CONFIG_LOAD_INFO, 2944219089Spjd spa->spa_load_info) == 0); 2945219089Spjd } 2946168404Spjd spa_unload(spa); 2947168404Spjd spa_deactivate(spa); 2948219089Spjd spa->spa_last_open_failed = error; 2949168404Spjd if (locked) 2950168404Spjd mutex_exit(&spa_namespace_lock); 2951168404Spjd *spapp = NULL; 2952168404Spjd return (error); 2953168404Spjd } 2954168404Spjd } 2955168404Spjd 2956168404Spjd spa_open_ref(spa, tag); 2957185029Spjd 2958219089Spjd if (config != NULL) 2959219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2960219089Spjd 2961219089Spjd /* 2962219089Spjd * If we've recovered the pool, pass back any information we 2963219089Spjd * gathered while doing the load. 2964219089Spjd */ 2965219089Spjd if (state == SPA_LOAD_RECOVER) { 2966219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2967219089Spjd spa->spa_load_info) == 0); 2968219089Spjd } 2969219089Spjd 2970219089Spjd if (locked) { 2971219089Spjd spa->spa_last_open_failed = 0; 2972219089Spjd spa->spa_last_ubsync_txg = 0; 2973219089Spjd spa->spa_load_txg = 0; 2974168404Spjd mutex_exit(&spa_namespace_lock); 2975219089Spjd#ifdef __FreeBSD__ 2976219089Spjd#ifdef _KERNEL 2977219089Spjd if (firstopen) 2978249047Savg zvol_create_minors(spa->spa_name); 2979219089Spjd#endif 2980219089Spjd#endif 2981219089Spjd } 2982168404Spjd 2983168404Spjd *spapp = spa; 2984168404Spjd 2985168404Spjd return (0); 2986168404Spjd} 2987168404Spjd 2988168404Spjdint 2989219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2990219089Spjd nvlist_t **config) 2991219089Spjd{ 2992219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 2993219089Spjd} 2994219089Spjd 2995219089Spjdint 2996168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 2997168404Spjd{ 2998219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 2999168404Spjd} 3000168404Spjd 3001168404Spjd/* 3002168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 3003168404Spjd * preventing it from being exported or destroyed. 3004168404Spjd */ 3005168404Spjdspa_t * 3006168404Spjdspa_inject_addref(char *name) 3007168404Spjd{ 3008168404Spjd spa_t *spa; 3009168404Spjd 3010168404Spjd mutex_enter(&spa_namespace_lock); 3011168404Spjd if ((spa = spa_lookup(name)) == NULL) { 3012168404Spjd mutex_exit(&spa_namespace_lock); 3013168404Spjd return (NULL); 3014168404Spjd } 3015168404Spjd spa->spa_inject_ref++; 3016168404Spjd mutex_exit(&spa_namespace_lock); 3017168404Spjd 3018168404Spjd return (spa); 3019168404Spjd} 3020168404Spjd 3021168404Spjdvoid 3022168404Spjdspa_inject_delref(spa_t *spa) 3023168404Spjd{ 3024168404Spjd mutex_enter(&spa_namespace_lock); 3025168404Spjd spa->spa_inject_ref--; 3026168404Spjd mutex_exit(&spa_namespace_lock); 3027168404Spjd} 3028168404Spjd 3029185029Spjd/* 3030185029Spjd * Add spares device information to the nvlist. 3031185029Spjd */ 3032168404Spjdstatic void 3033168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 3034168404Spjd{ 3035168404Spjd nvlist_t **spares; 3036168404Spjd uint_t i, nspares; 3037168404Spjd nvlist_t *nvroot; 3038168404Spjd uint64_t guid; 3039168404Spjd vdev_stat_t *vs; 3040168404Spjd uint_t vsc; 3041168404Spjd uint64_t pool; 3042168404Spjd 3043209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3044209962Smm 3045185029Spjd if (spa->spa_spares.sav_count == 0) 3046168404Spjd return; 3047168404Spjd 3048168404Spjd VERIFY(nvlist_lookup_nvlist(config, 3049168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3050185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3051168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3052168404Spjd if (nspares != 0) { 3053168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3054168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3055168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3056168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3057168404Spjd 3058168404Spjd /* 3059168404Spjd * Go through and find any spares which have since been 3060168404Spjd * repurposed as an active spare. If this is the case, update 3061168404Spjd * their status appropriately. 3062168404Spjd */ 3063168404Spjd for (i = 0; i < nspares; i++) { 3064168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 3065168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3066185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 3067185029Spjd pool != 0ULL) { 3068168404Spjd VERIFY(nvlist_lookup_uint64_array( 3069219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 3070168404Spjd (uint64_t **)&vs, &vsc) == 0); 3071168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 3072168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 3073168404Spjd } 3074168404Spjd } 3075168404Spjd } 3076168404Spjd} 3077168404Spjd 3078185029Spjd/* 3079185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 3080185029Spjd */ 3081185029Spjdstatic void 3082185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 3083185029Spjd{ 3084185029Spjd nvlist_t **l2cache; 3085185029Spjd uint_t i, j, nl2cache; 3086185029Spjd nvlist_t *nvroot; 3087185029Spjd uint64_t guid; 3088185029Spjd vdev_t *vd; 3089185029Spjd vdev_stat_t *vs; 3090185029Spjd uint_t vsc; 3091185029Spjd 3092209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3093209962Smm 3094185029Spjd if (spa->spa_l2cache.sav_count == 0) 3095185029Spjd return; 3096185029Spjd 3097185029Spjd VERIFY(nvlist_lookup_nvlist(config, 3098185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3099185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3100185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3101185029Spjd if (nl2cache != 0) { 3102185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3103185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3104185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3105185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3106185029Spjd 3107185029Spjd /* 3108185029Spjd * Update level 2 cache device stats. 3109185029Spjd */ 3110185029Spjd 3111185029Spjd for (i = 0; i < nl2cache; i++) { 3112185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 3113185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3114185029Spjd 3115185029Spjd vd = NULL; 3116185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3117185029Spjd if (guid == 3118185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3119185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 3120185029Spjd break; 3121185029Spjd } 3122185029Spjd } 3123185029Spjd ASSERT(vd != NULL); 3124185029Spjd 3125185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3126219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3127219089Spjd == 0); 3128185029Spjd vdev_get_stats(vd, vs); 3129185029Spjd } 3130185029Spjd } 3131185029Spjd} 3132185029Spjd 3133236884Smmstatic void 3134236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3135236884Smm{ 3136236884Smm nvlist_t *features; 3137236884Smm zap_cursor_t zc; 3138236884Smm zap_attribute_t za; 3139236884Smm 3140236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3141236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3142236884Smm 3143253993Smav /* We may be unable to read features if pool is suspended. */ 3144253993Smav if (spa_suspended(spa)) 3145253993Smav goto out; 3146253993Smav 3147236884Smm if (spa->spa_feat_for_read_obj != 0) { 3148236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3149236884Smm spa->spa_feat_for_read_obj); 3150236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3151236884Smm zap_cursor_advance(&zc)) { 3152236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3153236884Smm za.za_num_integers == 1); 3154236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3155236884Smm za.za_first_integer)); 3156236884Smm } 3157236884Smm zap_cursor_fini(&zc); 3158236884Smm } 3159236884Smm 3160236884Smm if (spa->spa_feat_for_write_obj != 0) { 3161236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3162236884Smm spa->spa_feat_for_write_obj); 3163236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3164236884Smm zap_cursor_advance(&zc)) { 3165236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3166236884Smm za.za_num_integers == 1); 3167236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3168236884Smm za.za_first_integer)); 3169236884Smm } 3170236884Smm zap_cursor_fini(&zc); 3171236884Smm } 3172236884Smm 3173253993Smavout: 3174236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3175236884Smm features) == 0); 3176236884Smm nvlist_free(features); 3177236884Smm} 3178236884Smm 3179168404Spjdint 3180236884Smmspa_get_stats(const char *name, nvlist_t **config, 3181236884Smm char *altroot, size_t buflen) 3182168404Spjd{ 3183168404Spjd int error; 3184168404Spjd spa_t *spa; 3185168404Spjd 3186168404Spjd *config = NULL; 3187219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3188168404Spjd 3189209962Smm if (spa != NULL) { 3190209962Smm /* 3191209962Smm * This still leaves a window of inconsistency where the spares 3192209962Smm * or l2cache devices could change and the config would be 3193209962Smm * self-inconsistent. 3194209962Smm */ 3195209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3196168404Spjd 3197209962Smm if (*config != NULL) { 3198219089Spjd uint64_t loadtimes[2]; 3199219089Spjd 3200219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3201219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3202219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3203219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3204219089Spjd 3205185029Spjd VERIFY(nvlist_add_uint64(*config, 3206209962Smm ZPOOL_CONFIG_ERRCOUNT, 3207209962Smm spa_get_errlog_size(spa)) == 0); 3208185029Spjd 3209209962Smm if (spa_suspended(spa)) 3210209962Smm VERIFY(nvlist_add_uint64(*config, 3211209962Smm ZPOOL_CONFIG_SUSPENDED, 3212209962Smm spa->spa_failmode) == 0); 3213209962Smm 3214209962Smm spa_add_spares(spa, *config); 3215209962Smm spa_add_l2cache(spa, *config); 3216236884Smm spa_add_feature_stats(spa, *config); 3217209962Smm } 3218168404Spjd } 3219168404Spjd 3220168404Spjd /* 3221168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3222168404Spjd * and call spa_lookup() directly. 3223168404Spjd */ 3224168404Spjd if (altroot) { 3225168404Spjd if (spa == NULL) { 3226168404Spjd mutex_enter(&spa_namespace_lock); 3227168404Spjd spa = spa_lookup(name); 3228168404Spjd if (spa) 3229168404Spjd spa_altroot(spa, altroot, buflen); 3230168404Spjd else 3231168404Spjd altroot[0] = '\0'; 3232168404Spjd spa = NULL; 3233168404Spjd mutex_exit(&spa_namespace_lock); 3234168404Spjd } else { 3235168404Spjd spa_altroot(spa, altroot, buflen); 3236168404Spjd } 3237168404Spjd } 3238168404Spjd 3239209962Smm if (spa != NULL) { 3240209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3241168404Spjd spa_close(spa, FTAG); 3242209962Smm } 3243168404Spjd 3244168404Spjd return (error); 3245168404Spjd} 3246168404Spjd 3247168404Spjd/* 3248185029Spjd * Validate that the auxiliary device array is well formed. We must have an 3249185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 3250185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3251185029Spjd * specified, as long as they are well-formed. 3252168404Spjd */ 3253168404Spjdstatic int 3254185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3255185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 3256185029Spjd vdev_labeltype_t label) 3257168404Spjd{ 3258185029Spjd nvlist_t **dev; 3259185029Spjd uint_t i, ndev; 3260168404Spjd vdev_t *vd; 3261168404Spjd int error; 3262168404Spjd 3263185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3264185029Spjd 3265168404Spjd /* 3266185029Spjd * It's acceptable to have no devs specified. 3267168404Spjd */ 3268185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3269168404Spjd return (0); 3270168404Spjd 3271185029Spjd if (ndev == 0) 3272249195Smm return (SET_ERROR(EINVAL)); 3273168404Spjd 3274168404Spjd /* 3275185029Spjd * Make sure the pool is formatted with a version that supports this 3276185029Spjd * device type. 3277168404Spjd */ 3278185029Spjd if (spa_version(spa) < version) 3279249195Smm return (SET_ERROR(ENOTSUP)); 3280168404Spjd 3281168404Spjd /* 3282185029Spjd * Set the pending device list so we correctly handle device in-use 3283168404Spjd * checking. 3284168404Spjd */ 3285185029Spjd sav->sav_pending = dev; 3286185029Spjd sav->sav_npending = ndev; 3287168404Spjd 3288185029Spjd for (i = 0; i < ndev; i++) { 3289185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3290168404Spjd mode)) != 0) 3291168404Spjd goto out; 3292168404Spjd 3293168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3294168404Spjd vdev_free(vd); 3295249195Smm error = SET_ERROR(EINVAL); 3296168404Spjd goto out; 3297168404Spjd } 3298168404Spjd 3299185029Spjd /* 3300185029Spjd * The L2ARC currently only supports disk devices in 3301185029Spjd * kernel context. For user-level testing, we allow it. 3302185029Spjd */ 3303185029Spjd#ifdef _KERNEL 3304185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3305185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3306249195Smm error = SET_ERROR(ENOTBLK); 3307230514Smm vdev_free(vd); 3308185029Spjd goto out; 3309185029Spjd } 3310185029Spjd#endif 3311168404Spjd vd->vdev_top = vd; 3312168404Spjd 3313168404Spjd if ((error = vdev_open(vd)) == 0 && 3314185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 3315185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3316168404Spjd vd->vdev_guid) == 0); 3317168404Spjd } 3318168404Spjd 3319168404Spjd vdev_free(vd); 3320168404Spjd 3321185029Spjd if (error && 3322185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3323168404Spjd goto out; 3324168404Spjd else 3325168404Spjd error = 0; 3326168404Spjd } 3327168404Spjd 3328168404Spjdout: 3329185029Spjd sav->sav_pending = NULL; 3330185029Spjd sav->sav_npending = 0; 3331168404Spjd return (error); 3332168404Spjd} 3333168404Spjd 3334185029Spjdstatic int 3335185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3336185029Spjd{ 3337185029Spjd int error; 3338185029Spjd 3339185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3340185029Spjd 3341185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3342185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3343185029Spjd VDEV_LABEL_SPARE)) != 0) { 3344185029Spjd return (error); 3345185029Spjd } 3346185029Spjd 3347185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3348185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3349185029Spjd VDEV_LABEL_L2CACHE)); 3350185029Spjd} 3351185029Spjd 3352185029Spjdstatic void 3353185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3354185029Spjd const char *config) 3355185029Spjd{ 3356185029Spjd int i; 3357185029Spjd 3358185029Spjd if (sav->sav_config != NULL) { 3359185029Spjd nvlist_t **olddevs; 3360185029Spjd uint_t oldndevs; 3361185029Spjd nvlist_t **newdevs; 3362185029Spjd 3363185029Spjd /* 3364185029Spjd * Generate new dev list by concatentating with the 3365185029Spjd * current dev list. 3366185029Spjd */ 3367185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3368185029Spjd &olddevs, &oldndevs) == 0); 3369185029Spjd 3370185029Spjd newdevs = kmem_alloc(sizeof (void *) * 3371185029Spjd (ndevs + oldndevs), KM_SLEEP); 3372185029Spjd for (i = 0; i < oldndevs; i++) 3373185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3374185029Spjd KM_SLEEP) == 0); 3375185029Spjd for (i = 0; i < ndevs; i++) 3376185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3377185029Spjd KM_SLEEP) == 0); 3378185029Spjd 3379185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 3380185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 3381185029Spjd 3382185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3383185029Spjd config, newdevs, ndevs + oldndevs) == 0); 3384185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 3385185029Spjd nvlist_free(newdevs[i]); 3386185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3387185029Spjd } else { 3388185029Spjd /* 3389185029Spjd * Generate a new dev list. 3390185029Spjd */ 3391185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3392185029Spjd KM_SLEEP) == 0); 3393185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3394185029Spjd devs, ndevs) == 0); 3395185029Spjd } 3396185029Spjd} 3397185029Spjd 3398168404Spjd/* 3399185029Spjd * Stop and drop level 2 ARC devices 3400185029Spjd */ 3401185029Spjdvoid 3402185029Spjdspa_l2cache_drop(spa_t *spa) 3403185029Spjd{ 3404185029Spjd vdev_t *vd; 3405185029Spjd int i; 3406185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 3407185029Spjd 3408185029Spjd for (i = 0; i < sav->sav_count; i++) { 3409185029Spjd uint64_t pool; 3410185029Spjd 3411185029Spjd vd = sav->sav_vdevs[i]; 3412185029Spjd ASSERT(vd != NULL); 3413185029Spjd 3414209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3415209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 3416185029Spjd l2arc_remove_vdev(vd); 3417185029Spjd } 3418185029Spjd} 3419185029Spjd 3420185029Spjd/* 3421168404Spjd * Pool Creation 3422168404Spjd */ 3423168404Spjdint 3424185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3425248571Smm nvlist_t *zplprops) 3426168404Spjd{ 3427168404Spjd spa_t *spa; 3428185029Spjd char *altroot = NULL; 3429168404Spjd vdev_t *rvd; 3430168404Spjd dsl_pool_t *dp; 3431168404Spjd dmu_tx_t *tx; 3432219089Spjd int error = 0; 3433168404Spjd uint64_t txg = TXG_INITIAL; 3434185029Spjd nvlist_t **spares, **l2cache; 3435185029Spjd uint_t nspares, nl2cache; 3436219089Spjd uint64_t version, obj; 3437236884Smm boolean_t has_features; 3438168404Spjd 3439168404Spjd /* 3440168404Spjd * If this pool already exists, return failure. 3441168404Spjd */ 3442168404Spjd mutex_enter(&spa_namespace_lock); 3443168404Spjd if (spa_lookup(pool) != NULL) { 3444168404Spjd mutex_exit(&spa_namespace_lock); 3445249195Smm return (SET_ERROR(EEXIST)); 3446168404Spjd } 3447168404Spjd 3448168404Spjd /* 3449168404Spjd * Allocate a new spa_t structure. 3450168404Spjd */ 3451185029Spjd (void) nvlist_lookup_string(props, 3452185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3453219089Spjd spa = spa_add(pool, NULL, altroot); 3454209962Smm spa_activate(spa, spa_mode_global); 3455168404Spjd 3456185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 3457185029Spjd spa_deactivate(spa); 3458185029Spjd spa_remove(spa); 3459185029Spjd mutex_exit(&spa_namespace_lock); 3460185029Spjd return (error); 3461185029Spjd } 3462185029Spjd 3463236884Smm has_features = B_FALSE; 3464236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3465236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3466236884Smm if (zpool_prop_feature(nvpair_name(elem))) 3467236884Smm has_features = B_TRUE; 3468236884Smm } 3469236884Smm 3470236884Smm if (has_features || nvlist_lookup_uint64(props, 3471236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3472185029Spjd version = SPA_VERSION; 3473236884Smm } 3474236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3475219089Spjd 3476219089Spjd spa->spa_first_txg = txg; 3477219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 3478185029Spjd spa->spa_uberblock.ub_version = version; 3479168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3480168404Spjd 3481168404Spjd /* 3482209962Smm * Create "The Godfather" zio to hold all async IOs 3483209962Smm */ 3484209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 3485209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 3486209962Smm 3487209962Smm /* 3488168404Spjd * Create the root vdev. 3489168404Spjd */ 3490185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3491168404Spjd 3492168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3493168404Spjd 3494168404Spjd ASSERT(error != 0 || rvd != NULL); 3495168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3496168404Spjd 3497185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 3498249195Smm error = SET_ERROR(EINVAL); 3499168404Spjd 3500168404Spjd if (error == 0 && 3501168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3502185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 3503168404Spjd VDEV_ALLOC_ADD)) == 0) { 3504219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 3505254591Sgibbs vdev_ashift_optimize(rvd->vdev_child[c]); 3506219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 3507219089Spjd vdev_expand(rvd->vdev_child[c], txg); 3508219089Spjd } 3509168404Spjd } 3510168404Spjd 3511185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3512168404Spjd 3513168404Spjd if (error != 0) { 3514168404Spjd spa_unload(spa); 3515168404Spjd spa_deactivate(spa); 3516168404Spjd spa_remove(spa); 3517168404Spjd mutex_exit(&spa_namespace_lock); 3518168404Spjd return (error); 3519168404Spjd } 3520168404Spjd 3521168404Spjd /* 3522168404Spjd * Get the list of spares, if specified. 3523168404Spjd */ 3524168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3525168404Spjd &spares, &nspares) == 0) { 3526185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3527168404Spjd KM_SLEEP) == 0); 3528185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3529168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3530185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3531168404Spjd spa_load_spares(spa); 3532185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3533185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3534168404Spjd } 3535168404Spjd 3536185029Spjd /* 3537185029Spjd * Get the list of level 2 cache devices, if specified. 3538185029Spjd */ 3539185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3540185029Spjd &l2cache, &nl2cache) == 0) { 3541185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3542185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 3543185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3544185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3545185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3546185029Spjd spa_load_l2cache(spa); 3547185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3548185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3549185029Spjd } 3550185029Spjd 3551236884Smm spa->spa_is_initializing = B_TRUE; 3552185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3553168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 3554236884Smm spa->spa_is_initializing = B_FALSE; 3555168404Spjd 3556219089Spjd /* 3557219089Spjd * Create DDTs (dedup tables). 3558219089Spjd */ 3559219089Spjd ddt_create(spa); 3560219089Spjd 3561219089Spjd spa_update_dspace(spa); 3562219089Spjd 3563168404Spjd tx = dmu_tx_create_assigned(dp, txg); 3564168404Spjd 3565168404Spjd /* 3566168404Spjd * Create the pool config object. 3567168404Spjd */ 3568168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3569185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3570168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3571168404Spjd 3572168404Spjd if (zap_add(spa->spa_meta_objset, 3573168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3574168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3575168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 3576168404Spjd } 3577168404Spjd 3578236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 3579236884Smm spa_feature_create_zap_objects(spa, tx); 3580236884Smm 3581219089Spjd if (zap_add(spa->spa_meta_objset, 3582219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3583219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 3584219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 3585219089Spjd } 3586219089Spjd 3587185029Spjd /* Newly created pools with the right version are always deflated. */ 3588185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3589185029Spjd spa->spa_deflate = TRUE; 3590185029Spjd if (zap_add(spa->spa_meta_objset, 3591185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3592185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3593185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 3594185029Spjd } 3595168404Spjd } 3596168404Spjd 3597168404Spjd /* 3598219089Spjd * Create the deferred-free bpobj. Turn off compression 3599168404Spjd * because sync-to-convergence takes longer if the blocksize 3600168404Spjd * keeps changing. 3601168404Spjd */ 3602219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3603219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 3604168404Spjd ZIO_COMPRESS_OFF, tx); 3605168404Spjd if (zap_add(spa->spa_meta_objset, 3606219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3607219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 3608219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 3609168404Spjd } 3610219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3611219089Spjd spa->spa_meta_objset, obj)); 3612168404Spjd 3613168404Spjd /* 3614168404Spjd * Create the pool's history object. 3615168404Spjd */ 3616185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 3617185029Spjd spa_history_create_obj(spa, tx); 3618168404Spjd 3619185029Spjd /* 3620185029Spjd * Set pool properties. 3621185029Spjd */ 3622185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3623185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3624185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3625219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3626219089Spjd 3627209962Smm if (props != NULL) { 3628209962Smm spa_configfile_set(spa, props, B_FALSE); 3629248571Smm spa_sync_props(props, tx); 3630209962Smm } 3631185029Spjd 3632168404Spjd dmu_tx_commit(tx); 3633168404Spjd 3634168404Spjd spa->spa_sync_on = B_TRUE; 3635168404Spjd txg_sync_start(spa->spa_dsl_pool); 3636168404Spjd 3637168404Spjd /* 3638168404Spjd * We explicitly wait for the first transaction to complete so that our 3639168404Spjd * bean counters are appropriately updated. 3640168404Spjd */ 3641168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 3642168404Spjd 3643185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3644168404Spjd 3645248571Smm spa_history_log_version(spa, "create"); 3646185029Spjd 3647208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 3648208442Smm 3649168404Spjd mutex_exit(&spa_namespace_lock); 3650168404Spjd 3651168404Spjd return (0); 3652168404Spjd} 3653168404Spjd 3654241286Savg#ifdef _KERNEL 3655219089Spjd#if defined(sun) 3656185029Spjd/* 3657219089Spjd * Get the root pool information from the root disk, then import the root pool 3658219089Spjd * during the system boot up time. 3659185029Spjd */ 3660219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3661219089Spjd 3662219089Spjdstatic nvlist_t * 3663219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3664185029Spjd{ 3665219089Spjd nvlist_t *config; 3666185029Spjd nvlist_t *nvtop, *nvroot; 3667185029Spjd uint64_t pgid; 3668185029Spjd 3669219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3670219089Spjd return (NULL); 3671219089Spjd 3672168404Spjd /* 3673185029Spjd * Add this top-level vdev to the child array. 3674168404Spjd */ 3675219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3676219089Spjd &nvtop) == 0); 3677219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3678219089Spjd &pgid) == 0); 3679219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3680168404Spjd 3681185029Spjd /* 3682185029Spjd * Put this pool's top-level vdevs into a root vdev. 3683185029Spjd */ 3684185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3685219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3686219089Spjd VDEV_TYPE_ROOT) == 0); 3687185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3688185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3689185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3690185029Spjd &nvtop, 1) == 0); 3691168404Spjd 3692168404Spjd /* 3693185029Spjd * Replace the existing vdev_tree with the new root vdev in 3694185029Spjd * this pool's configuration (remove the old, add the new). 3695168404Spjd */ 3696185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3697185029Spjd nvlist_free(nvroot); 3698219089Spjd return (config); 3699185029Spjd} 3700168404Spjd 3701185029Spjd/* 3702219089Spjd * Walk the vdev tree and see if we can find a device with "better" 3703219089Spjd * configuration. A configuration is "better" if the label on that 3704219089Spjd * device has a more recent txg. 3705185029Spjd */ 3706219089Spjdstatic void 3707219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3708185029Spjd{ 3709219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3710219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3711185029Spjd 3712219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 3713219089Spjd nvlist_t *label; 3714219089Spjd uint64_t label_txg; 3715185029Spjd 3716219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3717219089Spjd &label) != 0) 3718219089Spjd return; 3719185029Spjd 3720219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3721219089Spjd &label_txg) == 0); 3722168404Spjd 3723219089Spjd /* 3724219089Spjd * Do we have a better boot device? 3725219089Spjd */ 3726219089Spjd if (label_txg > *txg) { 3727219089Spjd *txg = label_txg; 3728219089Spjd *avd = vd; 3729185029Spjd } 3730219089Spjd nvlist_free(label); 3731185029Spjd } 3732185029Spjd} 3733185029Spjd 3734185029Spjd/* 3735185029Spjd * Import a root pool. 3736185029Spjd * 3737185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 3738185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3739185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 3740185029Spjd * 3741185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 3742185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 3743185029Spjd * e.g. 3744185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 3745185029Spjd */ 3746185029Spjdint 3747185029Spjdspa_import_rootpool(char *devpath, char *devid) 3748185029Spjd{ 3749219089Spjd spa_t *spa; 3750219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 3751219089Spjd nvlist_t *config, *nvtop; 3752219089Spjd uint64_t guid, txg; 3753185029Spjd char *pname; 3754185029Spjd int error; 3755185029Spjd 3756185029Spjd /* 3757219089Spjd * Read the label from the boot device and generate a configuration. 3758185029Spjd */ 3759219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3760219089Spjd#if defined(_OBP) && defined(_KERNEL) 3761219089Spjd if (config == NULL) { 3762219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 3763219089Spjd /* iscsi boot */ 3764219089Spjd get_iscsi_bootpath_phy(devpath); 3765219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3766219089Spjd } 3767219089Spjd } 3768219089Spjd#endif 3769219089Spjd if (config == NULL) { 3770236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3771219089Spjd devpath); 3772249195Smm return (SET_ERROR(EIO)); 3773219089Spjd } 3774185029Spjd 3775219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3776219089Spjd &pname) == 0); 3777219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3778185029Spjd 3779209962Smm mutex_enter(&spa_namespace_lock); 3780209962Smm if ((spa = spa_lookup(pname)) != NULL) { 3781209962Smm /* 3782209962Smm * Remove the existing root pool from the namespace so that we 3783209962Smm * can replace it with the correct config we just read in. 3784209962Smm */ 3785209962Smm spa_remove(spa); 3786209962Smm } 3787185029Spjd 3788219089Spjd spa = spa_add(pname, config, NULL); 3789209962Smm spa->spa_is_root = B_TRUE; 3790219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3791209962Smm 3792219089Spjd /* 3793219089Spjd * Build up a vdev tree based on the boot device's label config. 3794219089Spjd */ 3795219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3796219089Spjd &nvtop) == 0); 3797219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3798219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3799219089Spjd VDEV_ALLOC_ROOTPOOL); 3800219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3801219089Spjd if (error) { 3802209962Smm mutex_exit(&spa_namespace_lock); 3803219089Spjd nvlist_free(config); 3804219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3805219089Spjd pname); 3806219089Spjd return (error); 3807209962Smm } 3808209962Smm 3809219089Spjd /* 3810219089Spjd * Get the boot vdev. 3811219089Spjd */ 3812219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3813219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3814219089Spjd (u_longlong_t)guid); 3815249195Smm error = SET_ERROR(ENOENT); 3816219089Spjd goto out; 3817219089Spjd } 3818209962Smm 3819219089Spjd /* 3820219089Spjd * Determine if there is a better boot device. 3821219089Spjd */ 3822219089Spjd avd = bvd; 3823219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 3824219089Spjd if (avd != bvd) { 3825219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3826219089Spjd "try booting from '%s'", avd->vdev_path); 3827249195Smm error = SET_ERROR(EINVAL); 3828219089Spjd goto out; 3829219089Spjd } 3830209962Smm 3831219089Spjd /* 3832219089Spjd * If the boot device is part of a spare vdev then ensure that 3833219089Spjd * we're booting off the active spare. 3834219089Spjd */ 3835219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3836219089Spjd !bvd->vdev_isspare) { 3837219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3838219089Spjd "try booting from '%s'", 3839219089Spjd bvd->vdev_parent-> 3840219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3841249195Smm error = SET_ERROR(EINVAL); 3842219089Spjd goto out; 3843219089Spjd } 3844209962Smm 3845219089Spjd error = 0; 3846219089Spjdout: 3847219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3848219089Spjd vdev_free(rvd); 3849219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3850209962Smm mutex_exit(&spa_namespace_lock); 3851209962Smm 3852219089Spjd nvlist_free(config); 3853219089Spjd return (error); 3854185029Spjd} 3855185029Spjd 3856241286Savg#else 3857241286Savg 3858243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3859243502Savg uint64_t *count); 3860241286Savg 3861241286Savgstatic nvlist_t * 3862241286Savgspa_generate_rootconf(const char *name) 3863241286Savg{ 3864243502Savg nvlist_t **configs, **tops; 3865241286Savg nvlist_t *config; 3866243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 3867243502Savg uint64_t *holes; 3868243502Savg uint64_t best_txg; 3869243213Savg uint64_t nchildren; 3870241286Savg uint64_t pgid; 3871243502Savg uint64_t count; 3872243502Savg uint64_t i; 3873243502Savg uint_t nholes; 3874241286Savg 3875243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 3876241286Savg return (NULL); 3877241286Savg 3878243502Savg ASSERT3U(count, !=, 0); 3879243502Savg best_txg = 0; 3880243502Savg for (i = 0; i < count; i++) { 3881243502Savg uint64_t txg; 3882243502Savg 3883243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 3884243502Savg &txg) == 0); 3885243502Savg if (txg > best_txg) { 3886243502Savg best_txg = txg; 3887243502Savg best_cfg = configs[i]; 3888243502Savg } 3889243502Savg } 3890243502Savg 3891241286Savg /* 3892243213Savg * Multi-vdev root pool configuration discovery is not supported yet. 3893243213Savg */ 3894245945Savg nchildren = 1; 3895245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 3896243502Savg holes = NULL; 3897243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 3898243502Savg &holes, &nholes); 3899243502Savg 3900244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 3901243502Savg for (i = 0; i < nchildren; i++) { 3902243502Savg if (i >= count) 3903243502Savg break; 3904243502Savg if (configs[i] == NULL) 3905243502Savg continue; 3906243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 3907243502Savg &nvtop) == 0); 3908243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 3909243213Savg } 3910243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 3911243502Savg if (i >= nchildren) 3912243502Savg continue; 3913243502Savg if (tops[holes[i]] != NULL) 3914243502Savg continue; 3915243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 3916243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 3917243502Savg VDEV_TYPE_HOLE) == 0); 3918243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 3919243502Savg holes[i]) == 0); 3920243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 3921243502Savg 0) == 0); 3922243502Savg } 3923243502Savg for (i = 0; i < nchildren; i++) { 3924243502Savg if (tops[i] != NULL) 3925243502Savg continue; 3926243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 3927243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 3928243502Savg VDEV_TYPE_MISSING) == 0); 3929243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 3930243502Savg i) == 0); 3931243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 3932243502Savg 0) == 0); 3933243502Savg } 3934243213Savg 3935243213Savg /* 3936243502Savg * Create pool config based on the best vdev config. 3937241286Savg */ 3938243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 3939241286Savg 3940241286Savg /* 3941241286Savg * Put this pool's top-level vdevs into a root vdev. 3942241286Savg */ 3943243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3944243502Savg &pgid) == 0); 3945241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3946241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3947241286Savg VDEV_TYPE_ROOT) == 0); 3948241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3949241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3950241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3951243502Savg tops, nchildren) == 0); 3952241286Savg 3953241286Savg /* 3954241286Savg * Replace the existing vdev_tree with the new root vdev in 3955241286Savg * this pool's configuration (remove the old, add the new). 3956241286Savg */ 3957241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3958243502Savg 3959243502Savg /* 3960243502Savg * Drop vdev config elements that should not be present at pool level. 3961243502Savg */ 3962243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 3963243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 3964243502Savg 3965243502Savg for (i = 0; i < count; i++) 3966243502Savg nvlist_free(configs[i]); 3967243502Savg kmem_free(configs, count * sizeof(void *)); 3968243502Savg for (i = 0; i < nchildren; i++) 3969243502Savg nvlist_free(tops[i]); 3970243502Savg kmem_free(tops, nchildren * sizeof(void *)); 3971241286Savg nvlist_free(nvroot); 3972241286Savg return (config); 3973241286Savg} 3974241286Savg 3975241286Savgint 3976241286Savgspa_import_rootpool(const char *name) 3977241286Savg{ 3978241286Savg spa_t *spa; 3979241286Savg vdev_t *rvd, *bvd, *avd = NULL; 3980241286Savg nvlist_t *config, *nvtop; 3981241286Savg uint64_t txg; 3982241286Savg char *pname; 3983241286Savg int error; 3984241286Savg 3985241286Savg /* 3986241286Savg * Read the label from the boot device and generate a configuration. 3987241286Savg */ 3988241286Savg config = spa_generate_rootconf(name); 3989243213Savg 3990243213Savg mutex_enter(&spa_namespace_lock); 3991243213Savg if (config != NULL) { 3992243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3993243213Savg &pname) == 0 && strcmp(name, pname) == 0); 3994243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 3995243213Savg == 0); 3996243213Savg 3997243213Savg if ((spa = spa_lookup(pname)) != NULL) { 3998243213Savg /* 3999243213Savg * Remove the existing root pool from the namespace so 4000243213Savg * that we can replace it with the correct config 4001243213Savg * we just read in. 4002243213Savg */ 4003243213Savg spa_remove(spa); 4004243213Savg } 4005243213Savg spa = spa_add(pname, config, NULL); 4006243501Savg 4007243501Savg /* 4008243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4009243501Savg * via spa_version(). 4010243501Savg */ 4011243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4012243501Savg &spa->spa_ubsync.ub_version) != 0) 4013243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4014243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 4015241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4016241286Savg name); 4017241286Savg return (EIO); 4018243213Savg } else { 4019243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4020241286Savg } 4021241286Savg spa->spa_is_root = B_TRUE; 4022241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4023241286Savg 4024241286Savg /* 4025241286Savg * Build up a vdev tree based on the boot device's label config. 4026241286Savg */ 4027241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4028241286Savg &nvtop) == 0); 4029241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4030241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4031241286Savg VDEV_ALLOC_ROOTPOOL); 4032241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4033241286Savg if (error) { 4034241286Savg mutex_exit(&spa_namespace_lock); 4035241286Savg nvlist_free(config); 4036241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4037241286Savg pname); 4038241286Savg return (error); 4039241286Savg } 4040241286Savg 4041241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4042241286Savg vdev_free(rvd); 4043241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4044241286Savg mutex_exit(&spa_namespace_lock); 4045241286Savg 4046243213Savg nvlist_free(config); 4047243213Savg return (0); 4048241286Savg} 4049241286Savg 4050241286Savg#endif /* sun */ 4051219089Spjd#endif 4052219089Spjd 4053209962Smm/* 4054209962Smm * Import a non-root pool into the system. 4055209962Smm */ 4056185029Spjdint 4057219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4058185029Spjd{ 4059209962Smm spa_t *spa; 4060209962Smm char *altroot = NULL; 4061219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 4062219089Spjd zpool_rewind_policy_t policy; 4063219089Spjd uint64_t mode = spa_mode_global; 4064219089Spjd uint64_t readonly = B_FALSE; 4065209962Smm int error; 4066209962Smm nvlist_t *nvroot; 4067209962Smm nvlist_t **spares, **l2cache; 4068209962Smm uint_t nspares, nl2cache; 4069209962Smm 4070209962Smm /* 4071209962Smm * If a pool with this name exists, return failure. 4072209962Smm */ 4073209962Smm mutex_enter(&spa_namespace_lock); 4074219089Spjd if (spa_lookup(pool) != NULL) { 4075209962Smm mutex_exit(&spa_namespace_lock); 4076249195Smm return (SET_ERROR(EEXIST)); 4077209962Smm } 4078209962Smm 4079209962Smm /* 4080209962Smm * Create and initialize the spa structure. 4081209962Smm */ 4082209962Smm (void) nvlist_lookup_string(props, 4083209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4084219089Spjd (void) nvlist_lookup_uint64(props, 4085219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4086219089Spjd if (readonly) 4087219089Spjd mode = FREAD; 4088219089Spjd spa = spa_add(pool, config, altroot); 4089219089Spjd spa->spa_import_flags = flags; 4090209962Smm 4091209962Smm /* 4092219089Spjd * Verbatim import - Take a pool and insert it into the namespace 4093219089Spjd * as if it had been loaded at boot. 4094219089Spjd */ 4095219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4096219089Spjd if (props != NULL) 4097219089Spjd spa_configfile_set(spa, props, B_FALSE); 4098219089Spjd 4099219089Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 4100219089Spjd 4101219089Spjd mutex_exit(&spa_namespace_lock); 4102248571Smm spa_history_log_version(spa, "import"); 4103219089Spjd 4104219089Spjd return (0); 4105219089Spjd } 4106219089Spjd 4107219089Spjd spa_activate(spa, mode); 4108219089Spjd 4109219089Spjd /* 4110209962Smm * Don't start async tasks until we know everything is healthy. 4111209962Smm */ 4112209962Smm spa_async_suspend(spa); 4113209962Smm 4114219089Spjd zpool_get_rewind_policy(config, &policy); 4115219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 4116219089Spjd state = SPA_LOAD_RECOVER; 4117219089Spjd 4118209962Smm /* 4119209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4120209962Smm * because the user-supplied config is actually the one to trust when 4121209962Smm * doing an import. 4122209962Smm */ 4123219089Spjd if (state != SPA_LOAD_RECOVER) 4124219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4125209962Smm 4126219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4127219089Spjd policy.zrp_request); 4128219089Spjd 4129219089Spjd /* 4130219089Spjd * Propagate anything learned while loading the pool and pass it 4131219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 4132219089Spjd */ 4133219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4134219089Spjd spa->spa_load_info) == 0); 4135219089Spjd 4136209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4137209962Smm /* 4138209962Smm * Toss any existing sparelist, as it doesn't have any validity 4139209962Smm * anymore, and conflicts with spa_has_spare(). 4140209962Smm */ 4141209962Smm if (spa->spa_spares.sav_config) { 4142209962Smm nvlist_free(spa->spa_spares.sav_config); 4143209962Smm spa->spa_spares.sav_config = NULL; 4144209962Smm spa_load_spares(spa); 4145209962Smm } 4146209962Smm if (spa->spa_l2cache.sav_config) { 4147209962Smm nvlist_free(spa->spa_l2cache.sav_config); 4148209962Smm spa->spa_l2cache.sav_config = NULL; 4149209962Smm spa_load_l2cache(spa); 4150209962Smm } 4151209962Smm 4152209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4153209962Smm &nvroot) == 0); 4154209962Smm if (error == 0) 4155209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4156209962Smm VDEV_ALLOC_SPARE); 4157209962Smm if (error == 0) 4158209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4159209962Smm VDEV_ALLOC_L2CACHE); 4160209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4161209962Smm 4162209962Smm if (props != NULL) 4163209962Smm spa_configfile_set(spa, props, B_FALSE); 4164209962Smm 4165209962Smm if (error != 0 || (props && spa_writeable(spa) && 4166209962Smm (error = spa_prop_set(spa, props)))) { 4167209962Smm spa_unload(spa); 4168209962Smm spa_deactivate(spa); 4169209962Smm spa_remove(spa); 4170209962Smm mutex_exit(&spa_namespace_lock); 4171209962Smm return (error); 4172209962Smm } 4173209962Smm 4174209962Smm spa_async_resume(spa); 4175209962Smm 4176209962Smm /* 4177209962Smm * Override any spares and level 2 cache devices as specified by 4178209962Smm * the user, as these may have correct device names/devids, etc. 4179209962Smm */ 4180209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4181209962Smm &spares, &nspares) == 0) { 4182209962Smm if (spa->spa_spares.sav_config) 4183209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4184209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4185209962Smm else 4186209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4187209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4188209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4189209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4190209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4191209962Smm spa_load_spares(spa); 4192209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4193209962Smm spa->spa_spares.sav_sync = B_TRUE; 4194209962Smm } 4195209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4196209962Smm &l2cache, &nl2cache) == 0) { 4197209962Smm if (spa->spa_l2cache.sav_config) 4198209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4199209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4200209962Smm else 4201209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4202209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4203209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4204209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4205209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4206209962Smm spa_load_l2cache(spa); 4207209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4208209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 4209209962Smm } 4210209962Smm 4211219089Spjd /* 4212219089Spjd * Check for any removed devices. 4213219089Spjd */ 4214219089Spjd if (spa->spa_autoreplace) { 4215219089Spjd spa_aux_check_removed(&spa->spa_spares); 4216219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 4217219089Spjd } 4218219089Spjd 4219209962Smm if (spa_writeable(spa)) { 4220209962Smm /* 4221209962Smm * Update the config cache to include the newly-imported pool. 4222209962Smm */ 4223209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4224209962Smm } 4225209962Smm 4226219089Spjd /* 4227219089Spjd * It's possible that the pool was expanded while it was exported. 4228219089Spjd * We kick off an async task to handle this for us. 4229219089Spjd */ 4230219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4231219089Spjd 4232209962Smm mutex_exit(&spa_namespace_lock); 4233248571Smm spa_history_log_version(spa, "import"); 4234209962Smm 4235219089Spjd#ifdef __FreeBSD__ 4236219089Spjd#ifdef _KERNEL 4237219089Spjd zvol_create_minors(pool); 4238219089Spjd#endif 4239219089Spjd#endif 4240209962Smm return (0); 4241185029Spjd} 4242185029Spjd 4243168404Spjdnvlist_t * 4244168404Spjdspa_tryimport(nvlist_t *tryconfig) 4245168404Spjd{ 4246168404Spjd nvlist_t *config = NULL; 4247168404Spjd char *poolname; 4248168404Spjd spa_t *spa; 4249168404Spjd uint64_t state; 4250208443Smm int error; 4251168404Spjd 4252168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4253168404Spjd return (NULL); 4254168404Spjd 4255168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4256168404Spjd return (NULL); 4257168404Spjd 4258168404Spjd /* 4259168404Spjd * Create and initialize the spa structure. 4260168404Spjd */ 4261168404Spjd mutex_enter(&spa_namespace_lock); 4262219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4263209962Smm spa_activate(spa, FREAD); 4264168404Spjd 4265168404Spjd /* 4266168404Spjd * Pass off the heavy lifting to spa_load(). 4267168404Spjd * Pass TRUE for mosconfig because the user-supplied config 4268168404Spjd * is actually the one to trust when doing an import. 4269168404Spjd */ 4270219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4271168404Spjd 4272168404Spjd /* 4273168404Spjd * If 'tryconfig' was at least parsable, return the current config. 4274168404Spjd */ 4275168404Spjd if (spa->spa_root_vdev != NULL) { 4276168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4277168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4278168404Spjd poolname) == 0); 4279168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4280168404Spjd state) == 0); 4281168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4282168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 4283236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4284236884Smm spa->spa_load_info) == 0); 4285168404Spjd 4286168404Spjd /* 4287185029Spjd * If the bootfs property exists on this pool then we 4288185029Spjd * copy it out so that external consumers can tell which 4289185029Spjd * pools are bootable. 4290168404Spjd */ 4291208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 4292185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4293185029Spjd 4294185029Spjd /* 4295185029Spjd * We have to play games with the name since the 4296185029Spjd * pool was opened as TRYIMPORT_NAME. 4297185029Spjd */ 4298185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 4299185029Spjd spa->spa_bootfs, tmpname) == 0) { 4300185029Spjd char *cp; 4301185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4302185029Spjd 4303185029Spjd cp = strchr(tmpname, '/'); 4304185029Spjd if (cp == NULL) { 4305185029Spjd (void) strlcpy(dsname, tmpname, 4306185029Spjd MAXPATHLEN); 4307185029Spjd } else { 4308185029Spjd (void) snprintf(dsname, MAXPATHLEN, 4309185029Spjd "%s/%s", poolname, ++cp); 4310185029Spjd } 4311185029Spjd VERIFY(nvlist_add_string(config, 4312185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4313185029Spjd kmem_free(dsname, MAXPATHLEN); 4314185029Spjd } 4315185029Spjd kmem_free(tmpname, MAXPATHLEN); 4316185029Spjd } 4317185029Spjd 4318185029Spjd /* 4319185029Spjd * Add the list of hot spares and level 2 cache devices. 4320185029Spjd */ 4321209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4322168404Spjd spa_add_spares(spa, config); 4323185029Spjd spa_add_l2cache(spa, config); 4324209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4325168404Spjd } 4326168404Spjd 4327168404Spjd spa_unload(spa); 4328168404Spjd spa_deactivate(spa); 4329168404Spjd spa_remove(spa); 4330168404Spjd mutex_exit(&spa_namespace_lock); 4331168404Spjd 4332168404Spjd return (config); 4333168404Spjd} 4334168404Spjd 4335168404Spjd/* 4336168404Spjd * Pool export/destroy 4337168404Spjd * 4338168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 4339168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 4340168404Spjd * update the pool state and sync all the labels to disk, removing the 4341207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4342207670Smm * we don't sync the labels or remove the configuration cache. 4343168404Spjd */ 4344168404Spjdstatic int 4345185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4346207670Smm boolean_t force, boolean_t hardforce) 4347168404Spjd{ 4348168404Spjd spa_t *spa; 4349168404Spjd 4350168404Spjd if (oldconfig) 4351168404Spjd *oldconfig = NULL; 4352168404Spjd 4353209962Smm if (!(spa_mode_global & FWRITE)) 4354249195Smm return (SET_ERROR(EROFS)); 4355168404Spjd 4356168404Spjd mutex_enter(&spa_namespace_lock); 4357168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4358168404Spjd mutex_exit(&spa_namespace_lock); 4359249195Smm return (SET_ERROR(ENOENT)); 4360168404Spjd } 4361168404Spjd 4362168404Spjd /* 4363168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 4364168404Spjd * reacquire the namespace lock, and see if we can export. 4365168404Spjd */ 4366168404Spjd spa_open_ref(spa, FTAG); 4367168404Spjd mutex_exit(&spa_namespace_lock); 4368168404Spjd spa_async_suspend(spa); 4369168404Spjd mutex_enter(&spa_namespace_lock); 4370168404Spjd spa_close(spa, FTAG); 4371168404Spjd 4372168404Spjd /* 4373168404Spjd * The pool will be in core if it's openable, 4374168404Spjd * in which case we can modify its state. 4375168404Spjd */ 4376168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4377168404Spjd /* 4378168404Spjd * Objsets may be open only because they're dirty, so we 4379168404Spjd * have to force it to sync before checking spa_refcnt. 4380168404Spjd */ 4381168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 4382168404Spjd 4383168404Spjd /* 4384168404Spjd * A pool cannot be exported or destroyed if there are active 4385168404Spjd * references. If we are resetting a pool, allow references by 4386168404Spjd * fault injection handlers. 4387168404Spjd */ 4388168404Spjd if (!spa_refcount_zero(spa) || 4389168404Spjd (spa->spa_inject_ref != 0 && 4390168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 4391168404Spjd spa_async_resume(spa); 4392168404Spjd mutex_exit(&spa_namespace_lock); 4393249195Smm return (SET_ERROR(EBUSY)); 4394168404Spjd } 4395168404Spjd 4396185029Spjd /* 4397185029Spjd * A pool cannot be exported if it has an active shared spare. 4398185029Spjd * This is to prevent other pools stealing the active spare 4399185029Spjd * from an exported pool. At user's own will, such pool can 4400185029Spjd * be forcedly exported. 4401185029Spjd */ 4402185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 4403185029Spjd spa_has_active_shared_spare(spa)) { 4404185029Spjd spa_async_resume(spa); 4405185029Spjd mutex_exit(&spa_namespace_lock); 4406249195Smm return (SET_ERROR(EXDEV)); 4407185029Spjd } 4408168404Spjd 4409168404Spjd /* 4410168404Spjd * We want this to be reflected on every label, 4411168404Spjd * so mark them all dirty. spa_unload() will do the 4412168404Spjd * final sync that pushes these changes out. 4413168404Spjd */ 4414207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4415185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4416168404Spjd spa->spa_state = new_state; 4417219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 4418219089Spjd TXG_DEFER_SIZE + 1; 4419168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4420185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4421168404Spjd } 4422168404Spjd } 4423168404Spjd 4424185029Spjd spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4425185029Spjd 4426168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4427168404Spjd spa_unload(spa); 4428168404Spjd spa_deactivate(spa); 4429168404Spjd } 4430168404Spjd 4431168404Spjd if (oldconfig && spa->spa_config) 4432168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4433168404Spjd 4434168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 4435207670Smm if (!hardforce) 4436207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 4437168404Spjd spa_remove(spa); 4438168404Spjd } 4439168404Spjd mutex_exit(&spa_namespace_lock); 4440168404Spjd 4441168404Spjd return (0); 4442168404Spjd} 4443168404Spjd 4444168404Spjd/* 4445168404Spjd * Destroy a storage pool. 4446168404Spjd */ 4447168404Spjdint 4448168404Spjdspa_destroy(char *pool) 4449168404Spjd{ 4450207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4451207670Smm B_FALSE, B_FALSE)); 4452168404Spjd} 4453168404Spjd 4454168404Spjd/* 4455168404Spjd * Export a storage pool. 4456168404Spjd */ 4457168404Spjdint 4458207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4459207670Smm boolean_t hardforce) 4460168404Spjd{ 4461207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4462207670Smm force, hardforce)); 4463168404Spjd} 4464168404Spjd 4465168404Spjd/* 4466168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 4467168404Spjd * from the namespace in any way. 4468168404Spjd */ 4469168404Spjdint 4470168404Spjdspa_reset(char *pool) 4471168404Spjd{ 4472185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4473207670Smm B_FALSE, B_FALSE)); 4474168404Spjd} 4475168404Spjd 4476168404Spjd/* 4477168404Spjd * ========================================================================== 4478168404Spjd * Device manipulation 4479168404Spjd * ========================================================================== 4480168404Spjd */ 4481168404Spjd 4482168404Spjd/* 4483185029Spjd * Add a device to a storage pool. 4484168404Spjd */ 4485168404Spjdint 4486168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4487168404Spjd{ 4488219089Spjd uint64_t txg, id; 4489209962Smm int error; 4490168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4491168404Spjd vdev_t *vd, *tvd; 4492185029Spjd nvlist_t **spares, **l2cache; 4493185029Spjd uint_t nspares, nl2cache; 4494168404Spjd 4495219089Spjd ASSERT(spa_writeable(spa)); 4496219089Spjd 4497168404Spjd txg = spa_vdev_enter(spa); 4498168404Spjd 4499168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4500168404Spjd VDEV_ALLOC_ADD)) != 0) 4501168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4502168404Spjd 4503185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4504168404Spjd 4505185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4506185029Spjd &nspares) != 0) 4507168404Spjd nspares = 0; 4508168404Spjd 4509185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4510185029Spjd &nl2cache) != 0) 4511185029Spjd nl2cache = 0; 4512185029Spjd 4513185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4514168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4515168404Spjd 4516185029Spjd if (vd->vdev_children != 0 && 4517185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 4518185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4519168404Spjd 4520168404Spjd /* 4521185029Spjd * We must validate the spares and l2cache devices after checking the 4522185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4523168404Spjd */ 4524185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4525168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4526168404Spjd 4527168404Spjd /* 4528168404Spjd * Transfer each new top-level vdev from vd to rvd. 4529168404Spjd */ 4530209962Smm for (int c = 0; c < vd->vdev_children; c++) { 4531219089Spjd 4532219089Spjd /* 4533219089Spjd * Set the vdev id to the first hole, if one exists. 4534219089Spjd */ 4535219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 4536219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 4537219089Spjd vdev_free(rvd->vdev_child[id]); 4538219089Spjd break; 4539219089Spjd } 4540219089Spjd } 4541168404Spjd tvd = vd->vdev_child[c]; 4542168404Spjd vdev_remove_child(vd, tvd); 4543219089Spjd tvd->vdev_id = id; 4544168404Spjd vdev_add_child(rvd, tvd); 4545168404Spjd vdev_config_dirty(tvd); 4546168404Spjd } 4547168404Spjd 4548168404Spjd if (nspares != 0) { 4549185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4550185029Spjd ZPOOL_CONFIG_SPARES); 4551168404Spjd spa_load_spares(spa); 4552185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4553168404Spjd } 4554168404Spjd 4555185029Spjd if (nl2cache != 0) { 4556185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4557185029Spjd ZPOOL_CONFIG_L2CACHE); 4558185029Spjd spa_load_l2cache(spa); 4559185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4560185029Spjd } 4561185029Spjd 4562168404Spjd /* 4563168404Spjd * We have to be careful when adding new vdevs to an existing pool. 4564168404Spjd * If other threads start allocating from these vdevs before we 4565168404Spjd * sync the config cache, and we lose power, then upon reboot we may 4566168404Spjd * fail to open the pool because there are DVAs that the config cache 4567168404Spjd * can't translate. Therefore, we first add the vdevs without 4568168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4569168404Spjd * and then let spa_config_update() initialize the new metaslabs. 4570168404Spjd * 4571168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 4572168404Spjd * if we lose power at any point in this sequence, the remaining 4573168404Spjd * steps will be completed the next time we load the pool. 4574168404Spjd */ 4575168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 4576168404Spjd 4577168404Spjd mutex_enter(&spa_namespace_lock); 4578168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4579168404Spjd mutex_exit(&spa_namespace_lock); 4580168404Spjd 4581168404Spjd return (0); 4582168404Spjd} 4583168404Spjd 4584168404Spjd/* 4585168404Spjd * Attach a device to a mirror. The arguments are the path to any device 4586168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 4587168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 4588168404Spjd * 4589168404Spjd * If 'replacing' is specified, the new device is intended to replace the 4590168404Spjd * existing device; in this case the two devices are made into their own 4591185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 4592168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 4593168404Spjd * extra rules: you can't attach to it after it's been created, and upon 4594168404Spjd * completion of resilvering, the first disk (the one being replaced) 4595168404Spjd * is automatically detached. 4596168404Spjd */ 4597168404Spjdint 4598168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4599168404Spjd{ 4600219089Spjd uint64_t txg, dtl_max_txg; 4601168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4602168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4603168404Spjd vdev_ops_t *pvops; 4604185029Spjd char *oldvdpath, *newvdpath; 4605185029Spjd int newvd_isspare; 4606185029Spjd int error; 4607168404Spjd 4608219089Spjd ASSERT(spa_writeable(spa)); 4609219089Spjd 4610168404Spjd txg = spa_vdev_enter(spa); 4611168404Spjd 4612185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4613168404Spjd 4614168404Spjd if (oldvd == NULL) 4615168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4616168404Spjd 4617168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 4618168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4619168404Spjd 4620168404Spjd pvd = oldvd->vdev_parent; 4621168404Spjd 4622168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4623230514Smm VDEV_ALLOC_ATTACH)) != 0) 4624185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4625185029Spjd 4626185029Spjd if (newrootvd->vdev_children != 1) 4627168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4628168404Spjd 4629168404Spjd newvd = newrootvd->vdev_child[0]; 4630168404Spjd 4631168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 4632168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4633168404Spjd 4634168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4635168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 4636168404Spjd 4637185029Spjd /* 4638185029Spjd * Spares can't replace logs 4639185029Spjd */ 4640185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4641185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4642185029Spjd 4643168404Spjd if (!replacing) { 4644168404Spjd /* 4645168404Spjd * For attach, the only allowable parent is a mirror or the root 4646168404Spjd * vdev. 4647168404Spjd */ 4648168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 4649168404Spjd pvd->vdev_ops != &vdev_root_ops) 4650168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4651168404Spjd 4652168404Spjd pvops = &vdev_mirror_ops; 4653168404Spjd } else { 4654168404Spjd /* 4655168404Spjd * Active hot spares can only be replaced by inactive hot 4656168404Spjd * spares. 4657168404Spjd */ 4658168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4659219089Spjd oldvd->vdev_isspare && 4660168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 4661168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4662168404Spjd 4663168404Spjd /* 4664168404Spjd * If the source is a hot spare, and the parent isn't already a 4665168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 4666168404Spjd * want to create a replacing vdev. The user is not allowed to 4667168404Spjd * attach to a spared vdev child unless the 'isspare' state is 4668168404Spjd * the same (spare replaces spare, non-spare replaces 4669168404Spjd * non-spare). 4670168404Spjd */ 4671219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 4672219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4673168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4674219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 4675219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 4676168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4677219089Spjd } 4678219089Spjd 4679219089Spjd if (newvd->vdev_isspare) 4680168404Spjd pvops = &vdev_spare_ops; 4681168404Spjd else 4682168404Spjd pvops = &vdev_replacing_ops; 4683168404Spjd } 4684168404Spjd 4685168404Spjd /* 4686219089Spjd * Make sure the new device is big enough. 4687168404Spjd */ 4688219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4689168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4690168404Spjd 4691168404Spjd /* 4692168404Spjd * The new device cannot have a higher alignment requirement 4693168404Spjd * than the top-level vdev. 4694168404Spjd */ 4695168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4696168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4697168404Spjd 4698168404Spjd /* 4699168404Spjd * If this is an in-place replacement, update oldvd's path and devid 4700168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 4701168404Spjd */ 4702168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4703168404Spjd spa_strfree(oldvd->vdev_path); 4704168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4705168404Spjd KM_SLEEP); 4706168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 4707168404Spjd newvd->vdev_path, "old"); 4708168404Spjd if (oldvd->vdev_devid != NULL) { 4709168404Spjd spa_strfree(oldvd->vdev_devid); 4710168404Spjd oldvd->vdev_devid = NULL; 4711168404Spjd } 4712168404Spjd } 4713168404Spjd 4714219089Spjd /* mark the device being resilvered */ 4715254112Sdelphij newvd->vdev_resilver_txg = txg; 4716219089Spjd 4717168404Spjd /* 4718168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 4719168404Spjd * mirror/replacing/spare vdev above oldvd. 4720168404Spjd */ 4721168404Spjd if (pvd->vdev_ops != pvops) 4722168404Spjd pvd = vdev_add_parent(oldvd, pvops); 4723168404Spjd 4724168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 4725168404Spjd ASSERT(pvd->vdev_ops == pvops); 4726168404Spjd ASSERT(oldvd->vdev_parent == pvd); 4727168404Spjd 4728168404Spjd /* 4729168404Spjd * Extract the new device from its root and add it to pvd. 4730168404Spjd */ 4731168404Spjd vdev_remove_child(newrootvd, newvd); 4732168404Spjd newvd->vdev_id = pvd->vdev_children; 4733219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 4734168404Spjd vdev_add_child(pvd, newvd); 4735168404Spjd 4736168404Spjd tvd = newvd->vdev_top; 4737168404Spjd ASSERT(pvd->vdev_top == tvd); 4738168404Spjd ASSERT(tvd->vdev_parent == rvd); 4739168404Spjd 4740168404Spjd vdev_config_dirty(tvd); 4741168404Spjd 4742168404Spjd /* 4743219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4744219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 4745219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 4746168404Spjd */ 4747219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4748168404Spjd 4749219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4750219089Spjd dtl_max_txg - TXG_INITIAL); 4751168404Spjd 4752209962Smm if (newvd->vdev_isspare) { 4753168404Spjd spa_spare_activate(newvd); 4754209962Smm spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4755209962Smm } 4756209962Smm 4757185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 4758185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 4759185029Spjd newvd_isspare = newvd->vdev_isspare; 4760168404Spjd 4761168404Spjd /* 4762168404Spjd * Mark newvd's DTL dirty in this txg. 4763168404Spjd */ 4764168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 4765168404Spjd 4766219089Spjd /* 4767219089Spjd * Restart the resilver 4768219089Spjd */ 4769219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4770168404Spjd 4771219089Spjd /* 4772219089Spjd * Commit the config 4773219089Spjd */ 4774219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4775185029Spjd 4776248571Smm spa_history_log_internal(spa, "vdev attach", NULL, 4777219089Spjd "%s vdev=%s %s vdev=%s", 4778219089Spjd replacing && newvd_isspare ? "spare in" : 4779219089Spjd replacing ? "replace" : "attach", newvdpath, 4780219089Spjd replacing ? "for" : "to", oldvdpath); 4781219089Spjd 4782185029Spjd spa_strfree(oldvdpath); 4783185029Spjd spa_strfree(newvdpath); 4784185029Spjd 4785219089Spjd if (spa->spa_bootfs) 4786219089Spjd spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4787168404Spjd 4788168404Spjd return (0); 4789168404Spjd} 4790168404Spjd 4791168404Spjd/* 4792168404Spjd * Detach a device from a mirror or replacing vdev. 4793251631Sdelphij * 4794168404Spjd * If 'replace_done' is specified, only detach if the parent 4795168404Spjd * is a replacing vdev. 4796168404Spjd */ 4797168404Spjdint 4798209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4799168404Spjd{ 4800168404Spjd uint64_t txg; 4801209962Smm int error; 4802168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4803168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 4804168404Spjd boolean_t unspare = B_FALSE; 4805247187Smm uint64_t unspare_guid = 0; 4806219089Spjd char *vdpath; 4807168404Spjd 4808219089Spjd ASSERT(spa_writeable(spa)); 4809219089Spjd 4810168404Spjd txg = spa_vdev_enter(spa); 4811168404Spjd 4812185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4813168404Spjd 4814168404Spjd if (vd == NULL) 4815168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4816168404Spjd 4817168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 4818168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4819168404Spjd 4820168404Spjd pvd = vd->vdev_parent; 4821168404Spjd 4822168404Spjd /* 4823209962Smm * If the parent/child relationship is not as expected, don't do it. 4824209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4825209962Smm * vdev that's replacing B with C. The user's intent in replacing 4826209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 4827209962Smm * the replace by detaching C, the expected behavior is to end up 4828209962Smm * M(A,B). But suppose that right after deciding to detach C, 4829209962Smm * the replacement of B completes. We would have M(A,C), and then 4830209962Smm * ask to detach C, which would leave us with just A -- not what 4831209962Smm * the user wanted. To prevent this, we make sure that the 4832209962Smm * parent/child relationship hasn't changed -- in this example, 4833209962Smm * that C's parent is still the replacing vdev R. 4834209962Smm */ 4835209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 4836209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4837209962Smm 4838209962Smm /* 4839219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 4840168404Spjd */ 4841219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4842219089Spjd pvd->vdev_ops != &vdev_spare_ops) 4843219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4844168404Spjd 4845168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4846185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 4847168404Spjd 4848168404Spjd /* 4849168404Spjd * Only mirror, replacing, and spare vdevs support detach. 4850168404Spjd */ 4851168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 4852168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 4853168404Spjd pvd->vdev_ops != &vdev_spare_ops) 4854168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4855168404Spjd 4856168404Spjd /* 4857209962Smm * If this device has the only valid copy of some data, 4858209962Smm * we cannot safely detach it. 4859168404Spjd */ 4860209962Smm if (vdev_dtl_required(vd)) 4861168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4862168404Spjd 4863209962Smm ASSERT(pvd->vdev_children >= 2); 4864168404Spjd 4865168404Spjd /* 4866185029Spjd * If we are detaching the second disk from a replacing vdev, then 4867185029Spjd * check to see if we changed the original vdev's path to have "/old" 4868185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 4869168404Spjd */ 4870219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4871219089Spjd vd->vdev_path != NULL) { 4872219089Spjd size_t len = strlen(vd->vdev_path); 4873219089Spjd 4874219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 4875219089Spjd cvd = pvd->vdev_child[c]; 4876219089Spjd 4877219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 4878219089Spjd continue; 4879219089Spjd 4880219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4881219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 4882219089Spjd spa_strfree(cvd->vdev_path); 4883219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 4884219089Spjd break; 4885219089Spjd } 4886185029Spjd } 4887185029Spjd } 4888168404Spjd 4889168404Spjd /* 4890168404Spjd * If we are detaching the original disk from a spare, then it implies 4891168404Spjd * that the spare should become a real disk, and be removed from the 4892168404Spjd * active spare list for the pool. 4893168404Spjd */ 4894168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4895219089Spjd vd->vdev_id == 0 && 4896219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4897168404Spjd unspare = B_TRUE; 4898168404Spjd 4899168404Spjd /* 4900168404Spjd * Erase the disk labels so the disk can be used for other things. 4901168404Spjd * This must be done after all other error cases are handled, 4902168404Spjd * but before we disembowel vd (so we can still do I/O to it). 4903168404Spjd * But if we can't do it, don't treat the error as fatal -- 4904168404Spjd * it may be that the unwritability of the disk is the reason 4905168404Spjd * it's being detached! 4906168404Spjd */ 4907168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4908168404Spjd 4909168404Spjd /* 4910168404Spjd * Remove vd from its parent and compact the parent's children. 4911168404Spjd */ 4912168404Spjd vdev_remove_child(pvd, vd); 4913168404Spjd vdev_compact_children(pvd); 4914168404Spjd 4915168404Spjd /* 4916168404Spjd * Remember one of the remaining children so we can get tvd below. 4917168404Spjd */ 4918219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4919168404Spjd 4920168404Spjd /* 4921168404Spjd * If we need to remove the remaining child from the list of hot spares, 4922209962Smm * do it now, marking the vdev as no longer a spare in the process. 4923209962Smm * We must do this before vdev_remove_parent(), because that can 4924209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 4925209962Smm * reason, we must remove the spare now, in the same txg as the detach; 4926209962Smm * otherwise someone could attach a new sibling, change the GUID, and 4927209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4928168404Spjd */ 4929168404Spjd if (unspare) { 4930168404Spjd ASSERT(cvd->vdev_isspare); 4931168404Spjd spa_spare_remove(cvd); 4932168404Spjd unspare_guid = cvd->vdev_guid; 4933209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4934219089Spjd cvd->vdev_unspare = B_TRUE; 4935168404Spjd } 4936168404Spjd 4937168404Spjd /* 4938168404Spjd * If the parent mirror/replacing vdev only has one child, 4939168404Spjd * the parent is no longer needed. Remove it from the tree. 4940168404Spjd */ 4941219089Spjd if (pvd->vdev_children == 1) { 4942219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 4943219089Spjd cvd->vdev_unspare = B_FALSE; 4944168404Spjd vdev_remove_parent(cvd); 4945219089Spjd } 4946168404Spjd 4947219089Spjd 4948168404Spjd /* 4949168404Spjd * We don't set tvd until now because the parent we just removed 4950168404Spjd * may have been the previous top-level vdev. 4951168404Spjd */ 4952168404Spjd tvd = cvd->vdev_top; 4953168404Spjd ASSERT(tvd->vdev_parent == rvd); 4954168404Spjd 4955168404Spjd /* 4956168404Spjd * Reevaluate the parent vdev state. 4957168404Spjd */ 4958185029Spjd vdev_propagate_state(cvd); 4959168404Spjd 4960168404Spjd /* 4961219089Spjd * If the 'autoexpand' property is set on the pool then automatically 4962219089Spjd * try to expand the size of the pool. For example if the device we 4963219089Spjd * just detached was smaller than the others, it may be possible to 4964219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4965219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 4966168404Spjd */ 4967219089Spjd if (spa->spa_autoexpand) { 4968219089Spjd vdev_reopen(tvd); 4969219089Spjd vdev_expand(tvd, txg); 4970219089Spjd } 4971168404Spjd 4972168404Spjd vdev_config_dirty(tvd); 4973168404Spjd 4974168404Spjd /* 4975168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4976168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 4977168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 4978168404Spjd * prevent vd from being accessed after it's freed. 4979168404Spjd */ 4980219089Spjd vdpath = spa_strdup(vd->vdev_path); 4981209962Smm for (int t = 0; t < TXG_SIZE; t++) 4982168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4983168404Spjd vd->vdev_detached = B_TRUE; 4984168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 4985168404Spjd 4986185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4987185029Spjd 4988219089Spjd /* hang on to the spa before we release the lock */ 4989219089Spjd spa_open_ref(spa, FTAG); 4990219089Spjd 4991168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 4992168404Spjd 4993248571Smm spa_history_log_internal(spa, "detach", NULL, 4994219089Spjd "vdev=%s", vdpath); 4995219089Spjd spa_strfree(vdpath); 4996219089Spjd 4997168404Spjd /* 4998168404Spjd * If this was the removal of the original device in a hot spare vdev, 4999168404Spjd * then we want to go through and remove the device from the hot spare 5000168404Spjd * list of every other pool. 5001168404Spjd */ 5002168404Spjd if (unspare) { 5003219089Spjd spa_t *altspa = NULL; 5004219089Spjd 5005168404Spjd mutex_enter(&spa_namespace_lock); 5006219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 5007219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 5008219089Spjd altspa == spa) 5009168404Spjd continue; 5010219089Spjd 5011219089Spjd spa_open_ref(altspa, FTAG); 5012185029Spjd mutex_exit(&spa_namespace_lock); 5013219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5014185029Spjd mutex_enter(&spa_namespace_lock); 5015219089Spjd spa_close(altspa, FTAG); 5016168404Spjd } 5017168404Spjd mutex_exit(&spa_namespace_lock); 5018219089Spjd 5019219089Spjd /* search the rest of the vdevs for spares to remove */ 5020219089Spjd spa_vdev_resilver_done(spa); 5021168404Spjd } 5022168404Spjd 5023219089Spjd /* all done with the spa; OK to release */ 5024219089Spjd mutex_enter(&spa_namespace_lock); 5025219089Spjd spa_close(spa, FTAG); 5026219089Spjd mutex_exit(&spa_namespace_lock); 5027219089Spjd 5028168404Spjd return (error); 5029168404Spjd} 5030168404Spjd 5031219089Spjd/* 5032219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 5033219089Spjd */ 5034219089Spjdint 5035219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5036219089Spjd nvlist_t *props, boolean_t exp) 5037219089Spjd{ 5038219089Spjd int error = 0; 5039219089Spjd uint64_t txg, *glist; 5040219089Spjd spa_t *newspa; 5041219089Spjd uint_t c, children, lastlog; 5042219089Spjd nvlist_t **child, *nvl, *tmp; 5043219089Spjd dmu_tx_t *tx; 5044219089Spjd char *altroot = NULL; 5045219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5046219089Spjd boolean_t activate_slog; 5047219089Spjd 5048219089Spjd ASSERT(spa_writeable(spa)); 5049219089Spjd 5050219089Spjd txg = spa_vdev_enter(spa); 5051219089Spjd 5052219089Spjd /* clear the log and flush everything up to now */ 5053219089Spjd activate_slog = spa_passivate_log(spa); 5054219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5055219089Spjd error = spa_offline_log(spa); 5056219089Spjd txg = spa_vdev_config_enter(spa); 5057219089Spjd 5058219089Spjd if (activate_slog) 5059219089Spjd spa_activate_log(spa); 5060219089Spjd 5061219089Spjd if (error != 0) 5062219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5063219089Spjd 5064219089Spjd /* check new spa name before going any further */ 5065219089Spjd if (spa_lookup(newname) != NULL) 5066219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5067219089Spjd 5068219089Spjd /* 5069219089Spjd * scan through all the children to ensure they're all mirrors 5070219089Spjd */ 5071219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5072219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5073219089Spjd &children) != 0) 5074219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5075219089Spjd 5076219089Spjd /* first, check to ensure we've got the right child count */ 5077219089Spjd rvd = spa->spa_root_vdev; 5078219089Spjd lastlog = 0; 5079219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 5080219089Spjd vdev_t *vd = rvd->vdev_child[c]; 5081219089Spjd 5082219089Spjd /* don't count the holes & logs as children */ 5083219089Spjd if (vd->vdev_islog || vd->vdev_ishole) { 5084219089Spjd if (lastlog == 0) 5085219089Spjd lastlog = c; 5086219089Spjd continue; 5087219089Spjd } 5088219089Spjd 5089219089Spjd lastlog = 0; 5090219089Spjd } 5091219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5092219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5093219089Spjd 5094219089Spjd /* next, ensure no spare or cache devices are part of the split */ 5095219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5096219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5097219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5098219089Spjd 5099219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5100219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5101219089Spjd 5102219089Spjd /* then, loop over each vdev and validate it */ 5103219089Spjd for (c = 0; c < children; c++) { 5104219089Spjd uint64_t is_hole = 0; 5105219089Spjd 5106219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5107219089Spjd &is_hole); 5108219089Spjd 5109219089Spjd if (is_hole != 0) { 5110219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5111219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5112219089Spjd continue; 5113219089Spjd } else { 5114249195Smm error = SET_ERROR(EINVAL); 5115219089Spjd break; 5116219089Spjd } 5117219089Spjd } 5118219089Spjd 5119219089Spjd /* which disk is going to be split? */ 5120219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5121219089Spjd &glist[c]) != 0) { 5122249195Smm error = SET_ERROR(EINVAL); 5123219089Spjd break; 5124219089Spjd } 5125219089Spjd 5126219089Spjd /* look it up in the spa */ 5127219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5128219089Spjd if (vml[c] == NULL) { 5129249195Smm error = SET_ERROR(ENODEV); 5130219089Spjd break; 5131219089Spjd } 5132219089Spjd 5133219089Spjd /* make sure there's nothing stopping the split */ 5134219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5135219089Spjd vml[c]->vdev_islog || 5136219089Spjd vml[c]->vdev_ishole || 5137219089Spjd vml[c]->vdev_isspare || 5138219089Spjd vml[c]->vdev_isl2cache || 5139219089Spjd !vdev_writeable(vml[c]) || 5140219089Spjd vml[c]->vdev_children != 0 || 5141219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5142219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5143249195Smm error = SET_ERROR(EINVAL); 5144219089Spjd break; 5145219089Spjd } 5146219089Spjd 5147219089Spjd if (vdev_dtl_required(vml[c])) { 5148249195Smm error = SET_ERROR(EBUSY); 5149219089Spjd break; 5150219089Spjd } 5151219089Spjd 5152219089Spjd /* we need certain info from the top level */ 5153219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5154219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 5155219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5156219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 5157219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5158219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 5159219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5160219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 5161219089Spjd } 5162219089Spjd 5163219089Spjd if (error != 0) { 5164219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5165219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5166219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5167219089Spjd } 5168219089Spjd 5169219089Spjd /* stop writers from using the disks */ 5170219089Spjd for (c = 0; c < children; c++) { 5171219089Spjd if (vml[c] != NULL) 5172219089Spjd vml[c]->vdev_offline = B_TRUE; 5173219089Spjd } 5174219089Spjd vdev_reopen(spa->spa_root_vdev); 5175219089Spjd 5176219089Spjd /* 5177219089Spjd * Temporarily record the splitting vdevs in the spa config. This 5178219089Spjd * will disappear once the config is regenerated. 5179219089Spjd */ 5180219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5181219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5182219089Spjd glist, children) == 0); 5183219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5184219089Spjd 5185219089Spjd mutex_enter(&spa->spa_props_lock); 5186219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5187219089Spjd nvl) == 0); 5188219089Spjd mutex_exit(&spa->spa_props_lock); 5189219089Spjd spa->spa_config_splitting = nvl; 5190219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5191219089Spjd 5192219089Spjd /* configure and create the new pool */ 5193219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5194219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5195219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5196219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5197219089Spjd spa_version(spa)) == 0); 5198219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5199219089Spjd spa->spa_config_txg) == 0); 5200219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5201219089Spjd spa_generate_guid(NULL)) == 0); 5202219089Spjd (void) nvlist_lookup_string(props, 5203219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5204219089Spjd 5205219089Spjd /* add the new pool to the namespace */ 5206219089Spjd newspa = spa_add(newname, config, altroot); 5207219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 5208219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 5209219089Spjd 5210219089Spjd /* release the spa config lock, retaining the namespace lock */ 5211219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5212219089Spjd 5213219089Spjd if (zio_injection_enabled) 5214219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 5215219089Spjd 5216219089Spjd spa_activate(newspa, spa_mode_global); 5217219089Spjd spa_async_suspend(newspa); 5218219089Spjd 5219219089Spjd#ifndef sun 5220219089Spjd /* mark that we are creating new spa by splitting */ 5221219089Spjd newspa->spa_splitting_newspa = B_TRUE; 5222219089Spjd#endif 5223219089Spjd /* create the new pool from the disks of the original pool */ 5224219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5225219089Spjd#ifndef sun 5226219089Spjd newspa->spa_splitting_newspa = B_FALSE; 5227219089Spjd#endif 5228219089Spjd if (error) 5229219089Spjd goto out; 5230219089Spjd 5231219089Spjd /* if that worked, generate a real config for the new pool */ 5232219089Spjd if (newspa->spa_root_vdev != NULL) { 5233219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5234219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 5235219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5236219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5237219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5238219089Spjd B_TRUE)); 5239219089Spjd } 5240219089Spjd 5241219089Spjd /* set the props */ 5242219089Spjd if (props != NULL) { 5243219089Spjd spa_configfile_set(newspa, props, B_FALSE); 5244219089Spjd error = spa_prop_set(newspa, props); 5245219089Spjd if (error) 5246219089Spjd goto out; 5247219089Spjd } 5248219089Spjd 5249219089Spjd /* flush everything */ 5250219089Spjd txg = spa_vdev_config_enter(newspa); 5251219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 5252219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5253219089Spjd 5254219089Spjd if (zio_injection_enabled) 5255219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 5256219089Spjd 5257219089Spjd spa_async_resume(newspa); 5258219089Spjd 5259219089Spjd /* finally, update the original pool's config */ 5260219089Spjd txg = spa_vdev_config_enter(spa); 5261219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5262219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 5263219089Spjd if (error != 0) 5264219089Spjd dmu_tx_abort(tx); 5265219089Spjd for (c = 0; c < children; c++) { 5266219089Spjd if (vml[c] != NULL) { 5267219089Spjd vdev_split(vml[c]); 5268219089Spjd if (error == 0) 5269248571Smm spa_history_log_internal(spa, "detach", tx, 5270248571Smm "vdev=%s", vml[c]->vdev_path); 5271219089Spjd vdev_free(vml[c]); 5272219089Spjd } 5273219089Spjd } 5274219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5275219089Spjd spa->spa_config_splitting = NULL; 5276219089Spjd nvlist_free(nvl); 5277219089Spjd if (error == 0) 5278219089Spjd dmu_tx_commit(tx); 5279219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 5280219089Spjd 5281219089Spjd if (zio_injection_enabled) 5282219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 5283219089Spjd 5284219089Spjd /* split is complete; log a history record */ 5285248571Smm spa_history_log_internal(newspa, "split", NULL, 5286248571Smm "from pool %s", spa_name(spa)); 5287219089Spjd 5288219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5289219089Spjd 5290219089Spjd /* if we're not going to mount the filesystems in userland, export */ 5291219089Spjd if (exp) 5292219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5293219089Spjd B_FALSE, B_FALSE); 5294219089Spjd 5295219089Spjd return (error); 5296219089Spjd 5297219089Spjdout: 5298219089Spjd spa_unload(newspa); 5299219089Spjd spa_deactivate(newspa); 5300219089Spjd spa_remove(newspa); 5301219089Spjd 5302219089Spjd txg = spa_vdev_config_enter(spa); 5303219089Spjd 5304219089Spjd /* re-online all offlined disks */ 5305219089Spjd for (c = 0; c < children; c++) { 5306219089Spjd if (vml[c] != NULL) 5307219089Spjd vml[c]->vdev_offline = B_FALSE; 5308219089Spjd } 5309219089Spjd vdev_reopen(spa->spa_root_vdev); 5310219089Spjd 5311219089Spjd nvlist_free(spa->spa_config_splitting); 5312219089Spjd spa->spa_config_splitting = NULL; 5313219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 5314219089Spjd 5315219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5316219089Spjd return (error); 5317219089Spjd} 5318219089Spjd 5319185029Spjdstatic nvlist_t * 5320185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5321185029Spjd{ 5322185029Spjd for (int i = 0; i < count; i++) { 5323185029Spjd uint64_t guid; 5324185029Spjd 5325185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5326185029Spjd &guid) == 0); 5327185029Spjd 5328185029Spjd if (guid == target_guid) 5329185029Spjd return (nvpp[i]); 5330185029Spjd } 5331185029Spjd 5332185029Spjd return (NULL); 5333185029Spjd} 5334185029Spjd 5335185029Spjdstatic void 5336185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5337185029Spjd nvlist_t *dev_to_remove) 5338185029Spjd{ 5339185029Spjd nvlist_t **newdev = NULL; 5340185029Spjd 5341185029Spjd if (count > 1) 5342185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5343185029Spjd 5344185029Spjd for (int i = 0, j = 0; i < count; i++) { 5345185029Spjd if (dev[i] == dev_to_remove) 5346185029Spjd continue; 5347185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5348185029Spjd } 5349185029Spjd 5350185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5351185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5352185029Spjd 5353185029Spjd for (int i = 0; i < count - 1; i++) 5354185029Spjd nvlist_free(newdev[i]); 5355185029Spjd 5356185029Spjd if (count > 1) 5357185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 5358185029Spjd} 5359185029Spjd 5360168404Spjd/* 5361219089Spjd * Evacuate the device. 5362219089Spjd */ 5363219089Spjdstatic int 5364219089Spjdspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5365219089Spjd{ 5366219089Spjd uint64_t txg; 5367219089Spjd int error = 0; 5368219089Spjd 5369219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5370219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5371219089Spjd ASSERT(vd == vd->vdev_top); 5372219089Spjd 5373219089Spjd /* 5374219089Spjd * Evacuate the device. We don't hold the config lock as writer 5375219089Spjd * since we need to do I/O but we do keep the 5376219089Spjd * spa_namespace_lock held. Once this completes the device 5377219089Spjd * should no longer have any blocks allocated on it. 5378219089Spjd */ 5379219089Spjd if (vd->vdev_islog) { 5380219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5381219089Spjd error = spa_offline_log(spa); 5382219089Spjd } else { 5383249195Smm error = SET_ERROR(ENOTSUP); 5384219089Spjd } 5385219089Spjd 5386219089Spjd if (error) 5387219089Spjd return (error); 5388219089Spjd 5389219089Spjd /* 5390219089Spjd * The evacuation succeeded. Remove any remaining MOS metadata 5391219089Spjd * associated with this vdev, and wait for these changes to sync. 5392219089Spjd */ 5393240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 5394219089Spjd txg = spa_vdev_config_enter(spa); 5395219089Spjd vd->vdev_removing = B_TRUE; 5396219089Spjd vdev_dirty(vd, 0, NULL, txg); 5397219089Spjd vdev_config_dirty(vd); 5398219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5399219089Spjd 5400219089Spjd return (0); 5401219089Spjd} 5402219089Spjd 5403219089Spjd/* 5404219089Spjd * Complete the removal by cleaning up the namespace. 5405219089Spjd */ 5406219089Spjdstatic void 5407219089Spjdspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5408219089Spjd{ 5409219089Spjd vdev_t *rvd = spa->spa_root_vdev; 5410219089Spjd uint64_t id = vd->vdev_id; 5411219089Spjd boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5412219089Spjd 5413219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5414219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5415219089Spjd ASSERT(vd == vd->vdev_top); 5416219089Spjd 5417219089Spjd /* 5418219089Spjd * Only remove any devices which are empty. 5419219089Spjd */ 5420219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5421219089Spjd return; 5422219089Spjd 5423219089Spjd (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5424219089Spjd 5425219089Spjd if (list_link_active(&vd->vdev_state_dirty_node)) 5426219089Spjd vdev_state_clean(vd); 5427219089Spjd if (list_link_active(&vd->vdev_config_dirty_node)) 5428219089Spjd vdev_config_clean(vd); 5429219089Spjd 5430219089Spjd vdev_free(vd); 5431219089Spjd 5432219089Spjd if (last_vdev) { 5433219089Spjd vdev_compact_children(rvd); 5434219089Spjd } else { 5435219089Spjd vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5436219089Spjd vdev_add_child(rvd, vd); 5437219089Spjd } 5438219089Spjd vdev_config_dirty(rvd); 5439219089Spjd 5440219089Spjd /* 5441219089Spjd * Reassess the health of our root vdev. 5442219089Spjd */ 5443219089Spjd vdev_reopen(rvd); 5444219089Spjd} 5445219089Spjd 5446219089Spjd/* 5447219089Spjd * Remove a device from the pool - 5448219089Spjd * 5449219089Spjd * Removing a device from the vdev namespace requires several steps 5450219089Spjd * and can take a significant amount of time. As a result we use 5451219089Spjd * the spa_vdev_config_[enter/exit] functions which allow us to 5452219089Spjd * grab and release the spa_config_lock while still holding the namespace 5453219089Spjd * lock. During each step the configuration is synced out. 5454251631Sdelphij * 5455251631Sdelphij * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5456251631Sdelphij * devices. 5457219089Spjd */ 5458168404Spjdint 5459168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5460168404Spjd{ 5461168404Spjd vdev_t *vd; 5462219089Spjd metaslab_group_t *mg; 5463185029Spjd nvlist_t **spares, **l2cache, *nv; 5464219089Spjd uint64_t txg = 0; 5465185029Spjd uint_t nspares, nl2cache; 5466185029Spjd int error = 0; 5467209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5468168404Spjd 5469219089Spjd ASSERT(spa_writeable(spa)); 5470219089Spjd 5471209962Smm if (!locked) 5472209962Smm txg = spa_vdev_enter(spa); 5473168404Spjd 5474185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5475168404Spjd 5476185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 5477185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5478185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5479185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5480185029Spjd /* 5481185029Spjd * Only remove the hot spare if it's not currently in use 5482185029Spjd * in this pool. 5483185029Spjd */ 5484185029Spjd if (vd == NULL || unspare) { 5485185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 5486185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5487185029Spjd spa_load_spares(spa); 5488185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5489185029Spjd } else { 5490249195Smm error = SET_ERROR(EBUSY); 5491168404Spjd } 5492185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 5493185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5494185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5495185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5496185029Spjd /* 5497185029Spjd * Cache devices can always be removed. 5498185029Spjd */ 5499185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5500185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5501185029Spjd spa_load_l2cache(spa); 5502185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5503219089Spjd } else if (vd != NULL && vd->vdev_islog) { 5504219089Spjd ASSERT(!locked); 5505219089Spjd ASSERT(vd == vd->vdev_top); 5506219089Spjd 5507219089Spjd /* 5508219089Spjd * XXX - Once we have bp-rewrite this should 5509219089Spjd * become the common case. 5510219089Spjd */ 5511219089Spjd 5512219089Spjd mg = vd->vdev_mg; 5513219089Spjd 5514219089Spjd /* 5515219089Spjd * Stop allocating from this vdev. 5516219089Spjd */ 5517219089Spjd metaslab_group_passivate(mg); 5518219089Spjd 5519219089Spjd /* 5520219089Spjd * Wait for the youngest allocations and frees to sync, 5521219089Spjd * and then wait for the deferral of those frees to finish. 5522219089Spjd */ 5523219089Spjd spa_vdev_config_exit(spa, NULL, 5524219089Spjd txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5525219089Spjd 5526219089Spjd /* 5527219089Spjd * Attempt to evacuate the vdev. 5528219089Spjd */ 5529219089Spjd error = spa_vdev_remove_evacuate(spa, vd); 5530219089Spjd 5531219089Spjd txg = spa_vdev_config_enter(spa); 5532219089Spjd 5533219089Spjd /* 5534219089Spjd * If we couldn't evacuate the vdev, unwind. 5535219089Spjd */ 5536219089Spjd if (error) { 5537219089Spjd metaslab_group_activate(mg); 5538219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5539219089Spjd } 5540219089Spjd 5541219089Spjd /* 5542219089Spjd * Clean up the vdev namespace. 5543219089Spjd */ 5544219089Spjd spa_vdev_remove_from_namespace(spa, vd); 5545219089Spjd 5546185029Spjd } else if (vd != NULL) { 5547185029Spjd /* 5548185029Spjd * Normal vdevs cannot be removed (yet). 5549185029Spjd */ 5550249195Smm error = SET_ERROR(ENOTSUP); 5551168404Spjd } else { 5552185029Spjd /* 5553185029Spjd * There is no vdev of any kind with the specified guid. 5554185029Spjd */ 5555249195Smm error = SET_ERROR(ENOENT); 5556168404Spjd } 5557168404Spjd 5558209962Smm if (!locked) 5559209962Smm return (spa_vdev_exit(spa, NULL, txg, error)); 5560209962Smm 5561209962Smm return (error); 5562168404Spjd} 5563168404Spjd 5564168404Spjd/* 5565185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 5566251631Sdelphij * currently spared, so we can detach it. 5567168404Spjd */ 5568168404Spjdstatic vdev_t * 5569185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 5570168404Spjd{ 5571168404Spjd vdev_t *newvd, *oldvd; 5572168404Spjd 5573219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5574185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5575168404Spjd if (oldvd != NULL) 5576168404Spjd return (oldvd); 5577168404Spjd } 5578168404Spjd 5579185029Spjd /* 5580219089Spjd * Check for a completed replacement. We always consider the first 5581219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 5582219089Spjd * the newest (see spa_vdev_attach() for how that works). In 5583219089Spjd * the case where the newest vdev is faulted, we will not automatically 5584219089Spjd * remove it after a resilver completes. This is OK as it will require 5585219089Spjd * user intervention to determine which disk the admin wishes to keep. 5586185029Spjd */ 5587219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 5588219089Spjd ASSERT(vd->vdev_children > 1); 5589219089Spjd 5590219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 5591168404Spjd oldvd = vd->vdev_child[0]; 5592168404Spjd 5593209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 5594219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5595209962Smm !vdev_dtl_required(oldvd)) 5596168404Spjd return (oldvd); 5597168404Spjd } 5598168404Spjd 5599185029Spjd /* 5600185029Spjd * Check for a completed resilver with the 'unspare' flag set. 5601185029Spjd */ 5602219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 5603219089Spjd vdev_t *first = vd->vdev_child[0]; 5604219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5605185029Spjd 5606219089Spjd if (last->vdev_unspare) { 5607219089Spjd oldvd = first; 5608219089Spjd newvd = last; 5609219089Spjd } else if (first->vdev_unspare) { 5610219089Spjd oldvd = last; 5611219089Spjd newvd = first; 5612219089Spjd } else { 5613219089Spjd oldvd = NULL; 5614219089Spjd } 5615219089Spjd 5616219089Spjd if (oldvd != NULL && 5617209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 5618219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5619219089Spjd !vdev_dtl_required(oldvd)) 5620185029Spjd return (oldvd); 5621219089Spjd 5622219089Spjd /* 5623219089Spjd * If there are more than two spares attached to a disk, 5624219089Spjd * and those spares are not required, then we want to 5625219089Spjd * attempt to free them up now so that they can be used 5626219089Spjd * by other pools. Once we're back down to a single 5627219089Spjd * disk+spare, we stop removing them. 5628219089Spjd */ 5629219089Spjd if (vd->vdev_children > 2) { 5630219089Spjd newvd = vd->vdev_child[1]; 5631219089Spjd 5632219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 5633219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 5634219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 5635219089Spjd !vdev_dtl_required(newvd)) 5636219089Spjd return (newvd); 5637185029Spjd } 5638185029Spjd } 5639185029Spjd 5640168404Spjd return (NULL); 5641168404Spjd} 5642168404Spjd 5643168404Spjdstatic void 5644185029Spjdspa_vdev_resilver_done(spa_t *spa) 5645168404Spjd{ 5646209962Smm vdev_t *vd, *pvd, *ppvd; 5647209962Smm uint64_t guid, sguid, pguid, ppguid; 5648168404Spjd 5649209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5650168404Spjd 5651185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5652209962Smm pvd = vd->vdev_parent; 5653209962Smm ppvd = pvd->vdev_parent; 5654168404Spjd guid = vd->vdev_guid; 5655209962Smm pguid = pvd->vdev_guid; 5656209962Smm ppguid = ppvd->vdev_guid; 5657209962Smm sguid = 0; 5658168404Spjd /* 5659168404Spjd * If we have just finished replacing a hot spared device, then 5660168404Spjd * we need to detach the parent's first child (the original hot 5661168404Spjd * spare) as well. 5662168404Spjd */ 5663219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5664219089Spjd ppvd->vdev_children == 2) { 5665168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5666209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 5667168404Spjd } 5668254112Sdelphij ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5669254112Sdelphij 5670209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5671209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5672168404Spjd return; 5673209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5674168404Spjd return; 5675209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5676168404Spjd } 5677168404Spjd 5678209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5679168404Spjd} 5680168404Spjd 5681168404Spjd/* 5682219089Spjd * Update the stored path or FRU for this vdev. 5683168404Spjd */ 5684168404Spjdint 5685209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5686209962Smm boolean_t ispath) 5687168404Spjd{ 5688185029Spjd vdev_t *vd; 5689219089Spjd boolean_t sync = B_FALSE; 5690168404Spjd 5691219089Spjd ASSERT(spa_writeable(spa)); 5692168404Spjd 5693219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 5694219089Spjd 5695209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5696219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5697168404Spjd 5698168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5699219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5700168404Spjd 5701209962Smm if (ispath) { 5702219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 5703219089Spjd spa_strfree(vd->vdev_path); 5704219089Spjd vd->vdev_path = spa_strdup(value); 5705219089Spjd sync = B_TRUE; 5706219089Spjd } 5707209962Smm } else { 5708219089Spjd if (vd->vdev_fru == NULL) { 5709219089Spjd vd->vdev_fru = spa_strdup(value); 5710219089Spjd sync = B_TRUE; 5711219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 5712209962Smm spa_strfree(vd->vdev_fru); 5713219089Spjd vd->vdev_fru = spa_strdup(value); 5714219089Spjd sync = B_TRUE; 5715219089Spjd } 5716209962Smm } 5717168404Spjd 5718219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5719168404Spjd} 5720168404Spjd 5721209962Smmint 5722209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5723209962Smm{ 5724209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5725209962Smm} 5726209962Smm 5727209962Smmint 5728209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5729209962Smm{ 5730209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5731209962Smm} 5732209962Smm 5733168404Spjd/* 5734168404Spjd * ========================================================================== 5735219089Spjd * SPA Scanning 5736168404Spjd * ========================================================================== 5737168404Spjd */ 5738168404Spjd 5739168404Spjdint 5740219089Spjdspa_scan_stop(spa_t *spa) 5741168404Spjd{ 5742185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5743219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5744249195Smm return (SET_ERROR(EBUSY)); 5745219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 5746219089Spjd} 5747168404Spjd 5748219089Spjdint 5749219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 5750219089Spjd{ 5751219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5752219089Spjd 5753219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5754249195Smm return (SET_ERROR(ENOTSUP)); 5755168404Spjd 5756168404Spjd /* 5757185029Spjd * If a resilver was requested, but there is no DTL on a 5758185029Spjd * writeable leaf device, we have nothing to do. 5759168404Spjd */ 5760219089Spjd if (func == POOL_SCAN_RESILVER && 5761185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5762185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5763168404Spjd return (0); 5764168404Spjd } 5765168404Spjd 5766219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 5767168404Spjd} 5768168404Spjd 5769168404Spjd/* 5770168404Spjd * ========================================================================== 5771168404Spjd * SPA async task processing 5772168404Spjd * ========================================================================== 5773168404Spjd */ 5774168404Spjd 5775168404Spjdstatic void 5776185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 5777168404Spjd{ 5778185029Spjd if (vd->vdev_remove_wanted) { 5779219089Spjd vd->vdev_remove_wanted = B_FALSE; 5780219089Spjd vd->vdev_delayed_close = B_FALSE; 5781185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5782209962Smm 5783209962Smm /* 5784209962Smm * We want to clear the stats, but we don't want to do a full 5785209962Smm * vdev_clear() as that will cause us to throw away 5786209962Smm * degraded/faulted state as well as attempt to reopen the 5787209962Smm * device, all of which is a waste. 5788209962Smm */ 5789209962Smm vd->vdev_stat.vs_read_errors = 0; 5790209962Smm vd->vdev_stat.vs_write_errors = 0; 5791209962Smm vd->vdev_stat.vs_checksum_errors = 0; 5792209962Smm 5793185029Spjd vdev_state_dirty(vd->vdev_top); 5794185029Spjd } 5795168404Spjd 5796185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5797185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 5798185029Spjd} 5799168404Spjd 5800185029Spjdstatic void 5801185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 5802185029Spjd{ 5803185029Spjd if (vd->vdev_probe_wanted) { 5804219089Spjd vd->vdev_probe_wanted = B_FALSE; 5805185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 5806168404Spjd } 5807168404Spjd 5808185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5809185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 5810168404Spjd} 5811168404Spjd 5812168404Spjdstatic void 5813219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 5814219089Spjd{ 5815219089Spjd sysevent_id_t eid; 5816219089Spjd nvlist_t *attr; 5817219089Spjd char *physpath; 5818219089Spjd 5819219089Spjd if (!spa->spa_autoexpand) 5820219089Spjd return; 5821219089Spjd 5822219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5823219089Spjd vdev_t *cvd = vd->vdev_child[c]; 5824219089Spjd spa_async_autoexpand(spa, cvd); 5825219089Spjd } 5826219089Spjd 5827219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5828219089Spjd return; 5829219089Spjd 5830219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5831219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5832219089Spjd 5833219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5834219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5835219089Spjd 5836219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5837219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5838219089Spjd 5839219089Spjd nvlist_free(attr); 5840219089Spjd kmem_free(physpath, MAXPATHLEN); 5841219089Spjd} 5842219089Spjd 5843219089Spjdstatic void 5844168404Spjdspa_async_thread(void *arg) 5845168404Spjd{ 5846168404Spjd spa_t *spa = arg; 5847168404Spjd int tasks; 5848168404Spjd 5849168404Spjd ASSERT(spa->spa_sync_on); 5850168404Spjd 5851168404Spjd mutex_enter(&spa->spa_async_lock); 5852168404Spjd tasks = spa->spa_async_tasks; 5853253990Smav spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 5854168404Spjd mutex_exit(&spa->spa_async_lock); 5855168404Spjd 5856168404Spjd /* 5857168404Spjd * See if the config needs to be updated. 5858168404Spjd */ 5859168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5860219089Spjd uint64_t old_space, new_space; 5861219089Spjd 5862168404Spjd mutex_enter(&spa_namespace_lock); 5863219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 5864168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5865219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 5866168404Spjd mutex_exit(&spa_namespace_lock); 5867219089Spjd 5868219089Spjd /* 5869219089Spjd * If the pool grew as a result of the config update, 5870219089Spjd * then log an internal history event. 5871219089Spjd */ 5872219089Spjd if (new_space != old_space) { 5873248571Smm spa_history_log_internal(spa, "vdev online", NULL, 5874219089Spjd "pool '%s' size: %llu(+%llu)", 5875219089Spjd spa_name(spa), new_space, new_space - old_space); 5876219089Spjd } 5877168404Spjd } 5878168404Spjd 5879219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5880219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5881219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 5882219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 5883219089Spjd } 5884219089Spjd 5885168404Spjd /* 5886185029Spjd * See if any devices need to be probed. 5887168404Spjd */ 5888185029Spjd if (tasks & SPA_ASYNC_PROBE) { 5889219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5890185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 5891185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5892185029Spjd } 5893168404Spjd 5894168404Spjd /* 5895185029Spjd * If any devices are done replacing, detach them. 5896168404Spjd */ 5897185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 5898185029Spjd spa_vdev_resilver_done(spa); 5899168404Spjd 5900168404Spjd /* 5901168404Spjd * Kick off a resilver. 5902168404Spjd */ 5903168404Spjd if (tasks & SPA_ASYNC_RESILVER) 5904219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 5905168404Spjd 5906168404Spjd /* 5907168404Spjd * Let the world know that we're done. 5908168404Spjd */ 5909168404Spjd mutex_enter(&spa->spa_async_lock); 5910168404Spjd spa->spa_async_thread = NULL; 5911168404Spjd cv_broadcast(&spa->spa_async_cv); 5912168404Spjd mutex_exit(&spa->spa_async_lock); 5913168404Spjd thread_exit(); 5914168404Spjd} 5915168404Spjd 5916253990Smavstatic void 5917253990Smavspa_async_thread_vd(void *arg) 5918253990Smav{ 5919253990Smav spa_t *spa = arg; 5920253990Smav int tasks; 5921253990Smav 5922253990Smav ASSERT(spa->spa_sync_on); 5923253990Smav 5924253990Smav mutex_enter(&spa->spa_async_lock); 5925253990Smav tasks = spa->spa_async_tasks; 5926253990Smavretry: 5927253990Smav spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 5928253990Smav mutex_exit(&spa->spa_async_lock); 5929253990Smav 5930253990Smav /* 5931253990Smav * See if any devices need to be marked REMOVED. 5932253990Smav */ 5933253990Smav if (tasks & SPA_ASYNC_REMOVE) { 5934253990Smav spa_vdev_state_enter(spa, SCL_NONE); 5935253990Smav spa_async_remove(spa, spa->spa_root_vdev); 5936253990Smav for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5937253990Smav spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5938253990Smav for (int i = 0; i < spa->spa_spares.sav_count; i++) 5939253990Smav spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5940253990Smav (void) spa_vdev_state_exit(spa, NULL, 0); 5941253990Smav } 5942253990Smav 5943253990Smav /* 5944253990Smav * Let the world know that we're done. 5945253990Smav */ 5946253990Smav mutex_enter(&spa->spa_async_lock); 5947253990Smav tasks = spa->spa_async_tasks; 5948253990Smav if ((tasks & SPA_ASYNC_REMOVE) != 0) 5949253990Smav goto retry; 5950253990Smav spa->spa_async_thread_vd = NULL; 5951253990Smav cv_broadcast(&spa->spa_async_cv); 5952253990Smav mutex_exit(&spa->spa_async_lock); 5953253990Smav thread_exit(); 5954253990Smav} 5955253990Smav 5956168404Spjdvoid 5957168404Spjdspa_async_suspend(spa_t *spa) 5958168404Spjd{ 5959168404Spjd mutex_enter(&spa->spa_async_lock); 5960168404Spjd spa->spa_async_suspended++; 5961253990Smav while (spa->spa_async_thread != NULL && 5962253990Smav spa->spa_async_thread_vd != NULL) 5963168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5964168404Spjd mutex_exit(&spa->spa_async_lock); 5965168404Spjd} 5966168404Spjd 5967168404Spjdvoid 5968168404Spjdspa_async_resume(spa_t *spa) 5969168404Spjd{ 5970168404Spjd mutex_enter(&spa->spa_async_lock); 5971168404Spjd ASSERT(spa->spa_async_suspended != 0); 5972168404Spjd spa->spa_async_suspended--; 5973168404Spjd mutex_exit(&spa->spa_async_lock); 5974168404Spjd} 5975168404Spjd 5976251636Sdelphijstatic boolean_t 5977251636Sdelphijspa_async_tasks_pending(spa_t *spa) 5978251636Sdelphij{ 5979251636Sdelphij uint_t non_config_tasks; 5980251636Sdelphij uint_t config_task; 5981251636Sdelphij boolean_t config_task_suspended; 5982251636Sdelphij 5983253990Smav non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 5984253990Smav SPA_ASYNC_REMOVE); 5985251636Sdelphij config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 5986251636Sdelphij if (spa->spa_ccw_fail_time == 0) { 5987251636Sdelphij config_task_suspended = B_FALSE; 5988251636Sdelphij } else { 5989251636Sdelphij config_task_suspended = 5990251636Sdelphij (gethrtime() - spa->spa_ccw_fail_time) < 5991251636Sdelphij (zfs_ccw_retry_interval * NANOSEC); 5992251636Sdelphij } 5993251636Sdelphij 5994251636Sdelphij return (non_config_tasks || (config_task && !config_task_suspended)); 5995251636Sdelphij} 5996251636Sdelphij 5997168404Spjdstatic void 5998168404Spjdspa_async_dispatch(spa_t *spa) 5999168404Spjd{ 6000168404Spjd mutex_enter(&spa->spa_async_lock); 6001251636Sdelphij if (spa_async_tasks_pending(spa) && 6002251636Sdelphij !spa->spa_async_suspended && 6003168404Spjd spa->spa_async_thread == NULL && 6004251636Sdelphij rootdir != NULL) 6005168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 6006168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6007168404Spjd mutex_exit(&spa->spa_async_lock); 6008168404Spjd} 6009168404Spjd 6010253990Smavstatic void 6011253990Smavspa_async_dispatch_vd(spa_t *spa) 6012253990Smav{ 6013253990Smav mutex_enter(&spa->spa_async_lock); 6014253990Smav if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6015253990Smav !spa->spa_async_suspended && 6016253990Smav spa->spa_async_thread_vd == NULL && 6017253990Smav rootdir != NULL) 6018253990Smav spa->spa_async_thread_vd = thread_create(NULL, 0, 6019253990Smav spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6020253990Smav mutex_exit(&spa->spa_async_lock); 6021253990Smav} 6022253990Smav 6023168404Spjdvoid 6024168404Spjdspa_async_request(spa_t *spa, int task) 6025168404Spjd{ 6026219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6027168404Spjd mutex_enter(&spa->spa_async_lock); 6028168404Spjd spa->spa_async_tasks |= task; 6029168404Spjd mutex_exit(&spa->spa_async_lock); 6030253990Smav spa_async_dispatch_vd(spa); 6031168404Spjd} 6032168404Spjd 6033168404Spjd/* 6034168404Spjd * ========================================================================== 6035168404Spjd * SPA syncing routines 6036168404Spjd * ========================================================================== 6037168404Spjd */ 6038168404Spjd 6039219089Spjdstatic int 6040219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6041168404Spjd{ 6042219089Spjd bpobj_t *bpo = arg; 6043219089Spjd bpobj_enqueue(bpo, bp, tx); 6044219089Spjd return (0); 6045219089Spjd} 6046168404Spjd 6047219089Spjdstatic int 6048219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6049219089Spjd{ 6050219089Spjd zio_t *zio = arg; 6051168404Spjd 6052219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6053240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 6054219089Spjd return (0); 6055168404Spjd} 6056168404Spjd 6057168404Spjdstatic void 6058168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6059168404Spjd{ 6060168404Spjd char *packed = NULL; 6061185029Spjd size_t bufsize; 6062168404Spjd size_t nvsize = 0; 6063168404Spjd dmu_buf_t *db; 6064168404Spjd 6065168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6066168404Spjd 6067185029Spjd /* 6068185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6069185029Spjd * information. This avoids the dbuf_will_dirty() path and 6070185029Spjd * saves us a pre-read to get data we don't actually care about. 6071185029Spjd */ 6072236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6073185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 6074168404Spjd 6075168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6076168404Spjd KM_SLEEP) == 0); 6077185029Spjd bzero(packed + nvsize, bufsize - nvsize); 6078168404Spjd 6079185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6080168404Spjd 6081185029Spjd kmem_free(packed, bufsize); 6082168404Spjd 6083168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6084168404Spjd dmu_buf_will_dirty(db, tx); 6085168404Spjd *(uint64_t *)db->db_data = nvsize; 6086168404Spjd dmu_buf_rele(db, FTAG); 6087168404Spjd} 6088168404Spjd 6089168404Spjdstatic void 6090185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6091185029Spjd const char *config, const char *entry) 6092168404Spjd{ 6093168404Spjd nvlist_t *nvroot; 6094185029Spjd nvlist_t **list; 6095168404Spjd int i; 6096168404Spjd 6097185029Spjd if (!sav->sav_sync) 6098168404Spjd return; 6099168404Spjd 6100168404Spjd /* 6101185029Spjd * Update the MOS nvlist describing the list of available devices. 6102185029Spjd * spa_validate_aux() will have already made sure this nvlist is 6103185029Spjd * valid and the vdevs are labeled appropriately. 6104168404Spjd */ 6105185029Spjd if (sav->sav_object == 0) { 6106185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6107185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6108185029Spjd sizeof (uint64_t), tx); 6109168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 6110185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6111185029Spjd &sav->sav_object, tx) == 0); 6112168404Spjd } 6113168404Spjd 6114168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6115185029Spjd if (sav->sav_count == 0) { 6116185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6117168404Spjd } else { 6118185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6119185029Spjd for (i = 0; i < sav->sav_count; i++) 6120185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6121219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 6122185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6123185029Spjd sav->sav_count) == 0); 6124185029Spjd for (i = 0; i < sav->sav_count; i++) 6125185029Spjd nvlist_free(list[i]); 6126185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 6127168404Spjd } 6128168404Spjd 6129185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6130168404Spjd nvlist_free(nvroot); 6131168404Spjd 6132185029Spjd sav->sav_sync = B_FALSE; 6133168404Spjd} 6134168404Spjd 6135168404Spjdstatic void 6136168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6137168404Spjd{ 6138168404Spjd nvlist_t *config; 6139168404Spjd 6140185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) 6141168404Spjd return; 6142168404Spjd 6143185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6144168404Spjd 6145185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 6146185029Spjd dmu_tx_get_txg(tx), B_FALSE); 6147185029Spjd 6148243505Smm /* 6149243505Smm * If we're upgrading the spa version then make sure that 6150243505Smm * the config object gets updated with the correct version. 6151243505Smm */ 6152243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6153243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6154243505Smm spa->spa_uberblock.ub_version); 6155243505Smm 6156185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6157185029Spjd 6158168404Spjd if (spa->spa_config_syncing) 6159168404Spjd nvlist_free(spa->spa_config_syncing); 6160168404Spjd spa->spa_config_syncing = config; 6161168404Spjd 6162168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6163168404Spjd} 6164168404Spjd 6165236884Smmstatic void 6166248571Smmspa_sync_version(void *arg, dmu_tx_t *tx) 6167236884Smm{ 6168248571Smm uint64_t *versionp = arg; 6169248571Smm uint64_t version = *versionp; 6170248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6171236884Smm 6172236884Smm /* 6173236884Smm * Setting the version is special cased when first creating the pool. 6174236884Smm */ 6175236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 6176236884Smm 6177247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6178236884Smm ASSERT(version >= spa_version(spa)); 6179236884Smm 6180236884Smm spa->spa_uberblock.ub_version = version; 6181236884Smm vdev_config_dirty(spa->spa_root_vdev); 6182248571Smm spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6183236884Smm} 6184236884Smm 6185185029Spjd/* 6186185029Spjd * Set zpool properties. 6187185029Spjd */ 6188168404Spjdstatic void 6189248571Smmspa_sync_props(void *arg, dmu_tx_t *tx) 6190168404Spjd{ 6191248571Smm nvlist_t *nvp = arg; 6192248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6193185029Spjd objset_t *mos = spa->spa_meta_objset; 6194236884Smm nvpair_t *elem = NULL; 6195168404Spjd 6196168404Spjd mutex_enter(&spa->spa_props_lock); 6197168404Spjd 6198185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 6199236884Smm uint64_t intval; 6200236884Smm char *strval, *fname; 6201236884Smm zpool_prop_t prop; 6202236884Smm const char *propname; 6203236884Smm zprop_type_t proptype; 6204236884Smm zfeature_info_t *feature; 6205236884Smm 6206185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6207236884Smm case ZPROP_INVAL: 6208236884Smm /* 6209236884Smm * We checked this earlier in spa_prop_validate(). 6210236884Smm */ 6211236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 6212236884Smm 6213236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 6214236884Smm VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); 6215236884Smm 6216236884Smm spa_feature_enable(spa, feature, tx); 6217248571Smm spa_history_log_internal(spa, "set", tx, 6218248571Smm "%s=enabled", nvpair_name(elem)); 6219236884Smm break; 6220236884Smm 6221185029Spjd case ZPOOL_PROP_VERSION: 6222236884Smm VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6223185029Spjd /* 6224236884Smm * The version is synced seperatly before other 6225236884Smm * properties and should be correct by now. 6226185029Spjd */ 6227236884Smm ASSERT3U(spa_version(spa), >=, intval); 6228185029Spjd break; 6229168404Spjd 6230185029Spjd case ZPOOL_PROP_ALTROOT: 6231185029Spjd /* 6232185029Spjd * 'altroot' is a non-persistent property. It should 6233185029Spjd * have been set temporarily at creation or import time. 6234185029Spjd */ 6235185029Spjd ASSERT(spa->spa_root != NULL); 6236185029Spjd break; 6237168404Spjd 6238219089Spjd case ZPOOL_PROP_READONLY: 6239185029Spjd case ZPOOL_PROP_CACHEFILE: 6240185029Spjd /* 6241219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 6242219089Spjd * properties. 6243185029Spjd */ 6244168404Spjd break; 6245228103Smm case ZPOOL_PROP_COMMENT: 6246228103Smm VERIFY(nvpair_value_string(elem, &strval) == 0); 6247228103Smm if (spa->spa_comment != NULL) 6248228103Smm spa_strfree(spa->spa_comment); 6249228103Smm spa->spa_comment = spa_strdup(strval); 6250228103Smm /* 6251228103Smm * We need to dirty the configuration on all the vdevs 6252228103Smm * so that their labels get updated. It's unnecessary 6253228103Smm * to do this for pool creation since the vdev's 6254228103Smm * configuratoin has already been dirtied. 6255228103Smm */ 6256228103Smm if (tx->tx_txg != TXG_INITIAL) 6257228103Smm vdev_config_dirty(spa->spa_root_vdev); 6258248571Smm spa_history_log_internal(spa, "set", tx, 6259248571Smm "%s=%s", nvpair_name(elem), strval); 6260228103Smm break; 6261185029Spjd default: 6262185029Spjd /* 6263185029Spjd * Set pool property values in the poolprops mos object. 6264185029Spjd */ 6265185029Spjd if (spa->spa_pool_props_object == 0) { 6266236884Smm spa->spa_pool_props_object = 6267236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 6268185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6269236884Smm tx); 6270185029Spjd } 6271185029Spjd 6272185029Spjd /* normalize the property name */ 6273185029Spjd propname = zpool_prop_to_name(prop); 6274185029Spjd proptype = zpool_prop_get_type(prop); 6275185029Spjd 6276185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 6277185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 6278185029Spjd VERIFY(nvpair_value_string(elem, &strval) == 0); 6279185029Spjd VERIFY(zap_update(mos, 6280185029Spjd spa->spa_pool_props_object, propname, 6281185029Spjd 1, strlen(strval) + 1, strval, tx) == 0); 6282248571Smm spa_history_log_internal(spa, "set", tx, 6283248571Smm "%s=%s", nvpair_name(elem), strval); 6284185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6285185029Spjd VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6286185029Spjd 6287185029Spjd if (proptype == PROP_TYPE_INDEX) { 6288185029Spjd const char *unused; 6289185029Spjd VERIFY(zpool_prop_index_to_string( 6290185029Spjd prop, intval, &unused) == 0); 6291185029Spjd } 6292185029Spjd VERIFY(zap_update(mos, 6293185029Spjd spa->spa_pool_props_object, propname, 6294185029Spjd 8, 1, &intval, tx) == 0); 6295248571Smm spa_history_log_internal(spa, "set", tx, 6296248571Smm "%s=%lld", nvpair_name(elem), intval); 6297185029Spjd } else { 6298185029Spjd ASSERT(0); /* not allowed */ 6299185029Spjd } 6300185029Spjd 6301185029Spjd switch (prop) { 6302185029Spjd case ZPOOL_PROP_DELEGATION: 6303185029Spjd spa->spa_delegation = intval; 6304185029Spjd break; 6305185029Spjd case ZPOOL_PROP_BOOTFS: 6306185029Spjd spa->spa_bootfs = intval; 6307185029Spjd break; 6308185029Spjd case ZPOOL_PROP_FAILUREMODE: 6309185029Spjd spa->spa_failmode = intval; 6310185029Spjd break; 6311219089Spjd case ZPOOL_PROP_AUTOEXPAND: 6312219089Spjd spa->spa_autoexpand = intval; 6313219089Spjd if (tx->tx_txg != TXG_INITIAL) 6314219089Spjd spa_async_request(spa, 6315219089Spjd SPA_ASYNC_AUTOEXPAND); 6316219089Spjd break; 6317219089Spjd case ZPOOL_PROP_DEDUPDITTO: 6318219089Spjd spa->spa_dedup_ditto = intval; 6319219089Spjd break; 6320185029Spjd default: 6321185029Spjd break; 6322185029Spjd } 6323168404Spjd } 6324185029Spjd 6325168404Spjd } 6326185029Spjd 6327185029Spjd mutex_exit(&spa->spa_props_lock); 6328168404Spjd} 6329168404Spjd 6330168404Spjd/* 6331219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 6332219089Spjd * reflect the new version this txg, so there must be no changes this 6333219089Spjd * txg to anything that the upgrade code depends on after it executes. 6334219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 6335219089Spjd * tasks. 6336219089Spjd */ 6337219089Spjdstatic void 6338219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6339219089Spjd{ 6340219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6341219089Spjd 6342219089Spjd ASSERT(spa->spa_sync_pass == 1); 6343219089Spjd 6344248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6345248571Smm 6346219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6347219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6348219089Spjd dsl_pool_create_origin(dp, tx); 6349219089Spjd 6350219089Spjd /* Keeping the origin open increases spa_minref */ 6351219089Spjd spa->spa_minref += 3; 6352219089Spjd } 6353219089Spjd 6354219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6355219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6356219089Spjd dsl_pool_upgrade_clones(dp, tx); 6357219089Spjd } 6358219089Spjd 6359219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6360219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6361219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 6362219089Spjd 6363219089Spjd /* Keeping the freedir open increases spa_minref */ 6364219089Spjd spa->spa_minref += 3; 6365219089Spjd } 6366236884Smm 6367236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6368236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6369236884Smm spa_feature_create_zap_objects(spa, tx); 6370236884Smm } 6371248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 6372219089Spjd} 6373219089Spjd 6374219089Spjd/* 6375168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 6376168404Spjd * part of the process, so we iterate until it converges. 6377168404Spjd */ 6378168404Spjdvoid 6379168404Spjdspa_sync(spa_t *spa, uint64_t txg) 6380168404Spjd{ 6381168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6382168404Spjd objset_t *mos = spa->spa_meta_objset; 6383219089Spjd bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 6384219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6385168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6386168404Spjd vdev_t *vd; 6387168404Spjd dmu_tx_t *tx; 6388185029Spjd int error; 6389168404Spjd 6390219089Spjd VERIFY(spa_writeable(spa)); 6391219089Spjd 6392168404Spjd /* 6393168404Spjd * Lock out configuration changes. 6394168404Spjd */ 6395185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6396168404Spjd 6397168404Spjd spa->spa_syncing_txg = txg; 6398168404Spjd spa->spa_sync_pass = 0; 6399168404Spjd 6400185029Spjd /* 6401185029Spjd * If there are any pending vdev state changes, convert them 6402185029Spjd * into config changes that go out with this transaction group. 6403185029Spjd */ 6404185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6405209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 6406209962Smm /* 6407209962Smm * We need the write lock here because, for aux vdevs, 6408209962Smm * calling vdev_config_dirty() modifies sav_config. 6409209962Smm * This is ugly and will become unnecessary when we 6410209962Smm * eliminate the aux vdev wart by integrating all vdevs 6411209962Smm * into the root vdev tree. 6412209962Smm */ 6413209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6414209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6415209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6416209962Smm vdev_state_clean(vd); 6417209962Smm vdev_config_dirty(vd); 6418209962Smm } 6419209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6420209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6421185029Spjd } 6422185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6423185029Spjd 6424168404Spjd tx = dmu_tx_create_assigned(dp, txg); 6425168404Spjd 6426247265Smm spa->spa_sync_starttime = gethrtime(); 6427247265Smm#ifdef illumos 6428247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6429247265Smm spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6430247265Smm#else /* FreeBSD */ 6431247265Smm#ifdef _KERNEL 6432247265Smm callout_reset(&spa->spa_deadman_cycid, 6433247265Smm hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6434247265Smm#endif 6435247265Smm#endif 6436247265Smm 6437168404Spjd /* 6438185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6439168404Spjd * set spa_deflate if we have no raid-z vdevs. 6440168404Spjd */ 6441185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6442185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6443168404Spjd int i; 6444168404Spjd 6445168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 6446168404Spjd vd = rvd->vdev_child[i]; 6447168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6448168404Spjd break; 6449168404Spjd } 6450168404Spjd if (i == rvd->vdev_children) { 6451168404Spjd spa->spa_deflate = TRUE; 6452168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 6453168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6454168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6455168404Spjd } 6456168404Spjd } 6457168404Spjd 6458168404Spjd /* 6459219089Spjd * If anything has changed in this txg, or if someone is waiting 6460219089Spjd * for this txg to sync (eg, spa_vdev_remove()), push the 6461219089Spjd * deferred frees from the previous txg. If not, leave them 6462219089Spjd * alone so that we don't generate work on an otherwise idle 6463219089Spjd * system. 6464168404Spjd */ 6465168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6466168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6467219089Spjd !txg_list_empty(&dp->dp_sync_tasks, txg) || 6468219089Spjd ((dsl_scan_active(dp->dp_scan) || 6469219089Spjd txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6470219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6471219089Spjd VERIFY3U(bpobj_iterate(defer_bpo, 6472219089Spjd spa_free_sync_cb, zio, tx), ==, 0); 6473240415Smm VERIFY0(zio_wait(zio)); 6474219089Spjd } 6475168404Spjd 6476168404Spjd /* 6477168404Spjd * Iterate to convergence. 6478168404Spjd */ 6479168404Spjd do { 6480219089Spjd int pass = ++spa->spa_sync_pass; 6481168404Spjd 6482168404Spjd spa_sync_config_object(spa, tx); 6483185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6484185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6485185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6486185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6487168404Spjd spa_errlog_sync(spa, txg); 6488168404Spjd dsl_pool_sync(dp, txg); 6489168404Spjd 6490243503Smm if (pass < zfs_sync_pass_deferred_free) { 6491219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6492219089Spjd bplist_iterate(free_bpl, spa_free_sync_cb, 6493219089Spjd zio, tx); 6494219089Spjd VERIFY(zio_wait(zio) == 0); 6495219089Spjd } else { 6496219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 6497219089Spjd defer_bpo, tx); 6498168404Spjd } 6499168404Spjd 6500219089Spjd ddt_sync(spa, txg); 6501219089Spjd dsl_scan_sync(dp, tx); 6502168404Spjd 6503219089Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6504219089Spjd vdev_sync(vd, txg); 6505168404Spjd 6506219089Spjd if (pass == 1) 6507219089Spjd spa_sync_upgrades(spa, tx); 6508168404Spjd 6509219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 6510219089Spjd 6511168404Spjd /* 6512168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 6513168404Spjd * to commit the transaction group. 6514168404Spjd * 6515185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 6516185029Spjd * random top-level vdevs that are known to be visible in the 6517185029Spjd * config cache (see spa_vdev_add() for a complete description). 6518185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6519168404Spjd */ 6520185029Spjd for (;;) { 6521185029Spjd /* 6522185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 6523185029Spjd * while we're attempting to write the vdev labels. 6524185029Spjd */ 6525185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6526168404Spjd 6527185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 6528185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 6529185029Spjd int svdcount = 0; 6530185029Spjd int children = rvd->vdev_children; 6531185029Spjd int c0 = spa_get_random(children); 6532185029Spjd 6533219089Spjd for (int c = 0; c < children; c++) { 6534185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 6535185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6536185029Spjd continue; 6537185029Spjd svd[svdcount++] = vd; 6538185029Spjd if (svdcount == SPA_DVAS_PER_BP) 6539185029Spjd break; 6540185029Spjd } 6541213198Smm error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6542213198Smm if (error != 0) 6543213198Smm error = vdev_config_sync(svd, svdcount, txg, 6544213198Smm B_TRUE); 6545185029Spjd } else { 6546185029Spjd error = vdev_config_sync(rvd->vdev_child, 6547213198Smm rvd->vdev_children, txg, B_FALSE); 6548213198Smm if (error != 0) 6549213198Smm error = vdev_config_sync(rvd->vdev_child, 6550213198Smm rvd->vdev_children, txg, B_TRUE); 6551168404Spjd } 6552185029Spjd 6553239620Smm if (error == 0) 6554239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 6555239620Smm 6556185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6557185029Spjd 6558185029Spjd if (error == 0) 6559185029Spjd break; 6560185029Spjd zio_suspend(spa, NULL); 6561185029Spjd zio_resume_wait(spa); 6562168404Spjd } 6563168404Spjd dmu_tx_commit(tx); 6564168404Spjd 6565247265Smm#ifdef illumos 6566247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6567247265Smm#else /* FreeBSD */ 6568247265Smm#ifdef _KERNEL 6569247265Smm callout_drain(&spa->spa_deadman_cycid); 6570247265Smm#endif 6571247265Smm#endif 6572247265Smm 6573168404Spjd /* 6574168404Spjd * Clear the dirty config list. 6575168404Spjd */ 6576185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6577168404Spjd vdev_config_clean(vd); 6578168404Spjd 6579168404Spjd /* 6580168404Spjd * Now that the new config has synced transactionally, 6581168404Spjd * let it become visible to the config cache. 6582168404Spjd */ 6583168404Spjd if (spa->spa_config_syncing != NULL) { 6584168404Spjd spa_config_set(spa, spa->spa_config_syncing); 6585168404Spjd spa->spa_config_txg = txg; 6586168404Spjd spa->spa_config_syncing = NULL; 6587168404Spjd } 6588168404Spjd 6589168404Spjd spa->spa_ubsync = spa->spa_uberblock; 6590168404Spjd 6591219089Spjd dsl_pool_sync_done(dp, txg); 6592168404Spjd 6593168404Spjd /* 6594168404Spjd * Update usable space statistics. 6595168404Spjd */ 6596168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6597168404Spjd vdev_sync_done(vd, txg); 6598168404Spjd 6599219089Spjd spa_update_dspace(spa); 6600219089Spjd 6601168404Spjd /* 6602168404Spjd * It had better be the case that we didn't dirty anything 6603168404Spjd * since vdev_config_sync(). 6604168404Spjd */ 6605168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6606168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6607168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6608168404Spjd 6609219089Spjd spa->spa_sync_pass = 0; 6610219089Spjd 6611185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6612168404Spjd 6613219089Spjd spa_handle_ignored_writes(spa); 6614219089Spjd 6615168404Spjd /* 6616168404Spjd * If any async tasks have been requested, kick them off. 6617168404Spjd */ 6618168404Spjd spa_async_dispatch(spa); 6619253990Smav spa_async_dispatch_vd(spa); 6620168404Spjd} 6621168404Spjd 6622168404Spjd/* 6623168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 6624168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 6625168404Spjd * sync. 6626168404Spjd */ 6627168404Spjdvoid 6628168404Spjdspa_sync_allpools(void) 6629168404Spjd{ 6630168404Spjd spa_t *spa = NULL; 6631168404Spjd mutex_enter(&spa_namespace_lock); 6632168404Spjd while ((spa = spa_next(spa)) != NULL) { 6633219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 6634219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 6635168404Spjd continue; 6636168404Spjd spa_open_ref(spa, FTAG); 6637168404Spjd mutex_exit(&spa_namespace_lock); 6638168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6639168404Spjd mutex_enter(&spa_namespace_lock); 6640168404Spjd spa_close(spa, FTAG); 6641168404Spjd } 6642168404Spjd mutex_exit(&spa_namespace_lock); 6643168404Spjd} 6644168404Spjd 6645168404Spjd/* 6646168404Spjd * ========================================================================== 6647168404Spjd * Miscellaneous routines 6648168404Spjd * ========================================================================== 6649168404Spjd */ 6650168404Spjd 6651168404Spjd/* 6652168404Spjd * Remove all pools in the system. 6653168404Spjd */ 6654168404Spjdvoid 6655168404Spjdspa_evict_all(void) 6656168404Spjd{ 6657168404Spjd spa_t *spa; 6658168404Spjd 6659168404Spjd /* 6660168404Spjd * Remove all cached state. All pools should be closed now, 6661168404Spjd * so every spa in the AVL tree should be unreferenced. 6662168404Spjd */ 6663168404Spjd mutex_enter(&spa_namespace_lock); 6664168404Spjd while ((spa = spa_next(NULL)) != NULL) { 6665168404Spjd /* 6666168404Spjd * Stop async tasks. The async thread may need to detach 6667168404Spjd * a device that's been replaced, which requires grabbing 6668168404Spjd * spa_namespace_lock, so we must drop it here. 6669168404Spjd */ 6670168404Spjd spa_open_ref(spa, FTAG); 6671168404Spjd mutex_exit(&spa_namespace_lock); 6672168404Spjd spa_async_suspend(spa); 6673168404Spjd mutex_enter(&spa_namespace_lock); 6674168404Spjd spa_close(spa, FTAG); 6675168404Spjd 6676168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6677168404Spjd spa_unload(spa); 6678168404Spjd spa_deactivate(spa); 6679168404Spjd } 6680168404Spjd spa_remove(spa); 6681168404Spjd } 6682168404Spjd mutex_exit(&spa_namespace_lock); 6683168404Spjd} 6684168404Spjd 6685168404Spjdvdev_t * 6686209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6687168404Spjd{ 6688185029Spjd vdev_t *vd; 6689185029Spjd int i; 6690185029Spjd 6691185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6692185029Spjd return (vd); 6693185029Spjd 6694209962Smm if (aux) { 6695185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6696185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 6697185029Spjd if (vd->vdev_guid == guid) 6698185029Spjd return (vd); 6699185029Spjd } 6700209962Smm 6701209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 6702209962Smm vd = spa->spa_spares.sav_vdevs[i]; 6703209962Smm if (vd->vdev_guid == guid) 6704209962Smm return (vd); 6705209962Smm } 6706185029Spjd } 6707185029Spjd 6708185029Spjd return (NULL); 6709168404Spjd} 6710168404Spjd 6711168404Spjdvoid 6712185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 6713168404Spjd{ 6714219089Spjd ASSERT(spa_writeable(spa)); 6715219089Spjd 6716185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6717168404Spjd 6718168404Spjd /* 6719168404Spjd * This should only be called for a non-faulted pool, and since a 6720168404Spjd * future version would result in an unopenable pool, this shouldn't be 6721168404Spjd * possible. 6722168404Spjd */ 6723247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6724185029Spjd ASSERT(version >= spa->spa_uberblock.ub_version); 6725168404Spjd 6726185029Spjd spa->spa_uberblock.ub_version = version; 6727168404Spjd vdev_config_dirty(spa->spa_root_vdev); 6728168404Spjd 6729185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 6730168404Spjd 6731168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6732168404Spjd} 6733168404Spjd 6734168404Spjdboolean_t 6735168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 6736168404Spjd{ 6737168404Spjd int i; 6738168404Spjd uint64_t spareguid; 6739185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6740168404Spjd 6741185029Spjd for (i = 0; i < sav->sav_count; i++) 6742185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 6743168404Spjd return (B_TRUE); 6744168404Spjd 6745185029Spjd for (i = 0; i < sav->sav_npending; i++) { 6746185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6747185029Spjd &spareguid) == 0 && spareguid == guid) 6748168404Spjd return (B_TRUE); 6749168404Spjd } 6750168404Spjd 6751168404Spjd return (B_FALSE); 6752168404Spjd} 6753168404Spjd 6754185029Spjd/* 6755185029Spjd * Check if a pool has an active shared spare device. 6756185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 6757185029Spjd */ 6758185029Spjdstatic boolean_t 6759185029Spjdspa_has_active_shared_spare(spa_t *spa) 6760168404Spjd{ 6761185029Spjd int i, refcnt; 6762185029Spjd uint64_t pool; 6763185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6764185029Spjd 6765185029Spjd for (i = 0; i < sav->sav_count; i++) { 6766185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6767185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6768185029Spjd refcnt > 2) 6769185029Spjd return (B_TRUE); 6770185029Spjd } 6771185029Spjd 6772185029Spjd return (B_FALSE); 6773168404Spjd} 6774168404Spjd 6775185029Spjd/* 6776185029Spjd * Post a sysevent corresponding to the given event. The 'name' must be one of 6777185029Spjd * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6778185029Spjd * filled in from the spa and (optionally) the vdev. This doesn't do anything 6779185029Spjd * in the userland libzpool, as we don't want consumers to misinterpret ztest 6780185029Spjd * or zdb as real changes. 6781185029Spjd */ 6782185029Spjdvoid 6783185029Spjdspa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6784168404Spjd{ 6785185029Spjd#ifdef _KERNEL 6786185029Spjd sysevent_t *ev; 6787185029Spjd sysevent_attr_list_t *attr = NULL; 6788185029Spjd sysevent_value_t value; 6789185029Spjd sysevent_id_t eid; 6790168404Spjd 6791185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6792185029Spjd SE_SLEEP); 6793168404Spjd 6794185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6795185029Spjd value.value.sv_string = spa_name(spa); 6796185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6797185029Spjd goto done; 6798168404Spjd 6799185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6800185029Spjd value.value.sv_uint64 = spa_guid(spa); 6801185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6802185029Spjd goto done; 6803168404Spjd 6804185029Spjd if (vd) { 6805185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6806185029Spjd value.value.sv_uint64 = vd->vdev_guid; 6807185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6808185029Spjd SE_SLEEP) != 0) 6809185029Spjd goto done; 6810168404Spjd 6811185029Spjd if (vd->vdev_path) { 6812185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6813185029Spjd value.value.sv_string = vd->vdev_path; 6814185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6815185029Spjd &value, SE_SLEEP) != 0) 6816185029Spjd goto done; 6817168404Spjd } 6818168404Spjd } 6819168404Spjd 6820185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 6821185029Spjd goto done; 6822185029Spjd attr = NULL; 6823168404Spjd 6824185029Spjd (void) log_sysevent(ev, SE_SLEEP, &eid); 6825185029Spjd 6826185029Spjddone: 6827185029Spjd if (attr) 6828185029Spjd sysevent_free_attr(attr); 6829185029Spjd sysevent_free(ev); 6830185029Spjd#endif 6831168404Spjd} 6832