spa.c revision 268473
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24264670Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25268126Sdelphij * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. 26247265Smm * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30251629Sdelphij * SPA: Storage Pool Allocator 31251629Sdelphij * 32168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 33168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 34168404Spjd * pool. 35168404Spjd */ 36168404Spjd 37168404Spjd#include <sys/zfs_context.h> 38168404Spjd#include <sys/fm/fs/zfs.h> 39168404Spjd#include <sys/spa_impl.h> 40168404Spjd#include <sys/zio.h> 41168404Spjd#include <sys/zio_checksum.h> 42168404Spjd#include <sys/dmu.h> 43168404Spjd#include <sys/dmu_tx.h> 44168404Spjd#include <sys/zap.h> 45168404Spjd#include <sys/zil.h> 46219089Spjd#include <sys/ddt.h> 47168404Spjd#include <sys/vdev_impl.h> 48168404Spjd#include <sys/metaslab.h> 49219089Spjd#include <sys/metaslab_impl.h> 50168404Spjd#include <sys/uberblock_impl.h> 51168404Spjd#include <sys/txg.h> 52168404Spjd#include <sys/avl.h> 53168404Spjd#include <sys/dmu_traverse.h> 54168404Spjd#include <sys/dmu_objset.h> 55168404Spjd#include <sys/unique.h> 56168404Spjd#include <sys/dsl_pool.h> 57168404Spjd#include <sys/dsl_dataset.h> 58168404Spjd#include <sys/dsl_dir.h> 59168404Spjd#include <sys/dsl_prop.h> 60168404Spjd#include <sys/dsl_synctask.h> 61168404Spjd#include <sys/fs/zfs.h> 62185029Spjd#include <sys/arc.h> 63168404Spjd#include <sys/callb.h> 64185029Spjd#include <sys/spa_boot.h> 65219089Spjd#include <sys/zfs_ioctl.h> 66219089Spjd#include <sys/dsl_scan.h> 67248571Smm#include <sys/dmu_send.h> 68248571Smm#include <sys/dsl_destroy.h> 69248571Smm#include <sys/dsl_userhold.h> 70236884Smm#include <sys/zfeature.h> 71219089Spjd#include <sys/zvol.h> 72240868Spjd#include <sys/trim_map.h> 73168404Spjd 74219089Spjd#ifdef _KERNEL 75219089Spjd#include <sys/callb.h> 76219089Spjd#include <sys/cpupart.h> 77219089Spjd#include <sys/zone.h> 78219089Spjd#endif /* _KERNEL */ 79219089Spjd 80185029Spjd#include "zfs_prop.h" 81185029Spjd#include "zfs_comutil.h" 82168404Spjd 83204073Spjd/* Check hostid on import? */ 84204073Spjdstatic int check_hostid = 1; 85204073Spjd 86204073SpjdSYSCTL_DECL(_vfs_zfs); 87267992ShselaskySYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, 88204073Spjd "Check hostid on import?"); 89204073Spjd 90251636Sdelphij/* 91251636Sdelphij * The interval, in seconds, at which failed configuration cache file writes 92251636Sdelphij * should be retried. 93251636Sdelphij */ 94251636Sdelphijstatic int zfs_ccw_retry_interval = 300; 95251636Sdelphij 96219089Spjdtypedef enum zti_modes { 97258631Savg ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 98258631Savg ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 99258631Savg ZTI_MODE_NULL, /* don't create a taskq */ 100258631Savg ZTI_NMODES 101219089Spjd} zti_modes_t; 102168712Spjd 103258631Savg#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 104258631Savg#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 105258631Savg#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 106209962Smm 107258631Savg#define ZTI_N(n) ZTI_P(n, 1) 108258631Savg#define ZTI_ONE ZTI_N(1) 109209962Smm 110209962Smmtypedef struct zio_taskq_info { 111258631Savg zti_modes_t zti_mode; 112211931Smm uint_t zti_value; 113258631Savg uint_t zti_count; 114209962Smm} zio_taskq_info_t; 115209962Smm 116209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 117219089Spjd "issue", "issue_high", "intr", "intr_high" 118209962Smm}; 119209962Smm 120211931Smm/* 121258631Savg * This table defines the taskq settings for each ZFS I/O type. When 122258631Savg * initializing a pool, we use this table to create an appropriately sized 123258631Savg * taskq. Some operations are low volume and therefore have a small, static 124258631Savg * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 125258631Savg * macros. Other operations process a large amount of data; the ZTI_BATCH 126258631Savg * macro causes us to create a taskq oriented for throughput. Some operations 127258631Savg * are so high frequency and short-lived that the taskq itself can become a a 128258631Savg * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 129258631Savg * additional degree of parallelism specified by the number of threads per- 130258631Savg * taskq and the number of taskqs; when dispatching an event in this case, the 131258631Savg * particular taskq is chosen at random. 132258631Savg * 133258631Savg * The different taskq priorities are to handle the different contexts (issue 134258631Savg * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 135258631Savg * need to be handled with minimum delay. 136211931Smm */ 137211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 138211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 139258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 140264670Sdelphij { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 141258631Savg { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 142258631Savg { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 143258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 144258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 145209962Smm}; 146209962Smm 147248571Smmstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 148248571Smmstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 149185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 150219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 151219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 152219089Spjd char **ereport); 153219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 154185029Spjd 155258632Savguint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 156219089Spjd#ifdef PSRSET_BIND 157219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 158219089Spjd#endif 159219089Spjd#ifdef SYSDC 160219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 161219089Spjd#endif 162219089Spjduint_t zio_taskq_basedc = 80; /* base duty cycle */ 163219089Spjd 164219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 165243503Smmextern int zfs_sync_pass_deferred_free; 166219089Spjd 167247265Smm#ifndef illumos 168247265Smmextern void spa_deadman(void *arg); 169247265Smm#endif 170247265Smm 171168404Spjd/* 172219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 173219089Spjd * to get the vdev stats associated with the imported devices. 174219089Spjd */ 175219089Spjd#define TRYIMPORT_NAME "$import" 176219089Spjd 177219089Spjd/* 178168404Spjd * ========================================================================== 179185029Spjd * SPA properties routines 180185029Spjd * ========================================================================== 181185029Spjd */ 182185029Spjd 183185029Spjd/* 184185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 185185029Spjd */ 186185029Spjdstatic void 187185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 188185029Spjd uint64_t intval, zprop_source_t src) 189185029Spjd{ 190185029Spjd const char *propname = zpool_prop_to_name(prop); 191185029Spjd nvlist_t *propval; 192185029Spjd 193185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 194185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 195185029Spjd 196185029Spjd if (strval != NULL) 197185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 198185029Spjd else 199185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 200185029Spjd 201185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 202185029Spjd nvlist_free(propval); 203185029Spjd} 204185029Spjd 205185029Spjd/* 206185029Spjd * Get property values from the spa configuration. 207185029Spjd */ 208185029Spjdstatic void 209185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 210185029Spjd{ 211236155Smm vdev_t *rvd = spa->spa_root_vdev; 212236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 213209962Smm uint64_t size; 214219089Spjd uint64_t alloc; 215236155Smm uint64_t space; 216185029Spjd uint64_t cap, version; 217185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 218185029Spjd spa_config_dirent_t *dp; 219185029Spjd 220185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 221185029Spjd 222236155Smm if (rvd != NULL) { 223219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 224219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 225209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 226209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 227219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 228219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 229219089Spjd size - alloc, src); 230236155Smm 231236155Smm space = 0; 232236155Smm for (int c = 0; c < rvd->vdev_children; c++) { 233236155Smm vdev_t *tvd = rvd->vdev_child[c]; 234236155Smm space += tvd->vdev_max_asize - tvd->vdev_asize; 235236155Smm } 236236155Smm spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 237236155Smm src); 238236155Smm 239219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 240219089Spjd (spa_mode(spa) == FREAD), src); 241185029Spjd 242219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 243209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 244185029Spjd 245219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 246219089Spjd ddt_get_pool_dedup_ratio(spa), src); 247219089Spjd 248209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 249236155Smm rvd->vdev_state, src); 250209962Smm 251209962Smm version = spa_version(spa); 252209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 253209962Smm src = ZPROP_SRC_DEFAULT; 254209962Smm else 255209962Smm src = ZPROP_SRC_LOCAL; 256209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 257209962Smm } 258209962Smm 259236884Smm if (pool != NULL) { 260236884Smm /* 261236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 262236884Smm * when opening pools before this version freedir will be NULL. 263236884Smm */ 264268079Sdelphij if (pool->dp_free_dir != NULL) { 265236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 266268079Sdelphij pool->dp_free_dir->dd_phys->dd_used_bytes, src); 267236884Smm } else { 268236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 269236884Smm NULL, 0, src); 270236884Smm } 271268079Sdelphij 272268079Sdelphij if (pool->dp_leak_dir != NULL) { 273268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 274268079Sdelphij pool->dp_leak_dir->dd_phys->dd_used_bytes, src); 275268079Sdelphij } else { 276268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 277268079Sdelphij NULL, 0, src); 278268079Sdelphij } 279236884Smm } 280236884Smm 281185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 282185029Spjd 283228103Smm if (spa->spa_comment != NULL) { 284228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 285228103Smm 0, ZPROP_SRC_LOCAL); 286228103Smm } 287228103Smm 288185029Spjd if (spa->spa_root != NULL) 289185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 290185029Spjd 0, ZPROP_SRC_LOCAL); 291185029Spjd 292185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 293185029Spjd if (dp->scd_path == NULL) { 294185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 295185029Spjd "none", 0, ZPROP_SRC_LOCAL); 296185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 297185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 298185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 299185029Spjd } 300185029Spjd } 301185029Spjd} 302185029Spjd 303185029Spjd/* 304185029Spjd * Get zpool property values. 305185029Spjd */ 306185029Spjdint 307185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 308185029Spjd{ 309219089Spjd objset_t *mos = spa->spa_meta_objset; 310185029Spjd zap_cursor_t zc; 311185029Spjd zap_attribute_t za; 312185029Spjd int err; 313185029Spjd 314185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 315185029Spjd 316185029Spjd mutex_enter(&spa->spa_props_lock); 317185029Spjd 318185029Spjd /* 319185029Spjd * Get properties from the spa config. 320185029Spjd */ 321185029Spjd spa_prop_get_config(spa, nvp); 322185029Spjd 323185029Spjd /* If no pool property object, no more prop to get. */ 324219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 325185029Spjd mutex_exit(&spa->spa_props_lock); 326185029Spjd return (0); 327185029Spjd } 328185029Spjd 329185029Spjd /* 330185029Spjd * Get properties from the MOS pool property object. 331185029Spjd */ 332185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 333185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 334185029Spjd zap_cursor_advance(&zc)) { 335185029Spjd uint64_t intval = 0; 336185029Spjd char *strval = NULL; 337185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 338185029Spjd zpool_prop_t prop; 339185029Spjd 340185029Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 341185029Spjd continue; 342185029Spjd 343185029Spjd switch (za.za_integer_length) { 344185029Spjd case 8: 345185029Spjd /* integer property */ 346185029Spjd if (za.za_first_integer != 347185029Spjd zpool_prop_default_numeric(prop)) 348185029Spjd src = ZPROP_SRC_LOCAL; 349185029Spjd 350185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 351185029Spjd dsl_pool_t *dp; 352185029Spjd dsl_dataset_t *ds = NULL; 353185029Spjd 354185029Spjd dp = spa_get_dsl(spa); 355248571Smm dsl_pool_config_enter(dp, FTAG); 356185029Spjd if (err = dsl_dataset_hold_obj(dp, 357185029Spjd za.za_first_integer, FTAG, &ds)) { 358248571Smm dsl_pool_config_exit(dp, FTAG); 359185029Spjd break; 360185029Spjd } 361185029Spjd 362185029Spjd strval = kmem_alloc( 363185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 364185029Spjd KM_SLEEP); 365185029Spjd dsl_dataset_name(ds, strval); 366185029Spjd dsl_dataset_rele(ds, FTAG); 367248571Smm dsl_pool_config_exit(dp, FTAG); 368185029Spjd } else { 369185029Spjd strval = NULL; 370185029Spjd intval = za.za_first_integer; 371185029Spjd } 372185029Spjd 373185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 374185029Spjd 375185029Spjd if (strval != NULL) 376185029Spjd kmem_free(strval, 377185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 378185029Spjd 379185029Spjd break; 380185029Spjd 381185029Spjd case 1: 382185029Spjd /* string property */ 383185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 384185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 385185029Spjd za.za_name, 1, za.za_num_integers, strval); 386185029Spjd if (err) { 387185029Spjd kmem_free(strval, za.za_num_integers); 388185029Spjd break; 389185029Spjd } 390185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 391185029Spjd kmem_free(strval, za.za_num_integers); 392185029Spjd break; 393185029Spjd 394185029Spjd default: 395185029Spjd break; 396185029Spjd } 397185029Spjd } 398185029Spjd zap_cursor_fini(&zc); 399185029Spjd mutex_exit(&spa->spa_props_lock); 400185029Spjdout: 401185029Spjd if (err && err != ENOENT) { 402185029Spjd nvlist_free(*nvp); 403185029Spjd *nvp = NULL; 404185029Spjd return (err); 405185029Spjd } 406185029Spjd 407185029Spjd return (0); 408185029Spjd} 409185029Spjd 410185029Spjd/* 411185029Spjd * Validate the given pool properties nvlist and modify the list 412185029Spjd * for the property values to be set. 413185029Spjd */ 414185029Spjdstatic int 415185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 416185029Spjd{ 417185029Spjd nvpair_t *elem; 418185029Spjd int error = 0, reset_bootfs = 0; 419247187Smm uint64_t objnum = 0; 420236884Smm boolean_t has_feature = B_FALSE; 421185029Spjd 422185029Spjd elem = NULL; 423185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 424185029Spjd uint64_t intval; 425236884Smm char *strval, *slash, *check, *fname; 426236884Smm const char *propname = nvpair_name(elem); 427236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 428185029Spjd 429236884Smm switch (prop) { 430236884Smm case ZPROP_INVAL: 431236884Smm if (!zpool_prop_feature(propname)) { 432249195Smm error = SET_ERROR(EINVAL); 433236884Smm break; 434236884Smm } 435185029Spjd 436236884Smm /* 437236884Smm * Sanitize the input. 438236884Smm */ 439236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 440249195Smm error = SET_ERROR(EINVAL); 441236884Smm break; 442236884Smm } 443185029Spjd 444236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 445249195Smm error = SET_ERROR(EINVAL); 446236884Smm break; 447236884Smm } 448236884Smm 449236884Smm if (intval != 0) { 450249195Smm error = SET_ERROR(EINVAL); 451236884Smm break; 452236884Smm } 453236884Smm 454236884Smm fname = strchr(propname, '@') + 1; 455236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 456249195Smm error = SET_ERROR(EINVAL); 457236884Smm break; 458236884Smm } 459236884Smm 460236884Smm has_feature = B_TRUE; 461236884Smm break; 462236884Smm 463185029Spjd case ZPOOL_PROP_VERSION: 464185029Spjd error = nvpair_value_uint64(elem, &intval); 465185029Spjd if (!error && 466236884Smm (intval < spa_version(spa) || 467236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 468236884Smm has_feature)) 469249195Smm error = SET_ERROR(EINVAL); 470185029Spjd break; 471185029Spjd 472185029Spjd case ZPOOL_PROP_DELEGATION: 473185029Spjd case ZPOOL_PROP_AUTOREPLACE: 474185029Spjd case ZPOOL_PROP_LISTSNAPS: 475219089Spjd case ZPOOL_PROP_AUTOEXPAND: 476185029Spjd error = nvpair_value_uint64(elem, &intval); 477185029Spjd if (!error && intval > 1) 478249195Smm error = SET_ERROR(EINVAL); 479185029Spjd break; 480185029Spjd 481185029Spjd case ZPOOL_PROP_BOOTFS: 482209962Smm /* 483209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 484209962Smm * or the pool is still being created (version == 0), 485209962Smm * the bootfs property cannot be set. 486209962Smm */ 487185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 488249195Smm error = SET_ERROR(ENOTSUP); 489185029Spjd break; 490185029Spjd } 491185029Spjd 492185029Spjd /* 493185029Spjd * Make sure the vdev config is bootable 494185029Spjd */ 495185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 496249195Smm error = SET_ERROR(ENOTSUP); 497185029Spjd break; 498185029Spjd } 499185029Spjd 500185029Spjd reset_bootfs = 1; 501185029Spjd 502185029Spjd error = nvpair_value_string(elem, &strval); 503185029Spjd 504185029Spjd if (!error) { 505236884Smm objset_t *os; 506185029Spjd uint64_t compress; 507185029Spjd 508185029Spjd if (strval == NULL || strval[0] == '\0') { 509185029Spjd objnum = zpool_prop_default_numeric( 510185029Spjd ZPOOL_PROP_BOOTFS); 511185029Spjd break; 512185029Spjd } 513185029Spjd 514219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 515185029Spjd break; 516185029Spjd 517219089Spjd /* Must be ZPL and not gzip compressed. */ 518219089Spjd 519219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 520249195Smm error = SET_ERROR(ENOTSUP); 521248571Smm } else if ((error = 522248571Smm dsl_prop_get_int_ds(dmu_objset_ds(os), 523185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 524248571Smm &compress)) == 0 && 525185029Spjd !BOOTFS_COMPRESS_VALID(compress)) { 526249195Smm error = SET_ERROR(ENOTSUP); 527185029Spjd } else { 528185029Spjd objnum = dmu_objset_id(os); 529185029Spjd } 530219089Spjd dmu_objset_rele(os, FTAG); 531185029Spjd } 532185029Spjd break; 533185029Spjd 534185029Spjd case ZPOOL_PROP_FAILUREMODE: 535185029Spjd error = nvpair_value_uint64(elem, &intval); 536185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 537185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 538249195Smm error = SET_ERROR(EINVAL); 539185029Spjd 540185029Spjd /* 541185029Spjd * This is a special case which only occurs when 542185029Spjd * the pool has completely failed. This allows 543185029Spjd * the user to change the in-core failmode property 544185029Spjd * without syncing it out to disk (I/Os might 545185029Spjd * currently be blocked). We do this by returning 546185029Spjd * EIO to the caller (spa_prop_set) to trick it 547185029Spjd * into thinking we encountered a property validation 548185029Spjd * error. 549185029Spjd */ 550185029Spjd if (!error && spa_suspended(spa)) { 551185029Spjd spa->spa_failmode = intval; 552249195Smm error = SET_ERROR(EIO); 553185029Spjd } 554185029Spjd break; 555185029Spjd 556185029Spjd case ZPOOL_PROP_CACHEFILE: 557185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 558185029Spjd break; 559185029Spjd 560185029Spjd if (strval[0] == '\0') 561185029Spjd break; 562185029Spjd 563185029Spjd if (strcmp(strval, "none") == 0) 564185029Spjd break; 565185029Spjd 566185029Spjd if (strval[0] != '/') { 567249195Smm error = SET_ERROR(EINVAL); 568185029Spjd break; 569185029Spjd } 570185029Spjd 571185029Spjd slash = strrchr(strval, '/'); 572185029Spjd ASSERT(slash != NULL); 573185029Spjd 574185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 575185029Spjd strcmp(slash, "/..") == 0) 576249195Smm error = SET_ERROR(EINVAL); 577185029Spjd break; 578219089Spjd 579228103Smm case ZPOOL_PROP_COMMENT: 580228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 581228103Smm break; 582228103Smm for (check = strval; *check != '\0'; check++) { 583228103Smm /* 584228103Smm * The kernel doesn't have an easy isprint() 585228103Smm * check. For this kernel check, we merely 586228103Smm * check ASCII apart from DEL. Fix this if 587228103Smm * there is an easy-to-use kernel isprint(). 588228103Smm */ 589228103Smm if (*check >= 0x7f) { 590249195Smm error = SET_ERROR(EINVAL); 591228103Smm break; 592228103Smm } 593228103Smm check++; 594228103Smm } 595228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 596228103Smm error = E2BIG; 597228103Smm break; 598228103Smm 599219089Spjd case ZPOOL_PROP_DEDUPDITTO: 600219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 601249195Smm error = SET_ERROR(ENOTSUP); 602219089Spjd else 603219089Spjd error = nvpair_value_uint64(elem, &intval); 604219089Spjd if (error == 0 && 605219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 606249195Smm error = SET_ERROR(EINVAL); 607219089Spjd break; 608185029Spjd } 609185029Spjd 610185029Spjd if (error) 611185029Spjd break; 612185029Spjd } 613185029Spjd 614185029Spjd if (!error && reset_bootfs) { 615185029Spjd error = nvlist_remove(props, 616185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 617185029Spjd 618185029Spjd if (!error) { 619185029Spjd error = nvlist_add_uint64(props, 620185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 621185029Spjd } 622185029Spjd } 623185029Spjd 624185029Spjd return (error); 625185029Spjd} 626185029Spjd 627209962Smmvoid 628209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 629209962Smm{ 630209962Smm char *cachefile; 631209962Smm spa_config_dirent_t *dp; 632209962Smm 633209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 634209962Smm &cachefile) != 0) 635209962Smm return; 636209962Smm 637209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 638209962Smm KM_SLEEP); 639209962Smm 640209962Smm if (cachefile[0] == '\0') 641209962Smm dp->scd_path = spa_strdup(spa_config_path); 642209962Smm else if (strcmp(cachefile, "none") == 0) 643209962Smm dp->scd_path = NULL; 644209962Smm else 645209962Smm dp->scd_path = spa_strdup(cachefile); 646209962Smm 647209962Smm list_insert_head(&spa->spa_config_list, dp); 648209962Smm if (need_sync) 649209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 650209962Smm} 651209962Smm 652185029Spjdint 653185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 654185029Spjd{ 655185029Spjd int error; 656236884Smm nvpair_t *elem = NULL; 657209962Smm boolean_t need_sync = B_FALSE; 658185029Spjd 659185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 660185029Spjd return (error); 661185029Spjd 662209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 663236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 664209962Smm 665219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 666219089Spjd prop == ZPOOL_PROP_ALTROOT || 667219089Spjd prop == ZPOOL_PROP_READONLY) 668209962Smm continue; 669209962Smm 670236884Smm if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 671236884Smm uint64_t ver; 672236884Smm 673236884Smm if (prop == ZPOOL_PROP_VERSION) { 674236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 675236884Smm } else { 676236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 677236884Smm ver = SPA_VERSION_FEATURES; 678236884Smm need_sync = B_TRUE; 679236884Smm } 680236884Smm 681236884Smm /* Save time if the version is already set. */ 682236884Smm if (ver == spa_version(spa)) 683236884Smm continue; 684236884Smm 685236884Smm /* 686236884Smm * In addition to the pool directory object, we might 687236884Smm * create the pool properties object, the features for 688236884Smm * read object, the features for write object, or the 689236884Smm * feature descriptions object. 690236884Smm */ 691248571Smm error = dsl_sync_task(spa->spa_name, NULL, 692268473Sdelphij spa_sync_version, &ver, 693268473Sdelphij 6, ZFS_SPACE_CHECK_RESERVED); 694236884Smm if (error) 695236884Smm return (error); 696236884Smm continue; 697236884Smm } 698236884Smm 699209962Smm need_sync = B_TRUE; 700209962Smm break; 701209962Smm } 702209962Smm 703236884Smm if (need_sync) { 704248571Smm return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 705268473Sdelphij nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 706236884Smm } 707236884Smm 708236884Smm return (0); 709185029Spjd} 710185029Spjd 711185029Spjd/* 712185029Spjd * If the bootfs property value is dsobj, clear it. 713185029Spjd */ 714185029Spjdvoid 715185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 716185029Spjd{ 717185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 718185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 719185029Spjd spa->spa_pool_props_object, 720185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 721185029Spjd spa->spa_bootfs = 0; 722185029Spjd } 723185029Spjd} 724185029Spjd 725239620Smm/*ARGSUSED*/ 726239620Smmstatic int 727248571Smmspa_change_guid_check(void *arg, dmu_tx_t *tx) 728239620Smm{ 729248571Smm uint64_t *newguid = arg; 730248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 731239620Smm vdev_t *rvd = spa->spa_root_vdev; 732239620Smm uint64_t vdev_state; 733239620Smm 734239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 735239620Smm vdev_state = rvd->vdev_state; 736239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 737239620Smm 738239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 739249195Smm return (SET_ERROR(ENXIO)); 740239620Smm 741239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 742239620Smm 743239620Smm return (0); 744239620Smm} 745239620Smm 746239620Smmstatic void 747248571Smmspa_change_guid_sync(void *arg, dmu_tx_t *tx) 748239620Smm{ 749248571Smm uint64_t *newguid = arg; 750248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 751239620Smm uint64_t oldguid; 752239620Smm vdev_t *rvd = spa->spa_root_vdev; 753239620Smm 754239620Smm oldguid = spa_guid(spa); 755239620Smm 756239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 757239620Smm rvd->vdev_guid = *newguid; 758239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 759239620Smm vdev_config_dirty(rvd); 760239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 761239620Smm 762248571Smm spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 763239620Smm oldguid, *newguid); 764239620Smm} 765239620Smm 766185029Spjd/* 767228103Smm * Change the GUID for the pool. This is done so that we can later 768228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 769228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 770228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 771228103Smm * online when we do this, or else any vdevs that weren't present 772228103Smm * would be orphaned from our pool. We are also going to issue a 773228103Smm * sysevent to update any watchers. 774228103Smm */ 775228103Smmint 776228103Smmspa_change_guid(spa_t *spa) 777228103Smm{ 778239620Smm int error; 779239620Smm uint64_t guid; 780228103Smm 781254074Sdelphij mutex_enter(&spa->spa_vdev_top_lock); 782239620Smm mutex_enter(&spa_namespace_lock); 783239620Smm guid = spa_generate_guid(NULL); 784228103Smm 785248571Smm error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 786268473Sdelphij spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 787228103Smm 788239620Smm if (error == 0) { 789239620Smm spa_config_sync(spa, B_FALSE, B_TRUE); 790239620Smm spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 791239620Smm } 792228103Smm 793239620Smm mutex_exit(&spa_namespace_lock); 794254074Sdelphij mutex_exit(&spa->spa_vdev_top_lock); 795228103Smm 796239620Smm return (error); 797228103Smm} 798228103Smm 799228103Smm/* 800185029Spjd * ========================================================================== 801168404Spjd * SPA state manipulation (open/create/destroy/import/export) 802168404Spjd * ========================================================================== 803168404Spjd */ 804168404Spjd 805168404Spjdstatic int 806168404Spjdspa_error_entry_compare(const void *a, const void *b) 807168404Spjd{ 808168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 809168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 810168404Spjd int ret; 811168404Spjd 812168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 813268123Sdelphij sizeof (zbookmark_phys_t)); 814168404Spjd 815168404Spjd if (ret < 0) 816168404Spjd return (-1); 817168404Spjd else if (ret > 0) 818168404Spjd return (1); 819168404Spjd else 820168404Spjd return (0); 821168404Spjd} 822168404Spjd 823168404Spjd/* 824168404Spjd * Utility function which retrieves copies of the current logs and 825168404Spjd * re-initializes them in the process. 826168404Spjd */ 827168404Spjdvoid 828168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 829168404Spjd{ 830168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 831168404Spjd 832168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 833168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 834168404Spjd 835168404Spjd avl_create(&spa->spa_errlist_scrub, 836168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 837168404Spjd offsetof(spa_error_entry_t, se_avl)); 838168404Spjd avl_create(&spa->spa_errlist_last, 839168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 840168404Spjd offsetof(spa_error_entry_t, se_avl)); 841168404Spjd} 842168404Spjd 843258631Savgstatic void 844258631Savgspa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 845168404Spjd{ 846258631Savg const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 847258631Savg enum zti_modes mode = ztip->zti_mode; 848258631Savg uint_t value = ztip->zti_value; 849258631Savg uint_t count = ztip->zti_count; 850258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 851258631Savg char name[32]; 852258630Savg uint_t flags = 0; 853219089Spjd boolean_t batch = B_FALSE; 854168404Spjd 855258631Savg if (mode == ZTI_MODE_NULL) { 856258631Savg tqs->stqs_count = 0; 857258631Savg tqs->stqs_taskq = NULL; 858258631Savg return; 859258631Savg } 860168404Spjd 861258631Savg ASSERT3U(count, >, 0); 862168404Spjd 863258631Savg tqs->stqs_count = count; 864258631Savg tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 865219089Spjd 866258632Savg switch (mode) { 867258632Savg case ZTI_MODE_FIXED: 868258632Savg ASSERT3U(value, >=, 1); 869258632Savg value = MAX(value, 1); 870258632Savg break; 871219089Spjd 872258632Savg case ZTI_MODE_BATCH: 873258632Savg batch = B_TRUE; 874258632Savg flags |= TASKQ_THREADS_CPU_PCT; 875258632Savg value = zio_taskq_batch_pct; 876258632Savg break; 877219089Spjd 878258632Savg default: 879258632Savg panic("unrecognized mode for %s_%s taskq (%u:%u) in " 880258632Savg "spa_activate()", 881258632Savg zio_type_name[t], zio_taskq_types[q], mode, value); 882258632Savg break; 883258632Savg } 884258631Savg 885258632Savg for (uint_t i = 0; i < count; i++) { 886258632Savg taskq_t *tq; 887258631Savg 888258631Savg if (count > 1) { 889258631Savg (void) snprintf(name, sizeof (name), "%s_%s_%u", 890258631Savg zio_type_name[t], zio_taskq_types[q], i); 891258631Savg } else { 892258631Savg (void) snprintf(name, sizeof (name), "%s_%s", 893258631Savg zio_type_name[t], zio_taskq_types[q]); 894258631Savg } 895258631Savg 896219089Spjd#ifdef SYSDC 897258631Savg if (zio_taskq_sysdc && spa->spa_proc != &p0) { 898258631Savg if (batch) 899258631Savg flags |= TASKQ_DC_BATCH; 900219089Spjd 901258631Savg tq = taskq_create_sysdc(name, value, 50, INT_MAX, 902258631Savg spa->spa_proc, zio_taskq_basedc, flags); 903258631Savg } else { 904258631Savg#endif 905258632Savg pri_t pri = maxclsyspri; 906258632Savg /* 907258632Savg * The write issue taskq can be extremely CPU 908258632Savg * intensive. Run it at slightly lower priority 909258632Savg * than the other taskqs. 910258632Savg */ 911258632Savg if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 912258632Savg pri--; 913258632Savg 914258632Savg tq = taskq_create_proc(name, value, pri, 50, 915258631Savg INT_MAX, spa->spa_proc, flags); 916258631Savg#ifdef SYSDC 917258631Savg } 918258631Savg#endif 919258631Savg 920258631Savg tqs->stqs_taskq[i] = tq; 921219089Spjd } 922219089Spjd} 923219089Spjd 924219089Spjdstatic void 925258631Savgspa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 926258631Savg{ 927258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 928258631Savg 929258631Savg if (tqs->stqs_taskq == NULL) { 930258631Savg ASSERT0(tqs->stqs_count); 931258631Savg return; 932258631Savg } 933258631Savg 934258631Savg for (uint_t i = 0; i < tqs->stqs_count; i++) { 935258631Savg ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 936258631Savg taskq_destroy(tqs->stqs_taskq[i]); 937258631Savg } 938258631Savg 939258631Savg kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 940258631Savg tqs->stqs_taskq = NULL; 941258631Savg} 942258631Savg 943258631Savg/* 944258631Savg * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 945258631Savg * Note that a type may have multiple discrete taskqs to avoid lock contention 946258631Savg * on the taskq itself. In that case we choose which taskq at random by using 947258631Savg * the low bits of gethrtime(). 948258631Savg */ 949258631Savgvoid 950258631Savgspa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 951258631Savg task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 952258631Savg{ 953258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 954258631Savg taskq_t *tq; 955258631Savg 956258631Savg ASSERT3P(tqs->stqs_taskq, !=, NULL); 957258631Savg ASSERT3U(tqs->stqs_count, !=, 0); 958258631Savg 959258631Savg if (tqs->stqs_count == 1) { 960258631Savg tq = tqs->stqs_taskq[0]; 961258631Savg } else { 962267038Sbdrewery#ifdef _KERNEL 963267029Smav tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 964267038Sbdrewery#else 965267038Sbdrewery tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 966267038Sbdrewery#endif 967258631Savg } 968258631Savg 969258631Savg taskq_dispatch_ent(tq, func, arg, flags, ent); 970258631Savg} 971258631Savg 972258631Savgstatic void 973219089Spjdspa_create_zio_taskqs(spa_t *spa) 974219089Spjd{ 975185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 976185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 977258631Savg spa_taskqs_init(spa, t, q); 978219089Spjd } 979219089Spjd } 980219089Spjd} 981209962Smm 982219089Spjd#ifdef _KERNEL 983219089Spjd#ifdef SPA_PROCESS 984219089Spjdstatic void 985219089Spjdspa_thread(void *arg) 986219089Spjd{ 987219089Spjd callb_cpr_t cprinfo; 988209962Smm 989219089Spjd spa_t *spa = arg; 990219089Spjd user_t *pu = PTOU(curproc); 991209962Smm 992219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 993219089Spjd spa->spa_name); 994209962Smm 995219089Spjd ASSERT(curproc != &p0); 996219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 997219089Spjd "zpool-%s", spa->spa_name); 998219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 999211931Smm 1000219089Spjd#ifdef PSRSET_BIND 1001219089Spjd /* bind this thread to the requested psrset */ 1002219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 1003219089Spjd pool_lock(); 1004219089Spjd mutex_enter(&cpu_lock); 1005219089Spjd mutex_enter(&pidlock); 1006219089Spjd mutex_enter(&curproc->p_lock); 1007219089Spjd 1008219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1009219089Spjd 0, NULL, NULL) == 0) { 1010219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 1011219089Spjd } else { 1012219089Spjd cmn_err(CE_WARN, 1013219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 1014219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1015219089Spjd } 1016219089Spjd 1017219089Spjd mutex_exit(&curproc->p_lock); 1018219089Spjd mutex_exit(&pidlock); 1019219089Spjd mutex_exit(&cpu_lock); 1020219089Spjd pool_unlock(); 1021219089Spjd } 1022219089Spjd#endif 1023219089Spjd 1024219089Spjd#ifdef SYSDC 1025219089Spjd if (zio_taskq_sysdc) { 1026219089Spjd sysdc_thread_enter(curthread, 100, 0); 1027219089Spjd } 1028219089Spjd#endif 1029219089Spjd 1030219089Spjd spa->spa_proc = curproc; 1031219089Spjd spa->spa_did = curthread->t_did; 1032219089Spjd 1033219089Spjd spa_create_zio_taskqs(spa); 1034219089Spjd 1035219089Spjd mutex_enter(&spa->spa_proc_lock); 1036219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1037219089Spjd 1038219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 1039219089Spjd cv_broadcast(&spa->spa_proc_cv); 1040219089Spjd 1041219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 1042219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1043219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1044219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1045219089Spjd 1046219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1047219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 1048219089Spjd spa->spa_proc = &p0; 1049219089Spjd cv_broadcast(&spa->spa_proc_cv); 1050219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1051219089Spjd 1052219089Spjd mutex_enter(&curproc->p_lock); 1053219089Spjd lwp_exit(); 1054219089Spjd} 1055219089Spjd#endif /* SPA_PROCESS */ 1056219089Spjd#endif 1057219089Spjd 1058219089Spjd/* 1059219089Spjd * Activate an uninitialized pool. 1060219089Spjd */ 1061219089Spjdstatic void 1062219089Spjdspa_activate(spa_t *spa, int mode) 1063219089Spjd{ 1064219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1065219089Spjd 1066219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 1067219089Spjd spa->spa_mode = mode; 1068219089Spjd 1069219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1070219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1071219089Spjd 1072219089Spjd /* Try to create a covering process */ 1073219089Spjd mutex_enter(&spa->spa_proc_lock); 1074219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1075219089Spjd ASSERT(spa->spa_proc == &p0); 1076219089Spjd spa->spa_did = 0; 1077219089Spjd 1078219089Spjd#ifdef SPA_PROCESS 1079219089Spjd /* Only create a process if we're going to be around a while. */ 1080219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1081219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1082219089Spjd NULL, 0) == 0) { 1083219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 1084219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 1085219089Spjd cv_wait(&spa->spa_proc_cv, 1086219089Spjd &spa->spa_proc_lock); 1087209962Smm } 1088219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1089219089Spjd ASSERT(spa->spa_proc != &p0); 1090219089Spjd ASSERT(spa->spa_did != 0); 1091219089Spjd } else { 1092219089Spjd#ifdef _KERNEL 1093219089Spjd cmn_err(CE_WARN, 1094219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 1095219089Spjd spa->spa_name); 1096219089Spjd#endif 1097185029Spjd } 1098168404Spjd } 1099219089Spjd#endif /* SPA_PROCESS */ 1100219089Spjd mutex_exit(&spa->spa_proc_lock); 1101168404Spjd 1102219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1103219089Spjd ASSERT(spa->spa_proc == &p0); 1104219089Spjd if (spa->spa_proc == &p0) { 1105219089Spjd spa_create_zio_taskqs(spa); 1106219089Spjd } 1107219089Spjd 1108240868Spjd /* 1109240868Spjd * Start TRIM thread. 1110240868Spjd */ 1111240868Spjd trim_thread_create(spa); 1112240868Spjd 1113185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1114185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1115185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1116185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1117168404Spjd 1118168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 1119168404Spjd offsetof(struct vdev, vdev_txg_node)); 1120168404Spjd 1121168404Spjd avl_create(&spa->spa_errlist_scrub, 1122168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1123168404Spjd offsetof(spa_error_entry_t, se_avl)); 1124168404Spjd avl_create(&spa->spa_errlist_last, 1125168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1126168404Spjd offsetof(spa_error_entry_t, se_avl)); 1127168404Spjd} 1128168404Spjd 1129168404Spjd/* 1130168404Spjd * Opposite of spa_activate(). 1131168404Spjd */ 1132168404Spjdstatic void 1133168404Spjdspa_deactivate(spa_t *spa) 1134168404Spjd{ 1135168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1136168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1137168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1138209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1139168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1140168404Spjd 1141240868Spjd /* 1142240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1143240868Spjd * before spa_deactivate(). 1144240868Spjd */ 1145240868Spjd trim_thread_destroy(spa); 1146240868Spjd 1147168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1148168404Spjd 1149185029Spjd list_destroy(&spa->spa_config_dirty_list); 1150185029Spjd list_destroy(&spa->spa_state_dirty_list); 1151168404Spjd 1152185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1153185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1154258631Savg spa_taskqs_fini(spa, t, q); 1155185029Spjd } 1156168404Spjd } 1157168404Spjd 1158168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1159168404Spjd spa->spa_normal_class = NULL; 1160168404Spjd 1161185029Spjd metaslab_class_destroy(spa->spa_log_class); 1162185029Spjd spa->spa_log_class = NULL; 1163185029Spjd 1164168404Spjd /* 1165168404Spjd * If this was part of an import or the open otherwise failed, we may 1166168404Spjd * still have errors left in the queues. Empty them just in case. 1167168404Spjd */ 1168168404Spjd spa_errlog_drain(spa); 1169168404Spjd 1170168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1171168404Spjd avl_destroy(&spa->spa_errlist_last); 1172168404Spjd 1173168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1174219089Spjd 1175219089Spjd mutex_enter(&spa->spa_proc_lock); 1176219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1177219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1178219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1179219089Spjd cv_broadcast(&spa->spa_proc_cv); 1180219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1181219089Spjd ASSERT(spa->spa_proc != &p0); 1182219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1183219089Spjd } 1184219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1185219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1186219089Spjd } 1187219089Spjd ASSERT(spa->spa_proc == &p0); 1188219089Spjd mutex_exit(&spa->spa_proc_lock); 1189219089Spjd 1190219089Spjd#ifdef SPA_PROCESS 1191219089Spjd /* 1192219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1193219089Spjd * module, so that the module can't be unloaded out from underneath 1194219089Spjd * it. 1195219089Spjd */ 1196219089Spjd if (spa->spa_did != 0) { 1197219089Spjd thread_join(spa->spa_did); 1198219089Spjd spa->spa_did = 0; 1199219089Spjd } 1200219089Spjd#endif /* SPA_PROCESS */ 1201168404Spjd} 1202168404Spjd 1203168404Spjd/* 1204168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1205168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1206168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1207168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1208168404Spjd */ 1209168404Spjdstatic int 1210168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1211168404Spjd uint_t id, int atype) 1212168404Spjd{ 1213168404Spjd nvlist_t **child; 1214219089Spjd uint_t children; 1215168404Spjd int error; 1216168404Spjd 1217168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1218168404Spjd return (error); 1219168404Spjd 1220168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1221168404Spjd return (0); 1222168404Spjd 1223185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1224185029Spjd &child, &children); 1225185029Spjd 1226185029Spjd if (error == ENOENT) 1227185029Spjd return (0); 1228185029Spjd 1229185029Spjd if (error) { 1230168404Spjd vdev_free(*vdp); 1231168404Spjd *vdp = NULL; 1232249195Smm return (SET_ERROR(EINVAL)); 1233168404Spjd } 1234168404Spjd 1235219089Spjd for (int c = 0; c < children; c++) { 1236168404Spjd vdev_t *vd; 1237168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1238168404Spjd atype)) != 0) { 1239168404Spjd vdev_free(*vdp); 1240168404Spjd *vdp = NULL; 1241168404Spjd return (error); 1242168404Spjd } 1243168404Spjd } 1244168404Spjd 1245168404Spjd ASSERT(*vdp != NULL); 1246168404Spjd 1247168404Spjd return (0); 1248168404Spjd} 1249168404Spjd 1250168404Spjd/* 1251168404Spjd * Opposite of spa_load(). 1252168404Spjd */ 1253168404Spjdstatic void 1254168404Spjdspa_unload(spa_t *spa) 1255168404Spjd{ 1256168404Spjd int i; 1257168404Spjd 1258185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1259185029Spjd 1260168404Spjd /* 1261240868Spjd * Stop TRIM thread. 1262240868Spjd */ 1263240868Spjd trim_thread_destroy(spa); 1264240868Spjd 1265240868Spjd /* 1266168404Spjd * Stop async tasks. 1267168404Spjd */ 1268168404Spjd spa_async_suspend(spa); 1269168404Spjd 1270168404Spjd /* 1271168404Spjd * Stop syncing. 1272168404Spjd */ 1273168404Spjd if (spa->spa_sync_on) { 1274168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1275168404Spjd spa->spa_sync_on = B_FALSE; 1276168404Spjd } 1277168404Spjd 1278168404Spjd /* 1279185029Spjd * Wait for any outstanding async I/O to complete. 1280168404Spjd */ 1281209962Smm if (spa->spa_async_zio_root != NULL) { 1282209962Smm (void) zio_wait(spa->spa_async_zio_root); 1283209962Smm spa->spa_async_zio_root = NULL; 1284209962Smm } 1285168404Spjd 1286219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1287219089Spjd 1288258717Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1289258717Savg 1290168404Spjd /* 1291258717Savg * Close all vdevs. 1292258717Savg */ 1293258717Savg if (spa->spa_root_vdev) 1294258717Savg vdev_free(spa->spa_root_vdev); 1295258717Savg ASSERT(spa->spa_root_vdev == NULL); 1296258717Savg 1297258717Savg /* 1298168404Spjd * Close the dsl pool. 1299168404Spjd */ 1300168404Spjd if (spa->spa_dsl_pool) { 1301168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1302168404Spjd spa->spa_dsl_pool = NULL; 1303219089Spjd spa->spa_meta_objset = NULL; 1304168404Spjd } 1305168404Spjd 1306219089Spjd ddt_unload(spa); 1307219089Spjd 1308209962Smm 1309168404Spjd /* 1310209962Smm * Drop and purge level 2 cache 1311209962Smm */ 1312209962Smm spa_l2cache_drop(spa); 1313209962Smm 1314185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1315185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1316185029Spjd if (spa->spa_spares.sav_vdevs) { 1317185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1318185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1319185029Spjd spa->spa_spares.sav_vdevs = NULL; 1320168404Spjd } 1321185029Spjd if (spa->spa_spares.sav_config) { 1322185029Spjd nvlist_free(spa->spa_spares.sav_config); 1323185029Spjd spa->spa_spares.sav_config = NULL; 1324168404Spjd } 1325185029Spjd spa->spa_spares.sav_count = 0; 1326168404Spjd 1327230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1328230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1329185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1330230514Smm } 1331185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1332185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1333185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1334185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1335185029Spjd } 1336185029Spjd if (spa->spa_l2cache.sav_config) { 1337185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1338185029Spjd spa->spa_l2cache.sav_config = NULL; 1339185029Spjd } 1340185029Spjd spa->spa_l2cache.sav_count = 0; 1341185029Spjd 1342168404Spjd spa->spa_async_suspended = 0; 1343209962Smm 1344228103Smm if (spa->spa_comment != NULL) { 1345228103Smm spa_strfree(spa->spa_comment); 1346228103Smm spa->spa_comment = NULL; 1347228103Smm } 1348228103Smm 1349209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1350168404Spjd} 1351168404Spjd 1352168404Spjd/* 1353168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1354168404Spjd * this pool. When this is called, we have some form of basic information in 1355185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1356185029Spjd * then re-generate a more complete list including status information. 1357168404Spjd */ 1358168404Spjdstatic void 1359168404Spjdspa_load_spares(spa_t *spa) 1360168404Spjd{ 1361168404Spjd nvlist_t **spares; 1362168404Spjd uint_t nspares; 1363168404Spjd int i; 1364168404Spjd vdev_t *vd, *tvd; 1365168404Spjd 1366185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1367185029Spjd 1368168404Spjd /* 1369168404Spjd * First, close and free any existing spare vdevs. 1370168404Spjd */ 1371185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1372185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1373168404Spjd 1374168404Spjd /* Undo the call to spa_activate() below */ 1375185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1376185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1377168404Spjd spa_spare_remove(tvd); 1378168404Spjd vdev_close(vd); 1379168404Spjd vdev_free(vd); 1380168404Spjd } 1381168404Spjd 1382185029Spjd if (spa->spa_spares.sav_vdevs) 1383185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1384185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1385168404Spjd 1386185029Spjd if (spa->spa_spares.sav_config == NULL) 1387168404Spjd nspares = 0; 1388168404Spjd else 1389185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1390168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1391168404Spjd 1392185029Spjd spa->spa_spares.sav_count = (int)nspares; 1393185029Spjd spa->spa_spares.sav_vdevs = NULL; 1394168404Spjd 1395168404Spjd if (nspares == 0) 1396168404Spjd return; 1397168404Spjd 1398168404Spjd /* 1399168404Spjd * Construct the array of vdevs, opening them to get status in the 1400168404Spjd * process. For each spare, there is potentially two different vdev_t 1401168404Spjd * structures associated with it: one in the list of spares (used only 1402168404Spjd * for basic validation purposes) and one in the active vdev 1403168404Spjd * configuration (if it's spared in). During this phase we open and 1404168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1405168404Spjd * active configuration, then we also mark this vdev as an active spare. 1406168404Spjd */ 1407185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1408185029Spjd KM_SLEEP); 1409185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1410168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1411168404Spjd VDEV_ALLOC_SPARE) == 0); 1412168404Spjd ASSERT(vd != NULL); 1413168404Spjd 1414185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1415168404Spjd 1416185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1417185029Spjd B_FALSE)) != NULL) { 1418168404Spjd if (!tvd->vdev_isspare) 1419168404Spjd spa_spare_add(tvd); 1420168404Spjd 1421168404Spjd /* 1422168404Spjd * We only mark the spare active if we were successfully 1423168404Spjd * able to load the vdev. Otherwise, importing a pool 1424168404Spjd * with a bad active spare would result in strange 1425168404Spjd * behavior, because multiple pool would think the spare 1426168404Spjd * is actively in use. 1427168404Spjd * 1428168404Spjd * There is a vulnerability here to an equally bizarre 1429168404Spjd * circumstance, where a dead active spare is later 1430168404Spjd * brought back to life (onlined or otherwise). Given 1431168404Spjd * the rarity of this scenario, and the extra complexity 1432168404Spjd * it adds, we ignore the possibility. 1433168404Spjd */ 1434168404Spjd if (!vdev_is_dead(tvd)) 1435168404Spjd spa_spare_activate(tvd); 1436168404Spjd } 1437168404Spjd 1438185029Spjd vd->vdev_top = vd; 1439209962Smm vd->vdev_aux = &spa->spa_spares; 1440185029Spjd 1441168404Spjd if (vdev_open(vd) != 0) 1442168404Spjd continue; 1443168404Spjd 1444185029Spjd if (vdev_validate_aux(vd) == 0) 1445185029Spjd spa_spare_add(vd); 1446168404Spjd } 1447168404Spjd 1448168404Spjd /* 1449168404Spjd * Recompute the stashed list of spares, with status information 1450168404Spjd * this time. 1451168404Spjd */ 1452185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1453168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1454168404Spjd 1455185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1456185029Spjd KM_SLEEP); 1457185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1458185029Spjd spares[i] = vdev_config_generate(spa, 1459219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1460185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1461185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1462185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1463168404Spjd nvlist_free(spares[i]); 1464185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1465168404Spjd} 1466168404Spjd 1467185029Spjd/* 1468185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1469185029Spjd * this pool. When this is called, we have some form of basic information in 1470185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1471185029Spjd * then re-generate a more complete list including status information. 1472185029Spjd * Devices which are already active have their details maintained, and are 1473185029Spjd * not re-opened. 1474185029Spjd */ 1475185029Spjdstatic void 1476185029Spjdspa_load_l2cache(spa_t *spa) 1477185029Spjd{ 1478185029Spjd nvlist_t **l2cache; 1479185029Spjd uint_t nl2cache; 1480185029Spjd int i, j, oldnvdevs; 1481219089Spjd uint64_t guid; 1482185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1483185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1484185029Spjd 1485185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1486185029Spjd 1487185029Spjd if (sav->sav_config != NULL) { 1488185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1489185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1490185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1491185029Spjd } else { 1492185029Spjd nl2cache = 0; 1493247187Smm newvdevs = NULL; 1494185029Spjd } 1495185029Spjd 1496185029Spjd oldvdevs = sav->sav_vdevs; 1497185029Spjd oldnvdevs = sav->sav_count; 1498185029Spjd sav->sav_vdevs = NULL; 1499185029Spjd sav->sav_count = 0; 1500185029Spjd 1501185029Spjd /* 1502185029Spjd * Process new nvlist of vdevs. 1503185029Spjd */ 1504185029Spjd for (i = 0; i < nl2cache; i++) { 1505185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1506185029Spjd &guid) == 0); 1507185029Spjd 1508185029Spjd newvdevs[i] = NULL; 1509185029Spjd for (j = 0; j < oldnvdevs; j++) { 1510185029Spjd vd = oldvdevs[j]; 1511185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1512185029Spjd /* 1513185029Spjd * Retain previous vdev for add/remove ops. 1514185029Spjd */ 1515185029Spjd newvdevs[i] = vd; 1516185029Spjd oldvdevs[j] = NULL; 1517185029Spjd break; 1518185029Spjd } 1519185029Spjd } 1520185029Spjd 1521185029Spjd if (newvdevs[i] == NULL) { 1522185029Spjd /* 1523185029Spjd * Create new vdev 1524185029Spjd */ 1525185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1526185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1527185029Spjd ASSERT(vd != NULL); 1528185029Spjd newvdevs[i] = vd; 1529185029Spjd 1530185029Spjd /* 1531185029Spjd * Commit this vdev as an l2cache device, 1532185029Spjd * even if it fails to open. 1533185029Spjd */ 1534185029Spjd spa_l2cache_add(vd); 1535185029Spjd 1536185029Spjd vd->vdev_top = vd; 1537185029Spjd vd->vdev_aux = sav; 1538185029Spjd 1539185029Spjd spa_l2cache_activate(vd); 1540185029Spjd 1541185029Spjd if (vdev_open(vd) != 0) 1542185029Spjd continue; 1543185029Spjd 1544185029Spjd (void) vdev_validate_aux(vd); 1545185029Spjd 1546219089Spjd if (!vdev_is_dead(vd)) 1547219089Spjd l2arc_add_vdev(spa, vd); 1548185029Spjd } 1549185029Spjd } 1550185029Spjd 1551185029Spjd /* 1552185029Spjd * Purge vdevs that were dropped 1553185029Spjd */ 1554185029Spjd for (i = 0; i < oldnvdevs; i++) { 1555185029Spjd uint64_t pool; 1556185029Spjd 1557185029Spjd vd = oldvdevs[i]; 1558185029Spjd if (vd != NULL) { 1559230514Smm ASSERT(vd->vdev_isl2cache); 1560230514Smm 1561209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1562209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1563185029Spjd l2arc_remove_vdev(vd); 1564230514Smm vdev_clear_stats(vd); 1565230514Smm vdev_free(vd); 1566185029Spjd } 1567185029Spjd } 1568185029Spjd 1569185029Spjd if (oldvdevs) 1570185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1571185029Spjd 1572185029Spjd if (sav->sav_config == NULL) 1573185029Spjd goto out; 1574185029Spjd 1575185029Spjd sav->sav_vdevs = newvdevs; 1576185029Spjd sav->sav_count = (int)nl2cache; 1577185029Spjd 1578185029Spjd /* 1579185029Spjd * Recompute the stashed list of l2cache devices, with status 1580185029Spjd * information this time. 1581185029Spjd */ 1582185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1583185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1584185029Spjd 1585185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1586185029Spjd for (i = 0; i < sav->sav_count; i++) 1587185029Spjd l2cache[i] = vdev_config_generate(spa, 1588219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1589185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1590185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1591185029Spjdout: 1592185029Spjd for (i = 0; i < sav->sav_count; i++) 1593185029Spjd nvlist_free(l2cache[i]); 1594185029Spjd if (sav->sav_count) 1595185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1596185029Spjd} 1597185029Spjd 1598168404Spjdstatic int 1599168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1600168404Spjd{ 1601168404Spjd dmu_buf_t *db; 1602168404Spjd char *packed = NULL; 1603168404Spjd size_t nvsize = 0; 1604168404Spjd int error; 1605168404Spjd *value = NULL; 1606168404Spjd 1607262676Sdelphij error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1608262676Sdelphij if (error != 0) 1609262676Sdelphij return (error); 1610168404Spjd nvsize = *(uint64_t *)db->db_data; 1611168404Spjd dmu_buf_rele(db, FTAG); 1612168404Spjd 1613168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1614209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1615209962Smm DMU_READ_PREFETCH); 1616168404Spjd if (error == 0) 1617168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1618168404Spjd kmem_free(packed, nvsize); 1619168404Spjd 1620168404Spjd return (error); 1621168404Spjd} 1622168404Spjd 1623168404Spjd/* 1624185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1625185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1626185029Spjd */ 1627185029Spjdstatic void 1628185029Spjdspa_check_removed(vdev_t *vd) 1629185029Spjd{ 1630219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1631185029Spjd spa_check_removed(vd->vdev_child[c]); 1632185029Spjd 1633249188Smm if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1634249188Smm !vd->vdev_ishole) { 1635185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1636185029Spjd spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1637185029Spjd } 1638185029Spjd} 1639185029Spjd 1640185029Spjd/* 1641219089Spjd * Validate the current config against the MOS config 1642213197Smm */ 1643219089Spjdstatic boolean_t 1644219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1645213197Smm{ 1646219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1647219089Spjd nvlist_t *nv; 1648213197Smm 1649219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1650213197Smm 1651219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1652219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1653219089Spjd 1654219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1655219089Spjd 1656219089Spjd /* 1657219089Spjd * If we're doing a normal import, then build up any additional 1658219089Spjd * diagnostic information about missing devices in this config. 1659219089Spjd * We'll pass this up to the user for further processing. 1660219089Spjd */ 1661219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1662219089Spjd nvlist_t **child, *nv; 1663219089Spjd uint64_t idx = 0; 1664219089Spjd 1665219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1666219089Spjd KM_SLEEP); 1667219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1668219089Spjd 1669219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1670219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1671219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1672219089Spjd 1673219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1674219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1675219089Spjd mtvd->vdev_islog) 1676219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1677219089Spjd B_FALSE, 0); 1678219089Spjd } 1679219089Spjd 1680219089Spjd if (idx) { 1681219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1682219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1683219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1684219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1685219089Spjd 1686219089Spjd for (int i = 0; i < idx; i++) 1687219089Spjd nvlist_free(child[i]); 1688219089Spjd } 1689219089Spjd nvlist_free(nv); 1690219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1691219089Spjd } 1692219089Spjd 1693219089Spjd /* 1694219089Spjd * Compare the root vdev tree with the information we have 1695219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1696219089Spjd * with the corresponding MOS config top-level (mtvd). 1697219089Spjd */ 1698219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1699213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1700219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1701213197Smm 1702219089Spjd /* 1703219089Spjd * Resolve any "missing" vdevs in the current configuration. 1704219089Spjd * If we find that the MOS config has more accurate information 1705219089Spjd * about the top-level vdev then use that vdev instead. 1706219089Spjd */ 1707219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1708219089Spjd mtvd->vdev_ops != &vdev_missing_ops) { 1709219089Spjd 1710219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1711219089Spjd continue; 1712219089Spjd 1713219089Spjd /* 1714219089Spjd * Device specific actions. 1715219089Spjd */ 1716219089Spjd if (mtvd->vdev_islog) { 1717219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1718219089Spjd } else { 1719219089Spjd /* 1720219089Spjd * XXX - once we have 'readonly' pool 1721219089Spjd * support we should be able to handle 1722219089Spjd * missing data devices by transitioning 1723219089Spjd * the pool to readonly. 1724219089Spjd */ 1725219089Spjd continue; 1726219089Spjd } 1727219089Spjd 1728219089Spjd /* 1729219089Spjd * Swap the missing vdev with the data we were 1730219089Spjd * able to obtain from the MOS config. 1731219089Spjd */ 1732219089Spjd vdev_remove_child(rvd, tvd); 1733219089Spjd vdev_remove_child(mrvd, mtvd); 1734219089Spjd 1735219089Spjd vdev_add_child(rvd, mtvd); 1736219089Spjd vdev_add_child(mrvd, tvd); 1737219089Spjd 1738219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1739219089Spjd vdev_load(mtvd); 1740219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1741219089Spjd 1742219089Spjd vdev_reopen(rvd); 1743219089Spjd } else if (mtvd->vdev_islog) { 1744219089Spjd /* 1745219089Spjd * Load the slog device's state from the MOS config 1746219089Spjd * since it's possible that the label does not 1747219089Spjd * contain the most up-to-date information. 1748219089Spjd */ 1749219089Spjd vdev_load_log_state(tvd, mtvd); 1750219089Spjd vdev_reopen(tvd); 1751219089Spjd } 1752213197Smm } 1753219089Spjd vdev_free(mrvd); 1754219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1755219089Spjd 1756219089Spjd /* 1757219089Spjd * Ensure we were able to validate the config. 1758219089Spjd */ 1759219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1760213197Smm} 1761213197Smm 1762213197Smm/* 1763185029Spjd * Check for missing log devices 1764185029Spjd */ 1765248571Smmstatic boolean_t 1766185029Spjdspa_check_logs(spa_t *spa) 1767185029Spjd{ 1768248571Smm boolean_t rv = B_FALSE; 1769248571Smm 1770185029Spjd switch (spa->spa_log_state) { 1771185029Spjd case SPA_LOG_MISSING: 1772185029Spjd /* need to recheck in case slog has been restored */ 1773185029Spjd case SPA_LOG_UNKNOWN: 1774248571Smm rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, 1775248571Smm NULL, DS_FIND_CHILDREN) != 0); 1776248571Smm if (rv) 1777219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1778185029Spjd break; 1779185029Spjd } 1780248571Smm return (rv); 1781185029Spjd} 1782185029Spjd 1783219089Spjdstatic boolean_t 1784219089Spjdspa_passivate_log(spa_t *spa) 1785219089Spjd{ 1786219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1787219089Spjd boolean_t slog_found = B_FALSE; 1788219089Spjd 1789219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1790219089Spjd 1791219089Spjd if (!spa_has_slogs(spa)) 1792219089Spjd return (B_FALSE); 1793219089Spjd 1794219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1795219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1796219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1797219089Spjd 1798219089Spjd if (tvd->vdev_islog) { 1799219089Spjd metaslab_group_passivate(mg); 1800219089Spjd slog_found = B_TRUE; 1801219089Spjd } 1802219089Spjd } 1803219089Spjd 1804219089Spjd return (slog_found); 1805219089Spjd} 1806219089Spjd 1807219089Spjdstatic void 1808219089Spjdspa_activate_log(spa_t *spa) 1809219089Spjd{ 1810219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1811219089Spjd 1812219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1813219089Spjd 1814219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1815219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1816219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1817219089Spjd 1818219089Spjd if (tvd->vdev_islog) 1819219089Spjd metaslab_group_activate(mg); 1820219089Spjd } 1821219089Spjd} 1822219089Spjd 1823219089Spjdint 1824219089Spjdspa_offline_log(spa_t *spa) 1825219089Spjd{ 1826248571Smm int error; 1827219089Spjd 1828248571Smm error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1829248571Smm NULL, DS_FIND_CHILDREN); 1830248571Smm if (error == 0) { 1831219089Spjd /* 1832219089Spjd * We successfully offlined the log device, sync out the 1833219089Spjd * current txg so that the "stubby" block can be removed 1834219089Spjd * by zil_sync(). 1835219089Spjd */ 1836219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1837219089Spjd } 1838219089Spjd return (error); 1839219089Spjd} 1840219089Spjd 1841219089Spjdstatic void 1842219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1843219089Spjd{ 1844219089Spjd int i; 1845219089Spjd 1846219089Spjd for (i = 0; i < sav->sav_count; i++) 1847219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1848219089Spjd} 1849219089Spjd 1850219089Spjdvoid 1851219089Spjdspa_claim_notify(zio_t *zio) 1852219089Spjd{ 1853219089Spjd spa_t *spa = zio->io_spa; 1854219089Spjd 1855219089Spjd if (zio->io_error) 1856219089Spjd return; 1857219089Spjd 1858219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1859219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1860219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1861219089Spjd mutex_exit(&spa->spa_props_lock); 1862219089Spjd} 1863219089Spjd 1864219089Spjdtypedef struct spa_load_error { 1865219089Spjd uint64_t sle_meta_count; 1866219089Spjd uint64_t sle_data_count; 1867219089Spjd} spa_load_error_t; 1868219089Spjd 1869219089Spjdstatic void 1870219089Spjdspa_load_verify_done(zio_t *zio) 1871219089Spjd{ 1872219089Spjd blkptr_t *bp = zio->io_bp; 1873219089Spjd spa_load_error_t *sle = zio->io_private; 1874219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1875219089Spjd int error = zio->io_error; 1876219089Spjd 1877219089Spjd if (error) { 1878236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1879219089Spjd type != DMU_OT_INTENT_LOG) 1880219089Spjd atomic_add_64(&sle->sle_meta_count, 1); 1881219089Spjd else 1882219089Spjd atomic_add_64(&sle->sle_data_count, 1); 1883219089Spjd } 1884219089Spjd zio_data_buf_free(zio->io_data, zio->io_size); 1885219089Spjd} 1886219089Spjd 1887219089Spjd/*ARGSUSED*/ 1888219089Spjdstatic int 1889219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1890268123Sdelphij const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1891219089Spjd{ 1892268075Sdelphij if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { 1893219089Spjd zio_t *rio = arg; 1894219089Spjd size_t size = BP_GET_PSIZE(bp); 1895219089Spjd void *data = zio_data_buf_alloc(size); 1896219089Spjd 1897219089Spjd zio_nowait(zio_read(rio, spa, bp, data, size, 1898219089Spjd spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1899219089Spjd ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1900219089Spjd ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1901219089Spjd } 1902219089Spjd return (0); 1903219089Spjd} 1904219089Spjd 1905219089Spjdstatic int 1906219089Spjdspa_load_verify(spa_t *spa) 1907219089Spjd{ 1908219089Spjd zio_t *rio; 1909219089Spjd spa_load_error_t sle = { 0 }; 1910219089Spjd zpool_rewind_policy_t policy; 1911219089Spjd boolean_t verify_ok = B_FALSE; 1912219089Spjd int error; 1913219089Spjd 1914219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 1915219089Spjd 1916219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1917219089Spjd return (0); 1918219089Spjd 1919219089Spjd rio = zio_root(spa, NULL, &sle, 1920219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1921219089Spjd 1922219089Spjd error = traverse_pool(spa, spa->spa_verify_min_txg, 1923219089Spjd TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1924219089Spjd 1925219089Spjd (void) zio_wait(rio); 1926219089Spjd 1927219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 1928219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 1929219089Spjd 1930219089Spjd if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1931219089Spjd sle.sle_data_count <= policy.zrp_maxdata) { 1932219089Spjd int64_t loss = 0; 1933219089Spjd 1934219089Spjd verify_ok = B_TRUE; 1935219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1936219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1937219089Spjd 1938219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1939219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1940219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1941219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 1942219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1943219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1944219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1945219089Spjd } else { 1946219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1947219089Spjd } 1948219089Spjd 1949219089Spjd if (error) { 1950219089Spjd if (error != ENXIO && error != EIO) 1951249195Smm error = SET_ERROR(EIO); 1952219089Spjd return (error); 1953219089Spjd } 1954219089Spjd 1955219089Spjd return (verify_ok ? 0 : EIO); 1956219089Spjd} 1957219089Spjd 1958185029Spjd/* 1959219089Spjd * Find a value in the pool props object. 1960168404Spjd */ 1961219089Spjdstatic void 1962219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1963219089Spjd{ 1964219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1965219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1966219089Spjd} 1967219089Spjd 1968219089Spjd/* 1969219089Spjd * Find a value in the pool directory object. 1970219089Spjd */ 1971168404Spjdstatic int 1972219089Spjdspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1973168404Spjd{ 1974219089Spjd return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1975219089Spjd name, sizeof (uint64_t), 1, val)); 1976219089Spjd} 1977168404Spjd 1978219089Spjdstatic int 1979219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1980219089Spjd{ 1981219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1982219089Spjd return (err); 1983219089Spjd} 1984219089Spjd 1985219089Spjd/* 1986219089Spjd * Fix up config after a partly-completed split. This is done with the 1987219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1988219089Spjd * pool have that entry in their config, but only the splitting one contains 1989219089Spjd * a list of all the guids of the vdevs that are being split off. 1990219089Spjd * 1991219089Spjd * This function determines what to do with that list: either rejoin 1992219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 1993219089Spjd * the rejoin, each disk that is offlined is marked online again, and 1994219089Spjd * we do a reopen() call. If the vdev label for every disk that was 1995219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1996219089Spjd * then we call vdev_split() on each disk, and complete the split. 1997219089Spjd * 1998219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 1999219089Spjd * the original pool. 2000219089Spjd */ 2001219089Spjdstatic void 2002219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 2003219089Spjd{ 2004219089Spjd uint_t extracted; 2005219089Spjd uint64_t *glist; 2006219089Spjd uint_t i, gcount; 2007219089Spjd nvlist_t *nvl; 2008219089Spjd vdev_t **vd; 2009219089Spjd boolean_t attempt_reopen; 2010219089Spjd 2011219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2012219089Spjd return; 2013219089Spjd 2014219089Spjd /* check that the config is complete */ 2015219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2016219089Spjd &glist, &gcount) != 0) 2017219089Spjd return; 2018219089Spjd 2019219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2020219089Spjd 2021219089Spjd /* attempt to online all the vdevs & validate */ 2022219089Spjd attempt_reopen = B_TRUE; 2023219089Spjd for (i = 0; i < gcount; i++) { 2024219089Spjd if (glist[i] == 0) /* vdev is hole */ 2025219089Spjd continue; 2026219089Spjd 2027219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2028219089Spjd if (vd[i] == NULL) { 2029219089Spjd /* 2030219089Spjd * Don't bother attempting to reopen the disks; 2031219089Spjd * just do the split. 2032219089Spjd */ 2033219089Spjd attempt_reopen = B_FALSE; 2034219089Spjd } else { 2035219089Spjd /* attempt to re-online it */ 2036219089Spjd vd[i]->vdev_offline = B_FALSE; 2037219089Spjd } 2038219089Spjd } 2039219089Spjd 2040219089Spjd if (attempt_reopen) { 2041219089Spjd vdev_reopen(spa->spa_root_vdev); 2042219089Spjd 2043219089Spjd /* check each device to see what state it's in */ 2044219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 2045219089Spjd if (vd[i] != NULL && 2046219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2047219089Spjd break; 2048219089Spjd ++extracted; 2049219089Spjd } 2050219089Spjd } 2051219089Spjd 2052209962Smm /* 2053219089Spjd * If every disk has been moved to the new pool, or if we never 2054219089Spjd * even attempted to look at them, then we split them off for 2055219089Spjd * good. 2056209962Smm */ 2057219089Spjd if (!attempt_reopen || gcount == extracted) { 2058219089Spjd for (i = 0; i < gcount; i++) 2059219089Spjd if (vd[i] != NULL) 2060219089Spjd vdev_split(vd[i]); 2061219089Spjd vdev_reopen(spa->spa_root_vdev); 2062219089Spjd } 2063209962Smm 2064219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 2065219089Spjd} 2066185029Spjd 2067219089Spjdstatic int 2068219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2069219089Spjd boolean_t mosconfig) 2070219089Spjd{ 2071219089Spjd nvlist_t *config = spa->spa_config; 2072219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 2073228103Smm char *comment; 2074219089Spjd int error; 2075219089Spjd uint64_t pool_guid; 2076219089Spjd nvlist_t *nvl; 2077168404Spjd 2078219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2079249195Smm return (SET_ERROR(EINVAL)); 2080168404Spjd 2081228103Smm ASSERT(spa->spa_comment == NULL); 2082228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2083228103Smm spa->spa_comment = spa_strdup(comment); 2084228103Smm 2085168404Spjd /* 2086168404Spjd * Versioning wasn't explicitly added to the label until later, so if 2087168404Spjd * it's not present treat it as the initial version. 2088168404Spjd */ 2089219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2090219089Spjd &spa->spa_ubsync.ub_version) != 0) 2091219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2092168404Spjd 2093168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2094168404Spjd &spa->spa_config_txg); 2095168404Spjd 2096168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2097168404Spjd spa_guid_exists(pool_guid, 0)) { 2098249195Smm error = SET_ERROR(EEXIST); 2099219089Spjd } else { 2100228103Smm spa->spa_config_guid = pool_guid; 2101219089Spjd 2102219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2103219089Spjd &nvl) == 0) { 2104219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2105219089Spjd KM_SLEEP) == 0); 2106219089Spjd } 2107219089Spjd 2108236884Smm nvlist_free(spa->spa_load_info); 2109236884Smm spa->spa_load_info = fnvlist_alloc(); 2110236884Smm 2111219089Spjd gethrestime(&spa->spa_loaded_ts); 2112219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 2113219089Spjd mosconfig, &ereport); 2114168404Spjd } 2115168404Spjd 2116219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2117219089Spjd if (error) { 2118219089Spjd if (error != EEXIST) { 2119219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2120219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2121219089Spjd } 2122219089Spjd if (error != EBADF) { 2123219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2124219089Spjd } 2125219089Spjd } 2126219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2127219089Spjd spa->spa_ena = 0; 2128168404Spjd 2129219089Spjd return (error); 2130219089Spjd} 2131219089Spjd 2132219089Spjd/* 2133219089Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 2134219089Spjd * source of configuration information. 2135219089Spjd */ 2136219089Spjdstatic int 2137219089Spjdspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2138219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2139219089Spjd char **ereport) 2140219089Spjd{ 2141219089Spjd int error = 0; 2142219089Spjd nvlist_t *nvroot = NULL; 2143236884Smm nvlist_t *label; 2144219089Spjd vdev_t *rvd; 2145219089Spjd uberblock_t *ub = &spa->spa_uberblock; 2146219089Spjd uint64_t children, config_cache_txg = spa->spa_config_txg; 2147219089Spjd int orig_mode = spa->spa_mode; 2148219089Spjd int parse; 2149219089Spjd uint64_t obj; 2150236884Smm boolean_t missing_feat_write = B_FALSE; 2151219089Spjd 2152168404Spjd /* 2153219089Spjd * If this is an untrusted config, access the pool in read-only mode. 2154219089Spjd * This prevents things like resilvering recently removed devices. 2155219089Spjd */ 2156219089Spjd if (!mosconfig) 2157219089Spjd spa->spa_mode = FREAD; 2158219089Spjd 2159219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2160219089Spjd 2161219089Spjd spa->spa_load_state = state; 2162219089Spjd 2163219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2164249195Smm return (SET_ERROR(EINVAL)); 2165219089Spjd 2166219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 2167219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2168219089Spjd 2169219089Spjd /* 2170209962Smm * Create "The Godfather" zio to hold all async IOs 2171209962Smm */ 2172209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2173209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2174209962Smm 2175209962Smm /* 2176168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2177168404Spjd * value that will be returned by spa_version() since parsing the 2178168404Spjd * configuration requires knowing the version number. 2179168404Spjd */ 2180185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2181219089Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2182185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2183168404Spjd 2184168404Spjd if (error != 0) 2185219089Spjd return (error); 2186168404Spjd 2187168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2188168404Spjd 2189219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2190219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2191219089Spjd } 2192219089Spjd 2193168404Spjd /* 2194168404Spjd * Try to open all vdevs, loading each label in the process. 2195168404Spjd */ 2196185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2197168926Spjd error = vdev_open(rvd); 2198185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2199168926Spjd if (error != 0) 2200219089Spjd return (error); 2201168404Spjd 2202168404Spjd /* 2203209962Smm * We need to validate the vdev labels against the configuration that 2204209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 2205209962Smm * mosconfig is true then we're validating the vdev labels based on 2206219089Spjd * that config. Otherwise, we're validating against the cached config 2207209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 2208209962Smm * later we will recursively call spa_load() and validate against 2209209962Smm * the vdev config. 2210219089Spjd * 2211219089Spjd * If we're assembling a new pool that's been split off from an 2212219089Spjd * existing pool, the labels haven't yet been updated so we skip 2213219089Spjd * validation for now. 2214168404Spjd */ 2215219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2216219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2217230514Smm error = vdev_validate(rvd, mosconfig); 2218219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2219168404Spjd 2220219089Spjd if (error != 0) 2221219089Spjd return (error); 2222219089Spjd 2223219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2224249195Smm return (SET_ERROR(ENXIO)); 2225168404Spjd } 2226168404Spjd 2227168404Spjd /* 2228168404Spjd * Find the best uberblock. 2229168404Spjd */ 2230236884Smm vdev_uberblock_load(rvd, ub, &label); 2231168404Spjd 2232168404Spjd /* 2233168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2234168404Spjd */ 2235236884Smm if (ub->ub_txg == 0) { 2236236884Smm nvlist_free(label); 2237219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2238236884Smm } 2239168404Spjd 2240168404Spjd /* 2241236884Smm * If the pool has an unsupported version we can't open it. 2242168404Spjd */ 2243236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2244236884Smm nvlist_free(label); 2245219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2246236884Smm } 2247168404Spjd 2248236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2249236884Smm nvlist_t *features; 2250236884Smm 2251236884Smm /* 2252236884Smm * If we weren't able to find what's necessary for reading the 2253236884Smm * MOS in the label, return failure. 2254236884Smm */ 2255236884Smm if (label == NULL || nvlist_lookup_nvlist(label, 2256236884Smm ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2257236884Smm nvlist_free(label); 2258236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2259236884Smm ENXIO)); 2260236884Smm } 2261236884Smm 2262236884Smm /* 2263236884Smm * Update our in-core representation with the definitive values 2264236884Smm * from the label. 2265236884Smm */ 2266236884Smm nvlist_free(spa->spa_label_features); 2267236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2268236884Smm } 2269236884Smm 2270236884Smm nvlist_free(label); 2271236884Smm 2272168404Spjd /* 2273236884Smm * Look through entries in the label nvlist's features_for_read. If 2274236884Smm * there is a feature listed there which we don't understand then we 2275236884Smm * cannot open a pool. 2276236884Smm */ 2277236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2278236884Smm nvlist_t *unsup_feat; 2279236884Smm 2280236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2281236884Smm 0); 2282236884Smm 2283236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2284236884Smm NULL); nvp != NULL; 2285236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2286236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2287236884Smm VERIFY(nvlist_add_string(unsup_feat, 2288236884Smm nvpair_name(nvp), "") == 0); 2289236884Smm } 2290236884Smm } 2291236884Smm 2292236884Smm if (!nvlist_empty(unsup_feat)) { 2293236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2294236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2295236884Smm nvlist_free(unsup_feat); 2296236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2297236884Smm ENOTSUP)); 2298236884Smm } 2299236884Smm 2300236884Smm nvlist_free(unsup_feat); 2301236884Smm } 2302236884Smm 2303236884Smm /* 2304168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2305219089Spjd * incomplete configuration. We first check to see if the pool 2306219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2307219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2308219089Spjd * can handle missing vdevs. 2309168404Spjd */ 2310219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2311219089Spjd &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2312219089Spjd rvd->vdev_guid_sum != ub->ub_guid_sum) 2313219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2314219089Spjd 2315219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2316219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2317219089Spjd spa_try_repair(spa, config); 2318219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2319219089Spjd nvlist_free(spa->spa_config_splitting); 2320219089Spjd spa->spa_config_splitting = NULL; 2321168404Spjd } 2322168404Spjd 2323168404Spjd /* 2324168404Spjd * Initialize internal SPA structures. 2325168404Spjd */ 2326168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2327168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2328219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2329219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2330219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2331219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2332219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2333219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2334219089Spjd 2335236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2336219089Spjd if (error) 2337219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2338168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2339168404Spjd 2340219089Spjd if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2341219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2342168404Spjd 2343236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2344236884Smm boolean_t missing_feat_read = B_FALSE; 2345238926Smm nvlist_t *unsup_feat, *enabled_feat; 2346236884Smm 2347236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2348236884Smm &spa->spa_feat_for_read_obj) != 0) { 2349236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2350236884Smm } 2351236884Smm 2352236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2353236884Smm &spa->spa_feat_for_write_obj) != 0) { 2354236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2355236884Smm } 2356236884Smm 2357236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2358236884Smm &spa->spa_feat_desc_obj) != 0) { 2359236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2360236884Smm } 2361236884Smm 2362238926Smm enabled_feat = fnvlist_alloc(); 2363238926Smm unsup_feat = fnvlist_alloc(); 2364236884Smm 2365259813Sdelphij if (!spa_features_check(spa, B_FALSE, 2366238926Smm unsup_feat, enabled_feat)) 2367236884Smm missing_feat_read = B_TRUE; 2368236884Smm 2369236884Smm if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2370259813Sdelphij if (!spa_features_check(spa, B_TRUE, 2371238926Smm unsup_feat, enabled_feat)) { 2372236884Smm missing_feat_write = B_TRUE; 2373238926Smm } 2374236884Smm } 2375236884Smm 2376238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2377238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2378238926Smm 2379236884Smm if (!nvlist_empty(unsup_feat)) { 2380238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2381238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2382236884Smm } 2383236884Smm 2384238926Smm fnvlist_free(enabled_feat); 2385238926Smm fnvlist_free(unsup_feat); 2386236884Smm 2387236884Smm if (!missing_feat_read) { 2388236884Smm fnvlist_add_boolean(spa->spa_load_info, 2389236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2390236884Smm } 2391236884Smm 2392236884Smm /* 2393236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2394236884Smm * twofold: to determine whether the pool is available for 2395236884Smm * import in read-write mode and (if it is not) whether the 2396236884Smm * pool is available for import in read-only mode. If the pool 2397236884Smm * is available for import in read-write mode, it is displayed 2398236884Smm * as available in userland; if it is not available for import 2399236884Smm * in read-only mode, it is displayed as unavailable in 2400236884Smm * userland. If the pool is available for import in read-only 2401236884Smm * mode but not read-write mode, it is displayed as unavailable 2402236884Smm * in userland with a special note that the pool is actually 2403236884Smm * available for open in read-only mode. 2404236884Smm * 2405236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2406236884Smm * missing a feature for write, we must first determine whether 2407236884Smm * the pool can be opened read-only before returning to 2408236884Smm * userland in order to know whether to display the 2409236884Smm * abovementioned note. 2410236884Smm */ 2411236884Smm if (missing_feat_read || (missing_feat_write && 2412236884Smm spa_writeable(spa))) { 2413236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2414236884Smm ENOTSUP)); 2415236884Smm } 2416260150Sdelphij 2417260150Sdelphij /* 2418260150Sdelphij * Load refcounts for ZFS features from disk into an in-memory 2419260150Sdelphij * cache during SPA initialization. 2420260150Sdelphij */ 2421260150Sdelphij for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2422260150Sdelphij uint64_t refcount; 2423260150Sdelphij 2424260150Sdelphij error = feature_get_refcount_from_disk(spa, 2425260150Sdelphij &spa_feature_table[i], &refcount); 2426260150Sdelphij if (error == 0) { 2427260150Sdelphij spa->spa_feat_refcount_cache[i] = refcount; 2428260150Sdelphij } else if (error == ENOTSUP) { 2429260150Sdelphij spa->spa_feat_refcount_cache[i] = 2430260150Sdelphij SPA_FEATURE_DISABLED; 2431260150Sdelphij } else { 2432260150Sdelphij return (spa_vdev_err(rvd, 2433260150Sdelphij VDEV_AUX_CORRUPT_DATA, EIO)); 2434260150Sdelphij } 2435260150Sdelphij } 2436236884Smm } 2437236884Smm 2438260150Sdelphij if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2439260150Sdelphij if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2440268075Sdelphij &spa->spa_feat_enabled_txg_obj) != 0) 2441260150Sdelphij return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2442260150Sdelphij } 2443260150Sdelphij 2444236884Smm spa->spa_is_initializing = B_TRUE; 2445236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2446236884Smm spa->spa_is_initializing = B_FALSE; 2447236884Smm if (error != 0) 2448236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2449236884Smm 2450168404Spjd if (!mosconfig) { 2451168498Spjd uint64_t hostid; 2452219089Spjd nvlist_t *policy = NULL, *nvconfig; 2453168404Spjd 2454219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2455219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2456168404Spjd 2457219089Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2458185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2459168498Spjd char *hostname; 2460168498Spjd unsigned long myhostid = 0; 2461168498Spjd 2462219089Spjd VERIFY(nvlist_lookup_string(nvconfig, 2463168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2464168498Spjd 2465219089Spjd#ifdef _KERNEL 2466219089Spjd myhostid = zone_get_hostid(NULL); 2467219089Spjd#else /* _KERNEL */ 2468219089Spjd /* 2469219089Spjd * We're emulating the system's hostid in userland, so 2470219089Spjd * we can't use zone_get_hostid(). 2471219089Spjd */ 2472168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2473219089Spjd#endif /* _KERNEL */ 2474204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 2475219089Spjd hostid != myhostid) { 2476219089Spjd nvlist_free(nvconfig); 2477168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 2478168498Spjd "loaded as it was last accessed by " 2479185029Spjd "another system (host: %s hostid: 0x%lx). " 2480236146Smm "See: http://illumos.org/msg/ZFS-8000-EY", 2481185029Spjd spa_name(spa), hostname, 2482168498Spjd (unsigned long)hostid); 2483249195Smm return (SET_ERROR(EBADF)); 2484168498Spjd } 2485168498Spjd } 2486219089Spjd if (nvlist_lookup_nvlist(spa->spa_config, 2487219089Spjd ZPOOL_REWIND_POLICY, &policy) == 0) 2488219089Spjd VERIFY(nvlist_add_nvlist(nvconfig, 2489219089Spjd ZPOOL_REWIND_POLICY, policy) == 0); 2490168498Spjd 2491219089Spjd spa_config_set(spa, nvconfig); 2492168404Spjd spa_unload(spa); 2493168404Spjd spa_deactivate(spa); 2494209962Smm spa_activate(spa, orig_mode); 2495168404Spjd 2496219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2497168404Spjd } 2498168404Spjd 2499219089Spjd if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2500219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2501219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2502219089Spjd if (error != 0) 2503219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2504168404Spjd 2505168404Spjd /* 2506168404Spjd * Load the bit that tells us to use the new accounting function 2507168404Spjd * (raid-z deflation). If we have an older pool, this will not 2508168404Spjd * be present. 2509168404Spjd */ 2510219089Spjd error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2511219089Spjd if (error != 0 && error != ENOENT) 2512219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2513168404Spjd 2514219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2515219089Spjd &spa->spa_creation_version); 2516219089Spjd if (error != 0 && error != ENOENT) 2517219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2518219089Spjd 2519168404Spjd /* 2520168404Spjd * Load the persistent error log. If we have an older pool, this will 2521168404Spjd * not be present. 2522168404Spjd */ 2523219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2524219089Spjd if (error != 0 && error != ENOENT) 2525219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2526168404Spjd 2527219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2528219089Spjd &spa->spa_errlog_scrub); 2529219089Spjd if (error != 0 && error != ENOENT) 2530219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2531168404Spjd 2532168404Spjd /* 2533168404Spjd * Load the history object. If we have an older pool, this 2534168404Spjd * will not be present. 2535168404Spjd */ 2536219089Spjd error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2537219089Spjd if (error != 0 && error != ENOENT) 2538219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2539168404Spjd 2540168404Spjd /* 2541219089Spjd * If we're assembling the pool from the split-off vdevs of 2542219089Spjd * an existing pool, we don't want to attach the spares & cache 2543219089Spjd * devices. 2544219089Spjd */ 2545219089Spjd 2546219089Spjd /* 2547168404Spjd * Load any hot spares for this pool. 2548168404Spjd */ 2549219089Spjd error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2550219089Spjd if (error != 0 && error != ENOENT) 2551219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2552219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2553185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2554185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 2555219089Spjd &spa->spa_spares.sav_config) != 0) 2556219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2557168404Spjd 2558185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2559168404Spjd spa_load_spares(spa); 2560185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2561219089Spjd } else if (error == 0) { 2562219089Spjd spa->spa_spares.sav_sync = B_TRUE; 2563168404Spjd } 2564168404Spjd 2565185029Spjd /* 2566185029Spjd * Load any level 2 ARC devices for this pool. 2567185029Spjd */ 2568219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2569185029Spjd &spa->spa_l2cache.sav_object); 2570219089Spjd if (error != 0 && error != ENOENT) 2571219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2572219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2573185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2574185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2575219089Spjd &spa->spa_l2cache.sav_config) != 0) 2576219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2577185029Spjd 2578185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2579185029Spjd spa_load_l2cache(spa); 2580185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2581219089Spjd } else if (error == 0) { 2582219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2583185029Spjd } 2584185029Spjd 2585219089Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2586213197Smm 2587219089Spjd error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2588219089Spjd if (error && error != ENOENT) 2589219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2590185029Spjd 2591219089Spjd if (error == 0) { 2592219089Spjd uint64_t autoreplace; 2593185029Spjd 2594219089Spjd spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2595219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2596219089Spjd spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2597219089Spjd spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2598219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2599219089Spjd spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2600219089Spjd &spa->spa_dedup_ditto); 2601185029Spjd 2602219089Spjd spa->spa_autoreplace = (autoreplace != 0); 2603168404Spjd } 2604168404Spjd 2605168404Spjd /* 2606185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 2607185029Spjd * the ZFS DE that it should not issue any faults for unopenable 2608185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 2609185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 2610185029Spjd * over. 2611185029Spjd */ 2612219089Spjd if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2613185029Spjd spa_check_removed(spa->spa_root_vdev); 2614219089Spjd /* 2615219089Spjd * For the import case, this is done in spa_import(), because 2616219089Spjd * at this point we're using the spare definitions from 2617219089Spjd * the MOS config, not necessarily from the userland config. 2618219089Spjd */ 2619219089Spjd if (state != SPA_LOAD_IMPORT) { 2620219089Spjd spa_aux_check_removed(&spa->spa_spares); 2621219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 2622219089Spjd } 2623219089Spjd } 2624185029Spjd 2625185029Spjd /* 2626168404Spjd * Load the vdev state for all toplevel vdevs. 2627168404Spjd */ 2628168404Spjd vdev_load(rvd); 2629168404Spjd 2630168404Spjd /* 2631168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 2632168404Spjd */ 2633185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2634168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2635185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2636168404Spjd 2637168404Spjd /* 2638219089Spjd * Load the DDTs (dedup tables). 2639168404Spjd */ 2640219089Spjd error = ddt_load(spa); 2641219089Spjd if (error != 0) 2642219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2643219089Spjd 2644219089Spjd spa_update_dspace(spa); 2645219089Spjd 2646219089Spjd /* 2647219089Spjd * Validate the config, using the MOS config to fill in any 2648219089Spjd * information which might be missing. If we fail to validate 2649219089Spjd * the config then declare the pool unfit for use. If we're 2650219089Spjd * assembling a pool from a split, the log is not transferred 2651219089Spjd * over. 2652219089Spjd */ 2653219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2654219089Spjd nvlist_t *nvconfig; 2655219089Spjd 2656219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2657219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2658219089Spjd 2659219089Spjd if (!spa_config_valid(spa, nvconfig)) { 2660219089Spjd nvlist_free(nvconfig); 2661219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2662219089Spjd ENXIO)); 2663219089Spjd } 2664219089Spjd nvlist_free(nvconfig); 2665219089Spjd 2666219089Spjd /* 2667236884Smm * Now that we've validated the config, check the state of the 2668219089Spjd * root vdev. If it can't be opened, it indicates one or 2669219089Spjd * more toplevel vdevs are faulted. 2670219089Spjd */ 2671219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2672249195Smm return (SET_ERROR(ENXIO)); 2673219089Spjd 2674219089Spjd if (spa_check_logs(spa)) { 2675219089Spjd *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2676219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2677219089Spjd } 2678168404Spjd } 2679168404Spjd 2680236884Smm if (missing_feat_write) { 2681236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 2682236884Smm 2683236884Smm /* 2684236884Smm * At this point, we know that we can open the pool in 2685236884Smm * read-only mode but not read-write mode. We now have enough 2686236884Smm * information and can return to userland. 2687236884Smm */ 2688236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2689236884Smm } 2690236884Smm 2691219089Spjd /* 2692219089Spjd * We've successfully opened the pool, verify that we're ready 2693219089Spjd * to start pushing transactions. 2694219089Spjd */ 2695219089Spjd if (state != SPA_LOAD_TRYIMPORT) { 2696219089Spjd if (error = spa_load_verify(spa)) 2697219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2698219089Spjd error)); 2699219089Spjd } 2700219089Spjd 2701219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2702219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 2703168404Spjd dmu_tx_t *tx; 2704168404Spjd int need_update = B_FALSE; 2705168404Spjd 2706209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 2707209962Smm 2708168404Spjd /* 2709168404Spjd * Claim log blocks that haven't been committed yet. 2710168404Spjd * This must all happen in a single txg. 2711219089Spjd * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2712219089Spjd * invoked from zil_claim_log_block()'s i/o done callback. 2713219089Spjd * Price of rollback is that we abandon the log. 2714168404Spjd */ 2715219089Spjd spa->spa_claiming = B_TRUE; 2716219089Spjd 2717168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2718168404Spjd spa_first_txg(spa)); 2719185029Spjd (void) dmu_objset_find(spa_name(spa), 2720168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 2721168404Spjd dmu_tx_commit(tx); 2722168404Spjd 2723219089Spjd spa->spa_claiming = B_FALSE; 2724219089Spjd 2725219089Spjd spa_set_log_state(spa, SPA_LOG_GOOD); 2726168404Spjd spa->spa_sync_on = B_TRUE; 2727168404Spjd txg_sync_start(spa->spa_dsl_pool); 2728168404Spjd 2729168404Spjd /* 2730219089Spjd * Wait for all claims to sync. We sync up to the highest 2731219089Spjd * claimed log block birth time so that claimed log blocks 2732219089Spjd * don't appear to be from the future. spa_claim_max_txg 2733219089Spjd * will have been set for us by either zil_check_log_chain() 2734219089Spjd * (invoked from spa_check_logs()) or zil_claim() above. 2735168404Spjd */ 2736219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2737168404Spjd 2738168404Spjd /* 2739168404Spjd * If the config cache is stale, or we have uninitialized 2740168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 2741209962Smm * 2742219089Spjd * If this is a verbatim import, trust the current 2743209962Smm * in-core spa_config and update the disk labels. 2744168404Spjd */ 2745168404Spjd if (config_cache_txg != spa->spa_config_txg || 2746219089Spjd state == SPA_LOAD_IMPORT || 2747219089Spjd state == SPA_LOAD_RECOVER || 2748219089Spjd (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2749168404Spjd need_update = B_TRUE; 2750168404Spjd 2751209962Smm for (int c = 0; c < rvd->vdev_children; c++) 2752168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 2753168404Spjd need_update = B_TRUE; 2754168404Spjd 2755168404Spjd /* 2756168404Spjd * Update the config cache asychronously in case we're the 2757168404Spjd * root pool, in which case the config cache isn't writable yet. 2758168404Spjd */ 2759168404Spjd if (need_update) 2760168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2761208683Spjd 2762208683Spjd /* 2763208683Spjd * Check all DTLs to see if anything needs resilvering. 2764208683Spjd */ 2765219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2766219089Spjd vdev_resilver_needed(rvd, NULL, NULL)) 2767208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2768219089Spjd 2769219089Spjd /* 2770248571Smm * Log the fact that we booted up (so that we can detect if 2771248571Smm * we rebooted in the middle of an operation). 2772248571Smm */ 2773248571Smm spa_history_log_version(spa, "open"); 2774248571Smm 2775248571Smm /* 2776219089Spjd * Delete any inconsistent datasets. 2777219089Spjd */ 2778219089Spjd (void) dmu_objset_find(spa_name(spa), 2779219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2780219089Spjd 2781219089Spjd /* 2782219089Spjd * Clean up any stale temporary dataset userrefs. 2783219089Spjd */ 2784219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2785168404Spjd } 2786168404Spjd 2787219089Spjd return (0); 2788219089Spjd} 2789168404Spjd 2790219089Spjdstatic int 2791219089Spjdspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2792219089Spjd{ 2793219089Spjd int mode = spa->spa_mode; 2794219089Spjd 2795219089Spjd spa_unload(spa); 2796219089Spjd spa_deactivate(spa); 2797219089Spjd 2798219089Spjd spa->spa_load_max_txg--; 2799219089Spjd 2800219089Spjd spa_activate(spa, mode); 2801219089Spjd spa_async_suspend(spa); 2802219089Spjd 2803219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2804168404Spjd} 2805168404Spjd 2806236884Smm/* 2807236884Smm * If spa_load() fails this function will try loading prior txg's. If 2808236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2809236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2810236884Smm * function will not rewind the pool and will return the same error as 2811236884Smm * spa_load(). 2812236884Smm */ 2813219089Spjdstatic int 2814219089Spjdspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2815219089Spjd uint64_t max_request, int rewind_flags) 2816219089Spjd{ 2817236884Smm nvlist_t *loadinfo = NULL; 2818219089Spjd nvlist_t *config = NULL; 2819219089Spjd int load_error, rewind_error; 2820219089Spjd uint64_t safe_rewind_txg; 2821219089Spjd uint64_t min_txg; 2822219089Spjd 2823219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2824219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 2825219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2826219089Spjd } else { 2827219089Spjd spa->spa_load_max_txg = max_request; 2828219089Spjd } 2829219089Spjd 2830219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2831219089Spjd mosconfig); 2832219089Spjd if (load_error == 0) 2833219089Spjd return (0); 2834219089Spjd 2835219089Spjd if (spa->spa_root_vdev != NULL) 2836219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2837219089Spjd 2838219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2839219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2840219089Spjd 2841219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 2842219089Spjd nvlist_free(config); 2843219089Spjd return (load_error); 2844219089Spjd } 2845219089Spjd 2846236884Smm if (state == SPA_LOAD_RECOVER) { 2847236884Smm /* Price of rolling back is discarding txgs, including log */ 2848219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2849236884Smm } else { 2850236884Smm /* 2851236884Smm * If we aren't rolling back save the load info from our first 2852236884Smm * import attempt so that we can restore it after attempting 2853236884Smm * to rewind. 2854236884Smm */ 2855236884Smm loadinfo = spa->spa_load_info; 2856236884Smm spa->spa_load_info = fnvlist_alloc(); 2857236884Smm } 2858219089Spjd 2859219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2860219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2861219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2862219089Spjd TXG_INITIAL : safe_rewind_txg; 2863219089Spjd 2864219089Spjd /* 2865219089Spjd * Continue as long as we're finding errors, we're still within 2866219089Spjd * the acceptable rewind range, and we're still finding uberblocks 2867219089Spjd */ 2868219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2869219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2870219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 2871219089Spjd spa->spa_extreme_rewind = B_TRUE; 2872219089Spjd rewind_error = spa_load_retry(spa, state, mosconfig); 2873219089Spjd } 2874219089Spjd 2875219089Spjd spa->spa_extreme_rewind = B_FALSE; 2876219089Spjd spa->spa_load_max_txg = UINT64_MAX; 2877219089Spjd 2878219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2879219089Spjd spa_config_set(spa, config); 2880219089Spjd 2881236884Smm if (state == SPA_LOAD_RECOVER) { 2882236884Smm ASSERT3P(loadinfo, ==, NULL); 2883236884Smm return (rewind_error); 2884236884Smm } else { 2885236884Smm /* Store the rewind info as part of the initial load info */ 2886236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2887236884Smm spa->spa_load_info); 2888236884Smm 2889236884Smm /* Restore the initial load info */ 2890236884Smm fnvlist_free(spa->spa_load_info); 2891236884Smm spa->spa_load_info = loadinfo; 2892236884Smm 2893236884Smm return (load_error); 2894236884Smm } 2895219089Spjd} 2896219089Spjd 2897168404Spjd/* 2898168404Spjd * Pool Open/Import 2899168404Spjd * 2900168404Spjd * The import case is identical to an open except that the configuration is sent 2901168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 2902168404Spjd * case of an open, the pool configuration will exist in the 2903185029Spjd * POOL_STATE_UNINITIALIZED state. 2904168404Spjd * 2905168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 2906168404Spjd * the same time open the pool, without having to keep around the spa_t in some 2907168404Spjd * ambiguous state. 2908168404Spjd */ 2909168404Spjdstatic int 2910219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2911219089Spjd nvlist_t **config) 2912168404Spjd{ 2913168404Spjd spa_t *spa; 2914219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 2915168404Spjd int error; 2916168404Spjd int locked = B_FALSE; 2917219089Spjd int firstopen = B_FALSE; 2918168404Spjd 2919168404Spjd *spapp = NULL; 2920168404Spjd 2921168404Spjd /* 2922168404Spjd * As disgusting as this is, we need to support recursive calls to this 2923168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 2924168404Spjd * up calling spa_open() again. The real fix is to figure out how to 2925168404Spjd * avoid dsl_dir_open() calling this in the first place. 2926168404Spjd */ 2927168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 2928168404Spjd mutex_enter(&spa_namespace_lock); 2929168404Spjd locked = B_TRUE; 2930168404Spjd } 2931168404Spjd 2932168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 2933168404Spjd if (locked) 2934168404Spjd mutex_exit(&spa_namespace_lock); 2935249195Smm return (SET_ERROR(ENOENT)); 2936168404Spjd } 2937219089Spjd 2938168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2939219089Spjd zpool_rewind_policy_t policy; 2940168404Spjd 2941219089Spjd firstopen = B_TRUE; 2942219089Spjd 2943219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2944219089Spjd &policy); 2945219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 2946219089Spjd state = SPA_LOAD_RECOVER; 2947219089Spjd 2948209962Smm spa_activate(spa, spa_mode_global); 2949168404Spjd 2950219089Spjd if (state != SPA_LOAD_RECOVER) 2951219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2952168404Spjd 2953219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2954219089Spjd policy.zrp_request); 2955219089Spjd 2956168404Spjd if (error == EBADF) { 2957168404Spjd /* 2958168404Spjd * If vdev_validate() returns failure (indicated by 2959168404Spjd * EBADF), it indicates that one of the vdevs indicates 2960168404Spjd * that the pool has been exported or destroyed. If 2961168404Spjd * this is the case, the config cache is out of sync and 2962168404Spjd * we should remove the pool from the namespace. 2963168404Spjd */ 2964168404Spjd spa_unload(spa); 2965168404Spjd spa_deactivate(spa); 2966185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 2967168404Spjd spa_remove(spa); 2968168404Spjd if (locked) 2969168404Spjd mutex_exit(&spa_namespace_lock); 2970249195Smm return (SET_ERROR(ENOENT)); 2971168404Spjd } 2972168404Spjd 2973168404Spjd if (error) { 2974168404Spjd /* 2975168404Spjd * We can't open the pool, but we still have useful 2976168404Spjd * information: the state of each vdev after the 2977168404Spjd * attempted vdev_open(). Return this to the user. 2978168404Spjd */ 2979219089Spjd if (config != NULL && spa->spa_config) { 2980219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 2981219089Spjd KM_SLEEP) == 0); 2982219089Spjd VERIFY(nvlist_add_nvlist(*config, 2983219089Spjd ZPOOL_CONFIG_LOAD_INFO, 2984219089Spjd spa->spa_load_info) == 0); 2985219089Spjd } 2986168404Spjd spa_unload(spa); 2987168404Spjd spa_deactivate(spa); 2988219089Spjd spa->spa_last_open_failed = error; 2989168404Spjd if (locked) 2990168404Spjd mutex_exit(&spa_namespace_lock); 2991168404Spjd *spapp = NULL; 2992168404Spjd return (error); 2993168404Spjd } 2994168404Spjd } 2995168404Spjd 2996168404Spjd spa_open_ref(spa, tag); 2997185029Spjd 2998219089Spjd if (config != NULL) 2999219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3000219089Spjd 3001219089Spjd /* 3002219089Spjd * If we've recovered the pool, pass back any information we 3003219089Spjd * gathered while doing the load. 3004219089Spjd */ 3005219089Spjd if (state == SPA_LOAD_RECOVER) { 3006219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3007219089Spjd spa->spa_load_info) == 0); 3008219089Spjd } 3009219089Spjd 3010219089Spjd if (locked) { 3011219089Spjd spa->spa_last_open_failed = 0; 3012219089Spjd spa->spa_last_ubsync_txg = 0; 3013219089Spjd spa->spa_load_txg = 0; 3014168404Spjd mutex_exit(&spa_namespace_lock); 3015219089Spjd#ifdef __FreeBSD__ 3016219089Spjd#ifdef _KERNEL 3017219089Spjd if (firstopen) 3018249047Savg zvol_create_minors(spa->spa_name); 3019219089Spjd#endif 3020219089Spjd#endif 3021219089Spjd } 3022168404Spjd 3023168404Spjd *spapp = spa; 3024168404Spjd 3025168404Spjd return (0); 3026168404Spjd} 3027168404Spjd 3028168404Spjdint 3029219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3030219089Spjd nvlist_t **config) 3031219089Spjd{ 3032219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 3033219089Spjd} 3034219089Spjd 3035219089Spjdint 3036168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 3037168404Spjd{ 3038219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 3039168404Spjd} 3040168404Spjd 3041168404Spjd/* 3042168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 3043168404Spjd * preventing it from being exported or destroyed. 3044168404Spjd */ 3045168404Spjdspa_t * 3046168404Spjdspa_inject_addref(char *name) 3047168404Spjd{ 3048168404Spjd spa_t *spa; 3049168404Spjd 3050168404Spjd mutex_enter(&spa_namespace_lock); 3051168404Spjd if ((spa = spa_lookup(name)) == NULL) { 3052168404Spjd mutex_exit(&spa_namespace_lock); 3053168404Spjd return (NULL); 3054168404Spjd } 3055168404Spjd spa->spa_inject_ref++; 3056168404Spjd mutex_exit(&spa_namespace_lock); 3057168404Spjd 3058168404Spjd return (spa); 3059168404Spjd} 3060168404Spjd 3061168404Spjdvoid 3062168404Spjdspa_inject_delref(spa_t *spa) 3063168404Spjd{ 3064168404Spjd mutex_enter(&spa_namespace_lock); 3065168404Spjd spa->spa_inject_ref--; 3066168404Spjd mutex_exit(&spa_namespace_lock); 3067168404Spjd} 3068168404Spjd 3069185029Spjd/* 3070185029Spjd * Add spares device information to the nvlist. 3071185029Spjd */ 3072168404Spjdstatic void 3073168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 3074168404Spjd{ 3075168404Spjd nvlist_t **spares; 3076168404Spjd uint_t i, nspares; 3077168404Spjd nvlist_t *nvroot; 3078168404Spjd uint64_t guid; 3079168404Spjd vdev_stat_t *vs; 3080168404Spjd uint_t vsc; 3081168404Spjd uint64_t pool; 3082168404Spjd 3083209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3084209962Smm 3085185029Spjd if (spa->spa_spares.sav_count == 0) 3086168404Spjd return; 3087168404Spjd 3088168404Spjd VERIFY(nvlist_lookup_nvlist(config, 3089168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3090185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3091168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3092168404Spjd if (nspares != 0) { 3093168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3094168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3095168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3096168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3097168404Spjd 3098168404Spjd /* 3099168404Spjd * Go through and find any spares which have since been 3100168404Spjd * repurposed as an active spare. If this is the case, update 3101168404Spjd * their status appropriately. 3102168404Spjd */ 3103168404Spjd for (i = 0; i < nspares; i++) { 3104168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 3105168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3106185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 3107185029Spjd pool != 0ULL) { 3108168404Spjd VERIFY(nvlist_lookup_uint64_array( 3109219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 3110168404Spjd (uint64_t **)&vs, &vsc) == 0); 3111168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 3112168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 3113168404Spjd } 3114168404Spjd } 3115168404Spjd } 3116168404Spjd} 3117168404Spjd 3118185029Spjd/* 3119185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 3120185029Spjd */ 3121185029Spjdstatic void 3122185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 3123185029Spjd{ 3124185029Spjd nvlist_t **l2cache; 3125185029Spjd uint_t i, j, nl2cache; 3126185029Spjd nvlist_t *nvroot; 3127185029Spjd uint64_t guid; 3128185029Spjd vdev_t *vd; 3129185029Spjd vdev_stat_t *vs; 3130185029Spjd uint_t vsc; 3131185029Spjd 3132209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3133209962Smm 3134185029Spjd if (spa->spa_l2cache.sav_count == 0) 3135185029Spjd return; 3136185029Spjd 3137185029Spjd VERIFY(nvlist_lookup_nvlist(config, 3138185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3139185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3140185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3141185029Spjd if (nl2cache != 0) { 3142185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3143185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3144185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3145185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3146185029Spjd 3147185029Spjd /* 3148185029Spjd * Update level 2 cache device stats. 3149185029Spjd */ 3150185029Spjd 3151185029Spjd for (i = 0; i < nl2cache; i++) { 3152185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 3153185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3154185029Spjd 3155185029Spjd vd = NULL; 3156185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3157185029Spjd if (guid == 3158185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3159185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 3160185029Spjd break; 3161185029Spjd } 3162185029Spjd } 3163185029Spjd ASSERT(vd != NULL); 3164185029Spjd 3165185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3166219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3167219089Spjd == 0); 3168185029Spjd vdev_get_stats(vd, vs); 3169185029Spjd } 3170185029Spjd } 3171185029Spjd} 3172185029Spjd 3173236884Smmstatic void 3174236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3175236884Smm{ 3176236884Smm nvlist_t *features; 3177236884Smm zap_cursor_t zc; 3178236884Smm zap_attribute_t za; 3179236884Smm 3180236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3181236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3182236884Smm 3183253993Smav /* We may be unable to read features if pool is suspended. */ 3184253993Smav if (spa_suspended(spa)) 3185253993Smav goto out; 3186253993Smav 3187236884Smm if (spa->spa_feat_for_read_obj != 0) { 3188236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3189236884Smm spa->spa_feat_for_read_obj); 3190236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3191236884Smm zap_cursor_advance(&zc)) { 3192236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3193236884Smm za.za_num_integers == 1); 3194236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3195236884Smm za.za_first_integer)); 3196236884Smm } 3197236884Smm zap_cursor_fini(&zc); 3198236884Smm } 3199236884Smm 3200236884Smm if (spa->spa_feat_for_write_obj != 0) { 3201236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3202236884Smm spa->spa_feat_for_write_obj); 3203236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3204236884Smm zap_cursor_advance(&zc)) { 3205236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3206236884Smm za.za_num_integers == 1); 3207236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3208236884Smm za.za_first_integer)); 3209236884Smm } 3210236884Smm zap_cursor_fini(&zc); 3211236884Smm } 3212236884Smm 3213253993Smavout: 3214236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3215236884Smm features) == 0); 3216236884Smm nvlist_free(features); 3217236884Smm} 3218236884Smm 3219168404Spjdint 3220236884Smmspa_get_stats(const char *name, nvlist_t **config, 3221236884Smm char *altroot, size_t buflen) 3222168404Spjd{ 3223168404Spjd int error; 3224168404Spjd spa_t *spa; 3225168404Spjd 3226168404Spjd *config = NULL; 3227219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3228168404Spjd 3229209962Smm if (spa != NULL) { 3230209962Smm /* 3231209962Smm * This still leaves a window of inconsistency where the spares 3232209962Smm * or l2cache devices could change and the config would be 3233209962Smm * self-inconsistent. 3234209962Smm */ 3235209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3236168404Spjd 3237209962Smm if (*config != NULL) { 3238219089Spjd uint64_t loadtimes[2]; 3239219089Spjd 3240219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3241219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3242219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3243219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3244219089Spjd 3245185029Spjd VERIFY(nvlist_add_uint64(*config, 3246209962Smm ZPOOL_CONFIG_ERRCOUNT, 3247209962Smm spa_get_errlog_size(spa)) == 0); 3248185029Spjd 3249209962Smm if (spa_suspended(spa)) 3250209962Smm VERIFY(nvlist_add_uint64(*config, 3251209962Smm ZPOOL_CONFIG_SUSPENDED, 3252209962Smm spa->spa_failmode) == 0); 3253209962Smm 3254209962Smm spa_add_spares(spa, *config); 3255209962Smm spa_add_l2cache(spa, *config); 3256236884Smm spa_add_feature_stats(spa, *config); 3257209962Smm } 3258168404Spjd } 3259168404Spjd 3260168404Spjd /* 3261168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3262168404Spjd * and call spa_lookup() directly. 3263168404Spjd */ 3264168404Spjd if (altroot) { 3265168404Spjd if (spa == NULL) { 3266168404Spjd mutex_enter(&spa_namespace_lock); 3267168404Spjd spa = spa_lookup(name); 3268168404Spjd if (spa) 3269168404Spjd spa_altroot(spa, altroot, buflen); 3270168404Spjd else 3271168404Spjd altroot[0] = '\0'; 3272168404Spjd spa = NULL; 3273168404Spjd mutex_exit(&spa_namespace_lock); 3274168404Spjd } else { 3275168404Spjd spa_altroot(spa, altroot, buflen); 3276168404Spjd } 3277168404Spjd } 3278168404Spjd 3279209962Smm if (spa != NULL) { 3280209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3281168404Spjd spa_close(spa, FTAG); 3282209962Smm } 3283168404Spjd 3284168404Spjd return (error); 3285168404Spjd} 3286168404Spjd 3287168404Spjd/* 3288185029Spjd * Validate that the auxiliary device array is well formed. We must have an 3289185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 3290185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3291185029Spjd * specified, as long as they are well-formed. 3292168404Spjd */ 3293168404Spjdstatic int 3294185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3295185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 3296185029Spjd vdev_labeltype_t label) 3297168404Spjd{ 3298185029Spjd nvlist_t **dev; 3299185029Spjd uint_t i, ndev; 3300168404Spjd vdev_t *vd; 3301168404Spjd int error; 3302168404Spjd 3303185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3304185029Spjd 3305168404Spjd /* 3306185029Spjd * It's acceptable to have no devs specified. 3307168404Spjd */ 3308185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3309168404Spjd return (0); 3310168404Spjd 3311185029Spjd if (ndev == 0) 3312249195Smm return (SET_ERROR(EINVAL)); 3313168404Spjd 3314168404Spjd /* 3315185029Spjd * Make sure the pool is formatted with a version that supports this 3316185029Spjd * device type. 3317168404Spjd */ 3318185029Spjd if (spa_version(spa) < version) 3319249195Smm return (SET_ERROR(ENOTSUP)); 3320168404Spjd 3321168404Spjd /* 3322185029Spjd * Set the pending device list so we correctly handle device in-use 3323168404Spjd * checking. 3324168404Spjd */ 3325185029Spjd sav->sav_pending = dev; 3326185029Spjd sav->sav_npending = ndev; 3327168404Spjd 3328185029Spjd for (i = 0; i < ndev; i++) { 3329185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3330168404Spjd mode)) != 0) 3331168404Spjd goto out; 3332168404Spjd 3333168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3334168404Spjd vdev_free(vd); 3335249195Smm error = SET_ERROR(EINVAL); 3336168404Spjd goto out; 3337168404Spjd } 3338168404Spjd 3339185029Spjd /* 3340185029Spjd * The L2ARC currently only supports disk devices in 3341185029Spjd * kernel context. For user-level testing, we allow it. 3342185029Spjd */ 3343185029Spjd#ifdef _KERNEL 3344185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3345185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3346249195Smm error = SET_ERROR(ENOTBLK); 3347230514Smm vdev_free(vd); 3348185029Spjd goto out; 3349185029Spjd } 3350185029Spjd#endif 3351168404Spjd vd->vdev_top = vd; 3352168404Spjd 3353168404Spjd if ((error = vdev_open(vd)) == 0 && 3354185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 3355185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3356168404Spjd vd->vdev_guid) == 0); 3357168404Spjd } 3358168404Spjd 3359168404Spjd vdev_free(vd); 3360168404Spjd 3361185029Spjd if (error && 3362185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3363168404Spjd goto out; 3364168404Spjd else 3365168404Spjd error = 0; 3366168404Spjd } 3367168404Spjd 3368168404Spjdout: 3369185029Spjd sav->sav_pending = NULL; 3370185029Spjd sav->sav_npending = 0; 3371168404Spjd return (error); 3372168404Spjd} 3373168404Spjd 3374185029Spjdstatic int 3375185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3376185029Spjd{ 3377185029Spjd int error; 3378185029Spjd 3379185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3380185029Spjd 3381185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3382185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3383185029Spjd VDEV_LABEL_SPARE)) != 0) { 3384185029Spjd return (error); 3385185029Spjd } 3386185029Spjd 3387185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3388185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3389185029Spjd VDEV_LABEL_L2CACHE)); 3390185029Spjd} 3391185029Spjd 3392185029Spjdstatic void 3393185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3394185029Spjd const char *config) 3395185029Spjd{ 3396185029Spjd int i; 3397185029Spjd 3398185029Spjd if (sav->sav_config != NULL) { 3399185029Spjd nvlist_t **olddevs; 3400185029Spjd uint_t oldndevs; 3401185029Spjd nvlist_t **newdevs; 3402185029Spjd 3403185029Spjd /* 3404185029Spjd * Generate new dev list by concatentating with the 3405185029Spjd * current dev list. 3406185029Spjd */ 3407185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3408185029Spjd &olddevs, &oldndevs) == 0); 3409185029Spjd 3410185029Spjd newdevs = kmem_alloc(sizeof (void *) * 3411185029Spjd (ndevs + oldndevs), KM_SLEEP); 3412185029Spjd for (i = 0; i < oldndevs; i++) 3413185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3414185029Spjd KM_SLEEP) == 0); 3415185029Spjd for (i = 0; i < ndevs; i++) 3416185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3417185029Spjd KM_SLEEP) == 0); 3418185029Spjd 3419185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 3420185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 3421185029Spjd 3422185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3423185029Spjd config, newdevs, ndevs + oldndevs) == 0); 3424185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 3425185029Spjd nvlist_free(newdevs[i]); 3426185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3427185029Spjd } else { 3428185029Spjd /* 3429185029Spjd * Generate a new dev list. 3430185029Spjd */ 3431185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3432185029Spjd KM_SLEEP) == 0); 3433185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3434185029Spjd devs, ndevs) == 0); 3435185029Spjd } 3436185029Spjd} 3437185029Spjd 3438168404Spjd/* 3439185029Spjd * Stop and drop level 2 ARC devices 3440185029Spjd */ 3441185029Spjdvoid 3442185029Spjdspa_l2cache_drop(spa_t *spa) 3443185029Spjd{ 3444185029Spjd vdev_t *vd; 3445185029Spjd int i; 3446185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 3447185029Spjd 3448185029Spjd for (i = 0; i < sav->sav_count; i++) { 3449185029Spjd uint64_t pool; 3450185029Spjd 3451185029Spjd vd = sav->sav_vdevs[i]; 3452185029Spjd ASSERT(vd != NULL); 3453185029Spjd 3454209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3455209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 3456185029Spjd l2arc_remove_vdev(vd); 3457185029Spjd } 3458185029Spjd} 3459185029Spjd 3460185029Spjd/* 3461168404Spjd * Pool Creation 3462168404Spjd */ 3463168404Spjdint 3464185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3465248571Smm nvlist_t *zplprops) 3466168404Spjd{ 3467168404Spjd spa_t *spa; 3468185029Spjd char *altroot = NULL; 3469168404Spjd vdev_t *rvd; 3470168404Spjd dsl_pool_t *dp; 3471168404Spjd dmu_tx_t *tx; 3472219089Spjd int error = 0; 3473168404Spjd uint64_t txg = TXG_INITIAL; 3474185029Spjd nvlist_t **spares, **l2cache; 3475185029Spjd uint_t nspares, nl2cache; 3476219089Spjd uint64_t version, obj; 3477236884Smm boolean_t has_features; 3478168404Spjd 3479168404Spjd /* 3480168404Spjd * If this pool already exists, return failure. 3481168404Spjd */ 3482168404Spjd mutex_enter(&spa_namespace_lock); 3483168404Spjd if (spa_lookup(pool) != NULL) { 3484168404Spjd mutex_exit(&spa_namespace_lock); 3485249195Smm return (SET_ERROR(EEXIST)); 3486168404Spjd } 3487168404Spjd 3488168404Spjd /* 3489168404Spjd * Allocate a new spa_t structure. 3490168404Spjd */ 3491185029Spjd (void) nvlist_lookup_string(props, 3492185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3493219089Spjd spa = spa_add(pool, NULL, altroot); 3494209962Smm spa_activate(spa, spa_mode_global); 3495168404Spjd 3496185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 3497185029Spjd spa_deactivate(spa); 3498185029Spjd spa_remove(spa); 3499185029Spjd mutex_exit(&spa_namespace_lock); 3500185029Spjd return (error); 3501185029Spjd } 3502185029Spjd 3503236884Smm has_features = B_FALSE; 3504236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3505236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3506236884Smm if (zpool_prop_feature(nvpair_name(elem))) 3507236884Smm has_features = B_TRUE; 3508236884Smm } 3509236884Smm 3510236884Smm if (has_features || nvlist_lookup_uint64(props, 3511236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3512185029Spjd version = SPA_VERSION; 3513236884Smm } 3514236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3515219089Spjd 3516219089Spjd spa->spa_first_txg = txg; 3517219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 3518185029Spjd spa->spa_uberblock.ub_version = version; 3519168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3520168404Spjd 3521168404Spjd /* 3522209962Smm * Create "The Godfather" zio to hold all async IOs 3523209962Smm */ 3524209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 3525209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 3526209962Smm 3527209962Smm /* 3528168404Spjd * Create the root vdev. 3529168404Spjd */ 3530185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3531168404Spjd 3532168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3533168404Spjd 3534168404Spjd ASSERT(error != 0 || rvd != NULL); 3535168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3536168404Spjd 3537185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 3538249195Smm error = SET_ERROR(EINVAL); 3539168404Spjd 3540168404Spjd if (error == 0 && 3541168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3542185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 3543168404Spjd VDEV_ALLOC_ADD)) == 0) { 3544219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 3545254591Sgibbs vdev_ashift_optimize(rvd->vdev_child[c]); 3546219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 3547219089Spjd vdev_expand(rvd->vdev_child[c], txg); 3548219089Spjd } 3549168404Spjd } 3550168404Spjd 3551185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3552168404Spjd 3553168404Spjd if (error != 0) { 3554168404Spjd spa_unload(spa); 3555168404Spjd spa_deactivate(spa); 3556168404Spjd spa_remove(spa); 3557168404Spjd mutex_exit(&spa_namespace_lock); 3558168404Spjd return (error); 3559168404Spjd } 3560168404Spjd 3561168404Spjd /* 3562168404Spjd * Get the list of spares, if specified. 3563168404Spjd */ 3564168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3565168404Spjd &spares, &nspares) == 0) { 3566185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3567168404Spjd KM_SLEEP) == 0); 3568185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3569168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3570185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3571168404Spjd spa_load_spares(spa); 3572185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3573185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3574168404Spjd } 3575168404Spjd 3576185029Spjd /* 3577185029Spjd * Get the list of level 2 cache devices, if specified. 3578185029Spjd */ 3579185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3580185029Spjd &l2cache, &nl2cache) == 0) { 3581185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3582185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 3583185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3584185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3585185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3586185029Spjd spa_load_l2cache(spa); 3587185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3588185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3589185029Spjd } 3590185029Spjd 3591236884Smm spa->spa_is_initializing = B_TRUE; 3592185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3593168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 3594236884Smm spa->spa_is_initializing = B_FALSE; 3595168404Spjd 3596219089Spjd /* 3597219089Spjd * Create DDTs (dedup tables). 3598219089Spjd */ 3599219089Spjd ddt_create(spa); 3600219089Spjd 3601219089Spjd spa_update_dspace(spa); 3602219089Spjd 3603168404Spjd tx = dmu_tx_create_assigned(dp, txg); 3604168404Spjd 3605168404Spjd /* 3606168404Spjd * Create the pool config object. 3607168404Spjd */ 3608168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3609185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3610168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3611168404Spjd 3612168404Spjd if (zap_add(spa->spa_meta_objset, 3613168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3614168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3615168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 3616168404Spjd } 3617168404Spjd 3618236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 3619236884Smm spa_feature_create_zap_objects(spa, tx); 3620236884Smm 3621219089Spjd if (zap_add(spa->spa_meta_objset, 3622219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3623219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 3624219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 3625219089Spjd } 3626219089Spjd 3627185029Spjd /* Newly created pools with the right version are always deflated. */ 3628185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3629185029Spjd spa->spa_deflate = TRUE; 3630185029Spjd if (zap_add(spa->spa_meta_objset, 3631185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3632185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3633185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 3634185029Spjd } 3635168404Spjd } 3636168404Spjd 3637168404Spjd /* 3638219089Spjd * Create the deferred-free bpobj. Turn off compression 3639168404Spjd * because sync-to-convergence takes longer if the blocksize 3640168404Spjd * keeps changing. 3641168404Spjd */ 3642219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3643219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 3644168404Spjd ZIO_COMPRESS_OFF, tx); 3645168404Spjd if (zap_add(spa->spa_meta_objset, 3646219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3647219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 3648219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 3649168404Spjd } 3650219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3651219089Spjd spa->spa_meta_objset, obj)); 3652168404Spjd 3653168404Spjd /* 3654168404Spjd * Create the pool's history object. 3655168404Spjd */ 3656185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 3657185029Spjd spa_history_create_obj(spa, tx); 3658168404Spjd 3659185029Spjd /* 3660185029Spjd * Set pool properties. 3661185029Spjd */ 3662185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3663185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3664185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3665219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3666219089Spjd 3667209962Smm if (props != NULL) { 3668209962Smm spa_configfile_set(spa, props, B_FALSE); 3669248571Smm spa_sync_props(props, tx); 3670209962Smm } 3671185029Spjd 3672168404Spjd dmu_tx_commit(tx); 3673168404Spjd 3674168404Spjd spa->spa_sync_on = B_TRUE; 3675168404Spjd txg_sync_start(spa->spa_dsl_pool); 3676168404Spjd 3677168404Spjd /* 3678168404Spjd * We explicitly wait for the first transaction to complete so that our 3679168404Spjd * bean counters are appropriately updated. 3680168404Spjd */ 3681168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 3682168404Spjd 3683185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3684168404Spjd 3685248571Smm spa_history_log_version(spa, "create"); 3686185029Spjd 3687208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 3688208442Smm 3689168404Spjd mutex_exit(&spa_namespace_lock); 3690168404Spjd 3691168404Spjd return (0); 3692168404Spjd} 3693168404Spjd 3694241286Savg#ifdef _KERNEL 3695219089Spjd#if defined(sun) 3696185029Spjd/* 3697219089Spjd * Get the root pool information from the root disk, then import the root pool 3698219089Spjd * during the system boot up time. 3699185029Spjd */ 3700219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3701219089Spjd 3702219089Spjdstatic nvlist_t * 3703219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3704185029Spjd{ 3705219089Spjd nvlist_t *config; 3706185029Spjd nvlist_t *nvtop, *nvroot; 3707185029Spjd uint64_t pgid; 3708185029Spjd 3709219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3710219089Spjd return (NULL); 3711219089Spjd 3712168404Spjd /* 3713185029Spjd * Add this top-level vdev to the child array. 3714168404Spjd */ 3715219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3716219089Spjd &nvtop) == 0); 3717219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3718219089Spjd &pgid) == 0); 3719219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3720168404Spjd 3721185029Spjd /* 3722185029Spjd * Put this pool's top-level vdevs into a root vdev. 3723185029Spjd */ 3724185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3725219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3726219089Spjd VDEV_TYPE_ROOT) == 0); 3727185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3728185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3729185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3730185029Spjd &nvtop, 1) == 0); 3731168404Spjd 3732168404Spjd /* 3733185029Spjd * Replace the existing vdev_tree with the new root vdev in 3734185029Spjd * this pool's configuration (remove the old, add the new). 3735168404Spjd */ 3736185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3737185029Spjd nvlist_free(nvroot); 3738219089Spjd return (config); 3739185029Spjd} 3740168404Spjd 3741185029Spjd/* 3742219089Spjd * Walk the vdev tree and see if we can find a device with "better" 3743219089Spjd * configuration. A configuration is "better" if the label on that 3744219089Spjd * device has a more recent txg. 3745185029Spjd */ 3746219089Spjdstatic void 3747219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3748185029Spjd{ 3749219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3750219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3751185029Spjd 3752219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 3753219089Spjd nvlist_t *label; 3754219089Spjd uint64_t label_txg; 3755185029Spjd 3756219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3757219089Spjd &label) != 0) 3758219089Spjd return; 3759185029Spjd 3760219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3761219089Spjd &label_txg) == 0); 3762168404Spjd 3763219089Spjd /* 3764219089Spjd * Do we have a better boot device? 3765219089Spjd */ 3766219089Spjd if (label_txg > *txg) { 3767219089Spjd *txg = label_txg; 3768219089Spjd *avd = vd; 3769185029Spjd } 3770219089Spjd nvlist_free(label); 3771185029Spjd } 3772185029Spjd} 3773185029Spjd 3774185029Spjd/* 3775185029Spjd * Import a root pool. 3776185029Spjd * 3777185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 3778185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3779185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 3780185029Spjd * 3781185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 3782185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 3783185029Spjd * e.g. 3784185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 3785185029Spjd */ 3786185029Spjdint 3787185029Spjdspa_import_rootpool(char *devpath, char *devid) 3788185029Spjd{ 3789219089Spjd spa_t *spa; 3790219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 3791219089Spjd nvlist_t *config, *nvtop; 3792219089Spjd uint64_t guid, txg; 3793185029Spjd char *pname; 3794185029Spjd int error; 3795185029Spjd 3796185029Spjd /* 3797219089Spjd * Read the label from the boot device and generate a configuration. 3798185029Spjd */ 3799219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3800219089Spjd#if defined(_OBP) && defined(_KERNEL) 3801219089Spjd if (config == NULL) { 3802219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 3803219089Spjd /* iscsi boot */ 3804219089Spjd get_iscsi_bootpath_phy(devpath); 3805219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3806219089Spjd } 3807219089Spjd } 3808219089Spjd#endif 3809219089Spjd if (config == NULL) { 3810236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3811219089Spjd devpath); 3812249195Smm return (SET_ERROR(EIO)); 3813219089Spjd } 3814185029Spjd 3815219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3816219089Spjd &pname) == 0); 3817219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3818185029Spjd 3819209962Smm mutex_enter(&spa_namespace_lock); 3820209962Smm if ((spa = spa_lookup(pname)) != NULL) { 3821209962Smm /* 3822209962Smm * Remove the existing root pool from the namespace so that we 3823209962Smm * can replace it with the correct config we just read in. 3824209962Smm */ 3825209962Smm spa_remove(spa); 3826209962Smm } 3827185029Spjd 3828219089Spjd spa = spa_add(pname, config, NULL); 3829209962Smm spa->spa_is_root = B_TRUE; 3830219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3831209962Smm 3832219089Spjd /* 3833219089Spjd * Build up a vdev tree based on the boot device's label config. 3834219089Spjd */ 3835219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3836219089Spjd &nvtop) == 0); 3837219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3838219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3839219089Spjd VDEV_ALLOC_ROOTPOOL); 3840219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3841219089Spjd if (error) { 3842209962Smm mutex_exit(&spa_namespace_lock); 3843219089Spjd nvlist_free(config); 3844219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3845219089Spjd pname); 3846219089Spjd return (error); 3847209962Smm } 3848209962Smm 3849219089Spjd /* 3850219089Spjd * Get the boot vdev. 3851219089Spjd */ 3852219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3853219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3854219089Spjd (u_longlong_t)guid); 3855249195Smm error = SET_ERROR(ENOENT); 3856219089Spjd goto out; 3857219089Spjd } 3858209962Smm 3859219089Spjd /* 3860219089Spjd * Determine if there is a better boot device. 3861219089Spjd */ 3862219089Spjd avd = bvd; 3863219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 3864219089Spjd if (avd != bvd) { 3865219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3866219089Spjd "try booting from '%s'", avd->vdev_path); 3867249195Smm error = SET_ERROR(EINVAL); 3868219089Spjd goto out; 3869219089Spjd } 3870209962Smm 3871219089Spjd /* 3872219089Spjd * If the boot device is part of a spare vdev then ensure that 3873219089Spjd * we're booting off the active spare. 3874219089Spjd */ 3875219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3876219089Spjd !bvd->vdev_isspare) { 3877219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3878219089Spjd "try booting from '%s'", 3879219089Spjd bvd->vdev_parent-> 3880219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3881249195Smm error = SET_ERROR(EINVAL); 3882219089Spjd goto out; 3883219089Spjd } 3884209962Smm 3885219089Spjd error = 0; 3886219089Spjdout: 3887219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3888219089Spjd vdev_free(rvd); 3889219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3890209962Smm mutex_exit(&spa_namespace_lock); 3891209962Smm 3892219089Spjd nvlist_free(config); 3893219089Spjd return (error); 3894185029Spjd} 3895185029Spjd 3896241286Savg#else 3897241286Savg 3898243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3899243502Savg uint64_t *count); 3900241286Savg 3901241286Savgstatic nvlist_t * 3902241286Savgspa_generate_rootconf(const char *name) 3903241286Savg{ 3904243502Savg nvlist_t **configs, **tops; 3905241286Savg nvlist_t *config; 3906243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 3907243502Savg uint64_t *holes; 3908243502Savg uint64_t best_txg; 3909243213Savg uint64_t nchildren; 3910241286Savg uint64_t pgid; 3911243502Savg uint64_t count; 3912243502Savg uint64_t i; 3913243502Savg uint_t nholes; 3914241286Savg 3915243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 3916241286Savg return (NULL); 3917241286Savg 3918243502Savg ASSERT3U(count, !=, 0); 3919243502Savg best_txg = 0; 3920243502Savg for (i = 0; i < count; i++) { 3921243502Savg uint64_t txg; 3922243502Savg 3923243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 3924243502Savg &txg) == 0); 3925243502Savg if (txg > best_txg) { 3926243502Savg best_txg = txg; 3927243502Savg best_cfg = configs[i]; 3928243502Savg } 3929243502Savg } 3930243502Savg 3931245945Savg nchildren = 1; 3932245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 3933243502Savg holes = NULL; 3934243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 3935243502Savg &holes, &nholes); 3936243502Savg 3937244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 3938243502Savg for (i = 0; i < nchildren; i++) { 3939243502Savg if (i >= count) 3940243502Savg break; 3941243502Savg if (configs[i] == NULL) 3942243502Savg continue; 3943243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 3944243502Savg &nvtop) == 0); 3945243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 3946243213Savg } 3947243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 3948243502Savg if (i >= nchildren) 3949243502Savg continue; 3950243502Savg if (tops[holes[i]] != NULL) 3951243502Savg continue; 3952243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 3953243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 3954243502Savg VDEV_TYPE_HOLE) == 0); 3955243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 3956243502Savg holes[i]) == 0); 3957243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 3958243502Savg 0) == 0); 3959243502Savg } 3960243502Savg for (i = 0; i < nchildren; i++) { 3961243502Savg if (tops[i] != NULL) 3962243502Savg continue; 3963243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 3964243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 3965243502Savg VDEV_TYPE_MISSING) == 0); 3966243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 3967243502Savg i) == 0); 3968243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 3969243502Savg 0) == 0); 3970243502Savg } 3971243213Savg 3972243213Savg /* 3973243502Savg * Create pool config based on the best vdev config. 3974241286Savg */ 3975243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 3976241286Savg 3977241286Savg /* 3978241286Savg * Put this pool's top-level vdevs into a root vdev. 3979241286Savg */ 3980243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3981243502Savg &pgid) == 0); 3982241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3983241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3984241286Savg VDEV_TYPE_ROOT) == 0); 3985241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3986241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3987241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3988243502Savg tops, nchildren) == 0); 3989241286Savg 3990241286Savg /* 3991241286Savg * Replace the existing vdev_tree with the new root vdev in 3992241286Savg * this pool's configuration (remove the old, add the new). 3993241286Savg */ 3994241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3995243502Savg 3996243502Savg /* 3997243502Savg * Drop vdev config elements that should not be present at pool level. 3998243502Savg */ 3999243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 4000243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 4001243502Savg 4002243502Savg for (i = 0; i < count; i++) 4003243502Savg nvlist_free(configs[i]); 4004243502Savg kmem_free(configs, count * sizeof(void *)); 4005243502Savg for (i = 0; i < nchildren; i++) 4006243502Savg nvlist_free(tops[i]); 4007243502Savg kmem_free(tops, nchildren * sizeof(void *)); 4008241286Savg nvlist_free(nvroot); 4009241286Savg return (config); 4010241286Savg} 4011241286Savg 4012241286Savgint 4013241286Savgspa_import_rootpool(const char *name) 4014241286Savg{ 4015241286Savg spa_t *spa; 4016241286Savg vdev_t *rvd, *bvd, *avd = NULL; 4017241286Savg nvlist_t *config, *nvtop; 4018241286Savg uint64_t txg; 4019241286Savg char *pname; 4020241286Savg int error; 4021241286Savg 4022241286Savg /* 4023241286Savg * Read the label from the boot device and generate a configuration. 4024241286Savg */ 4025241286Savg config = spa_generate_rootconf(name); 4026243213Savg 4027243213Savg mutex_enter(&spa_namespace_lock); 4028243213Savg if (config != NULL) { 4029243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4030243213Savg &pname) == 0 && strcmp(name, pname) == 0); 4031243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 4032243213Savg == 0); 4033243213Savg 4034243213Savg if ((spa = spa_lookup(pname)) != NULL) { 4035243213Savg /* 4036243213Savg * Remove the existing root pool from the namespace so 4037243213Savg * that we can replace it with the correct config 4038243213Savg * we just read in. 4039243213Savg */ 4040243213Savg spa_remove(spa); 4041243213Savg } 4042243213Savg spa = spa_add(pname, config, NULL); 4043243501Savg 4044243501Savg /* 4045243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4046243501Savg * via spa_version(). 4047243501Savg */ 4048243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4049243501Savg &spa->spa_ubsync.ub_version) != 0) 4050243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4051243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 4052241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4053241286Savg name); 4054241286Savg return (EIO); 4055243213Savg } else { 4056243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4057241286Savg } 4058241286Savg spa->spa_is_root = B_TRUE; 4059241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4060241286Savg 4061241286Savg /* 4062241286Savg * Build up a vdev tree based on the boot device's label config. 4063241286Savg */ 4064241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4065241286Savg &nvtop) == 0); 4066241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4067241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4068241286Savg VDEV_ALLOC_ROOTPOOL); 4069241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4070241286Savg if (error) { 4071241286Savg mutex_exit(&spa_namespace_lock); 4072241286Savg nvlist_free(config); 4073241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4074241286Savg pname); 4075241286Savg return (error); 4076241286Savg } 4077241286Savg 4078241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4079241286Savg vdev_free(rvd); 4080241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4081241286Savg mutex_exit(&spa_namespace_lock); 4082241286Savg 4083243213Savg nvlist_free(config); 4084243213Savg return (0); 4085241286Savg} 4086241286Savg 4087241286Savg#endif /* sun */ 4088219089Spjd#endif 4089219089Spjd 4090209962Smm/* 4091209962Smm * Import a non-root pool into the system. 4092209962Smm */ 4093185029Spjdint 4094219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4095185029Spjd{ 4096209962Smm spa_t *spa; 4097209962Smm char *altroot = NULL; 4098219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 4099219089Spjd zpool_rewind_policy_t policy; 4100219089Spjd uint64_t mode = spa_mode_global; 4101219089Spjd uint64_t readonly = B_FALSE; 4102209962Smm int error; 4103209962Smm nvlist_t *nvroot; 4104209962Smm nvlist_t **spares, **l2cache; 4105209962Smm uint_t nspares, nl2cache; 4106209962Smm 4107209962Smm /* 4108209962Smm * If a pool with this name exists, return failure. 4109209962Smm */ 4110209962Smm mutex_enter(&spa_namespace_lock); 4111219089Spjd if (spa_lookup(pool) != NULL) { 4112209962Smm mutex_exit(&spa_namespace_lock); 4113249195Smm return (SET_ERROR(EEXIST)); 4114209962Smm } 4115209962Smm 4116209962Smm /* 4117209962Smm * Create and initialize the spa structure. 4118209962Smm */ 4119209962Smm (void) nvlist_lookup_string(props, 4120209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4121219089Spjd (void) nvlist_lookup_uint64(props, 4122219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4123219089Spjd if (readonly) 4124219089Spjd mode = FREAD; 4125219089Spjd spa = spa_add(pool, config, altroot); 4126219089Spjd spa->spa_import_flags = flags; 4127209962Smm 4128209962Smm /* 4129219089Spjd * Verbatim import - Take a pool and insert it into the namespace 4130219089Spjd * as if it had been loaded at boot. 4131219089Spjd */ 4132219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4133219089Spjd if (props != NULL) 4134219089Spjd spa_configfile_set(spa, props, B_FALSE); 4135219089Spjd 4136219089Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 4137219089Spjd 4138219089Spjd mutex_exit(&spa_namespace_lock); 4139219089Spjd return (0); 4140219089Spjd } 4141219089Spjd 4142219089Spjd spa_activate(spa, mode); 4143219089Spjd 4144219089Spjd /* 4145209962Smm * Don't start async tasks until we know everything is healthy. 4146209962Smm */ 4147209962Smm spa_async_suspend(spa); 4148209962Smm 4149219089Spjd zpool_get_rewind_policy(config, &policy); 4150219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 4151219089Spjd state = SPA_LOAD_RECOVER; 4152219089Spjd 4153209962Smm /* 4154209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4155209962Smm * because the user-supplied config is actually the one to trust when 4156209962Smm * doing an import. 4157209962Smm */ 4158219089Spjd if (state != SPA_LOAD_RECOVER) 4159219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4160209962Smm 4161219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4162219089Spjd policy.zrp_request); 4163219089Spjd 4164219089Spjd /* 4165219089Spjd * Propagate anything learned while loading the pool and pass it 4166219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 4167219089Spjd */ 4168219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4169219089Spjd spa->spa_load_info) == 0); 4170219089Spjd 4171209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4172209962Smm /* 4173209962Smm * Toss any existing sparelist, as it doesn't have any validity 4174209962Smm * anymore, and conflicts with spa_has_spare(). 4175209962Smm */ 4176209962Smm if (spa->spa_spares.sav_config) { 4177209962Smm nvlist_free(spa->spa_spares.sav_config); 4178209962Smm spa->spa_spares.sav_config = NULL; 4179209962Smm spa_load_spares(spa); 4180209962Smm } 4181209962Smm if (spa->spa_l2cache.sav_config) { 4182209962Smm nvlist_free(spa->spa_l2cache.sav_config); 4183209962Smm spa->spa_l2cache.sav_config = NULL; 4184209962Smm spa_load_l2cache(spa); 4185209962Smm } 4186209962Smm 4187209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4188209962Smm &nvroot) == 0); 4189209962Smm if (error == 0) 4190209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4191209962Smm VDEV_ALLOC_SPARE); 4192209962Smm if (error == 0) 4193209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4194209962Smm VDEV_ALLOC_L2CACHE); 4195209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4196209962Smm 4197209962Smm if (props != NULL) 4198209962Smm spa_configfile_set(spa, props, B_FALSE); 4199209962Smm 4200209962Smm if (error != 0 || (props && spa_writeable(spa) && 4201209962Smm (error = spa_prop_set(spa, props)))) { 4202209962Smm spa_unload(spa); 4203209962Smm spa_deactivate(spa); 4204209962Smm spa_remove(spa); 4205209962Smm mutex_exit(&spa_namespace_lock); 4206209962Smm return (error); 4207209962Smm } 4208209962Smm 4209209962Smm spa_async_resume(spa); 4210209962Smm 4211209962Smm /* 4212209962Smm * Override any spares and level 2 cache devices as specified by 4213209962Smm * the user, as these may have correct device names/devids, etc. 4214209962Smm */ 4215209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4216209962Smm &spares, &nspares) == 0) { 4217209962Smm if (spa->spa_spares.sav_config) 4218209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4219209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4220209962Smm else 4221209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4222209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4223209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4224209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4225209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4226209962Smm spa_load_spares(spa); 4227209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4228209962Smm spa->spa_spares.sav_sync = B_TRUE; 4229209962Smm } 4230209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4231209962Smm &l2cache, &nl2cache) == 0) { 4232209962Smm if (spa->spa_l2cache.sav_config) 4233209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4234209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4235209962Smm else 4236209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4237209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4238209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4239209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4240209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4241209962Smm spa_load_l2cache(spa); 4242209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4243209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 4244209962Smm } 4245209962Smm 4246219089Spjd /* 4247219089Spjd * Check for any removed devices. 4248219089Spjd */ 4249219089Spjd if (spa->spa_autoreplace) { 4250219089Spjd spa_aux_check_removed(&spa->spa_spares); 4251219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 4252219089Spjd } 4253219089Spjd 4254209962Smm if (spa_writeable(spa)) { 4255209962Smm /* 4256209962Smm * Update the config cache to include the newly-imported pool. 4257209962Smm */ 4258209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4259209962Smm } 4260209962Smm 4261219089Spjd /* 4262219089Spjd * It's possible that the pool was expanded while it was exported. 4263219089Spjd * We kick off an async task to handle this for us. 4264219089Spjd */ 4265219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4266219089Spjd 4267209962Smm mutex_exit(&spa_namespace_lock); 4268248571Smm spa_history_log_version(spa, "import"); 4269209962Smm 4270219089Spjd#ifdef __FreeBSD__ 4271219089Spjd#ifdef _KERNEL 4272219089Spjd zvol_create_minors(pool); 4273219089Spjd#endif 4274219089Spjd#endif 4275209962Smm return (0); 4276185029Spjd} 4277185029Spjd 4278168404Spjdnvlist_t * 4279168404Spjdspa_tryimport(nvlist_t *tryconfig) 4280168404Spjd{ 4281168404Spjd nvlist_t *config = NULL; 4282168404Spjd char *poolname; 4283168404Spjd spa_t *spa; 4284168404Spjd uint64_t state; 4285208443Smm int error; 4286168404Spjd 4287168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4288168404Spjd return (NULL); 4289168404Spjd 4290168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4291168404Spjd return (NULL); 4292168404Spjd 4293168404Spjd /* 4294168404Spjd * Create and initialize the spa structure. 4295168404Spjd */ 4296168404Spjd mutex_enter(&spa_namespace_lock); 4297219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4298209962Smm spa_activate(spa, FREAD); 4299168404Spjd 4300168404Spjd /* 4301168404Spjd * Pass off the heavy lifting to spa_load(). 4302168404Spjd * Pass TRUE for mosconfig because the user-supplied config 4303168404Spjd * is actually the one to trust when doing an import. 4304168404Spjd */ 4305219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4306168404Spjd 4307168404Spjd /* 4308168404Spjd * If 'tryconfig' was at least parsable, return the current config. 4309168404Spjd */ 4310168404Spjd if (spa->spa_root_vdev != NULL) { 4311168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4312168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4313168404Spjd poolname) == 0); 4314168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4315168404Spjd state) == 0); 4316168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4317168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 4318236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4319236884Smm spa->spa_load_info) == 0); 4320168404Spjd 4321168404Spjd /* 4322185029Spjd * If the bootfs property exists on this pool then we 4323185029Spjd * copy it out so that external consumers can tell which 4324185029Spjd * pools are bootable. 4325168404Spjd */ 4326208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 4327185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4328185029Spjd 4329185029Spjd /* 4330185029Spjd * We have to play games with the name since the 4331185029Spjd * pool was opened as TRYIMPORT_NAME. 4332185029Spjd */ 4333185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 4334185029Spjd spa->spa_bootfs, tmpname) == 0) { 4335185029Spjd char *cp; 4336185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4337185029Spjd 4338185029Spjd cp = strchr(tmpname, '/'); 4339185029Spjd if (cp == NULL) { 4340185029Spjd (void) strlcpy(dsname, tmpname, 4341185029Spjd MAXPATHLEN); 4342185029Spjd } else { 4343185029Spjd (void) snprintf(dsname, MAXPATHLEN, 4344185029Spjd "%s/%s", poolname, ++cp); 4345185029Spjd } 4346185029Spjd VERIFY(nvlist_add_string(config, 4347185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4348185029Spjd kmem_free(dsname, MAXPATHLEN); 4349185029Spjd } 4350185029Spjd kmem_free(tmpname, MAXPATHLEN); 4351185029Spjd } 4352185029Spjd 4353185029Spjd /* 4354185029Spjd * Add the list of hot spares and level 2 cache devices. 4355185029Spjd */ 4356209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4357168404Spjd spa_add_spares(spa, config); 4358185029Spjd spa_add_l2cache(spa, config); 4359209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4360168404Spjd } 4361168404Spjd 4362168404Spjd spa_unload(spa); 4363168404Spjd spa_deactivate(spa); 4364168404Spjd spa_remove(spa); 4365168404Spjd mutex_exit(&spa_namespace_lock); 4366168404Spjd 4367168404Spjd return (config); 4368168404Spjd} 4369168404Spjd 4370168404Spjd/* 4371168404Spjd * Pool export/destroy 4372168404Spjd * 4373168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 4374168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 4375168404Spjd * update the pool state and sync all the labels to disk, removing the 4376207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4377207670Smm * we don't sync the labels or remove the configuration cache. 4378168404Spjd */ 4379168404Spjdstatic int 4380185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4381207670Smm boolean_t force, boolean_t hardforce) 4382168404Spjd{ 4383168404Spjd spa_t *spa; 4384168404Spjd 4385168404Spjd if (oldconfig) 4386168404Spjd *oldconfig = NULL; 4387168404Spjd 4388209962Smm if (!(spa_mode_global & FWRITE)) 4389249195Smm return (SET_ERROR(EROFS)); 4390168404Spjd 4391168404Spjd mutex_enter(&spa_namespace_lock); 4392168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4393168404Spjd mutex_exit(&spa_namespace_lock); 4394249195Smm return (SET_ERROR(ENOENT)); 4395168404Spjd } 4396168404Spjd 4397168404Spjd /* 4398168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 4399168404Spjd * reacquire the namespace lock, and see if we can export. 4400168404Spjd */ 4401168404Spjd spa_open_ref(spa, FTAG); 4402168404Spjd mutex_exit(&spa_namespace_lock); 4403168404Spjd spa_async_suspend(spa); 4404168404Spjd mutex_enter(&spa_namespace_lock); 4405168404Spjd spa_close(spa, FTAG); 4406168404Spjd 4407168404Spjd /* 4408168404Spjd * The pool will be in core if it's openable, 4409168404Spjd * in which case we can modify its state. 4410168404Spjd */ 4411168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4412168404Spjd /* 4413168404Spjd * Objsets may be open only because they're dirty, so we 4414168404Spjd * have to force it to sync before checking spa_refcnt. 4415168404Spjd */ 4416168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 4417168404Spjd 4418168404Spjd /* 4419168404Spjd * A pool cannot be exported or destroyed if there are active 4420168404Spjd * references. If we are resetting a pool, allow references by 4421168404Spjd * fault injection handlers. 4422168404Spjd */ 4423168404Spjd if (!spa_refcount_zero(spa) || 4424168404Spjd (spa->spa_inject_ref != 0 && 4425168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 4426168404Spjd spa_async_resume(spa); 4427168404Spjd mutex_exit(&spa_namespace_lock); 4428249195Smm return (SET_ERROR(EBUSY)); 4429168404Spjd } 4430168404Spjd 4431185029Spjd /* 4432185029Spjd * A pool cannot be exported if it has an active shared spare. 4433185029Spjd * This is to prevent other pools stealing the active spare 4434185029Spjd * from an exported pool. At user's own will, such pool can 4435185029Spjd * be forcedly exported. 4436185029Spjd */ 4437185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 4438185029Spjd spa_has_active_shared_spare(spa)) { 4439185029Spjd spa_async_resume(spa); 4440185029Spjd mutex_exit(&spa_namespace_lock); 4441249195Smm return (SET_ERROR(EXDEV)); 4442185029Spjd } 4443168404Spjd 4444168404Spjd /* 4445168404Spjd * We want this to be reflected on every label, 4446168404Spjd * so mark them all dirty. spa_unload() will do the 4447168404Spjd * final sync that pushes these changes out. 4448168404Spjd */ 4449207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4450185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4451168404Spjd spa->spa_state = new_state; 4452219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 4453219089Spjd TXG_DEFER_SIZE + 1; 4454168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4455185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4456168404Spjd } 4457168404Spjd } 4458168404Spjd 4459185029Spjd spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4460185029Spjd 4461168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4462168404Spjd spa_unload(spa); 4463168404Spjd spa_deactivate(spa); 4464168404Spjd } 4465168404Spjd 4466168404Spjd if (oldconfig && spa->spa_config) 4467168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4468168404Spjd 4469168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 4470207670Smm if (!hardforce) 4471207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 4472168404Spjd spa_remove(spa); 4473168404Spjd } 4474168404Spjd mutex_exit(&spa_namespace_lock); 4475168404Spjd 4476168404Spjd return (0); 4477168404Spjd} 4478168404Spjd 4479168404Spjd/* 4480168404Spjd * Destroy a storage pool. 4481168404Spjd */ 4482168404Spjdint 4483168404Spjdspa_destroy(char *pool) 4484168404Spjd{ 4485207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4486207670Smm B_FALSE, B_FALSE)); 4487168404Spjd} 4488168404Spjd 4489168404Spjd/* 4490168404Spjd * Export a storage pool. 4491168404Spjd */ 4492168404Spjdint 4493207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4494207670Smm boolean_t hardforce) 4495168404Spjd{ 4496207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4497207670Smm force, hardforce)); 4498168404Spjd} 4499168404Spjd 4500168404Spjd/* 4501168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 4502168404Spjd * from the namespace in any way. 4503168404Spjd */ 4504168404Spjdint 4505168404Spjdspa_reset(char *pool) 4506168404Spjd{ 4507185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4508207670Smm B_FALSE, B_FALSE)); 4509168404Spjd} 4510168404Spjd 4511168404Spjd/* 4512168404Spjd * ========================================================================== 4513168404Spjd * Device manipulation 4514168404Spjd * ========================================================================== 4515168404Spjd */ 4516168404Spjd 4517168404Spjd/* 4518185029Spjd * Add a device to a storage pool. 4519168404Spjd */ 4520168404Spjdint 4521168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4522168404Spjd{ 4523219089Spjd uint64_t txg, id; 4524209962Smm int error; 4525168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4526168404Spjd vdev_t *vd, *tvd; 4527185029Spjd nvlist_t **spares, **l2cache; 4528185029Spjd uint_t nspares, nl2cache; 4529168404Spjd 4530219089Spjd ASSERT(spa_writeable(spa)); 4531219089Spjd 4532168404Spjd txg = spa_vdev_enter(spa); 4533168404Spjd 4534168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4535168404Spjd VDEV_ALLOC_ADD)) != 0) 4536168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4537168404Spjd 4538185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4539168404Spjd 4540185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4541185029Spjd &nspares) != 0) 4542168404Spjd nspares = 0; 4543168404Spjd 4544185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4545185029Spjd &nl2cache) != 0) 4546185029Spjd nl2cache = 0; 4547185029Spjd 4548185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4549168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4550168404Spjd 4551185029Spjd if (vd->vdev_children != 0 && 4552185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 4553185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4554168404Spjd 4555168404Spjd /* 4556185029Spjd * We must validate the spares and l2cache devices after checking the 4557185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4558168404Spjd */ 4559185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4560168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4561168404Spjd 4562168404Spjd /* 4563168404Spjd * Transfer each new top-level vdev from vd to rvd. 4564168404Spjd */ 4565209962Smm for (int c = 0; c < vd->vdev_children; c++) { 4566219089Spjd 4567219089Spjd /* 4568219089Spjd * Set the vdev id to the first hole, if one exists. 4569219089Spjd */ 4570219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 4571219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 4572219089Spjd vdev_free(rvd->vdev_child[id]); 4573219089Spjd break; 4574219089Spjd } 4575219089Spjd } 4576168404Spjd tvd = vd->vdev_child[c]; 4577168404Spjd vdev_remove_child(vd, tvd); 4578219089Spjd tvd->vdev_id = id; 4579168404Spjd vdev_add_child(rvd, tvd); 4580168404Spjd vdev_config_dirty(tvd); 4581168404Spjd } 4582168404Spjd 4583168404Spjd if (nspares != 0) { 4584185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4585185029Spjd ZPOOL_CONFIG_SPARES); 4586168404Spjd spa_load_spares(spa); 4587185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4588168404Spjd } 4589168404Spjd 4590185029Spjd if (nl2cache != 0) { 4591185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4592185029Spjd ZPOOL_CONFIG_L2CACHE); 4593185029Spjd spa_load_l2cache(spa); 4594185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4595185029Spjd } 4596185029Spjd 4597168404Spjd /* 4598168404Spjd * We have to be careful when adding new vdevs to an existing pool. 4599168404Spjd * If other threads start allocating from these vdevs before we 4600168404Spjd * sync the config cache, and we lose power, then upon reboot we may 4601168404Spjd * fail to open the pool because there are DVAs that the config cache 4602168404Spjd * can't translate. Therefore, we first add the vdevs without 4603168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4604168404Spjd * and then let spa_config_update() initialize the new metaslabs. 4605168404Spjd * 4606168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 4607168404Spjd * if we lose power at any point in this sequence, the remaining 4608168404Spjd * steps will be completed the next time we load the pool. 4609168404Spjd */ 4610168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 4611168404Spjd 4612168404Spjd mutex_enter(&spa_namespace_lock); 4613168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4614168404Spjd mutex_exit(&spa_namespace_lock); 4615168404Spjd 4616168404Spjd return (0); 4617168404Spjd} 4618168404Spjd 4619168404Spjd/* 4620168404Spjd * Attach a device to a mirror. The arguments are the path to any device 4621168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 4622168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 4623168404Spjd * 4624168404Spjd * If 'replacing' is specified, the new device is intended to replace the 4625168404Spjd * existing device; in this case the two devices are made into their own 4626185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 4627168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 4628168404Spjd * extra rules: you can't attach to it after it's been created, and upon 4629168404Spjd * completion of resilvering, the first disk (the one being replaced) 4630168404Spjd * is automatically detached. 4631168404Spjd */ 4632168404Spjdint 4633168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4634168404Spjd{ 4635219089Spjd uint64_t txg, dtl_max_txg; 4636168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4637168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4638168404Spjd vdev_ops_t *pvops; 4639185029Spjd char *oldvdpath, *newvdpath; 4640185029Spjd int newvd_isspare; 4641185029Spjd int error; 4642168404Spjd 4643219089Spjd ASSERT(spa_writeable(spa)); 4644219089Spjd 4645168404Spjd txg = spa_vdev_enter(spa); 4646168404Spjd 4647185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4648168404Spjd 4649168404Spjd if (oldvd == NULL) 4650168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4651168404Spjd 4652168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 4653168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4654168404Spjd 4655168404Spjd pvd = oldvd->vdev_parent; 4656168404Spjd 4657168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4658230514Smm VDEV_ALLOC_ATTACH)) != 0) 4659185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4660185029Spjd 4661185029Spjd if (newrootvd->vdev_children != 1) 4662168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4663168404Spjd 4664168404Spjd newvd = newrootvd->vdev_child[0]; 4665168404Spjd 4666168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 4667168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4668168404Spjd 4669168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4670168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 4671168404Spjd 4672185029Spjd /* 4673185029Spjd * Spares can't replace logs 4674185029Spjd */ 4675185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4676185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4677185029Spjd 4678168404Spjd if (!replacing) { 4679168404Spjd /* 4680168404Spjd * For attach, the only allowable parent is a mirror or the root 4681168404Spjd * vdev. 4682168404Spjd */ 4683168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 4684168404Spjd pvd->vdev_ops != &vdev_root_ops) 4685168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4686168404Spjd 4687168404Spjd pvops = &vdev_mirror_ops; 4688168404Spjd } else { 4689168404Spjd /* 4690168404Spjd * Active hot spares can only be replaced by inactive hot 4691168404Spjd * spares. 4692168404Spjd */ 4693168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4694219089Spjd oldvd->vdev_isspare && 4695168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 4696168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4697168404Spjd 4698168404Spjd /* 4699168404Spjd * If the source is a hot spare, and the parent isn't already a 4700168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 4701168404Spjd * want to create a replacing vdev. The user is not allowed to 4702168404Spjd * attach to a spared vdev child unless the 'isspare' state is 4703168404Spjd * the same (spare replaces spare, non-spare replaces 4704168404Spjd * non-spare). 4705168404Spjd */ 4706219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 4707219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4708168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4709219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 4710219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 4711168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4712219089Spjd } 4713219089Spjd 4714219089Spjd if (newvd->vdev_isspare) 4715168404Spjd pvops = &vdev_spare_ops; 4716168404Spjd else 4717168404Spjd pvops = &vdev_replacing_ops; 4718168404Spjd } 4719168404Spjd 4720168404Spjd /* 4721219089Spjd * Make sure the new device is big enough. 4722168404Spjd */ 4723219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4724168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4725168404Spjd 4726168404Spjd /* 4727168404Spjd * The new device cannot have a higher alignment requirement 4728168404Spjd * than the top-level vdev. 4729168404Spjd */ 4730168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4731168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4732168404Spjd 4733168404Spjd /* 4734168404Spjd * If this is an in-place replacement, update oldvd's path and devid 4735168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 4736168404Spjd */ 4737168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4738168404Spjd spa_strfree(oldvd->vdev_path); 4739168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4740168404Spjd KM_SLEEP); 4741168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 4742168404Spjd newvd->vdev_path, "old"); 4743168404Spjd if (oldvd->vdev_devid != NULL) { 4744168404Spjd spa_strfree(oldvd->vdev_devid); 4745168404Spjd oldvd->vdev_devid = NULL; 4746168404Spjd } 4747168404Spjd } 4748168404Spjd 4749219089Spjd /* mark the device being resilvered */ 4750254112Sdelphij newvd->vdev_resilver_txg = txg; 4751219089Spjd 4752168404Spjd /* 4753168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 4754168404Spjd * mirror/replacing/spare vdev above oldvd. 4755168404Spjd */ 4756168404Spjd if (pvd->vdev_ops != pvops) 4757168404Spjd pvd = vdev_add_parent(oldvd, pvops); 4758168404Spjd 4759168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 4760168404Spjd ASSERT(pvd->vdev_ops == pvops); 4761168404Spjd ASSERT(oldvd->vdev_parent == pvd); 4762168404Spjd 4763168404Spjd /* 4764168404Spjd * Extract the new device from its root and add it to pvd. 4765168404Spjd */ 4766168404Spjd vdev_remove_child(newrootvd, newvd); 4767168404Spjd newvd->vdev_id = pvd->vdev_children; 4768219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 4769168404Spjd vdev_add_child(pvd, newvd); 4770168404Spjd 4771168404Spjd tvd = newvd->vdev_top; 4772168404Spjd ASSERT(pvd->vdev_top == tvd); 4773168404Spjd ASSERT(tvd->vdev_parent == rvd); 4774168404Spjd 4775168404Spjd vdev_config_dirty(tvd); 4776168404Spjd 4777168404Spjd /* 4778219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4779219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 4780219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 4781168404Spjd */ 4782219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4783168404Spjd 4784219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4785219089Spjd dtl_max_txg - TXG_INITIAL); 4786168404Spjd 4787209962Smm if (newvd->vdev_isspare) { 4788168404Spjd spa_spare_activate(newvd); 4789209962Smm spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4790209962Smm } 4791209962Smm 4792185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 4793185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 4794185029Spjd newvd_isspare = newvd->vdev_isspare; 4795168404Spjd 4796168404Spjd /* 4797168404Spjd * Mark newvd's DTL dirty in this txg. 4798168404Spjd */ 4799168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 4800168404Spjd 4801219089Spjd /* 4802258717Savg * Schedule the resilver to restart in the future. We do this to 4803258717Savg * ensure that dmu_sync-ed blocks have been stitched into the 4804258717Savg * respective datasets. 4805219089Spjd */ 4806219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4807168404Spjd 4808219089Spjd /* 4809219089Spjd * Commit the config 4810219089Spjd */ 4811219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4812185029Spjd 4813248571Smm spa_history_log_internal(spa, "vdev attach", NULL, 4814219089Spjd "%s vdev=%s %s vdev=%s", 4815219089Spjd replacing && newvd_isspare ? "spare in" : 4816219089Spjd replacing ? "replace" : "attach", newvdpath, 4817219089Spjd replacing ? "for" : "to", oldvdpath); 4818219089Spjd 4819185029Spjd spa_strfree(oldvdpath); 4820185029Spjd spa_strfree(newvdpath); 4821185029Spjd 4822219089Spjd if (spa->spa_bootfs) 4823219089Spjd spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4824168404Spjd 4825168404Spjd return (0); 4826168404Spjd} 4827168404Spjd 4828168404Spjd/* 4829168404Spjd * Detach a device from a mirror or replacing vdev. 4830251631Sdelphij * 4831168404Spjd * If 'replace_done' is specified, only detach if the parent 4832168404Spjd * is a replacing vdev. 4833168404Spjd */ 4834168404Spjdint 4835209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4836168404Spjd{ 4837168404Spjd uint64_t txg; 4838209962Smm int error; 4839168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4840168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 4841168404Spjd boolean_t unspare = B_FALSE; 4842247187Smm uint64_t unspare_guid = 0; 4843219089Spjd char *vdpath; 4844168404Spjd 4845219089Spjd ASSERT(spa_writeable(spa)); 4846219089Spjd 4847168404Spjd txg = spa_vdev_enter(spa); 4848168404Spjd 4849185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4850168404Spjd 4851168404Spjd if (vd == NULL) 4852168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4853168404Spjd 4854168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 4855168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4856168404Spjd 4857168404Spjd pvd = vd->vdev_parent; 4858168404Spjd 4859168404Spjd /* 4860209962Smm * If the parent/child relationship is not as expected, don't do it. 4861209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4862209962Smm * vdev that's replacing B with C. The user's intent in replacing 4863209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 4864209962Smm * the replace by detaching C, the expected behavior is to end up 4865209962Smm * M(A,B). But suppose that right after deciding to detach C, 4866209962Smm * the replacement of B completes. We would have M(A,C), and then 4867209962Smm * ask to detach C, which would leave us with just A -- not what 4868209962Smm * the user wanted. To prevent this, we make sure that the 4869209962Smm * parent/child relationship hasn't changed -- in this example, 4870209962Smm * that C's parent is still the replacing vdev R. 4871209962Smm */ 4872209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 4873209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4874209962Smm 4875209962Smm /* 4876219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 4877168404Spjd */ 4878219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4879219089Spjd pvd->vdev_ops != &vdev_spare_ops) 4880219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4881168404Spjd 4882168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4883185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 4884168404Spjd 4885168404Spjd /* 4886168404Spjd * Only mirror, replacing, and spare vdevs support detach. 4887168404Spjd */ 4888168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 4889168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 4890168404Spjd pvd->vdev_ops != &vdev_spare_ops) 4891168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4892168404Spjd 4893168404Spjd /* 4894209962Smm * If this device has the only valid copy of some data, 4895209962Smm * we cannot safely detach it. 4896168404Spjd */ 4897209962Smm if (vdev_dtl_required(vd)) 4898168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4899168404Spjd 4900209962Smm ASSERT(pvd->vdev_children >= 2); 4901168404Spjd 4902168404Spjd /* 4903185029Spjd * If we are detaching the second disk from a replacing vdev, then 4904185029Spjd * check to see if we changed the original vdev's path to have "/old" 4905185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 4906168404Spjd */ 4907219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4908219089Spjd vd->vdev_path != NULL) { 4909219089Spjd size_t len = strlen(vd->vdev_path); 4910219089Spjd 4911219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 4912219089Spjd cvd = pvd->vdev_child[c]; 4913219089Spjd 4914219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 4915219089Spjd continue; 4916219089Spjd 4917219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4918219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 4919219089Spjd spa_strfree(cvd->vdev_path); 4920219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 4921219089Spjd break; 4922219089Spjd } 4923185029Spjd } 4924185029Spjd } 4925168404Spjd 4926168404Spjd /* 4927168404Spjd * If we are detaching the original disk from a spare, then it implies 4928168404Spjd * that the spare should become a real disk, and be removed from the 4929168404Spjd * active spare list for the pool. 4930168404Spjd */ 4931168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4932219089Spjd vd->vdev_id == 0 && 4933219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4934168404Spjd unspare = B_TRUE; 4935168404Spjd 4936168404Spjd /* 4937168404Spjd * Erase the disk labels so the disk can be used for other things. 4938168404Spjd * This must be done after all other error cases are handled, 4939168404Spjd * but before we disembowel vd (so we can still do I/O to it). 4940168404Spjd * But if we can't do it, don't treat the error as fatal -- 4941168404Spjd * it may be that the unwritability of the disk is the reason 4942168404Spjd * it's being detached! 4943168404Spjd */ 4944168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4945168404Spjd 4946168404Spjd /* 4947168404Spjd * Remove vd from its parent and compact the parent's children. 4948168404Spjd */ 4949168404Spjd vdev_remove_child(pvd, vd); 4950168404Spjd vdev_compact_children(pvd); 4951168404Spjd 4952168404Spjd /* 4953168404Spjd * Remember one of the remaining children so we can get tvd below. 4954168404Spjd */ 4955219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4956168404Spjd 4957168404Spjd /* 4958168404Spjd * If we need to remove the remaining child from the list of hot spares, 4959209962Smm * do it now, marking the vdev as no longer a spare in the process. 4960209962Smm * We must do this before vdev_remove_parent(), because that can 4961209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 4962209962Smm * reason, we must remove the spare now, in the same txg as the detach; 4963209962Smm * otherwise someone could attach a new sibling, change the GUID, and 4964209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4965168404Spjd */ 4966168404Spjd if (unspare) { 4967168404Spjd ASSERT(cvd->vdev_isspare); 4968168404Spjd spa_spare_remove(cvd); 4969168404Spjd unspare_guid = cvd->vdev_guid; 4970209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4971219089Spjd cvd->vdev_unspare = B_TRUE; 4972168404Spjd } 4973168404Spjd 4974168404Spjd /* 4975168404Spjd * If the parent mirror/replacing vdev only has one child, 4976168404Spjd * the parent is no longer needed. Remove it from the tree. 4977168404Spjd */ 4978219089Spjd if (pvd->vdev_children == 1) { 4979219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 4980219089Spjd cvd->vdev_unspare = B_FALSE; 4981168404Spjd vdev_remove_parent(cvd); 4982219089Spjd } 4983168404Spjd 4984219089Spjd 4985168404Spjd /* 4986168404Spjd * We don't set tvd until now because the parent we just removed 4987168404Spjd * may have been the previous top-level vdev. 4988168404Spjd */ 4989168404Spjd tvd = cvd->vdev_top; 4990168404Spjd ASSERT(tvd->vdev_parent == rvd); 4991168404Spjd 4992168404Spjd /* 4993168404Spjd * Reevaluate the parent vdev state. 4994168404Spjd */ 4995185029Spjd vdev_propagate_state(cvd); 4996168404Spjd 4997168404Spjd /* 4998219089Spjd * If the 'autoexpand' property is set on the pool then automatically 4999219089Spjd * try to expand the size of the pool. For example if the device we 5000219089Spjd * just detached was smaller than the others, it may be possible to 5001219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5002219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 5003168404Spjd */ 5004219089Spjd if (spa->spa_autoexpand) { 5005219089Spjd vdev_reopen(tvd); 5006219089Spjd vdev_expand(tvd, txg); 5007219089Spjd } 5008168404Spjd 5009168404Spjd vdev_config_dirty(tvd); 5010168404Spjd 5011168404Spjd /* 5012168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5013168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 5014168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 5015168404Spjd * prevent vd from being accessed after it's freed. 5016168404Spjd */ 5017219089Spjd vdpath = spa_strdup(vd->vdev_path); 5018209962Smm for (int t = 0; t < TXG_SIZE; t++) 5019168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5020168404Spjd vd->vdev_detached = B_TRUE; 5021168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 5022168404Spjd 5023185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 5024185029Spjd 5025219089Spjd /* hang on to the spa before we release the lock */ 5026219089Spjd spa_open_ref(spa, FTAG); 5027219089Spjd 5028168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 5029168404Spjd 5030248571Smm spa_history_log_internal(spa, "detach", NULL, 5031219089Spjd "vdev=%s", vdpath); 5032219089Spjd spa_strfree(vdpath); 5033219089Spjd 5034168404Spjd /* 5035168404Spjd * If this was the removal of the original device in a hot spare vdev, 5036168404Spjd * then we want to go through and remove the device from the hot spare 5037168404Spjd * list of every other pool. 5038168404Spjd */ 5039168404Spjd if (unspare) { 5040219089Spjd spa_t *altspa = NULL; 5041219089Spjd 5042168404Spjd mutex_enter(&spa_namespace_lock); 5043219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 5044219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 5045219089Spjd altspa == spa) 5046168404Spjd continue; 5047219089Spjd 5048219089Spjd spa_open_ref(altspa, FTAG); 5049185029Spjd mutex_exit(&spa_namespace_lock); 5050219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5051185029Spjd mutex_enter(&spa_namespace_lock); 5052219089Spjd spa_close(altspa, FTAG); 5053168404Spjd } 5054168404Spjd mutex_exit(&spa_namespace_lock); 5055219089Spjd 5056219089Spjd /* search the rest of the vdevs for spares to remove */ 5057219089Spjd spa_vdev_resilver_done(spa); 5058168404Spjd } 5059168404Spjd 5060219089Spjd /* all done with the spa; OK to release */ 5061219089Spjd mutex_enter(&spa_namespace_lock); 5062219089Spjd spa_close(spa, FTAG); 5063219089Spjd mutex_exit(&spa_namespace_lock); 5064219089Spjd 5065168404Spjd return (error); 5066168404Spjd} 5067168404Spjd 5068219089Spjd/* 5069219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 5070219089Spjd */ 5071219089Spjdint 5072219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5073219089Spjd nvlist_t *props, boolean_t exp) 5074219089Spjd{ 5075219089Spjd int error = 0; 5076219089Spjd uint64_t txg, *glist; 5077219089Spjd spa_t *newspa; 5078219089Spjd uint_t c, children, lastlog; 5079219089Spjd nvlist_t **child, *nvl, *tmp; 5080219089Spjd dmu_tx_t *tx; 5081219089Spjd char *altroot = NULL; 5082219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5083219089Spjd boolean_t activate_slog; 5084219089Spjd 5085219089Spjd ASSERT(spa_writeable(spa)); 5086219089Spjd 5087219089Spjd txg = spa_vdev_enter(spa); 5088219089Spjd 5089219089Spjd /* clear the log and flush everything up to now */ 5090219089Spjd activate_slog = spa_passivate_log(spa); 5091219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5092219089Spjd error = spa_offline_log(spa); 5093219089Spjd txg = spa_vdev_config_enter(spa); 5094219089Spjd 5095219089Spjd if (activate_slog) 5096219089Spjd spa_activate_log(spa); 5097219089Spjd 5098219089Spjd if (error != 0) 5099219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5100219089Spjd 5101219089Spjd /* check new spa name before going any further */ 5102219089Spjd if (spa_lookup(newname) != NULL) 5103219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5104219089Spjd 5105219089Spjd /* 5106219089Spjd * scan through all the children to ensure they're all mirrors 5107219089Spjd */ 5108219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5109219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5110219089Spjd &children) != 0) 5111219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5112219089Spjd 5113219089Spjd /* first, check to ensure we've got the right child count */ 5114219089Spjd rvd = spa->spa_root_vdev; 5115219089Spjd lastlog = 0; 5116219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 5117219089Spjd vdev_t *vd = rvd->vdev_child[c]; 5118219089Spjd 5119219089Spjd /* don't count the holes & logs as children */ 5120219089Spjd if (vd->vdev_islog || vd->vdev_ishole) { 5121219089Spjd if (lastlog == 0) 5122219089Spjd lastlog = c; 5123219089Spjd continue; 5124219089Spjd } 5125219089Spjd 5126219089Spjd lastlog = 0; 5127219089Spjd } 5128219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5129219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5130219089Spjd 5131219089Spjd /* next, ensure no spare or cache devices are part of the split */ 5132219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5133219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5134219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5135219089Spjd 5136219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5137219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5138219089Spjd 5139219089Spjd /* then, loop over each vdev and validate it */ 5140219089Spjd for (c = 0; c < children; c++) { 5141219089Spjd uint64_t is_hole = 0; 5142219089Spjd 5143219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5144219089Spjd &is_hole); 5145219089Spjd 5146219089Spjd if (is_hole != 0) { 5147219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5148219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5149219089Spjd continue; 5150219089Spjd } else { 5151249195Smm error = SET_ERROR(EINVAL); 5152219089Spjd break; 5153219089Spjd } 5154219089Spjd } 5155219089Spjd 5156219089Spjd /* which disk is going to be split? */ 5157219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5158219089Spjd &glist[c]) != 0) { 5159249195Smm error = SET_ERROR(EINVAL); 5160219089Spjd break; 5161219089Spjd } 5162219089Spjd 5163219089Spjd /* look it up in the spa */ 5164219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5165219089Spjd if (vml[c] == NULL) { 5166249195Smm error = SET_ERROR(ENODEV); 5167219089Spjd break; 5168219089Spjd } 5169219089Spjd 5170219089Spjd /* make sure there's nothing stopping the split */ 5171219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5172219089Spjd vml[c]->vdev_islog || 5173219089Spjd vml[c]->vdev_ishole || 5174219089Spjd vml[c]->vdev_isspare || 5175219089Spjd vml[c]->vdev_isl2cache || 5176219089Spjd !vdev_writeable(vml[c]) || 5177219089Spjd vml[c]->vdev_children != 0 || 5178219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5179219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5180249195Smm error = SET_ERROR(EINVAL); 5181219089Spjd break; 5182219089Spjd } 5183219089Spjd 5184219089Spjd if (vdev_dtl_required(vml[c])) { 5185249195Smm error = SET_ERROR(EBUSY); 5186219089Spjd break; 5187219089Spjd } 5188219089Spjd 5189219089Spjd /* we need certain info from the top level */ 5190219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5191219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 5192219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5193219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 5194219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5195219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 5196219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5197219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 5198219089Spjd } 5199219089Spjd 5200219089Spjd if (error != 0) { 5201219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5202219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5203219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5204219089Spjd } 5205219089Spjd 5206219089Spjd /* stop writers from using the disks */ 5207219089Spjd for (c = 0; c < children; c++) { 5208219089Spjd if (vml[c] != NULL) 5209219089Spjd vml[c]->vdev_offline = B_TRUE; 5210219089Spjd } 5211219089Spjd vdev_reopen(spa->spa_root_vdev); 5212219089Spjd 5213219089Spjd /* 5214219089Spjd * Temporarily record the splitting vdevs in the spa config. This 5215219089Spjd * will disappear once the config is regenerated. 5216219089Spjd */ 5217219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5218219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5219219089Spjd glist, children) == 0); 5220219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5221219089Spjd 5222219089Spjd mutex_enter(&spa->spa_props_lock); 5223219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5224219089Spjd nvl) == 0); 5225219089Spjd mutex_exit(&spa->spa_props_lock); 5226219089Spjd spa->spa_config_splitting = nvl; 5227219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5228219089Spjd 5229219089Spjd /* configure and create the new pool */ 5230219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5231219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5232219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5233219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5234219089Spjd spa_version(spa)) == 0); 5235219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5236219089Spjd spa->spa_config_txg) == 0); 5237219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5238219089Spjd spa_generate_guid(NULL)) == 0); 5239219089Spjd (void) nvlist_lookup_string(props, 5240219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5241219089Spjd 5242219089Spjd /* add the new pool to the namespace */ 5243219089Spjd newspa = spa_add(newname, config, altroot); 5244219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 5245219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 5246219089Spjd 5247219089Spjd /* release the spa config lock, retaining the namespace lock */ 5248219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5249219089Spjd 5250219089Spjd if (zio_injection_enabled) 5251219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 5252219089Spjd 5253219089Spjd spa_activate(newspa, spa_mode_global); 5254219089Spjd spa_async_suspend(newspa); 5255219089Spjd 5256219089Spjd#ifndef sun 5257219089Spjd /* mark that we are creating new spa by splitting */ 5258219089Spjd newspa->spa_splitting_newspa = B_TRUE; 5259219089Spjd#endif 5260219089Spjd /* create the new pool from the disks of the original pool */ 5261219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5262219089Spjd#ifndef sun 5263219089Spjd newspa->spa_splitting_newspa = B_FALSE; 5264219089Spjd#endif 5265219089Spjd if (error) 5266219089Spjd goto out; 5267219089Spjd 5268219089Spjd /* if that worked, generate a real config for the new pool */ 5269219089Spjd if (newspa->spa_root_vdev != NULL) { 5270219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5271219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 5272219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5273219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5274219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5275219089Spjd B_TRUE)); 5276219089Spjd } 5277219089Spjd 5278219089Spjd /* set the props */ 5279219089Spjd if (props != NULL) { 5280219089Spjd spa_configfile_set(newspa, props, B_FALSE); 5281219089Spjd error = spa_prop_set(newspa, props); 5282219089Spjd if (error) 5283219089Spjd goto out; 5284219089Spjd } 5285219089Spjd 5286219089Spjd /* flush everything */ 5287219089Spjd txg = spa_vdev_config_enter(newspa); 5288219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 5289219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5290219089Spjd 5291219089Spjd if (zio_injection_enabled) 5292219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 5293219089Spjd 5294219089Spjd spa_async_resume(newspa); 5295219089Spjd 5296219089Spjd /* finally, update the original pool's config */ 5297219089Spjd txg = spa_vdev_config_enter(spa); 5298219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5299219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 5300219089Spjd if (error != 0) 5301219089Spjd dmu_tx_abort(tx); 5302219089Spjd for (c = 0; c < children; c++) { 5303219089Spjd if (vml[c] != NULL) { 5304219089Spjd vdev_split(vml[c]); 5305219089Spjd if (error == 0) 5306248571Smm spa_history_log_internal(spa, "detach", tx, 5307248571Smm "vdev=%s", vml[c]->vdev_path); 5308219089Spjd vdev_free(vml[c]); 5309219089Spjd } 5310219089Spjd } 5311219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5312219089Spjd spa->spa_config_splitting = NULL; 5313219089Spjd nvlist_free(nvl); 5314219089Spjd if (error == 0) 5315219089Spjd dmu_tx_commit(tx); 5316219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 5317219089Spjd 5318219089Spjd if (zio_injection_enabled) 5319219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 5320219089Spjd 5321219089Spjd /* split is complete; log a history record */ 5322248571Smm spa_history_log_internal(newspa, "split", NULL, 5323248571Smm "from pool %s", spa_name(spa)); 5324219089Spjd 5325219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5326219089Spjd 5327219089Spjd /* if we're not going to mount the filesystems in userland, export */ 5328219089Spjd if (exp) 5329219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5330219089Spjd B_FALSE, B_FALSE); 5331219089Spjd 5332219089Spjd return (error); 5333219089Spjd 5334219089Spjdout: 5335219089Spjd spa_unload(newspa); 5336219089Spjd spa_deactivate(newspa); 5337219089Spjd spa_remove(newspa); 5338219089Spjd 5339219089Spjd txg = spa_vdev_config_enter(spa); 5340219089Spjd 5341219089Spjd /* re-online all offlined disks */ 5342219089Spjd for (c = 0; c < children; c++) { 5343219089Spjd if (vml[c] != NULL) 5344219089Spjd vml[c]->vdev_offline = B_FALSE; 5345219089Spjd } 5346219089Spjd vdev_reopen(spa->spa_root_vdev); 5347219089Spjd 5348219089Spjd nvlist_free(spa->spa_config_splitting); 5349219089Spjd spa->spa_config_splitting = NULL; 5350219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 5351219089Spjd 5352219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5353219089Spjd return (error); 5354219089Spjd} 5355219089Spjd 5356185029Spjdstatic nvlist_t * 5357185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5358185029Spjd{ 5359185029Spjd for (int i = 0; i < count; i++) { 5360185029Spjd uint64_t guid; 5361185029Spjd 5362185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5363185029Spjd &guid) == 0); 5364185029Spjd 5365185029Spjd if (guid == target_guid) 5366185029Spjd return (nvpp[i]); 5367185029Spjd } 5368185029Spjd 5369185029Spjd return (NULL); 5370185029Spjd} 5371185029Spjd 5372185029Spjdstatic void 5373185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5374185029Spjd nvlist_t *dev_to_remove) 5375185029Spjd{ 5376185029Spjd nvlist_t **newdev = NULL; 5377185029Spjd 5378185029Spjd if (count > 1) 5379185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5380185029Spjd 5381185029Spjd for (int i = 0, j = 0; i < count; i++) { 5382185029Spjd if (dev[i] == dev_to_remove) 5383185029Spjd continue; 5384185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5385185029Spjd } 5386185029Spjd 5387185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5388185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5389185029Spjd 5390185029Spjd for (int i = 0; i < count - 1; i++) 5391185029Spjd nvlist_free(newdev[i]); 5392185029Spjd 5393185029Spjd if (count > 1) 5394185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 5395185029Spjd} 5396185029Spjd 5397168404Spjd/* 5398219089Spjd * Evacuate the device. 5399219089Spjd */ 5400219089Spjdstatic int 5401219089Spjdspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5402219089Spjd{ 5403219089Spjd uint64_t txg; 5404219089Spjd int error = 0; 5405219089Spjd 5406219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5407219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5408219089Spjd ASSERT(vd == vd->vdev_top); 5409219089Spjd 5410219089Spjd /* 5411219089Spjd * Evacuate the device. We don't hold the config lock as writer 5412219089Spjd * since we need to do I/O but we do keep the 5413219089Spjd * spa_namespace_lock held. Once this completes the device 5414219089Spjd * should no longer have any blocks allocated on it. 5415219089Spjd */ 5416219089Spjd if (vd->vdev_islog) { 5417219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5418219089Spjd error = spa_offline_log(spa); 5419219089Spjd } else { 5420249195Smm error = SET_ERROR(ENOTSUP); 5421219089Spjd } 5422219089Spjd 5423219089Spjd if (error) 5424219089Spjd return (error); 5425219089Spjd 5426219089Spjd /* 5427219089Spjd * The evacuation succeeded. Remove any remaining MOS metadata 5428219089Spjd * associated with this vdev, and wait for these changes to sync. 5429219089Spjd */ 5430240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 5431219089Spjd txg = spa_vdev_config_enter(spa); 5432219089Spjd vd->vdev_removing = B_TRUE; 5433258717Savg vdev_dirty_leaves(vd, VDD_DTL, txg); 5434219089Spjd vdev_config_dirty(vd); 5435219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5436219089Spjd 5437219089Spjd return (0); 5438219089Spjd} 5439219089Spjd 5440219089Spjd/* 5441219089Spjd * Complete the removal by cleaning up the namespace. 5442219089Spjd */ 5443219089Spjdstatic void 5444219089Spjdspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5445219089Spjd{ 5446219089Spjd vdev_t *rvd = spa->spa_root_vdev; 5447219089Spjd uint64_t id = vd->vdev_id; 5448219089Spjd boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5449219089Spjd 5450219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5451219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5452219089Spjd ASSERT(vd == vd->vdev_top); 5453219089Spjd 5454219089Spjd /* 5455219089Spjd * Only remove any devices which are empty. 5456219089Spjd */ 5457219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5458219089Spjd return; 5459219089Spjd 5460219089Spjd (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5461219089Spjd 5462219089Spjd if (list_link_active(&vd->vdev_state_dirty_node)) 5463219089Spjd vdev_state_clean(vd); 5464219089Spjd if (list_link_active(&vd->vdev_config_dirty_node)) 5465219089Spjd vdev_config_clean(vd); 5466219089Spjd 5467219089Spjd vdev_free(vd); 5468219089Spjd 5469219089Spjd if (last_vdev) { 5470219089Spjd vdev_compact_children(rvd); 5471219089Spjd } else { 5472219089Spjd vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5473219089Spjd vdev_add_child(rvd, vd); 5474219089Spjd } 5475219089Spjd vdev_config_dirty(rvd); 5476219089Spjd 5477219089Spjd /* 5478219089Spjd * Reassess the health of our root vdev. 5479219089Spjd */ 5480219089Spjd vdev_reopen(rvd); 5481219089Spjd} 5482219089Spjd 5483219089Spjd/* 5484219089Spjd * Remove a device from the pool - 5485219089Spjd * 5486219089Spjd * Removing a device from the vdev namespace requires several steps 5487219089Spjd * and can take a significant amount of time. As a result we use 5488219089Spjd * the spa_vdev_config_[enter/exit] functions which allow us to 5489219089Spjd * grab and release the spa_config_lock while still holding the namespace 5490219089Spjd * lock. During each step the configuration is synced out. 5491251631Sdelphij * 5492251631Sdelphij * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5493251631Sdelphij * devices. 5494219089Spjd */ 5495168404Spjdint 5496168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5497168404Spjd{ 5498168404Spjd vdev_t *vd; 5499219089Spjd metaslab_group_t *mg; 5500185029Spjd nvlist_t **spares, **l2cache, *nv; 5501219089Spjd uint64_t txg = 0; 5502185029Spjd uint_t nspares, nl2cache; 5503185029Spjd int error = 0; 5504209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5505168404Spjd 5506219089Spjd ASSERT(spa_writeable(spa)); 5507219089Spjd 5508209962Smm if (!locked) 5509209962Smm txg = spa_vdev_enter(spa); 5510168404Spjd 5511185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5512168404Spjd 5513185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 5514185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5515185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5516185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5517185029Spjd /* 5518185029Spjd * Only remove the hot spare if it's not currently in use 5519185029Spjd * in this pool. 5520185029Spjd */ 5521185029Spjd if (vd == NULL || unspare) { 5522185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 5523185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5524185029Spjd spa_load_spares(spa); 5525185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5526185029Spjd } else { 5527249195Smm error = SET_ERROR(EBUSY); 5528168404Spjd } 5529185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 5530185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5531185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5532185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5533185029Spjd /* 5534185029Spjd * Cache devices can always be removed. 5535185029Spjd */ 5536185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5537185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5538185029Spjd spa_load_l2cache(spa); 5539185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5540219089Spjd } else if (vd != NULL && vd->vdev_islog) { 5541219089Spjd ASSERT(!locked); 5542219089Spjd ASSERT(vd == vd->vdev_top); 5543219089Spjd 5544219089Spjd mg = vd->vdev_mg; 5545219089Spjd 5546219089Spjd /* 5547219089Spjd * Stop allocating from this vdev. 5548219089Spjd */ 5549219089Spjd metaslab_group_passivate(mg); 5550219089Spjd 5551219089Spjd /* 5552219089Spjd * Wait for the youngest allocations and frees to sync, 5553219089Spjd * and then wait for the deferral of those frees to finish. 5554219089Spjd */ 5555219089Spjd spa_vdev_config_exit(spa, NULL, 5556219089Spjd txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5557219089Spjd 5558219089Spjd /* 5559219089Spjd * Attempt to evacuate the vdev. 5560219089Spjd */ 5561219089Spjd error = spa_vdev_remove_evacuate(spa, vd); 5562219089Spjd 5563219089Spjd txg = spa_vdev_config_enter(spa); 5564219089Spjd 5565219089Spjd /* 5566219089Spjd * If we couldn't evacuate the vdev, unwind. 5567219089Spjd */ 5568219089Spjd if (error) { 5569219089Spjd metaslab_group_activate(mg); 5570219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5571219089Spjd } 5572219089Spjd 5573219089Spjd /* 5574219089Spjd * Clean up the vdev namespace. 5575219089Spjd */ 5576219089Spjd spa_vdev_remove_from_namespace(spa, vd); 5577219089Spjd 5578185029Spjd } else if (vd != NULL) { 5579185029Spjd /* 5580185029Spjd * Normal vdevs cannot be removed (yet). 5581185029Spjd */ 5582249195Smm error = SET_ERROR(ENOTSUP); 5583168404Spjd } else { 5584185029Spjd /* 5585185029Spjd * There is no vdev of any kind with the specified guid. 5586185029Spjd */ 5587249195Smm error = SET_ERROR(ENOENT); 5588168404Spjd } 5589168404Spjd 5590209962Smm if (!locked) 5591209962Smm return (spa_vdev_exit(spa, NULL, txg, error)); 5592209962Smm 5593209962Smm return (error); 5594168404Spjd} 5595168404Spjd 5596168404Spjd/* 5597185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 5598251631Sdelphij * currently spared, so we can detach it. 5599168404Spjd */ 5600168404Spjdstatic vdev_t * 5601185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 5602168404Spjd{ 5603168404Spjd vdev_t *newvd, *oldvd; 5604168404Spjd 5605219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5606185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5607168404Spjd if (oldvd != NULL) 5608168404Spjd return (oldvd); 5609168404Spjd } 5610168404Spjd 5611185029Spjd /* 5612219089Spjd * Check for a completed replacement. We always consider the first 5613219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 5614219089Spjd * the newest (see spa_vdev_attach() for how that works). In 5615219089Spjd * the case where the newest vdev is faulted, we will not automatically 5616219089Spjd * remove it after a resilver completes. This is OK as it will require 5617219089Spjd * user intervention to determine which disk the admin wishes to keep. 5618185029Spjd */ 5619219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 5620219089Spjd ASSERT(vd->vdev_children > 1); 5621219089Spjd 5622219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 5623168404Spjd oldvd = vd->vdev_child[0]; 5624168404Spjd 5625209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 5626219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5627209962Smm !vdev_dtl_required(oldvd)) 5628168404Spjd return (oldvd); 5629168404Spjd } 5630168404Spjd 5631185029Spjd /* 5632185029Spjd * Check for a completed resilver with the 'unspare' flag set. 5633185029Spjd */ 5634219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 5635219089Spjd vdev_t *first = vd->vdev_child[0]; 5636219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5637185029Spjd 5638219089Spjd if (last->vdev_unspare) { 5639219089Spjd oldvd = first; 5640219089Spjd newvd = last; 5641219089Spjd } else if (first->vdev_unspare) { 5642219089Spjd oldvd = last; 5643219089Spjd newvd = first; 5644219089Spjd } else { 5645219089Spjd oldvd = NULL; 5646219089Spjd } 5647219089Spjd 5648219089Spjd if (oldvd != NULL && 5649209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 5650219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5651219089Spjd !vdev_dtl_required(oldvd)) 5652185029Spjd return (oldvd); 5653219089Spjd 5654219089Spjd /* 5655219089Spjd * If there are more than two spares attached to a disk, 5656219089Spjd * and those spares are not required, then we want to 5657219089Spjd * attempt to free them up now so that they can be used 5658219089Spjd * by other pools. Once we're back down to a single 5659219089Spjd * disk+spare, we stop removing them. 5660219089Spjd */ 5661219089Spjd if (vd->vdev_children > 2) { 5662219089Spjd newvd = vd->vdev_child[1]; 5663219089Spjd 5664219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 5665219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 5666219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 5667219089Spjd !vdev_dtl_required(newvd)) 5668219089Spjd return (newvd); 5669185029Spjd } 5670185029Spjd } 5671185029Spjd 5672168404Spjd return (NULL); 5673168404Spjd} 5674168404Spjd 5675168404Spjdstatic void 5676185029Spjdspa_vdev_resilver_done(spa_t *spa) 5677168404Spjd{ 5678209962Smm vdev_t *vd, *pvd, *ppvd; 5679209962Smm uint64_t guid, sguid, pguid, ppguid; 5680168404Spjd 5681209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5682168404Spjd 5683185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5684209962Smm pvd = vd->vdev_parent; 5685209962Smm ppvd = pvd->vdev_parent; 5686168404Spjd guid = vd->vdev_guid; 5687209962Smm pguid = pvd->vdev_guid; 5688209962Smm ppguid = ppvd->vdev_guid; 5689209962Smm sguid = 0; 5690168404Spjd /* 5691168404Spjd * If we have just finished replacing a hot spared device, then 5692168404Spjd * we need to detach the parent's first child (the original hot 5693168404Spjd * spare) as well. 5694168404Spjd */ 5695219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5696219089Spjd ppvd->vdev_children == 2) { 5697168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5698209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 5699168404Spjd } 5700254112Sdelphij ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5701254112Sdelphij 5702209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5703209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5704168404Spjd return; 5705209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5706168404Spjd return; 5707209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5708168404Spjd } 5709168404Spjd 5710209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5711168404Spjd} 5712168404Spjd 5713168404Spjd/* 5714219089Spjd * Update the stored path or FRU for this vdev. 5715168404Spjd */ 5716168404Spjdint 5717209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5718209962Smm boolean_t ispath) 5719168404Spjd{ 5720185029Spjd vdev_t *vd; 5721219089Spjd boolean_t sync = B_FALSE; 5722168404Spjd 5723219089Spjd ASSERT(spa_writeable(spa)); 5724168404Spjd 5725219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 5726219089Spjd 5727209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5728219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5729168404Spjd 5730168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5731219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5732168404Spjd 5733209962Smm if (ispath) { 5734219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 5735219089Spjd spa_strfree(vd->vdev_path); 5736219089Spjd vd->vdev_path = spa_strdup(value); 5737219089Spjd sync = B_TRUE; 5738219089Spjd } 5739209962Smm } else { 5740219089Spjd if (vd->vdev_fru == NULL) { 5741219089Spjd vd->vdev_fru = spa_strdup(value); 5742219089Spjd sync = B_TRUE; 5743219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 5744209962Smm spa_strfree(vd->vdev_fru); 5745219089Spjd vd->vdev_fru = spa_strdup(value); 5746219089Spjd sync = B_TRUE; 5747219089Spjd } 5748209962Smm } 5749168404Spjd 5750219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5751168404Spjd} 5752168404Spjd 5753209962Smmint 5754209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5755209962Smm{ 5756209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5757209962Smm} 5758209962Smm 5759209962Smmint 5760209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5761209962Smm{ 5762209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5763209962Smm} 5764209962Smm 5765168404Spjd/* 5766168404Spjd * ========================================================================== 5767219089Spjd * SPA Scanning 5768168404Spjd * ========================================================================== 5769168404Spjd */ 5770168404Spjd 5771168404Spjdint 5772219089Spjdspa_scan_stop(spa_t *spa) 5773168404Spjd{ 5774185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5775219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5776249195Smm return (SET_ERROR(EBUSY)); 5777219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 5778219089Spjd} 5779168404Spjd 5780219089Spjdint 5781219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 5782219089Spjd{ 5783219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5784219089Spjd 5785219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5786249195Smm return (SET_ERROR(ENOTSUP)); 5787168404Spjd 5788168404Spjd /* 5789185029Spjd * If a resilver was requested, but there is no DTL on a 5790185029Spjd * writeable leaf device, we have nothing to do. 5791168404Spjd */ 5792219089Spjd if (func == POOL_SCAN_RESILVER && 5793185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5794185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5795168404Spjd return (0); 5796168404Spjd } 5797168404Spjd 5798219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 5799168404Spjd} 5800168404Spjd 5801168404Spjd/* 5802168404Spjd * ========================================================================== 5803168404Spjd * SPA async task processing 5804168404Spjd * ========================================================================== 5805168404Spjd */ 5806168404Spjd 5807168404Spjdstatic void 5808185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 5809168404Spjd{ 5810185029Spjd if (vd->vdev_remove_wanted) { 5811219089Spjd vd->vdev_remove_wanted = B_FALSE; 5812219089Spjd vd->vdev_delayed_close = B_FALSE; 5813185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5814209962Smm 5815209962Smm /* 5816209962Smm * We want to clear the stats, but we don't want to do a full 5817209962Smm * vdev_clear() as that will cause us to throw away 5818209962Smm * degraded/faulted state as well as attempt to reopen the 5819209962Smm * device, all of which is a waste. 5820209962Smm */ 5821209962Smm vd->vdev_stat.vs_read_errors = 0; 5822209962Smm vd->vdev_stat.vs_write_errors = 0; 5823209962Smm vd->vdev_stat.vs_checksum_errors = 0; 5824209962Smm 5825185029Spjd vdev_state_dirty(vd->vdev_top); 5826185029Spjd } 5827168404Spjd 5828185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5829185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 5830185029Spjd} 5831168404Spjd 5832185029Spjdstatic void 5833185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 5834185029Spjd{ 5835185029Spjd if (vd->vdev_probe_wanted) { 5836219089Spjd vd->vdev_probe_wanted = B_FALSE; 5837185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 5838168404Spjd } 5839168404Spjd 5840185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5841185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 5842168404Spjd} 5843168404Spjd 5844168404Spjdstatic void 5845219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 5846219089Spjd{ 5847219089Spjd sysevent_id_t eid; 5848219089Spjd nvlist_t *attr; 5849219089Spjd char *physpath; 5850219089Spjd 5851219089Spjd if (!spa->spa_autoexpand) 5852219089Spjd return; 5853219089Spjd 5854219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5855219089Spjd vdev_t *cvd = vd->vdev_child[c]; 5856219089Spjd spa_async_autoexpand(spa, cvd); 5857219089Spjd } 5858219089Spjd 5859219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5860219089Spjd return; 5861219089Spjd 5862219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5863219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5864219089Spjd 5865219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5866219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5867219089Spjd 5868219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5869219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5870219089Spjd 5871219089Spjd nvlist_free(attr); 5872219089Spjd kmem_free(physpath, MAXPATHLEN); 5873219089Spjd} 5874219089Spjd 5875219089Spjdstatic void 5876168404Spjdspa_async_thread(void *arg) 5877168404Spjd{ 5878168404Spjd spa_t *spa = arg; 5879168404Spjd int tasks; 5880168404Spjd 5881168404Spjd ASSERT(spa->spa_sync_on); 5882168404Spjd 5883168404Spjd mutex_enter(&spa->spa_async_lock); 5884168404Spjd tasks = spa->spa_async_tasks; 5885253990Smav spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 5886168404Spjd mutex_exit(&spa->spa_async_lock); 5887168404Spjd 5888168404Spjd /* 5889168404Spjd * See if the config needs to be updated. 5890168404Spjd */ 5891168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5892219089Spjd uint64_t old_space, new_space; 5893219089Spjd 5894168404Spjd mutex_enter(&spa_namespace_lock); 5895219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 5896168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5897219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 5898168404Spjd mutex_exit(&spa_namespace_lock); 5899219089Spjd 5900219089Spjd /* 5901219089Spjd * If the pool grew as a result of the config update, 5902219089Spjd * then log an internal history event. 5903219089Spjd */ 5904219089Spjd if (new_space != old_space) { 5905248571Smm spa_history_log_internal(spa, "vdev online", NULL, 5906219089Spjd "pool '%s' size: %llu(+%llu)", 5907219089Spjd spa_name(spa), new_space, new_space - old_space); 5908219089Spjd } 5909168404Spjd } 5910168404Spjd 5911219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5912219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5913219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 5914219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 5915219089Spjd } 5916219089Spjd 5917168404Spjd /* 5918185029Spjd * See if any devices need to be probed. 5919168404Spjd */ 5920185029Spjd if (tasks & SPA_ASYNC_PROBE) { 5921219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5922185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 5923185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5924185029Spjd } 5925168404Spjd 5926168404Spjd /* 5927185029Spjd * If any devices are done replacing, detach them. 5928168404Spjd */ 5929185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 5930185029Spjd spa_vdev_resilver_done(spa); 5931168404Spjd 5932168404Spjd /* 5933168404Spjd * Kick off a resilver. 5934168404Spjd */ 5935168404Spjd if (tasks & SPA_ASYNC_RESILVER) 5936219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 5937168404Spjd 5938168404Spjd /* 5939168404Spjd * Let the world know that we're done. 5940168404Spjd */ 5941168404Spjd mutex_enter(&spa->spa_async_lock); 5942168404Spjd spa->spa_async_thread = NULL; 5943168404Spjd cv_broadcast(&spa->spa_async_cv); 5944168404Spjd mutex_exit(&spa->spa_async_lock); 5945168404Spjd thread_exit(); 5946168404Spjd} 5947168404Spjd 5948253990Smavstatic void 5949253990Smavspa_async_thread_vd(void *arg) 5950253990Smav{ 5951253990Smav spa_t *spa = arg; 5952253990Smav int tasks; 5953253990Smav 5954253990Smav ASSERT(spa->spa_sync_on); 5955253990Smav 5956253990Smav mutex_enter(&spa->spa_async_lock); 5957253990Smav tasks = spa->spa_async_tasks; 5958253990Smavretry: 5959253990Smav spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 5960253990Smav mutex_exit(&spa->spa_async_lock); 5961253990Smav 5962253990Smav /* 5963253990Smav * See if any devices need to be marked REMOVED. 5964253990Smav */ 5965253990Smav if (tasks & SPA_ASYNC_REMOVE) { 5966253990Smav spa_vdev_state_enter(spa, SCL_NONE); 5967253990Smav spa_async_remove(spa, spa->spa_root_vdev); 5968253990Smav for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5969253990Smav spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5970253990Smav for (int i = 0; i < spa->spa_spares.sav_count; i++) 5971253990Smav spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5972253990Smav (void) spa_vdev_state_exit(spa, NULL, 0); 5973253990Smav } 5974253990Smav 5975253990Smav /* 5976253990Smav * Let the world know that we're done. 5977253990Smav */ 5978253990Smav mutex_enter(&spa->spa_async_lock); 5979253990Smav tasks = spa->spa_async_tasks; 5980253990Smav if ((tasks & SPA_ASYNC_REMOVE) != 0) 5981253990Smav goto retry; 5982253990Smav spa->spa_async_thread_vd = NULL; 5983253990Smav cv_broadcast(&spa->spa_async_cv); 5984253990Smav mutex_exit(&spa->spa_async_lock); 5985253990Smav thread_exit(); 5986253990Smav} 5987253990Smav 5988168404Spjdvoid 5989168404Spjdspa_async_suspend(spa_t *spa) 5990168404Spjd{ 5991168404Spjd mutex_enter(&spa->spa_async_lock); 5992168404Spjd spa->spa_async_suspended++; 5993253990Smav while (spa->spa_async_thread != NULL && 5994253990Smav spa->spa_async_thread_vd != NULL) 5995168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5996168404Spjd mutex_exit(&spa->spa_async_lock); 5997168404Spjd} 5998168404Spjd 5999168404Spjdvoid 6000168404Spjdspa_async_resume(spa_t *spa) 6001168404Spjd{ 6002168404Spjd mutex_enter(&spa->spa_async_lock); 6003168404Spjd ASSERT(spa->spa_async_suspended != 0); 6004168404Spjd spa->spa_async_suspended--; 6005168404Spjd mutex_exit(&spa->spa_async_lock); 6006168404Spjd} 6007168404Spjd 6008251636Sdelphijstatic boolean_t 6009251636Sdelphijspa_async_tasks_pending(spa_t *spa) 6010251636Sdelphij{ 6011251636Sdelphij uint_t non_config_tasks; 6012251636Sdelphij uint_t config_task; 6013251636Sdelphij boolean_t config_task_suspended; 6014251636Sdelphij 6015253990Smav non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6016253990Smav SPA_ASYNC_REMOVE); 6017251636Sdelphij config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6018251636Sdelphij if (spa->spa_ccw_fail_time == 0) { 6019251636Sdelphij config_task_suspended = B_FALSE; 6020251636Sdelphij } else { 6021251636Sdelphij config_task_suspended = 6022251636Sdelphij (gethrtime() - spa->spa_ccw_fail_time) < 6023251636Sdelphij (zfs_ccw_retry_interval * NANOSEC); 6024251636Sdelphij } 6025251636Sdelphij 6026251636Sdelphij return (non_config_tasks || (config_task && !config_task_suspended)); 6027251636Sdelphij} 6028251636Sdelphij 6029168404Spjdstatic void 6030168404Spjdspa_async_dispatch(spa_t *spa) 6031168404Spjd{ 6032168404Spjd mutex_enter(&spa->spa_async_lock); 6033251636Sdelphij if (spa_async_tasks_pending(spa) && 6034251636Sdelphij !spa->spa_async_suspended && 6035168404Spjd spa->spa_async_thread == NULL && 6036251636Sdelphij rootdir != NULL) 6037168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 6038168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6039168404Spjd mutex_exit(&spa->spa_async_lock); 6040168404Spjd} 6041168404Spjd 6042253990Smavstatic void 6043253990Smavspa_async_dispatch_vd(spa_t *spa) 6044253990Smav{ 6045253990Smav mutex_enter(&spa->spa_async_lock); 6046253990Smav if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6047253990Smav !spa->spa_async_suspended && 6048253990Smav spa->spa_async_thread_vd == NULL && 6049253990Smav rootdir != NULL) 6050253990Smav spa->spa_async_thread_vd = thread_create(NULL, 0, 6051253990Smav spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6052253990Smav mutex_exit(&spa->spa_async_lock); 6053253990Smav} 6054253990Smav 6055168404Spjdvoid 6056168404Spjdspa_async_request(spa_t *spa, int task) 6057168404Spjd{ 6058219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6059168404Spjd mutex_enter(&spa->spa_async_lock); 6060168404Spjd spa->spa_async_tasks |= task; 6061168404Spjd mutex_exit(&spa->spa_async_lock); 6062253990Smav spa_async_dispatch_vd(spa); 6063168404Spjd} 6064168404Spjd 6065168404Spjd/* 6066168404Spjd * ========================================================================== 6067168404Spjd * SPA syncing routines 6068168404Spjd * ========================================================================== 6069168404Spjd */ 6070168404Spjd 6071219089Spjdstatic int 6072219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6073168404Spjd{ 6074219089Spjd bpobj_t *bpo = arg; 6075219089Spjd bpobj_enqueue(bpo, bp, tx); 6076219089Spjd return (0); 6077219089Spjd} 6078168404Spjd 6079219089Spjdstatic int 6080219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6081219089Spjd{ 6082219089Spjd zio_t *zio = arg; 6083168404Spjd 6084219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6085240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 6086219089Spjd return (0); 6087168404Spjd} 6088168404Spjd 6089258632Savg/* 6090258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6091258632Savg * amount of time spent syncing frees. 6092258632Savg */ 6093168404Spjdstatic void 6094258632Savgspa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6095258632Savg{ 6096258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6097258632Savg bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6098258632Savg VERIFY(zio_wait(zio) == 0); 6099258632Savg} 6100258632Savg 6101258632Savg/* 6102258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6103258632Savg * amount of time spent syncing deferred frees. 6104258632Savg */ 6105258632Savgstatic void 6106258632Savgspa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6107258632Savg{ 6108258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6109258632Savg VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6110258632Savg spa_free_sync_cb, zio, tx), ==, 0); 6111258632Savg VERIFY0(zio_wait(zio)); 6112258632Savg} 6113258632Savg 6114258632Savg 6115258632Savgstatic void 6116168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6117168404Spjd{ 6118168404Spjd char *packed = NULL; 6119185029Spjd size_t bufsize; 6120168404Spjd size_t nvsize = 0; 6121168404Spjd dmu_buf_t *db; 6122168404Spjd 6123168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6124168404Spjd 6125185029Spjd /* 6126185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6127260150Sdelphij * information. This avoids the dmu_buf_will_dirty() path and 6128185029Spjd * saves us a pre-read to get data we don't actually care about. 6129185029Spjd */ 6130236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6131185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 6132168404Spjd 6133168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6134168404Spjd KM_SLEEP) == 0); 6135185029Spjd bzero(packed + nvsize, bufsize - nvsize); 6136168404Spjd 6137185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6138168404Spjd 6139185029Spjd kmem_free(packed, bufsize); 6140168404Spjd 6141168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6142168404Spjd dmu_buf_will_dirty(db, tx); 6143168404Spjd *(uint64_t *)db->db_data = nvsize; 6144168404Spjd dmu_buf_rele(db, FTAG); 6145168404Spjd} 6146168404Spjd 6147168404Spjdstatic void 6148185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6149185029Spjd const char *config, const char *entry) 6150168404Spjd{ 6151168404Spjd nvlist_t *nvroot; 6152185029Spjd nvlist_t **list; 6153168404Spjd int i; 6154168404Spjd 6155185029Spjd if (!sav->sav_sync) 6156168404Spjd return; 6157168404Spjd 6158168404Spjd /* 6159185029Spjd * Update the MOS nvlist describing the list of available devices. 6160185029Spjd * spa_validate_aux() will have already made sure this nvlist is 6161185029Spjd * valid and the vdevs are labeled appropriately. 6162168404Spjd */ 6163185029Spjd if (sav->sav_object == 0) { 6164185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6165185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6166185029Spjd sizeof (uint64_t), tx); 6167168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 6168185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6169185029Spjd &sav->sav_object, tx) == 0); 6170168404Spjd } 6171168404Spjd 6172168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6173185029Spjd if (sav->sav_count == 0) { 6174185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6175168404Spjd } else { 6176185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6177185029Spjd for (i = 0; i < sav->sav_count; i++) 6178185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6179219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 6180185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6181185029Spjd sav->sav_count) == 0); 6182185029Spjd for (i = 0; i < sav->sav_count; i++) 6183185029Spjd nvlist_free(list[i]); 6184185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 6185168404Spjd } 6186168404Spjd 6187185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6188168404Spjd nvlist_free(nvroot); 6189168404Spjd 6190185029Spjd sav->sav_sync = B_FALSE; 6191168404Spjd} 6192168404Spjd 6193168404Spjdstatic void 6194168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6195168404Spjd{ 6196168404Spjd nvlist_t *config; 6197168404Spjd 6198185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) 6199168404Spjd return; 6200168404Spjd 6201185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6202168404Spjd 6203185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 6204185029Spjd dmu_tx_get_txg(tx), B_FALSE); 6205185029Spjd 6206243505Smm /* 6207243505Smm * If we're upgrading the spa version then make sure that 6208243505Smm * the config object gets updated with the correct version. 6209243505Smm */ 6210243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6211243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6212243505Smm spa->spa_uberblock.ub_version); 6213243505Smm 6214185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6215185029Spjd 6216168404Spjd if (spa->spa_config_syncing) 6217168404Spjd nvlist_free(spa->spa_config_syncing); 6218168404Spjd spa->spa_config_syncing = config; 6219168404Spjd 6220168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6221168404Spjd} 6222168404Spjd 6223236884Smmstatic void 6224248571Smmspa_sync_version(void *arg, dmu_tx_t *tx) 6225236884Smm{ 6226248571Smm uint64_t *versionp = arg; 6227248571Smm uint64_t version = *versionp; 6228248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6229236884Smm 6230236884Smm /* 6231236884Smm * Setting the version is special cased when first creating the pool. 6232236884Smm */ 6233236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 6234236884Smm 6235247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6236236884Smm ASSERT(version >= spa_version(spa)); 6237236884Smm 6238236884Smm spa->spa_uberblock.ub_version = version; 6239236884Smm vdev_config_dirty(spa->spa_root_vdev); 6240248571Smm spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6241236884Smm} 6242236884Smm 6243185029Spjd/* 6244185029Spjd * Set zpool properties. 6245185029Spjd */ 6246168404Spjdstatic void 6247248571Smmspa_sync_props(void *arg, dmu_tx_t *tx) 6248168404Spjd{ 6249248571Smm nvlist_t *nvp = arg; 6250248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6251185029Spjd objset_t *mos = spa->spa_meta_objset; 6252236884Smm nvpair_t *elem = NULL; 6253168404Spjd 6254168404Spjd mutex_enter(&spa->spa_props_lock); 6255168404Spjd 6256185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 6257236884Smm uint64_t intval; 6258236884Smm char *strval, *fname; 6259236884Smm zpool_prop_t prop; 6260236884Smm const char *propname; 6261236884Smm zprop_type_t proptype; 6262259813Sdelphij spa_feature_t fid; 6263236884Smm 6264185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6265236884Smm case ZPROP_INVAL: 6266236884Smm /* 6267236884Smm * We checked this earlier in spa_prop_validate(). 6268236884Smm */ 6269236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 6270236884Smm 6271236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 6272259813Sdelphij VERIFY0(zfeature_lookup_name(fname, &fid)); 6273236884Smm 6274259813Sdelphij spa_feature_enable(spa, fid, tx); 6275248571Smm spa_history_log_internal(spa, "set", tx, 6276248571Smm "%s=enabled", nvpair_name(elem)); 6277236884Smm break; 6278236884Smm 6279185029Spjd case ZPOOL_PROP_VERSION: 6280258717Savg intval = fnvpair_value_uint64(elem); 6281185029Spjd /* 6282236884Smm * The version is synced seperatly before other 6283236884Smm * properties and should be correct by now. 6284185029Spjd */ 6285236884Smm ASSERT3U(spa_version(spa), >=, intval); 6286185029Spjd break; 6287168404Spjd 6288185029Spjd case ZPOOL_PROP_ALTROOT: 6289185029Spjd /* 6290185029Spjd * 'altroot' is a non-persistent property. It should 6291185029Spjd * have been set temporarily at creation or import time. 6292185029Spjd */ 6293185029Spjd ASSERT(spa->spa_root != NULL); 6294185029Spjd break; 6295168404Spjd 6296219089Spjd case ZPOOL_PROP_READONLY: 6297185029Spjd case ZPOOL_PROP_CACHEFILE: 6298185029Spjd /* 6299219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 6300219089Spjd * properties. 6301185029Spjd */ 6302168404Spjd break; 6303228103Smm case ZPOOL_PROP_COMMENT: 6304258717Savg strval = fnvpair_value_string(elem); 6305228103Smm if (spa->spa_comment != NULL) 6306228103Smm spa_strfree(spa->spa_comment); 6307228103Smm spa->spa_comment = spa_strdup(strval); 6308228103Smm /* 6309228103Smm * We need to dirty the configuration on all the vdevs 6310228103Smm * so that their labels get updated. It's unnecessary 6311228103Smm * to do this for pool creation since the vdev's 6312228103Smm * configuratoin has already been dirtied. 6313228103Smm */ 6314228103Smm if (tx->tx_txg != TXG_INITIAL) 6315228103Smm vdev_config_dirty(spa->spa_root_vdev); 6316248571Smm spa_history_log_internal(spa, "set", tx, 6317248571Smm "%s=%s", nvpair_name(elem), strval); 6318228103Smm break; 6319185029Spjd default: 6320185029Spjd /* 6321185029Spjd * Set pool property values in the poolprops mos object. 6322185029Spjd */ 6323185029Spjd if (spa->spa_pool_props_object == 0) { 6324236884Smm spa->spa_pool_props_object = 6325236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 6326185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6327236884Smm tx); 6328185029Spjd } 6329185029Spjd 6330185029Spjd /* normalize the property name */ 6331185029Spjd propname = zpool_prop_to_name(prop); 6332185029Spjd proptype = zpool_prop_get_type(prop); 6333185029Spjd 6334185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 6335185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 6336258717Savg strval = fnvpair_value_string(elem); 6337258717Savg VERIFY0(zap_update(mos, 6338185029Spjd spa->spa_pool_props_object, propname, 6339258717Savg 1, strlen(strval) + 1, strval, tx)); 6340248571Smm spa_history_log_internal(spa, "set", tx, 6341248571Smm "%s=%s", nvpair_name(elem), strval); 6342185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6343258717Savg intval = fnvpair_value_uint64(elem); 6344185029Spjd 6345185029Spjd if (proptype == PROP_TYPE_INDEX) { 6346185029Spjd const char *unused; 6347258717Savg VERIFY0(zpool_prop_index_to_string( 6348258717Savg prop, intval, &unused)); 6349185029Spjd } 6350258717Savg VERIFY0(zap_update(mos, 6351185029Spjd spa->spa_pool_props_object, propname, 6352258717Savg 8, 1, &intval, tx)); 6353248571Smm spa_history_log_internal(spa, "set", tx, 6354248571Smm "%s=%lld", nvpair_name(elem), intval); 6355185029Spjd } else { 6356185029Spjd ASSERT(0); /* not allowed */ 6357185029Spjd } 6358185029Spjd 6359185029Spjd switch (prop) { 6360185029Spjd case ZPOOL_PROP_DELEGATION: 6361185029Spjd spa->spa_delegation = intval; 6362185029Spjd break; 6363185029Spjd case ZPOOL_PROP_BOOTFS: 6364185029Spjd spa->spa_bootfs = intval; 6365185029Spjd break; 6366185029Spjd case ZPOOL_PROP_FAILUREMODE: 6367185029Spjd spa->spa_failmode = intval; 6368185029Spjd break; 6369219089Spjd case ZPOOL_PROP_AUTOEXPAND: 6370219089Spjd spa->spa_autoexpand = intval; 6371219089Spjd if (tx->tx_txg != TXG_INITIAL) 6372219089Spjd spa_async_request(spa, 6373219089Spjd SPA_ASYNC_AUTOEXPAND); 6374219089Spjd break; 6375219089Spjd case ZPOOL_PROP_DEDUPDITTO: 6376219089Spjd spa->spa_dedup_ditto = intval; 6377219089Spjd break; 6378185029Spjd default: 6379185029Spjd break; 6380185029Spjd } 6381168404Spjd } 6382185029Spjd 6383168404Spjd } 6384185029Spjd 6385185029Spjd mutex_exit(&spa->spa_props_lock); 6386168404Spjd} 6387168404Spjd 6388168404Spjd/* 6389219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 6390219089Spjd * reflect the new version this txg, so there must be no changes this 6391219089Spjd * txg to anything that the upgrade code depends on after it executes. 6392219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 6393219089Spjd * tasks. 6394219089Spjd */ 6395219089Spjdstatic void 6396219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6397219089Spjd{ 6398219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6399219089Spjd 6400219089Spjd ASSERT(spa->spa_sync_pass == 1); 6401219089Spjd 6402248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6403248571Smm 6404219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6405219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6406219089Spjd dsl_pool_create_origin(dp, tx); 6407219089Spjd 6408219089Spjd /* Keeping the origin open increases spa_minref */ 6409219089Spjd spa->spa_minref += 3; 6410219089Spjd } 6411219089Spjd 6412219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6413219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6414219089Spjd dsl_pool_upgrade_clones(dp, tx); 6415219089Spjd } 6416219089Spjd 6417219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6418219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6419219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 6420219089Spjd 6421219089Spjd /* Keeping the freedir open increases spa_minref */ 6422219089Spjd spa->spa_minref += 3; 6423219089Spjd } 6424236884Smm 6425236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6426236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6427236884Smm spa_feature_create_zap_objects(spa, tx); 6428236884Smm } 6429268126Sdelphij 6430268126Sdelphij /* 6431268126Sdelphij * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6432268126Sdelphij * when possibility to use lz4 compression for metadata was added 6433268126Sdelphij * Old pools that have this feature enabled must be upgraded to have 6434268126Sdelphij * this feature active 6435268126Sdelphij */ 6436268126Sdelphij if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6437268126Sdelphij boolean_t lz4_en = spa_feature_is_enabled(spa, 6438268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 6439268126Sdelphij boolean_t lz4_ac = spa_feature_is_active(spa, 6440268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 6441268126Sdelphij 6442268126Sdelphij if (lz4_en && !lz4_ac) 6443268126Sdelphij spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6444268126Sdelphij } 6445248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 6446219089Spjd} 6447219089Spjd 6448219089Spjd/* 6449168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 6450168404Spjd * part of the process, so we iterate until it converges. 6451168404Spjd */ 6452168404Spjdvoid 6453168404Spjdspa_sync(spa_t *spa, uint64_t txg) 6454168404Spjd{ 6455168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6456168404Spjd objset_t *mos = spa->spa_meta_objset; 6457219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6458168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6459168404Spjd vdev_t *vd; 6460168404Spjd dmu_tx_t *tx; 6461185029Spjd int error; 6462168404Spjd 6463219089Spjd VERIFY(spa_writeable(spa)); 6464219089Spjd 6465168404Spjd /* 6466168404Spjd * Lock out configuration changes. 6467168404Spjd */ 6468185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6469168404Spjd 6470168404Spjd spa->spa_syncing_txg = txg; 6471168404Spjd spa->spa_sync_pass = 0; 6472168404Spjd 6473185029Spjd /* 6474185029Spjd * If there are any pending vdev state changes, convert them 6475185029Spjd * into config changes that go out with this transaction group. 6476185029Spjd */ 6477185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6478209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 6479209962Smm /* 6480209962Smm * We need the write lock here because, for aux vdevs, 6481209962Smm * calling vdev_config_dirty() modifies sav_config. 6482209962Smm * This is ugly and will become unnecessary when we 6483209962Smm * eliminate the aux vdev wart by integrating all vdevs 6484209962Smm * into the root vdev tree. 6485209962Smm */ 6486209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6487209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6488209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6489209962Smm vdev_state_clean(vd); 6490209962Smm vdev_config_dirty(vd); 6491209962Smm } 6492209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6493209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6494185029Spjd } 6495185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6496185029Spjd 6497168404Spjd tx = dmu_tx_create_assigned(dp, txg); 6498168404Spjd 6499247265Smm spa->spa_sync_starttime = gethrtime(); 6500247265Smm#ifdef illumos 6501247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6502247265Smm spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6503247265Smm#else /* FreeBSD */ 6504247265Smm#ifdef _KERNEL 6505247265Smm callout_reset(&spa->spa_deadman_cycid, 6506247265Smm hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6507247265Smm#endif 6508247265Smm#endif 6509247265Smm 6510168404Spjd /* 6511185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6512168404Spjd * set spa_deflate if we have no raid-z vdevs. 6513168404Spjd */ 6514185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6515185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6516168404Spjd int i; 6517168404Spjd 6518168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 6519168404Spjd vd = rvd->vdev_child[i]; 6520168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6521168404Spjd break; 6522168404Spjd } 6523168404Spjd if (i == rvd->vdev_children) { 6524168404Spjd spa->spa_deflate = TRUE; 6525168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 6526168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6527168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6528168404Spjd } 6529168404Spjd } 6530168404Spjd 6531168404Spjd /* 6532219089Spjd * If anything has changed in this txg, or if someone is waiting 6533219089Spjd * for this txg to sync (eg, spa_vdev_remove()), push the 6534219089Spjd * deferred frees from the previous txg. If not, leave them 6535219089Spjd * alone so that we don't generate work on an otherwise idle 6536219089Spjd * system. 6537168404Spjd */ 6538168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6539168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6540219089Spjd !txg_list_empty(&dp->dp_sync_tasks, txg) || 6541219089Spjd ((dsl_scan_active(dp->dp_scan) || 6542219089Spjd txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6543258632Savg spa_sync_deferred_frees(spa, tx); 6544219089Spjd } 6545168404Spjd 6546168404Spjd /* 6547168404Spjd * Iterate to convergence. 6548168404Spjd */ 6549168404Spjd do { 6550219089Spjd int pass = ++spa->spa_sync_pass; 6551168404Spjd 6552168404Spjd spa_sync_config_object(spa, tx); 6553185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6554185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6555185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6556185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6557168404Spjd spa_errlog_sync(spa, txg); 6558168404Spjd dsl_pool_sync(dp, txg); 6559168404Spjd 6560243503Smm if (pass < zfs_sync_pass_deferred_free) { 6561258632Savg spa_sync_frees(spa, free_bpl, tx); 6562219089Spjd } else { 6563219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 6564258632Savg &spa->spa_deferred_bpobj, tx); 6565168404Spjd } 6566168404Spjd 6567219089Spjd ddt_sync(spa, txg); 6568219089Spjd dsl_scan_sync(dp, tx); 6569168404Spjd 6570219089Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6571219089Spjd vdev_sync(vd, txg); 6572168404Spjd 6573219089Spjd if (pass == 1) 6574219089Spjd spa_sync_upgrades(spa, tx); 6575168404Spjd 6576219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 6577219089Spjd 6578168404Spjd /* 6579168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 6580168404Spjd * to commit the transaction group. 6581168404Spjd * 6582185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 6583185029Spjd * random top-level vdevs that are known to be visible in the 6584185029Spjd * config cache (see spa_vdev_add() for a complete description). 6585185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6586168404Spjd */ 6587185029Spjd for (;;) { 6588185029Spjd /* 6589185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 6590185029Spjd * while we're attempting to write the vdev labels. 6591185029Spjd */ 6592185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6593168404Spjd 6594185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 6595185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 6596185029Spjd int svdcount = 0; 6597185029Spjd int children = rvd->vdev_children; 6598185029Spjd int c0 = spa_get_random(children); 6599185029Spjd 6600219089Spjd for (int c = 0; c < children; c++) { 6601185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 6602185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6603185029Spjd continue; 6604185029Spjd svd[svdcount++] = vd; 6605185029Spjd if (svdcount == SPA_DVAS_PER_BP) 6606185029Spjd break; 6607185029Spjd } 6608213198Smm error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6609213198Smm if (error != 0) 6610213198Smm error = vdev_config_sync(svd, svdcount, txg, 6611213198Smm B_TRUE); 6612185029Spjd } else { 6613185029Spjd error = vdev_config_sync(rvd->vdev_child, 6614213198Smm rvd->vdev_children, txg, B_FALSE); 6615213198Smm if (error != 0) 6616213198Smm error = vdev_config_sync(rvd->vdev_child, 6617213198Smm rvd->vdev_children, txg, B_TRUE); 6618168404Spjd } 6619185029Spjd 6620239620Smm if (error == 0) 6621239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 6622239620Smm 6623185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6624185029Spjd 6625185029Spjd if (error == 0) 6626185029Spjd break; 6627185029Spjd zio_suspend(spa, NULL); 6628185029Spjd zio_resume_wait(spa); 6629168404Spjd } 6630168404Spjd dmu_tx_commit(tx); 6631168404Spjd 6632247265Smm#ifdef illumos 6633247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6634247265Smm#else /* FreeBSD */ 6635247265Smm#ifdef _KERNEL 6636247265Smm callout_drain(&spa->spa_deadman_cycid); 6637247265Smm#endif 6638247265Smm#endif 6639247265Smm 6640168404Spjd /* 6641168404Spjd * Clear the dirty config list. 6642168404Spjd */ 6643185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6644168404Spjd vdev_config_clean(vd); 6645168404Spjd 6646168404Spjd /* 6647168404Spjd * Now that the new config has synced transactionally, 6648168404Spjd * let it become visible to the config cache. 6649168404Spjd */ 6650168404Spjd if (spa->spa_config_syncing != NULL) { 6651168404Spjd spa_config_set(spa, spa->spa_config_syncing); 6652168404Spjd spa->spa_config_txg = txg; 6653168404Spjd spa->spa_config_syncing = NULL; 6654168404Spjd } 6655168404Spjd 6656168404Spjd spa->spa_ubsync = spa->spa_uberblock; 6657168404Spjd 6658219089Spjd dsl_pool_sync_done(dp, txg); 6659168404Spjd 6660168404Spjd /* 6661168404Spjd * Update usable space statistics. 6662168404Spjd */ 6663168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6664168404Spjd vdev_sync_done(vd, txg); 6665168404Spjd 6666219089Spjd spa_update_dspace(spa); 6667219089Spjd 6668168404Spjd /* 6669168404Spjd * It had better be the case that we didn't dirty anything 6670168404Spjd * since vdev_config_sync(). 6671168404Spjd */ 6672168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6673168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6674168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6675168404Spjd 6676219089Spjd spa->spa_sync_pass = 0; 6677219089Spjd 6678185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6679168404Spjd 6680219089Spjd spa_handle_ignored_writes(spa); 6681219089Spjd 6682168404Spjd /* 6683168404Spjd * If any async tasks have been requested, kick them off. 6684168404Spjd */ 6685168404Spjd spa_async_dispatch(spa); 6686253990Smav spa_async_dispatch_vd(spa); 6687168404Spjd} 6688168404Spjd 6689168404Spjd/* 6690168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 6691168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 6692168404Spjd * sync. 6693168404Spjd */ 6694168404Spjdvoid 6695168404Spjdspa_sync_allpools(void) 6696168404Spjd{ 6697168404Spjd spa_t *spa = NULL; 6698168404Spjd mutex_enter(&spa_namespace_lock); 6699168404Spjd while ((spa = spa_next(spa)) != NULL) { 6700219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 6701219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 6702168404Spjd continue; 6703168404Spjd spa_open_ref(spa, FTAG); 6704168404Spjd mutex_exit(&spa_namespace_lock); 6705168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6706168404Spjd mutex_enter(&spa_namespace_lock); 6707168404Spjd spa_close(spa, FTAG); 6708168404Spjd } 6709168404Spjd mutex_exit(&spa_namespace_lock); 6710168404Spjd} 6711168404Spjd 6712168404Spjd/* 6713168404Spjd * ========================================================================== 6714168404Spjd * Miscellaneous routines 6715168404Spjd * ========================================================================== 6716168404Spjd */ 6717168404Spjd 6718168404Spjd/* 6719168404Spjd * Remove all pools in the system. 6720168404Spjd */ 6721168404Spjdvoid 6722168404Spjdspa_evict_all(void) 6723168404Spjd{ 6724168404Spjd spa_t *spa; 6725168404Spjd 6726168404Spjd /* 6727168404Spjd * Remove all cached state. All pools should be closed now, 6728168404Spjd * so every spa in the AVL tree should be unreferenced. 6729168404Spjd */ 6730168404Spjd mutex_enter(&spa_namespace_lock); 6731168404Spjd while ((spa = spa_next(NULL)) != NULL) { 6732168404Spjd /* 6733168404Spjd * Stop async tasks. The async thread may need to detach 6734168404Spjd * a device that's been replaced, which requires grabbing 6735168404Spjd * spa_namespace_lock, so we must drop it here. 6736168404Spjd */ 6737168404Spjd spa_open_ref(spa, FTAG); 6738168404Spjd mutex_exit(&spa_namespace_lock); 6739168404Spjd spa_async_suspend(spa); 6740168404Spjd mutex_enter(&spa_namespace_lock); 6741168404Spjd spa_close(spa, FTAG); 6742168404Spjd 6743168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6744168404Spjd spa_unload(spa); 6745168404Spjd spa_deactivate(spa); 6746168404Spjd } 6747168404Spjd spa_remove(spa); 6748168404Spjd } 6749168404Spjd mutex_exit(&spa_namespace_lock); 6750168404Spjd} 6751168404Spjd 6752168404Spjdvdev_t * 6753209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6754168404Spjd{ 6755185029Spjd vdev_t *vd; 6756185029Spjd int i; 6757185029Spjd 6758185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6759185029Spjd return (vd); 6760185029Spjd 6761209962Smm if (aux) { 6762185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6763185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 6764185029Spjd if (vd->vdev_guid == guid) 6765185029Spjd return (vd); 6766185029Spjd } 6767209962Smm 6768209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 6769209962Smm vd = spa->spa_spares.sav_vdevs[i]; 6770209962Smm if (vd->vdev_guid == guid) 6771209962Smm return (vd); 6772209962Smm } 6773185029Spjd } 6774185029Spjd 6775185029Spjd return (NULL); 6776168404Spjd} 6777168404Spjd 6778168404Spjdvoid 6779185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 6780168404Spjd{ 6781219089Spjd ASSERT(spa_writeable(spa)); 6782219089Spjd 6783185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6784168404Spjd 6785168404Spjd /* 6786168404Spjd * This should only be called for a non-faulted pool, and since a 6787168404Spjd * future version would result in an unopenable pool, this shouldn't be 6788168404Spjd * possible. 6789168404Spjd */ 6790247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6791268075Sdelphij ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 6792168404Spjd 6793185029Spjd spa->spa_uberblock.ub_version = version; 6794168404Spjd vdev_config_dirty(spa->spa_root_vdev); 6795168404Spjd 6796185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 6797168404Spjd 6798168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6799168404Spjd} 6800168404Spjd 6801168404Spjdboolean_t 6802168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 6803168404Spjd{ 6804168404Spjd int i; 6805168404Spjd uint64_t spareguid; 6806185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6807168404Spjd 6808185029Spjd for (i = 0; i < sav->sav_count; i++) 6809185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 6810168404Spjd return (B_TRUE); 6811168404Spjd 6812185029Spjd for (i = 0; i < sav->sav_npending; i++) { 6813185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6814185029Spjd &spareguid) == 0 && spareguid == guid) 6815168404Spjd return (B_TRUE); 6816168404Spjd } 6817168404Spjd 6818168404Spjd return (B_FALSE); 6819168404Spjd} 6820168404Spjd 6821185029Spjd/* 6822185029Spjd * Check if a pool has an active shared spare device. 6823185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 6824185029Spjd */ 6825185029Spjdstatic boolean_t 6826185029Spjdspa_has_active_shared_spare(spa_t *spa) 6827168404Spjd{ 6828185029Spjd int i, refcnt; 6829185029Spjd uint64_t pool; 6830185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6831185029Spjd 6832185029Spjd for (i = 0; i < sav->sav_count; i++) { 6833185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6834185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6835185029Spjd refcnt > 2) 6836185029Spjd return (B_TRUE); 6837185029Spjd } 6838185029Spjd 6839185029Spjd return (B_FALSE); 6840168404Spjd} 6841168404Spjd 6842185029Spjd/* 6843185029Spjd * Post a sysevent corresponding to the given event. The 'name' must be one of 6844185029Spjd * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6845185029Spjd * filled in from the spa and (optionally) the vdev. This doesn't do anything 6846185029Spjd * in the userland libzpool, as we don't want consumers to misinterpret ztest 6847185029Spjd * or zdb as real changes. 6848185029Spjd */ 6849185029Spjdvoid 6850185029Spjdspa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6851168404Spjd{ 6852185029Spjd#ifdef _KERNEL 6853185029Spjd sysevent_t *ev; 6854185029Spjd sysevent_attr_list_t *attr = NULL; 6855185029Spjd sysevent_value_t value; 6856185029Spjd sysevent_id_t eid; 6857168404Spjd 6858185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6859185029Spjd SE_SLEEP); 6860168404Spjd 6861185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6862185029Spjd value.value.sv_string = spa_name(spa); 6863185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6864185029Spjd goto done; 6865168404Spjd 6866185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6867185029Spjd value.value.sv_uint64 = spa_guid(spa); 6868185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6869185029Spjd goto done; 6870168404Spjd 6871185029Spjd if (vd) { 6872185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6873185029Spjd value.value.sv_uint64 = vd->vdev_guid; 6874185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6875185029Spjd SE_SLEEP) != 0) 6876185029Spjd goto done; 6877168404Spjd 6878185029Spjd if (vd->vdev_path) { 6879185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6880185029Spjd value.value.sv_string = vd->vdev_path; 6881185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6882185029Spjd &value, SE_SLEEP) != 0) 6883185029Spjd goto done; 6884168404Spjd } 6885168404Spjd } 6886168404Spjd 6887185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 6888185029Spjd goto done; 6889185029Spjd attr = NULL; 6890168404Spjd 6891185029Spjd (void) log_sysevent(ev, SE_SLEEP, &eid); 6892185029Spjd 6893185029Spjddone: 6894185029Spjd if (attr) 6895185029Spjd sysevent_free_attr(attr); 6896185029Spjd sysevent_free(ev); 6897185029Spjd#endif 6898168404Spjd} 6899