spa.c revision 267992
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24264670Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25249188Smm * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 26247265Smm * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30251629Sdelphij * SPA: Storage Pool Allocator 31251629Sdelphij * 32168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 33168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 34168404Spjd * pool. 35168404Spjd */ 36168404Spjd 37168404Spjd#include <sys/zfs_context.h> 38168404Spjd#include <sys/fm/fs/zfs.h> 39168404Spjd#include <sys/spa_impl.h> 40168404Spjd#include <sys/zio.h> 41168404Spjd#include <sys/zio_checksum.h> 42168404Spjd#include <sys/dmu.h> 43168404Spjd#include <sys/dmu_tx.h> 44168404Spjd#include <sys/zap.h> 45168404Spjd#include <sys/zil.h> 46219089Spjd#include <sys/ddt.h> 47168404Spjd#include <sys/vdev_impl.h> 48168404Spjd#include <sys/metaslab.h> 49219089Spjd#include <sys/metaslab_impl.h> 50168404Spjd#include <sys/uberblock_impl.h> 51168404Spjd#include <sys/txg.h> 52168404Spjd#include <sys/avl.h> 53168404Spjd#include <sys/dmu_traverse.h> 54168404Spjd#include <sys/dmu_objset.h> 55168404Spjd#include <sys/unique.h> 56168404Spjd#include <sys/dsl_pool.h> 57168404Spjd#include <sys/dsl_dataset.h> 58168404Spjd#include <sys/dsl_dir.h> 59168404Spjd#include <sys/dsl_prop.h> 60168404Spjd#include <sys/dsl_synctask.h> 61168404Spjd#include <sys/fs/zfs.h> 62185029Spjd#include <sys/arc.h> 63168404Spjd#include <sys/callb.h> 64185029Spjd#include <sys/spa_boot.h> 65219089Spjd#include <sys/zfs_ioctl.h> 66219089Spjd#include <sys/dsl_scan.h> 67248571Smm#include <sys/dmu_send.h> 68248571Smm#include <sys/dsl_destroy.h> 69248571Smm#include <sys/dsl_userhold.h> 70236884Smm#include <sys/zfeature.h> 71219089Spjd#include <sys/zvol.h> 72240868Spjd#include <sys/trim_map.h> 73168404Spjd 74219089Spjd#ifdef _KERNEL 75219089Spjd#include <sys/callb.h> 76219089Spjd#include <sys/cpupart.h> 77219089Spjd#include <sys/zone.h> 78219089Spjd#endif /* _KERNEL */ 79219089Spjd 80185029Spjd#include "zfs_prop.h" 81185029Spjd#include "zfs_comutil.h" 82168404Spjd 83204073Spjd/* Check hostid on import? */ 84204073Spjdstatic int check_hostid = 1; 85204073Spjd 86204073SpjdSYSCTL_DECL(_vfs_zfs); 87267992ShselaskySYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, 88204073Spjd "Check hostid on import?"); 89204073Spjd 90251636Sdelphij/* 91251636Sdelphij * The interval, in seconds, at which failed configuration cache file writes 92251636Sdelphij * should be retried. 93251636Sdelphij */ 94251636Sdelphijstatic int zfs_ccw_retry_interval = 300; 95251636Sdelphij 96219089Spjdtypedef enum zti_modes { 97258631Savg ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 98258631Savg ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 99258631Savg ZTI_MODE_NULL, /* don't create a taskq */ 100258631Savg ZTI_NMODES 101219089Spjd} zti_modes_t; 102168712Spjd 103258631Savg#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 104258631Savg#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 105258631Savg#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 106209962Smm 107258631Savg#define ZTI_N(n) ZTI_P(n, 1) 108258631Savg#define ZTI_ONE ZTI_N(1) 109209962Smm 110209962Smmtypedef struct zio_taskq_info { 111258631Savg zti_modes_t zti_mode; 112211931Smm uint_t zti_value; 113258631Savg uint_t zti_count; 114209962Smm} zio_taskq_info_t; 115209962Smm 116209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 117219089Spjd "issue", "issue_high", "intr", "intr_high" 118209962Smm}; 119209962Smm 120211931Smm/* 121258631Savg * This table defines the taskq settings for each ZFS I/O type. When 122258631Savg * initializing a pool, we use this table to create an appropriately sized 123258631Savg * taskq. Some operations are low volume and therefore have a small, static 124258631Savg * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 125258631Savg * macros. Other operations process a large amount of data; the ZTI_BATCH 126258631Savg * macro causes us to create a taskq oriented for throughput. Some operations 127258631Savg * are so high frequency and short-lived that the taskq itself can become a a 128258631Savg * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 129258631Savg * additional degree of parallelism specified by the number of threads per- 130258631Savg * taskq and the number of taskqs; when dispatching an event in this case, the 131258631Savg * particular taskq is chosen at random. 132258631Savg * 133258631Savg * The different taskq priorities are to handle the different contexts (issue 134258631Savg * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 135258631Savg * need to be handled with minimum delay. 136211931Smm */ 137211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 138211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 139258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 140264670Sdelphij { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 141258631Savg { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 142258631Savg { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 143258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 144258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 145209962Smm}; 146209962Smm 147248571Smmstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 148248571Smmstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 149185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 150219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 151219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 152219089Spjd char **ereport); 153219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 154185029Spjd 155258632Savguint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 156219089Spjd#ifdef PSRSET_BIND 157219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 158219089Spjd#endif 159219089Spjd#ifdef SYSDC 160219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 161219089Spjd#endif 162219089Spjduint_t zio_taskq_basedc = 80; /* base duty cycle */ 163219089Spjd 164219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 165243503Smmextern int zfs_sync_pass_deferred_free; 166219089Spjd 167247265Smm#ifndef illumos 168247265Smmextern void spa_deadman(void *arg); 169247265Smm#endif 170247265Smm 171168404Spjd/* 172219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 173219089Spjd * to get the vdev stats associated with the imported devices. 174219089Spjd */ 175219089Spjd#define TRYIMPORT_NAME "$import" 176219089Spjd 177219089Spjd/* 178168404Spjd * ========================================================================== 179185029Spjd * SPA properties routines 180185029Spjd * ========================================================================== 181185029Spjd */ 182185029Spjd 183185029Spjd/* 184185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 185185029Spjd */ 186185029Spjdstatic void 187185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 188185029Spjd uint64_t intval, zprop_source_t src) 189185029Spjd{ 190185029Spjd const char *propname = zpool_prop_to_name(prop); 191185029Spjd nvlist_t *propval; 192185029Spjd 193185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 194185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 195185029Spjd 196185029Spjd if (strval != NULL) 197185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 198185029Spjd else 199185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 200185029Spjd 201185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 202185029Spjd nvlist_free(propval); 203185029Spjd} 204185029Spjd 205185029Spjd/* 206185029Spjd * Get property values from the spa configuration. 207185029Spjd */ 208185029Spjdstatic void 209185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 210185029Spjd{ 211236155Smm vdev_t *rvd = spa->spa_root_vdev; 212236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 213209962Smm uint64_t size; 214219089Spjd uint64_t alloc; 215236155Smm uint64_t space; 216185029Spjd uint64_t cap, version; 217185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 218185029Spjd spa_config_dirent_t *dp; 219185029Spjd 220185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 221185029Spjd 222236155Smm if (rvd != NULL) { 223219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 224219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 225209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 226209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 227219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 228219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 229219089Spjd size - alloc, src); 230236155Smm 231236155Smm space = 0; 232236155Smm for (int c = 0; c < rvd->vdev_children; c++) { 233236155Smm vdev_t *tvd = rvd->vdev_child[c]; 234236155Smm space += tvd->vdev_max_asize - tvd->vdev_asize; 235236155Smm } 236236155Smm spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 237236155Smm src); 238236155Smm 239219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 240219089Spjd (spa_mode(spa) == FREAD), src); 241185029Spjd 242219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 243209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 244185029Spjd 245219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 246219089Spjd ddt_get_pool_dedup_ratio(spa), src); 247219089Spjd 248209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 249236155Smm rvd->vdev_state, src); 250209962Smm 251209962Smm version = spa_version(spa); 252209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 253209962Smm src = ZPROP_SRC_DEFAULT; 254209962Smm else 255209962Smm src = ZPROP_SRC_LOCAL; 256209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 257209962Smm } 258209962Smm 259236884Smm if (pool != NULL) { 260236884Smm dsl_dir_t *freedir = pool->dp_free_dir; 261236884Smm 262236884Smm /* 263236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 264236884Smm * when opening pools before this version freedir will be NULL. 265236884Smm */ 266236884Smm if (freedir != NULL) { 267236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 268236884Smm freedir->dd_phys->dd_used_bytes, src); 269236884Smm } else { 270236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 271236884Smm NULL, 0, src); 272236884Smm } 273236884Smm } 274236884Smm 275185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 276185029Spjd 277228103Smm if (spa->spa_comment != NULL) { 278228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 279228103Smm 0, ZPROP_SRC_LOCAL); 280228103Smm } 281228103Smm 282185029Spjd if (spa->spa_root != NULL) 283185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 284185029Spjd 0, ZPROP_SRC_LOCAL); 285185029Spjd 286185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 287185029Spjd if (dp->scd_path == NULL) { 288185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 289185029Spjd "none", 0, ZPROP_SRC_LOCAL); 290185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 291185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 292185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 293185029Spjd } 294185029Spjd } 295185029Spjd} 296185029Spjd 297185029Spjd/* 298185029Spjd * Get zpool property values. 299185029Spjd */ 300185029Spjdint 301185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 302185029Spjd{ 303219089Spjd objset_t *mos = spa->spa_meta_objset; 304185029Spjd zap_cursor_t zc; 305185029Spjd zap_attribute_t za; 306185029Spjd int err; 307185029Spjd 308185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 309185029Spjd 310185029Spjd mutex_enter(&spa->spa_props_lock); 311185029Spjd 312185029Spjd /* 313185029Spjd * Get properties from the spa config. 314185029Spjd */ 315185029Spjd spa_prop_get_config(spa, nvp); 316185029Spjd 317185029Spjd /* If no pool property object, no more prop to get. */ 318219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 319185029Spjd mutex_exit(&spa->spa_props_lock); 320185029Spjd return (0); 321185029Spjd } 322185029Spjd 323185029Spjd /* 324185029Spjd * Get properties from the MOS pool property object. 325185029Spjd */ 326185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 327185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 328185029Spjd zap_cursor_advance(&zc)) { 329185029Spjd uint64_t intval = 0; 330185029Spjd char *strval = NULL; 331185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 332185029Spjd zpool_prop_t prop; 333185029Spjd 334185029Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 335185029Spjd continue; 336185029Spjd 337185029Spjd switch (za.za_integer_length) { 338185029Spjd case 8: 339185029Spjd /* integer property */ 340185029Spjd if (za.za_first_integer != 341185029Spjd zpool_prop_default_numeric(prop)) 342185029Spjd src = ZPROP_SRC_LOCAL; 343185029Spjd 344185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 345185029Spjd dsl_pool_t *dp; 346185029Spjd dsl_dataset_t *ds = NULL; 347185029Spjd 348185029Spjd dp = spa_get_dsl(spa); 349248571Smm dsl_pool_config_enter(dp, FTAG); 350185029Spjd if (err = dsl_dataset_hold_obj(dp, 351185029Spjd za.za_first_integer, FTAG, &ds)) { 352248571Smm dsl_pool_config_exit(dp, FTAG); 353185029Spjd break; 354185029Spjd } 355185029Spjd 356185029Spjd strval = kmem_alloc( 357185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 358185029Spjd KM_SLEEP); 359185029Spjd dsl_dataset_name(ds, strval); 360185029Spjd dsl_dataset_rele(ds, FTAG); 361248571Smm dsl_pool_config_exit(dp, FTAG); 362185029Spjd } else { 363185029Spjd strval = NULL; 364185029Spjd intval = za.za_first_integer; 365185029Spjd } 366185029Spjd 367185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 368185029Spjd 369185029Spjd if (strval != NULL) 370185029Spjd kmem_free(strval, 371185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 372185029Spjd 373185029Spjd break; 374185029Spjd 375185029Spjd case 1: 376185029Spjd /* string property */ 377185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 378185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 379185029Spjd za.za_name, 1, za.za_num_integers, strval); 380185029Spjd if (err) { 381185029Spjd kmem_free(strval, za.za_num_integers); 382185029Spjd break; 383185029Spjd } 384185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 385185029Spjd kmem_free(strval, za.za_num_integers); 386185029Spjd break; 387185029Spjd 388185029Spjd default: 389185029Spjd break; 390185029Spjd } 391185029Spjd } 392185029Spjd zap_cursor_fini(&zc); 393185029Spjd mutex_exit(&spa->spa_props_lock); 394185029Spjdout: 395185029Spjd if (err && err != ENOENT) { 396185029Spjd nvlist_free(*nvp); 397185029Spjd *nvp = NULL; 398185029Spjd return (err); 399185029Spjd } 400185029Spjd 401185029Spjd return (0); 402185029Spjd} 403185029Spjd 404185029Spjd/* 405185029Spjd * Validate the given pool properties nvlist and modify the list 406185029Spjd * for the property values to be set. 407185029Spjd */ 408185029Spjdstatic int 409185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 410185029Spjd{ 411185029Spjd nvpair_t *elem; 412185029Spjd int error = 0, reset_bootfs = 0; 413247187Smm uint64_t objnum = 0; 414236884Smm boolean_t has_feature = B_FALSE; 415185029Spjd 416185029Spjd elem = NULL; 417185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 418185029Spjd uint64_t intval; 419236884Smm char *strval, *slash, *check, *fname; 420236884Smm const char *propname = nvpair_name(elem); 421236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 422185029Spjd 423236884Smm switch (prop) { 424236884Smm case ZPROP_INVAL: 425236884Smm if (!zpool_prop_feature(propname)) { 426249195Smm error = SET_ERROR(EINVAL); 427236884Smm break; 428236884Smm } 429185029Spjd 430236884Smm /* 431236884Smm * Sanitize the input. 432236884Smm */ 433236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 434249195Smm error = SET_ERROR(EINVAL); 435236884Smm break; 436236884Smm } 437185029Spjd 438236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 439249195Smm error = SET_ERROR(EINVAL); 440236884Smm break; 441236884Smm } 442236884Smm 443236884Smm if (intval != 0) { 444249195Smm error = SET_ERROR(EINVAL); 445236884Smm break; 446236884Smm } 447236884Smm 448236884Smm fname = strchr(propname, '@') + 1; 449236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 450249195Smm error = SET_ERROR(EINVAL); 451236884Smm break; 452236884Smm } 453236884Smm 454236884Smm has_feature = B_TRUE; 455236884Smm break; 456236884Smm 457185029Spjd case ZPOOL_PROP_VERSION: 458185029Spjd error = nvpair_value_uint64(elem, &intval); 459185029Spjd if (!error && 460236884Smm (intval < spa_version(spa) || 461236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 462236884Smm has_feature)) 463249195Smm error = SET_ERROR(EINVAL); 464185029Spjd break; 465185029Spjd 466185029Spjd case ZPOOL_PROP_DELEGATION: 467185029Spjd case ZPOOL_PROP_AUTOREPLACE: 468185029Spjd case ZPOOL_PROP_LISTSNAPS: 469219089Spjd case ZPOOL_PROP_AUTOEXPAND: 470185029Spjd error = nvpair_value_uint64(elem, &intval); 471185029Spjd if (!error && intval > 1) 472249195Smm error = SET_ERROR(EINVAL); 473185029Spjd break; 474185029Spjd 475185029Spjd case ZPOOL_PROP_BOOTFS: 476209962Smm /* 477209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 478209962Smm * or the pool is still being created (version == 0), 479209962Smm * the bootfs property cannot be set. 480209962Smm */ 481185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 482249195Smm error = SET_ERROR(ENOTSUP); 483185029Spjd break; 484185029Spjd } 485185029Spjd 486185029Spjd /* 487185029Spjd * Make sure the vdev config is bootable 488185029Spjd */ 489185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 490249195Smm error = SET_ERROR(ENOTSUP); 491185029Spjd break; 492185029Spjd } 493185029Spjd 494185029Spjd reset_bootfs = 1; 495185029Spjd 496185029Spjd error = nvpair_value_string(elem, &strval); 497185029Spjd 498185029Spjd if (!error) { 499236884Smm objset_t *os; 500185029Spjd uint64_t compress; 501185029Spjd 502185029Spjd if (strval == NULL || strval[0] == '\0') { 503185029Spjd objnum = zpool_prop_default_numeric( 504185029Spjd ZPOOL_PROP_BOOTFS); 505185029Spjd break; 506185029Spjd } 507185029Spjd 508219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 509185029Spjd break; 510185029Spjd 511219089Spjd /* Must be ZPL and not gzip compressed. */ 512219089Spjd 513219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 514249195Smm error = SET_ERROR(ENOTSUP); 515248571Smm } else if ((error = 516248571Smm dsl_prop_get_int_ds(dmu_objset_ds(os), 517185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 518248571Smm &compress)) == 0 && 519185029Spjd !BOOTFS_COMPRESS_VALID(compress)) { 520249195Smm error = SET_ERROR(ENOTSUP); 521185029Spjd } else { 522185029Spjd objnum = dmu_objset_id(os); 523185029Spjd } 524219089Spjd dmu_objset_rele(os, FTAG); 525185029Spjd } 526185029Spjd break; 527185029Spjd 528185029Spjd case ZPOOL_PROP_FAILUREMODE: 529185029Spjd error = nvpair_value_uint64(elem, &intval); 530185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 531185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 532249195Smm error = SET_ERROR(EINVAL); 533185029Spjd 534185029Spjd /* 535185029Spjd * This is a special case which only occurs when 536185029Spjd * the pool has completely failed. This allows 537185029Spjd * the user to change the in-core failmode property 538185029Spjd * without syncing it out to disk (I/Os might 539185029Spjd * currently be blocked). We do this by returning 540185029Spjd * EIO to the caller (spa_prop_set) to trick it 541185029Spjd * into thinking we encountered a property validation 542185029Spjd * error. 543185029Spjd */ 544185029Spjd if (!error && spa_suspended(spa)) { 545185029Spjd spa->spa_failmode = intval; 546249195Smm error = SET_ERROR(EIO); 547185029Spjd } 548185029Spjd break; 549185029Spjd 550185029Spjd case ZPOOL_PROP_CACHEFILE: 551185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 552185029Spjd break; 553185029Spjd 554185029Spjd if (strval[0] == '\0') 555185029Spjd break; 556185029Spjd 557185029Spjd if (strcmp(strval, "none") == 0) 558185029Spjd break; 559185029Spjd 560185029Spjd if (strval[0] != '/') { 561249195Smm error = SET_ERROR(EINVAL); 562185029Spjd break; 563185029Spjd } 564185029Spjd 565185029Spjd slash = strrchr(strval, '/'); 566185029Spjd ASSERT(slash != NULL); 567185029Spjd 568185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 569185029Spjd strcmp(slash, "/..") == 0) 570249195Smm error = SET_ERROR(EINVAL); 571185029Spjd break; 572219089Spjd 573228103Smm case ZPOOL_PROP_COMMENT: 574228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 575228103Smm break; 576228103Smm for (check = strval; *check != '\0'; check++) { 577228103Smm /* 578228103Smm * The kernel doesn't have an easy isprint() 579228103Smm * check. For this kernel check, we merely 580228103Smm * check ASCII apart from DEL. Fix this if 581228103Smm * there is an easy-to-use kernel isprint(). 582228103Smm */ 583228103Smm if (*check >= 0x7f) { 584249195Smm error = SET_ERROR(EINVAL); 585228103Smm break; 586228103Smm } 587228103Smm check++; 588228103Smm } 589228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 590228103Smm error = E2BIG; 591228103Smm break; 592228103Smm 593219089Spjd case ZPOOL_PROP_DEDUPDITTO: 594219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 595249195Smm error = SET_ERROR(ENOTSUP); 596219089Spjd else 597219089Spjd error = nvpair_value_uint64(elem, &intval); 598219089Spjd if (error == 0 && 599219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 600249195Smm error = SET_ERROR(EINVAL); 601219089Spjd break; 602185029Spjd } 603185029Spjd 604185029Spjd if (error) 605185029Spjd break; 606185029Spjd } 607185029Spjd 608185029Spjd if (!error && reset_bootfs) { 609185029Spjd error = nvlist_remove(props, 610185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 611185029Spjd 612185029Spjd if (!error) { 613185029Spjd error = nvlist_add_uint64(props, 614185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 615185029Spjd } 616185029Spjd } 617185029Spjd 618185029Spjd return (error); 619185029Spjd} 620185029Spjd 621209962Smmvoid 622209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 623209962Smm{ 624209962Smm char *cachefile; 625209962Smm spa_config_dirent_t *dp; 626209962Smm 627209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 628209962Smm &cachefile) != 0) 629209962Smm return; 630209962Smm 631209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 632209962Smm KM_SLEEP); 633209962Smm 634209962Smm if (cachefile[0] == '\0') 635209962Smm dp->scd_path = spa_strdup(spa_config_path); 636209962Smm else if (strcmp(cachefile, "none") == 0) 637209962Smm dp->scd_path = NULL; 638209962Smm else 639209962Smm dp->scd_path = spa_strdup(cachefile); 640209962Smm 641209962Smm list_insert_head(&spa->spa_config_list, dp); 642209962Smm if (need_sync) 643209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 644209962Smm} 645209962Smm 646185029Spjdint 647185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 648185029Spjd{ 649185029Spjd int error; 650236884Smm nvpair_t *elem = NULL; 651209962Smm boolean_t need_sync = B_FALSE; 652185029Spjd 653185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 654185029Spjd return (error); 655185029Spjd 656209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 657236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 658209962Smm 659219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 660219089Spjd prop == ZPOOL_PROP_ALTROOT || 661219089Spjd prop == ZPOOL_PROP_READONLY) 662209962Smm continue; 663209962Smm 664236884Smm if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 665236884Smm uint64_t ver; 666236884Smm 667236884Smm if (prop == ZPOOL_PROP_VERSION) { 668236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 669236884Smm } else { 670236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 671236884Smm ver = SPA_VERSION_FEATURES; 672236884Smm need_sync = B_TRUE; 673236884Smm } 674236884Smm 675236884Smm /* Save time if the version is already set. */ 676236884Smm if (ver == spa_version(spa)) 677236884Smm continue; 678236884Smm 679236884Smm /* 680236884Smm * In addition to the pool directory object, we might 681236884Smm * create the pool properties object, the features for 682236884Smm * read object, the features for write object, or the 683236884Smm * feature descriptions object. 684236884Smm */ 685248571Smm error = dsl_sync_task(spa->spa_name, NULL, 686248571Smm spa_sync_version, &ver, 6); 687236884Smm if (error) 688236884Smm return (error); 689236884Smm continue; 690236884Smm } 691236884Smm 692209962Smm need_sync = B_TRUE; 693209962Smm break; 694209962Smm } 695209962Smm 696236884Smm if (need_sync) { 697248571Smm return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 698248571Smm nvp, 6)); 699236884Smm } 700236884Smm 701236884Smm return (0); 702185029Spjd} 703185029Spjd 704185029Spjd/* 705185029Spjd * If the bootfs property value is dsobj, clear it. 706185029Spjd */ 707185029Spjdvoid 708185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 709185029Spjd{ 710185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 711185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 712185029Spjd spa->spa_pool_props_object, 713185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 714185029Spjd spa->spa_bootfs = 0; 715185029Spjd } 716185029Spjd} 717185029Spjd 718239620Smm/*ARGSUSED*/ 719239620Smmstatic int 720248571Smmspa_change_guid_check(void *arg, dmu_tx_t *tx) 721239620Smm{ 722248571Smm uint64_t *newguid = arg; 723248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 724239620Smm vdev_t *rvd = spa->spa_root_vdev; 725239620Smm uint64_t vdev_state; 726239620Smm 727239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 728239620Smm vdev_state = rvd->vdev_state; 729239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 730239620Smm 731239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 732249195Smm return (SET_ERROR(ENXIO)); 733239620Smm 734239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 735239620Smm 736239620Smm return (0); 737239620Smm} 738239620Smm 739239620Smmstatic void 740248571Smmspa_change_guid_sync(void *arg, dmu_tx_t *tx) 741239620Smm{ 742248571Smm uint64_t *newguid = arg; 743248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 744239620Smm uint64_t oldguid; 745239620Smm vdev_t *rvd = spa->spa_root_vdev; 746239620Smm 747239620Smm oldguid = spa_guid(spa); 748239620Smm 749239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 750239620Smm rvd->vdev_guid = *newguid; 751239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 752239620Smm vdev_config_dirty(rvd); 753239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 754239620Smm 755248571Smm spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 756239620Smm oldguid, *newguid); 757239620Smm} 758239620Smm 759185029Spjd/* 760228103Smm * Change the GUID for the pool. This is done so that we can later 761228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 762228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 763228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 764228103Smm * online when we do this, or else any vdevs that weren't present 765228103Smm * would be orphaned from our pool. We are also going to issue a 766228103Smm * sysevent to update any watchers. 767228103Smm */ 768228103Smmint 769228103Smmspa_change_guid(spa_t *spa) 770228103Smm{ 771239620Smm int error; 772239620Smm uint64_t guid; 773228103Smm 774254074Sdelphij mutex_enter(&spa->spa_vdev_top_lock); 775239620Smm mutex_enter(&spa_namespace_lock); 776239620Smm guid = spa_generate_guid(NULL); 777228103Smm 778248571Smm error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 779248571Smm spa_change_guid_sync, &guid, 5); 780228103Smm 781239620Smm if (error == 0) { 782239620Smm spa_config_sync(spa, B_FALSE, B_TRUE); 783239620Smm spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 784239620Smm } 785228103Smm 786239620Smm mutex_exit(&spa_namespace_lock); 787254074Sdelphij mutex_exit(&spa->spa_vdev_top_lock); 788228103Smm 789239620Smm return (error); 790228103Smm} 791228103Smm 792228103Smm/* 793185029Spjd * ========================================================================== 794168404Spjd * SPA state manipulation (open/create/destroy/import/export) 795168404Spjd * ========================================================================== 796168404Spjd */ 797168404Spjd 798168404Spjdstatic int 799168404Spjdspa_error_entry_compare(const void *a, const void *b) 800168404Spjd{ 801168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 802168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 803168404Spjd int ret; 804168404Spjd 805168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 806168404Spjd sizeof (zbookmark_t)); 807168404Spjd 808168404Spjd if (ret < 0) 809168404Spjd return (-1); 810168404Spjd else if (ret > 0) 811168404Spjd return (1); 812168404Spjd else 813168404Spjd return (0); 814168404Spjd} 815168404Spjd 816168404Spjd/* 817168404Spjd * Utility function which retrieves copies of the current logs and 818168404Spjd * re-initializes them in the process. 819168404Spjd */ 820168404Spjdvoid 821168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 822168404Spjd{ 823168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 824168404Spjd 825168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 826168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 827168404Spjd 828168404Spjd avl_create(&spa->spa_errlist_scrub, 829168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 830168404Spjd offsetof(spa_error_entry_t, se_avl)); 831168404Spjd avl_create(&spa->spa_errlist_last, 832168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 833168404Spjd offsetof(spa_error_entry_t, se_avl)); 834168404Spjd} 835168404Spjd 836258631Savgstatic void 837258631Savgspa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 838168404Spjd{ 839258631Savg const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 840258631Savg enum zti_modes mode = ztip->zti_mode; 841258631Savg uint_t value = ztip->zti_value; 842258631Savg uint_t count = ztip->zti_count; 843258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 844258631Savg char name[32]; 845258630Savg uint_t flags = 0; 846219089Spjd boolean_t batch = B_FALSE; 847168404Spjd 848258631Savg if (mode == ZTI_MODE_NULL) { 849258631Savg tqs->stqs_count = 0; 850258631Savg tqs->stqs_taskq = NULL; 851258631Savg return; 852258631Savg } 853168404Spjd 854258631Savg ASSERT3U(count, >, 0); 855168404Spjd 856258631Savg tqs->stqs_count = count; 857258631Savg tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 858219089Spjd 859258632Savg switch (mode) { 860258632Savg case ZTI_MODE_FIXED: 861258632Savg ASSERT3U(value, >=, 1); 862258632Savg value = MAX(value, 1); 863258632Savg break; 864219089Spjd 865258632Savg case ZTI_MODE_BATCH: 866258632Savg batch = B_TRUE; 867258632Savg flags |= TASKQ_THREADS_CPU_PCT; 868258632Savg value = zio_taskq_batch_pct; 869258632Savg break; 870219089Spjd 871258632Savg default: 872258632Savg panic("unrecognized mode for %s_%s taskq (%u:%u) in " 873258632Savg "spa_activate()", 874258632Savg zio_type_name[t], zio_taskq_types[q], mode, value); 875258632Savg break; 876258632Savg } 877258631Savg 878258632Savg for (uint_t i = 0; i < count; i++) { 879258632Savg taskq_t *tq; 880258631Savg 881258631Savg if (count > 1) { 882258631Savg (void) snprintf(name, sizeof (name), "%s_%s_%u", 883258631Savg zio_type_name[t], zio_taskq_types[q], i); 884258631Savg } else { 885258631Savg (void) snprintf(name, sizeof (name), "%s_%s", 886258631Savg zio_type_name[t], zio_taskq_types[q]); 887258631Savg } 888258631Savg 889219089Spjd#ifdef SYSDC 890258631Savg if (zio_taskq_sysdc && spa->spa_proc != &p0) { 891258631Savg if (batch) 892258631Savg flags |= TASKQ_DC_BATCH; 893219089Spjd 894258631Savg tq = taskq_create_sysdc(name, value, 50, INT_MAX, 895258631Savg spa->spa_proc, zio_taskq_basedc, flags); 896258631Savg } else { 897258631Savg#endif 898258632Savg pri_t pri = maxclsyspri; 899258632Savg /* 900258632Savg * The write issue taskq can be extremely CPU 901258632Savg * intensive. Run it at slightly lower priority 902258632Savg * than the other taskqs. 903258632Savg */ 904258632Savg if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 905258632Savg pri--; 906258632Savg 907258632Savg tq = taskq_create_proc(name, value, pri, 50, 908258631Savg INT_MAX, spa->spa_proc, flags); 909258631Savg#ifdef SYSDC 910258631Savg } 911258631Savg#endif 912258631Savg 913258631Savg tqs->stqs_taskq[i] = tq; 914219089Spjd } 915219089Spjd} 916219089Spjd 917219089Spjdstatic void 918258631Savgspa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 919258631Savg{ 920258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 921258631Savg 922258631Savg if (tqs->stqs_taskq == NULL) { 923258631Savg ASSERT0(tqs->stqs_count); 924258631Savg return; 925258631Savg } 926258631Savg 927258631Savg for (uint_t i = 0; i < tqs->stqs_count; i++) { 928258631Savg ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 929258631Savg taskq_destroy(tqs->stqs_taskq[i]); 930258631Savg } 931258631Savg 932258631Savg kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 933258631Savg tqs->stqs_taskq = NULL; 934258631Savg} 935258631Savg 936258631Savg/* 937258631Savg * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 938258631Savg * Note that a type may have multiple discrete taskqs to avoid lock contention 939258631Savg * on the taskq itself. In that case we choose which taskq at random by using 940258631Savg * the low bits of gethrtime(). 941258631Savg */ 942258631Savgvoid 943258631Savgspa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 944258631Savg task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 945258631Savg{ 946258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 947258631Savg taskq_t *tq; 948258631Savg 949258631Savg ASSERT3P(tqs->stqs_taskq, !=, NULL); 950258631Savg ASSERT3U(tqs->stqs_count, !=, 0); 951258631Savg 952258631Savg if (tqs->stqs_count == 1) { 953258631Savg tq = tqs->stqs_taskq[0]; 954258631Savg } else { 955267038Sbdrewery#ifdef _KERNEL 956267029Smav tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 957267038Sbdrewery#else 958267038Sbdrewery tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 959267038Sbdrewery#endif 960258631Savg } 961258631Savg 962258631Savg taskq_dispatch_ent(tq, func, arg, flags, ent); 963258631Savg} 964258631Savg 965258631Savgstatic void 966219089Spjdspa_create_zio_taskqs(spa_t *spa) 967219089Spjd{ 968185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 969185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 970258631Savg spa_taskqs_init(spa, t, q); 971219089Spjd } 972219089Spjd } 973219089Spjd} 974209962Smm 975219089Spjd#ifdef _KERNEL 976219089Spjd#ifdef SPA_PROCESS 977219089Spjdstatic void 978219089Spjdspa_thread(void *arg) 979219089Spjd{ 980219089Spjd callb_cpr_t cprinfo; 981209962Smm 982219089Spjd spa_t *spa = arg; 983219089Spjd user_t *pu = PTOU(curproc); 984209962Smm 985219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 986219089Spjd spa->spa_name); 987209962Smm 988219089Spjd ASSERT(curproc != &p0); 989219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 990219089Spjd "zpool-%s", spa->spa_name); 991219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 992211931Smm 993219089Spjd#ifdef PSRSET_BIND 994219089Spjd /* bind this thread to the requested psrset */ 995219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 996219089Spjd pool_lock(); 997219089Spjd mutex_enter(&cpu_lock); 998219089Spjd mutex_enter(&pidlock); 999219089Spjd mutex_enter(&curproc->p_lock); 1000219089Spjd 1001219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1002219089Spjd 0, NULL, NULL) == 0) { 1003219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 1004219089Spjd } else { 1005219089Spjd cmn_err(CE_WARN, 1006219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 1007219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1008219089Spjd } 1009219089Spjd 1010219089Spjd mutex_exit(&curproc->p_lock); 1011219089Spjd mutex_exit(&pidlock); 1012219089Spjd mutex_exit(&cpu_lock); 1013219089Spjd pool_unlock(); 1014219089Spjd } 1015219089Spjd#endif 1016219089Spjd 1017219089Spjd#ifdef SYSDC 1018219089Spjd if (zio_taskq_sysdc) { 1019219089Spjd sysdc_thread_enter(curthread, 100, 0); 1020219089Spjd } 1021219089Spjd#endif 1022219089Spjd 1023219089Spjd spa->spa_proc = curproc; 1024219089Spjd spa->spa_did = curthread->t_did; 1025219089Spjd 1026219089Spjd spa_create_zio_taskqs(spa); 1027219089Spjd 1028219089Spjd mutex_enter(&spa->spa_proc_lock); 1029219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1030219089Spjd 1031219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 1032219089Spjd cv_broadcast(&spa->spa_proc_cv); 1033219089Spjd 1034219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 1035219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1036219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1037219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1038219089Spjd 1039219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1040219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 1041219089Spjd spa->spa_proc = &p0; 1042219089Spjd cv_broadcast(&spa->spa_proc_cv); 1043219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1044219089Spjd 1045219089Spjd mutex_enter(&curproc->p_lock); 1046219089Spjd lwp_exit(); 1047219089Spjd} 1048219089Spjd#endif /* SPA_PROCESS */ 1049219089Spjd#endif 1050219089Spjd 1051219089Spjd/* 1052219089Spjd * Activate an uninitialized pool. 1053219089Spjd */ 1054219089Spjdstatic void 1055219089Spjdspa_activate(spa_t *spa, int mode) 1056219089Spjd{ 1057219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1058219089Spjd 1059219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 1060219089Spjd spa->spa_mode = mode; 1061219089Spjd 1062219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1063219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1064219089Spjd 1065219089Spjd /* Try to create a covering process */ 1066219089Spjd mutex_enter(&spa->spa_proc_lock); 1067219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1068219089Spjd ASSERT(spa->spa_proc == &p0); 1069219089Spjd spa->spa_did = 0; 1070219089Spjd 1071219089Spjd#ifdef SPA_PROCESS 1072219089Spjd /* Only create a process if we're going to be around a while. */ 1073219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1074219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1075219089Spjd NULL, 0) == 0) { 1076219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 1077219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 1078219089Spjd cv_wait(&spa->spa_proc_cv, 1079219089Spjd &spa->spa_proc_lock); 1080209962Smm } 1081219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1082219089Spjd ASSERT(spa->spa_proc != &p0); 1083219089Spjd ASSERT(spa->spa_did != 0); 1084219089Spjd } else { 1085219089Spjd#ifdef _KERNEL 1086219089Spjd cmn_err(CE_WARN, 1087219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 1088219089Spjd spa->spa_name); 1089219089Spjd#endif 1090185029Spjd } 1091168404Spjd } 1092219089Spjd#endif /* SPA_PROCESS */ 1093219089Spjd mutex_exit(&spa->spa_proc_lock); 1094168404Spjd 1095219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1096219089Spjd ASSERT(spa->spa_proc == &p0); 1097219089Spjd if (spa->spa_proc == &p0) { 1098219089Spjd spa_create_zio_taskqs(spa); 1099219089Spjd } 1100219089Spjd 1101240868Spjd /* 1102240868Spjd * Start TRIM thread. 1103240868Spjd */ 1104240868Spjd trim_thread_create(spa); 1105240868Spjd 1106185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1107185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1108185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1109185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1110168404Spjd 1111168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 1112168404Spjd offsetof(struct vdev, vdev_txg_node)); 1113168404Spjd 1114168404Spjd avl_create(&spa->spa_errlist_scrub, 1115168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1116168404Spjd offsetof(spa_error_entry_t, se_avl)); 1117168404Spjd avl_create(&spa->spa_errlist_last, 1118168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1119168404Spjd offsetof(spa_error_entry_t, se_avl)); 1120168404Spjd} 1121168404Spjd 1122168404Spjd/* 1123168404Spjd * Opposite of spa_activate(). 1124168404Spjd */ 1125168404Spjdstatic void 1126168404Spjdspa_deactivate(spa_t *spa) 1127168404Spjd{ 1128168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1129168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1130168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1131209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1132168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1133168404Spjd 1134240868Spjd /* 1135240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1136240868Spjd * before spa_deactivate(). 1137240868Spjd */ 1138240868Spjd trim_thread_destroy(spa); 1139240868Spjd 1140168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1141168404Spjd 1142185029Spjd list_destroy(&spa->spa_config_dirty_list); 1143185029Spjd list_destroy(&spa->spa_state_dirty_list); 1144168404Spjd 1145185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1146185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1147258631Savg spa_taskqs_fini(spa, t, q); 1148185029Spjd } 1149168404Spjd } 1150168404Spjd 1151168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1152168404Spjd spa->spa_normal_class = NULL; 1153168404Spjd 1154185029Spjd metaslab_class_destroy(spa->spa_log_class); 1155185029Spjd spa->spa_log_class = NULL; 1156185029Spjd 1157168404Spjd /* 1158168404Spjd * If this was part of an import or the open otherwise failed, we may 1159168404Spjd * still have errors left in the queues. Empty them just in case. 1160168404Spjd */ 1161168404Spjd spa_errlog_drain(spa); 1162168404Spjd 1163168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1164168404Spjd avl_destroy(&spa->spa_errlist_last); 1165168404Spjd 1166168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1167219089Spjd 1168219089Spjd mutex_enter(&spa->spa_proc_lock); 1169219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1170219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1171219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1172219089Spjd cv_broadcast(&spa->spa_proc_cv); 1173219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1174219089Spjd ASSERT(spa->spa_proc != &p0); 1175219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1176219089Spjd } 1177219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1178219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1179219089Spjd } 1180219089Spjd ASSERT(spa->spa_proc == &p0); 1181219089Spjd mutex_exit(&spa->spa_proc_lock); 1182219089Spjd 1183219089Spjd#ifdef SPA_PROCESS 1184219089Spjd /* 1185219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1186219089Spjd * module, so that the module can't be unloaded out from underneath 1187219089Spjd * it. 1188219089Spjd */ 1189219089Spjd if (spa->spa_did != 0) { 1190219089Spjd thread_join(spa->spa_did); 1191219089Spjd spa->spa_did = 0; 1192219089Spjd } 1193219089Spjd#endif /* SPA_PROCESS */ 1194168404Spjd} 1195168404Spjd 1196168404Spjd/* 1197168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1198168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1199168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1200168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1201168404Spjd */ 1202168404Spjdstatic int 1203168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1204168404Spjd uint_t id, int atype) 1205168404Spjd{ 1206168404Spjd nvlist_t **child; 1207219089Spjd uint_t children; 1208168404Spjd int error; 1209168404Spjd 1210168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1211168404Spjd return (error); 1212168404Spjd 1213168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1214168404Spjd return (0); 1215168404Spjd 1216185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1217185029Spjd &child, &children); 1218185029Spjd 1219185029Spjd if (error == ENOENT) 1220185029Spjd return (0); 1221185029Spjd 1222185029Spjd if (error) { 1223168404Spjd vdev_free(*vdp); 1224168404Spjd *vdp = NULL; 1225249195Smm return (SET_ERROR(EINVAL)); 1226168404Spjd } 1227168404Spjd 1228219089Spjd for (int c = 0; c < children; c++) { 1229168404Spjd vdev_t *vd; 1230168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1231168404Spjd atype)) != 0) { 1232168404Spjd vdev_free(*vdp); 1233168404Spjd *vdp = NULL; 1234168404Spjd return (error); 1235168404Spjd } 1236168404Spjd } 1237168404Spjd 1238168404Spjd ASSERT(*vdp != NULL); 1239168404Spjd 1240168404Spjd return (0); 1241168404Spjd} 1242168404Spjd 1243168404Spjd/* 1244168404Spjd * Opposite of spa_load(). 1245168404Spjd */ 1246168404Spjdstatic void 1247168404Spjdspa_unload(spa_t *spa) 1248168404Spjd{ 1249168404Spjd int i; 1250168404Spjd 1251185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1252185029Spjd 1253168404Spjd /* 1254240868Spjd * Stop TRIM thread. 1255240868Spjd */ 1256240868Spjd trim_thread_destroy(spa); 1257240868Spjd 1258240868Spjd /* 1259168404Spjd * Stop async tasks. 1260168404Spjd */ 1261168404Spjd spa_async_suspend(spa); 1262168404Spjd 1263168404Spjd /* 1264168404Spjd * Stop syncing. 1265168404Spjd */ 1266168404Spjd if (spa->spa_sync_on) { 1267168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1268168404Spjd spa->spa_sync_on = B_FALSE; 1269168404Spjd } 1270168404Spjd 1271168404Spjd /* 1272185029Spjd * Wait for any outstanding async I/O to complete. 1273168404Spjd */ 1274209962Smm if (spa->spa_async_zio_root != NULL) { 1275209962Smm (void) zio_wait(spa->spa_async_zio_root); 1276209962Smm spa->spa_async_zio_root = NULL; 1277209962Smm } 1278168404Spjd 1279219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1280219089Spjd 1281258717Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1282258717Savg 1283168404Spjd /* 1284258717Savg * Close all vdevs. 1285258717Savg */ 1286258717Savg if (spa->spa_root_vdev) 1287258717Savg vdev_free(spa->spa_root_vdev); 1288258717Savg ASSERT(spa->spa_root_vdev == NULL); 1289258717Savg 1290258717Savg /* 1291168404Spjd * Close the dsl pool. 1292168404Spjd */ 1293168404Spjd if (spa->spa_dsl_pool) { 1294168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1295168404Spjd spa->spa_dsl_pool = NULL; 1296219089Spjd spa->spa_meta_objset = NULL; 1297168404Spjd } 1298168404Spjd 1299219089Spjd ddt_unload(spa); 1300219089Spjd 1301209962Smm 1302168404Spjd /* 1303209962Smm * Drop and purge level 2 cache 1304209962Smm */ 1305209962Smm spa_l2cache_drop(spa); 1306209962Smm 1307185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1308185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1309185029Spjd if (spa->spa_spares.sav_vdevs) { 1310185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1311185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1312185029Spjd spa->spa_spares.sav_vdevs = NULL; 1313168404Spjd } 1314185029Spjd if (spa->spa_spares.sav_config) { 1315185029Spjd nvlist_free(spa->spa_spares.sav_config); 1316185029Spjd spa->spa_spares.sav_config = NULL; 1317168404Spjd } 1318185029Spjd spa->spa_spares.sav_count = 0; 1319168404Spjd 1320230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1321230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1322185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1323230514Smm } 1324185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1325185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1326185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1327185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1328185029Spjd } 1329185029Spjd if (spa->spa_l2cache.sav_config) { 1330185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1331185029Spjd spa->spa_l2cache.sav_config = NULL; 1332185029Spjd } 1333185029Spjd spa->spa_l2cache.sav_count = 0; 1334185029Spjd 1335168404Spjd spa->spa_async_suspended = 0; 1336209962Smm 1337228103Smm if (spa->spa_comment != NULL) { 1338228103Smm spa_strfree(spa->spa_comment); 1339228103Smm spa->spa_comment = NULL; 1340228103Smm } 1341228103Smm 1342209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1343168404Spjd} 1344168404Spjd 1345168404Spjd/* 1346168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1347168404Spjd * this pool. When this is called, we have some form of basic information in 1348185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1349185029Spjd * then re-generate a more complete list including status information. 1350168404Spjd */ 1351168404Spjdstatic void 1352168404Spjdspa_load_spares(spa_t *spa) 1353168404Spjd{ 1354168404Spjd nvlist_t **spares; 1355168404Spjd uint_t nspares; 1356168404Spjd int i; 1357168404Spjd vdev_t *vd, *tvd; 1358168404Spjd 1359185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1360185029Spjd 1361168404Spjd /* 1362168404Spjd * First, close and free any existing spare vdevs. 1363168404Spjd */ 1364185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1365185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1366168404Spjd 1367168404Spjd /* Undo the call to spa_activate() below */ 1368185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1369185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1370168404Spjd spa_spare_remove(tvd); 1371168404Spjd vdev_close(vd); 1372168404Spjd vdev_free(vd); 1373168404Spjd } 1374168404Spjd 1375185029Spjd if (spa->spa_spares.sav_vdevs) 1376185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1377185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1378168404Spjd 1379185029Spjd if (spa->spa_spares.sav_config == NULL) 1380168404Spjd nspares = 0; 1381168404Spjd else 1382185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1383168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1384168404Spjd 1385185029Spjd spa->spa_spares.sav_count = (int)nspares; 1386185029Spjd spa->spa_spares.sav_vdevs = NULL; 1387168404Spjd 1388168404Spjd if (nspares == 0) 1389168404Spjd return; 1390168404Spjd 1391168404Spjd /* 1392168404Spjd * Construct the array of vdevs, opening them to get status in the 1393168404Spjd * process. For each spare, there is potentially two different vdev_t 1394168404Spjd * structures associated with it: one in the list of spares (used only 1395168404Spjd * for basic validation purposes) and one in the active vdev 1396168404Spjd * configuration (if it's spared in). During this phase we open and 1397168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1398168404Spjd * active configuration, then we also mark this vdev as an active spare. 1399168404Spjd */ 1400185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1401185029Spjd KM_SLEEP); 1402185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1403168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1404168404Spjd VDEV_ALLOC_SPARE) == 0); 1405168404Spjd ASSERT(vd != NULL); 1406168404Spjd 1407185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1408168404Spjd 1409185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1410185029Spjd B_FALSE)) != NULL) { 1411168404Spjd if (!tvd->vdev_isspare) 1412168404Spjd spa_spare_add(tvd); 1413168404Spjd 1414168404Spjd /* 1415168404Spjd * We only mark the spare active if we were successfully 1416168404Spjd * able to load the vdev. Otherwise, importing a pool 1417168404Spjd * with a bad active spare would result in strange 1418168404Spjd * behavior, because multiple pool would think the spare 1419168404Spjd * is actively in use. 1420168404Spjd * 1421168404Spjd * There is a vulnerability here to an equally bizarre 1422168404Spjd * circumstance, where a dead active spare is later 1423168404Spjd * brought back to life (onlined or otherwise). Given 1424168404Spjd * the rarity of this scenario, and the extra complexity 1425168404Spjd * it adds, we ignore the possibility. 1426168404Spjd */ 1427168404Spjd if (!vdev_is_dead(tvd)) 1428168404Spjd spa_spare_activate(tvd); 1429168404Spjd } 1430168404Spjd 1431185029Spjd vd->vdev_top = vd; 1432209962Smm vd->vdev_aux = &spa->spa_spares; 1433185029Spjd 1434168404Spjd if (vdev_open(vd) != 0) 1435168404Spjd continue; 1436168404Spjd 1437185029Spjd if (vdev_validate_aux(vd) == 0) 1438185029Spjd spa_spare_add(vd); 1439168404Spjd } 1440168404Spjd 1441168404Spjd /* 1442168404Spjd * Recompute the stashed list of spares, with status information 1443168404Spjd * this time. 1444168404Spjd */ 1445185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1446168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1447168404Spjd 1448185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1449185029Spjd KM_SLEEP); 1450185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1451185029Spjd spares[i] = vdev_config_generate(spa, 1452219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1453185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1454185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1455185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1456168404Spjd nvlist_free(spares[i]); 1457185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1458168404Spjd} 1459168404Spjd 1460185029Spjd/* 1461185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1462185029Spjd * this pool. When this is called, we have some form of basic information in 1463185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1464185029Spjd * then re-generate a more complete list including status information. 1465185029Spjd * Devices which are already active have their details maintained, and are 1466185029Spjd * not re-opened. 1467185029Spjd */ 1468185029Spjdstatic void 1469185029Spjdspa_load_l2cache(spa_t *spa) 1470185029Spjd{ 1471185029Spjd nvlist_t **l2cache; 1472185029Spjd uint_t nl2cache; 1473185029Spjd int i, j, oldnvdevs; 1474219089Spjd uint64_t guid; 1475185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1476185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1477185029Spjd 1478185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1479185029Spjd 1480185029Spjd if (sav->sav_config != NULL) { 1481185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1482185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1483185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1484185029Spjd } else { 1485185029Spjd nl2cache = 0; 1486247187Smm newvdevs = NULL; 1487185029Spjd } 1488185029Spjd 1489185029Spjd oldvdevs = sav->sav_vdevs; 1490185029Spjd oldnvdevs = sav->sav_count; 1491185029Spjd sav->sav_vdevs = NULL; 1492185029Spjd sav->sav_count = 0; 1493185029Spjd 1494185029Spjd /* 1495185029Spjd * Process new nvlist of vdevs. 1496185029Spjd */ 1497185029Spjd for (i = 0; i < nl2cache; i++) { 1498185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1499185029Spjd &guid) == 0); 1500185029Spjd 1501185029Spjd newvdevs[i] = NULL; 1502185029Spjd for (j = 0; j < oldnvdevs; j++) { 1503185029Spjd vd = oldvdevs[j]; 1504185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1505185029Spjd /* 1506185029Spjd * Retain previous vdev for add/remove ops. 1507185029Spjd */ 1508185029Spjd newvdevs[i] = vd; 1509185029Spjd oldvdevs[j] = NULL; 1510185029Spjd break; 1511185029Spjd } 1512185029Spjd } 1513185029Spjd 1514185029Spjd if (newvdevs[i] == NULL) { 1515185029Spjd /* 1516185029Spjd * Create new vdev 1517185029Spjd */ 1518185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1519185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1520185029Spjd ASSERT(vd != NULL); 1521185029Spjd newvdevs[i] = vd; 1522185029Spjd 1523185029Spjd /* 1524185029Spjd * Commit this vdev as an l2cache device, 1525185029Spjd * even if it fails to open. 1526185029Spjd */ 1527185029Spjd spa_l2cache_add(vd); 1528185029Spjd 1529185029Spjd vd->vdev_top = vd; 1530185029Spjd vd->vdev_aux = sav; 1531185029Spjd 1532185029Spjd spa_l2cache_activate(vd); 1533185029Spjd 1534185029Spjd if (vdev_open(vd) != 0) 1535185029Spjd continue; 1536185029Spjd 1537185029Spjd (void) vdev_validate_aux(vd); 1538185029Spjd 1539219089Spjd if (!vdev_is_dead(vd)) 1540219089Spjd l2arc_add_vdev(spa, vd); 1541185029Spjd } 1542185029Spjd } 1543185029Spjd 1544185029Spjd /* 1545185029Spjd * Purge vdevs that were dropped 1546185029Spjd */ 1547185029Spjd for (i = 0; i < oldnvdevs; i++) { 1548185029Spjd uint64_t pool; 1549185029Spjd 1550185029Spjd vd = oldvdevs[i]; 1551185029Spjd if (vd != NULL) { 1552230514Smm ASSERT(vd->vdev_isl2cache); 1553230514Smm 1554209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1555209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1556185029Spjd l2arc_remove_vdev(vd); 1557230514Smm vdev_clear_stats(vd); 1558230514Smm vdev_free(vd); 1559185029Spjd } 1560185029Spjd } 1561185029Spjd 1562185029Spjd if (oldvdevs) 1563185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1564185029Spjd 1565185029Spjd if (sav->sav_config == NULL) 1566185029Spjd goto out; 1567185029Spjd 1568185029Spjd sav->sav_vdevs = newvdevs; 1569185029Spjd sav->sav_count = (int)nl2cache; 1570185029Spjd 1571185029Spjd /* 1572185029Spjd * Recompute the stashed list of l2cache devices, with status 1573185029Spjd * information this time. 1574185029Spjd */ 1575185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1576185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1577185029Spjd 1578185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1579185029Spjd for (i = 0; i < sav->sav_count; i++) 1580185029Spjd l2cache[i] = vdev_config_generate(spa, 1581219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1582185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1583185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1584185029Spjdout: 1585185029Spjd for (i = 0; i < sav->sav_count; i++) 1586185029Spjd nvlist_free(l2cache[i]); 1587185029Spjd if (sav->sav_count) 1588185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1589185029Spjd} 1590185029Spjd 1591168404Spjdstatic int 1592168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1593168404Spjd{ 1594168404Spjd dmu_buf_t *db; 1595168404Spjd char *packed = NULL; 1596168404Spjd size_t nvsize = 0; 1597168404Spjd int error; 1598168404Spjd *value = NULL; 1599168404Spjd 1600262676Sdelphij error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1601262676Sdelphij if (error != 0) 1602262676Sdelphij return (error); 1603168404Spjd nvsize = *(uint64_t *)db->db_data; 1604168404Spjd dmu_buf_rele(db, FTAG); 1605168404Spjd 1606168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1607209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1608209962Smm DMU_READ_PREFETCH); 1609168404Spjd if (error == 0) 1610168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1611168404Spjd kmem_free(packed, nvsize); 1612168404Spjd 1613168404Spjd return (error); 1614168404Spjd} 1615168404Spjd 1616168404Spjd/* 1617185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1618185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1619185029Spjd */ 1620185029Spjdstatic void 1621185029Spjdspa_check_removed(vdev_t *vd) 1622185029Spjd{ 1623219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1624185029Spjd spa_check_removed(vd->vdev_child[c]); 1625185029Spjd 1626249188Smm if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1627249188Smm !vd->vdev_ishole) { 1628185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1629185029Spjd spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1630185029Spjd } 1631185029Spjd} 1632185029Spjd 1633185029Spjd/* 1634219089Spjd * Validate the current config against the MOS config 1635213197Smm */ 1636219089Spjdstatic boolean_t 1637219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1638213197Smm{ 1639219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1640219089Spjd nvlist_t *nv; 1641213197Smm 1642219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1643213197Smm 1644219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1645219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1646219089Spjd 1647219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1648219089Spjd 1649219089Spjd /* 1650219089Spjd * If we're doing a normal import, then build up any additional 1651219089Spjd * diagnostic information about missing devices in this config. 1652219089Spjd * We'll pass this up to the user for further processing. 1653219089Spjd */ 1654219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1655219089Spjd nvlist_t **child, *nv; 1656219089Spjd uint64_t idx = 0; 1657219089Spjd 1658219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1659219089Spjd KM_SLEEP); 1660219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1661219089Spjd 1662219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1663219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1664219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1665219089Spjd 1666219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1667219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1668219089Spjd mtvd->vdev_islog) 1669219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1670219089Spjd B_FALSE, 0); 1671219089Spjd } 1672219089Spjd 1673219089Spjd if (idx) { 1674219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1675219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1676219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1677219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1678219089Spjd 1679219089Spjd for (int i = 0; i < idx; i++) 1680219089Spjd nvlist_free(child[i]); 1681219089Spjd } 1682219089Spjd nvlist_free(nv); 1683219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1684219089Spjd } 1685219089Spjd 1686219089Spjd /* 1687219089Spjd * Compare the root vdev tree with the information we have 1688219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1689219089Spjd * with the corresponding MOS config top-level (mtvd). 1690219089Spjd */ 1691219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1692213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1693219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1694213197Smm 1695219089Spjd /* 1696219089Spjd * Resolve any "missing" vdevs in the current configuration. 1697219089Spjd * If we find that the MOS config has more accurate information 1698219089Spjd * about the top-level vdev then use that vdev instead. 1699219089Spjd */ 1700219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1701219089Spjd mtvd->vdev_ops != &vdev_missing_ops) { 1702219089Spjd 1703219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1704219089Spjd continue; 1705219089Spjd 1706219089Spjd /* 1707219089Spjd * Device specific actions. 1708219089Spjd */ 1709219089Spjd if (mtvd->vdev_islog) { 1710219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1711219089Spjd } else { 1712219089Spjd /* 1713219089Spjd * XXX - once we have 'readonly' pool 1714219089Spjd * support we should be able to handle 1715219089Spjd * missing data devices by transitioning 1716219089Spjd * the pool to readonly. 1717219089Spjd */ 1718219089Spjd continue; 1719219089Spjd } 1720219089Spjd 1721219089Spjd /* 1722219089Spjd * Swap the missing vdev with the data we were 1723219089Spjd * able to obtain from the MOS config. 1724219089Spjd */ 1725219089Spjd vdev_remove_child(rvd, tvd); 1726219089Spjd vdev_remove_child(mrvd, mtvd); 1727219089Spjd 1728219089Spjd vdev_add_child(rvd, mtvd); 1729219089Spjd vdev_add_child(mrvd, tvd); 1730219089Spjd 1731219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1732219089Spjd vdev_load(mtvd); 1733219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1734219089Spjd 1735219089Spjd vdev_reopen(rvd); 1736219089Spjd } else if (mtvd->vdev_islog) { 1737219089Spjd /* 1738219089Spjd * Load the slog device's state from the MOS config 1739219089Spjd * since it's possible that the label does not 1740219089Spjd * contain the most up-to-date information. 1741219089Spjd */ 1742219089Spjd vdev_load_log_state(tvd, mtvd); 1743219089Spjd vdev_reopen(tvd); 1744219089Spjd } 1745213197Smm } 1746219089Spjd vdev_free(mrvd); 1747219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1748219089Spjd 1749219089Spjd /* 1750219089Spjd * Ensure we were able to validate the config. 1751219089Spjd */ 1752219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1753213197Smm} 1754213197Smm 1755213197Smm/* 1756185029Spjd * Check for missing log devices 1757185029Spjd */ 1758248571Smmstatic boolean_t 1759185029Spjdspa_check_logs(spa_t *spa) 1760185029Spjd{ 1761248571Smm boolean_t rv = B_FALSE; 1762248571Smm 1763185029Spjd switch (spa->spa_log_state) { 1764185029Spjd case SPA_LOG_MISSING: 1765185029Spjd /* need to recheck in case slog has been restored */ 1766185029Spjd case SPA_LOG_UNKNOWN: 1767248571Smm rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, 1768248571Smm NULL, DS_FIND_CHILDREN) != 0); 1769248571Smm if (rv) 1770219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1771185029Spjd break; 1772185029Spjd } 1773248571Smm return (rv); 1774185029Spjd} 1775185029Spjd 1776219089Spjdstatic boolean_t 1777219089Spjdspa_passivate_log(spa_t *spa) 1778219089Spjd{ 1779219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1780219089Spjd boolean_t slog_found = B_FALSE; 1781219089Spjd 1782219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1783219089Spjd 1784219089Spjd if (!spa_has_slogs(spa)) 1785219089Spjd return (B_FALSE); 1786219089Spjd 1787219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1788219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1789219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1790219089Spjd 1791219089Spjd if (tvd->vdev_islog) { 1792219089Spjd metaslab_group_passivate(mg); 1793219089Spjd slog_found = B_TRUE; 1794219089Spjd } 1795219089Spjd } 1796219089Spjd 1797219089Spjd return (slog_found); 1798219089Spjd} 1799219089Spjd 1800219089Spjdstatic void 1801219089Spjdspa_activate_log(spa_t *spa) 1802219089Spjd{ 1803219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1804219089Spjd 1805219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1806219089Spjd 1807219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1808219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1809219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1810219089Spjd 1811219089Spjd if (tvd->vdev_islog) 1812219089Spjd metaslab_group_activate(mg); 1813219089Spjd } 1814219089Spjd} 1815219089Spjd 1816219089Spjdint 1817219089Spjdspa_offline_log(spa_t *spa) 1818219089Spjd{ 1819248571Smm int error; 1820219089Spjd 1821248571Smm error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1822248571Smm NULL, DS_FIND_CHILDREN); 1823248571Smm if (error == 0) { 1824219089Spjd /* 1825219089Spjd * We successfully offlined the log device, sync out the 1826219089Spjd * current txg so that the "stubby" block can be removed 1827219089Spjd * by zil_sync(). 1828219089Spjd */ 1829219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1830219089Spjd } 1831219089Spjd return (error); 1832219089Spjd} 1833219089Spjd 1834219089Spjdstatic void 1835219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1836219089Spjd{ 1837219089Spjd int i; 1838219089Spjd 1839219089Spjd for (i = 0; i < sav->sav_count; i++) 1840219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1841219089Spjd} 1842219089Spjd 1843219089Spjdvoid 1844219089Spjdspa_claim_notify(zio_t *zio) 1845219089Spjd{ 1846219089Spjd spa_t *spa = zio->io_spa; 1847219089Spjd 1848219089Spjd if (zio->io_error) 1849219089Spjd return; 1850219089Spjd 1851219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1852219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1853219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1854219089Spjd mutex_exit(&spa->spa_props_lock); 1855219089Spjd} 1856219089Spjd 1857219089Spjdtypedef struct spa_load_error { 1858219089Spjd uint64_t sle_meta_count; 1859219089Spjd uint64_t sle_data_count; 1860219089Spjd} spa_load_error_t; 1861219089Spjd 1862219089Spjdstatic void 1863219089Spjdspa_load_verify_done(zio_t *zio) 1864219089Spjd{ 1865219089Spjd blkptr_t *bp = zio->io_bp; 1866219089Spjd spa_load_error_t *sle = zio->io_private; 1867219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1868219089Spjd int error = zio->io_error; 1869219089Spjd 1870219089Spjd if (error) { 1871236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1872219089Spjd type != DMU_OT_INTENT_LOG) 1873219089Spjd atomic_add_64(&sle->sle_meta_count, 1); 1874219089Spjd else 1875219089Spjd atomic_add_64(&sle->sle_data_count, 1); 1876219089Spjd } 1877219089Spjd zio_data_buf_free(zio->io_data, zio->io_size); 1878219089Spjd} 1879219089Spjd 1880219089Spjd/*ARGSUSED*/ 1881219089Spjdstatic int 1882219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1883246666Smm const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1884219089Spjd{ 1885260150Sdelphij if (!BP_IS_HOLE(bp)) { 1886219089Spjd zio_t *rio = arg; 1887219089Spjd size_t size = BP_GET_PSIZE(bp); 1888219089Spjd void *data = zio_data_buf_alloc(size); 1889219089Spjd 1890219089Spjd zio_nowait(zio_read(rio, spa, bp, data, size, 1891219089Spjd spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1892219089Spjd ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1893219089Spjd ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1894219089Spjd } 1895219089Spjd return (0); 1896219089Spjd} 1897219089Spjd 1898219089Spjdstatic int 1899219089Spjdspa_load_verify(spa_t *spa) 1900219089Spjd{ 1901219089Spjd zio_t *rio; 1902219089Spjd spa_load_error_t sle = { 0 }; 1903219089Spjd zpool_rewind_policy_t policy; 1904219089Spjd boolean_t verify_ok = B_FALSE; 1905219089Spjd int error; 1906219089Spjd 1907219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 1908219089Spjd 1909219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1910219089Spjd return (0); 1911219089Spjd 1912219089Spjd rio = zio_root(spa, NULL, &sle, 1913219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1914219089Spjd 1915219089Spjd error = traverse_pool(spa, spa->spa_verify_min_txg, 1916219089Spjd TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1917219089Spjd 1918219089Spjd (void) zio_wait(rio); 1919219089Spjd 1920219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 1921219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 1922219089Spjd 1923219089Spjd if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1924219089Spjd sle.sle_data_count <= policy.zrp_maxdata) { 1925219089Spjd int64_t loss = 0; 1926219089Spjd 1927219089Spjd verify_ok = B_TRUE; 1928219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1929219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1930219089Spjd 1931219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1932219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1933219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1934219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 1935219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1936219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1937219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1938219089Spjd } else { 1939219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1940219089Spjd } 1941219089Spjd 1942219089Spjd if (error) { 1943219089Spjd if (error != ENXIO && error != EIO) 1944249195Smm error = SET_ERROR(EIO); 1945219089Spjd return (error); 1946219089Spjd } 1947219089Spjd 1948219089Spjd return (verify_ok ? 0 : EIO); 1949219089Spjd} 1950219089Spjd 1951185029Spjd/* 1952219089Spjd * Find a value in the pool props object. 1953168404Spjd */ 1954219089Spjdstatic void 1955219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1956219089Spjd{ 1957219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1958219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1959219089Spjd} 1960219089Spjd 1961219089Spjd/* 1962219089Spjd * Find a value in the pool directory object. 1963219089Spjd */ 1964168404Spjdstatic int 1965219089Spjdspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1966168404Spjd{ 1967219089Spjd return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1968219089Spjd name, sizeof (uint64_t), 1, val)); 1969219089Spjd} 1970168404Spjd 1971219089Spjdstatic int 1972219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1973219089Spjd{ 1974219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1975219089Spjd return (err); 1976219089Spjd} 1977219089Spjd 1978219089Spjd/* 1979219089Spjd * Fix up config after a partly-completed split. This is done with the 1980219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1981219089Spjd * pool have that entry in their config, but only the splitting one contains 1982219089Spjd * a list of all the guids of the vdevs that are being split off. 1983219089Spjd * 1984219089Spjd * This function determines what to do with that list: either rejoin 1985219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 1986219089Spjd * the rejoin, each disk that is offlined is marked online again, and 1987219089Spjd * we do a reopen() call. If the vdev label for every disk that was 1988219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1989219089Spjd * then we call vdev_split() on each disk, and complete the split. 1990219089Spjd * 1991219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 1992219089Spjd * the original pool. 1993219089Spjd */ 1994219089Spjdstatic void 1995219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 1996219089Spjd{ 1997219089Spjd uint_t extracted; 1998219089Spjd uint64_t *glist; 1999219089Spjd uint_t i, gcount; 2000219089Spjd nvlist_t *nvl; 2001219089Spjd vdev_t **vd; 2002219089Spjd boolean_t attempt_reopen; 2003219089Spjd 2004219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2005219089Spjd return; 2006219089Spjd 2007219089Spjd /* check that the config is complete */ 2008219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2009219089Spjd &glist, &gcount) != 0) 2010219089Spjd return; 2011219089Spjd 2012219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2013219089Spjd 2014219089Spjd /* attempt to online all the vdevs & validate */ 2015219089Spjd attempt_reopen = B_TRUE; 2016219089Spjd for (i = 0; i < gcount; i++) { 2017219089Spjd if (glist[i] == 0) /* vdev is hole */ 2018219089Spjd continue; 2019219089Spjd 2020219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2021219089Spjd if (vd[i] == NULL) { 2022219089Spjd /* 2023219089Spjd * Don't bother attempting to reopen the disks; 2024219089Spjd * just do the split. 2025219089Spjd */ 2026219089Spjd attempt_reopen = B_FALSE; 2027219089Spjd } else { 2028219089Spjd /* attempt to re-online it */ 2029219089Spjd vd[i]->vdev_offline = B_FALSE; 2030219089Spjd } 2031219089Spjd } 2032219089Spjd 2033219089Spjd if (attempt_reopen) { 2034219089Spjd vdev_reopen(spa->spa_root_vdev); 2035219089Spjd 2036219089Spjd /* check each device to see what state it's in */ 2037219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 2038219089Spjd if (vd[i] != NULL && 2039219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2040219089Spjd break; 2041219089Spjd ++extracted; 2042219089Spjd } 2043219089Spjd } 2044219089Spjd 2045209962Smm /* 2046219089Spjd * If every disk has been moved to the new pool, or if we never 2047219089Spjd * even attempted to look at them, then we split them off for 2048219089Spjd * good. 2049209962Smm */ 2050219089Spjd if (!attempt_reopen || gcount == extracted) { 2051219089Spjd for (i = 0; i < gcount; i++) 2052219089Spjd if (vd[i] != NULL) 2053219089Spjd vdev_split(vd[i]); 2054219089Spjd vdev_reopen(spa->spa_root_vdev); 2055219089Spjd } 2056209962Smm 2057219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 2058219089Spjd} 2059185029Spjd 2060219089Spjdstatic int 2061219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2062219089Spjd boolean_t mosconfig) 2063219089Spjd{ 2064219089Spjd nvlist_t *config = spa->spa_config; 2065219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 2066228103Smm char *comment; 2067219089Spjd int error; 2068219089Spjd uint64_t pool_guid; 2069219089Spjd nvlist_t *nvl; 2070168404Spjd 2071219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2072249195Smm return (SET_ERROR(EINVAL)); 2073168404Spjd 2074228103Smm ASSERT(spa->spa_comment == NULL); 2075228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2076228103Smm spa->spa_comment = spa_strdup(comment); 2077228103Smm 2078168404Spjd /* 2079168404Spjd * Versioning wasn't explicitly added to the label until later, so if 2080168404Spjd * it's not present treat it as the initial version. 2081168404Spjd */ 2082219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2083219089Spjd &spa->spa_ubsync.ub_version) != 0) 2084219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2085168404Spjd 2086168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2087168404Spjd &spa->spa_config_txg); 2088168404Spjd 2089168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2090168404Spjd spa_guid_exists(pool_guid, 0)) { 2091249195Smm error = SET_ERROR(EEXIST); 2092219089Spjd } else { 2093228103Smm spa->spa_config_guid = pool_guid; 2094219089Spjd 2095219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2096219089Spjd &nvl) == 0) { 2097219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2098219089Spjd KM_SLEEP) == 0); 2099219089Spjd } 2100219089Spjd 2101236884Smm nvlist_free(spa->spa_load_info); 2102236884Smm spa->spa_load_info = fnvlist_alloc(); 2103236884Smm 2104219089Spjd gethrestime(&spa->spa_loaded_ts); 2105219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 2106219089Spjd mosconfig, &ereport); 2107168404Spjd } 2108168404Spjd 2109219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2110219089Spjd if (error) { 2111219089Spjd if (error != EEXIST) { 2112219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2113219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2114219089Spjd } 2115219089Spjd if (error != EBADF) { 2116219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2117219089Spjd } 2118219089Spjd } 2119219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2120219089Spjd spa->spa_ena = 0; 2121168404Spjd 2122219089Spjd return (error); 2123219089Spjd} 2124219089Spjd 2125219089Spjd/* 2126219089Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 2127219089Spjd * source of configuration information. 2128219089Spjd */ 2129219089Spjdstatic int 2130219089Spjdspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2131219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2132219089Spjd char **ereport) 2133219089Spjd{ 2134219089Spjd int error = 0; 2135219089Spjd nvlist_t *nvroot = NULL; 2136236884Smm nvlist_t *label; 2137219089Spjd vdev_t *rvd; 2138219089Spjd uberblock_t *ub = &spa->spa_uberblock; 2139219089Spjd uint64_t children, config_cache_txg = spa->spa_config_txg; 2140219089Spjd int orig_mode = spa->spa_mode; 2141219089Spjd int parse; 2142219089Spjd uint64_t obj; 2143236884Smm boolean_t missing_feat_write = B_FALSE; 2144219089Spjd 2145168404Spjd /* 2146219089Spjd * If this is an untrusted config, access the pool in read-only mode. 2147219089Spjd * This prevents things like resilvering recently removed devices. 2148219089Spjd */ 2149219089Spjd if (!mosconfig) 2150219089Spjd spa->spa_mode = FREAD; 2151219089Spjd 2152219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2153219089Spjd 2154219089Spjd spa->spa_load_state = state; 2155219089Spjd 2156219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2157249195Smm return (SET_ERROR(EINVAL)); 2158219089Spjd 2159219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 2160219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2161219089Spjd 2162219089Spjd /* 2163209962Smm * Create "The Godfather" zio to hold all async IOs 2164209962Smm */ 2165209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2166209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2167209962Smm 2168209962Smm /* 2169168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2170168404Spjd * value that will be returned by spa_version() since parsing the 2171168404Spjd * configuration requires knowing the version number. 2172168404Spjd */ 2173185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2174219089Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2175185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2176168404Spjd 2177168404Spjd if (error != 0) 2178219089Spjd return (error); 2179168404Spjd 2180168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2181168404Spjd 2182219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2183219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2184219089Spjd } 2185219089Spjd 2186168404Spjd /* 2187168404Spjd * Try to open all vdevs, loading each label in the process. 2188168404Spjd */ 2189185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2190168926Spjd error = vdev_open(rvd); 2191185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2192168926Spjd if (error != 0) 2193219089Spjd return (error); 2194168404Spjd 2195168404Spjd /* 2196209962Smm * We need to validate the vdev labels against the configuration that 2197209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 2198209962Smm * mosconfig is true then we're validating the vdev labels based on 2199219089Spjd * that config. Otherwise, we're validating against the cached config 2200209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 2201209962Smm * later we will recursively call spa_load() and validate against 2202209962Smm * the vdev config. 2203219089Spjd * 2204219089Spjd * If we're assembling a new pool that's been split off from an 2205219089Spjd * existing pool, the labels haven't yet been updated so we skip 2206219089Spjd * validation for now. 2207168404Spjd */ 2208219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2209219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2210230514Smm error = vdev_validate(rvd, mosconfig); 2211219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2212168404Spjd 2213219089Spjd if (error != 0) 2214219089Spjd return (error); 2215219089Spjd 2216219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2217249195Smm return (SET_ERROR(ENXIO)); 2218168404Spjd } 2219168404Spjd 2220168404Spjd /* 2221168404Spjd * Find the best uberblock. 2222168404Spjd */ 2223236884Smm vdev_uberblock_load(rvd, ub, &label); 2224168404Spjd 2225168404Spjd /* 2226168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2227168404Spjd */ 2228236884Smm if (ub->ub_txg == 0) { 2229236884Smm nvlist_free(label); 2230219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2231236884Smm } 2232168404Spjd 2233168404Spjd /* 2234236884Smm * If the pool has an unsupported version we can't open it. 2235168404Spjd */ 2236236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2237236884Smm nvlist_free(label); 2238219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2239236884Smm } 2240168404Spjd 2241236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2242236884Smm nvlist_t *features; 2243236884Smm 2244236884Smm /* 2245236884Smm * If we weren't able to find what's necessary for reading the 2246236884Smm * MOS in the label, return failure. 2247236884Smm */ 2248236884Smm if (label == NULL || nvlist_lookup_nvlist(label, 2249236884Smm ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2250236884Smm nvlist_free(label); 2251236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2252236884Smm ENXIO)); 2253236884Smm } 2254236884Smm 2255236884Smm /* 2256236884Smm * Update our in-core representation with the definitive values 2257236884Smm * from the label. 2258236884Smm */ 2259236884Smm nvlist_free(spa->spa_label_features); 2260236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2261236884Smm } 2262236884Smm 2263236884Smm nvlist_free(label); 2264236884Smm 2265168404Spjd /* 2266236884Smm * Look through entries in the label nvlist's features_for_read. If 2267236884Smm * there is a feature listed there which we don't understand then we 2268236884Smm * cannot open a pool. 2269236884Smm */ 2270236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2271236884Smm nvlist_t *unsup_feat; 2272236884Smm 2273236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2274236884Smm 0); 2275236884Smm 2276236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2277236884Smm NULL); nvp != NULL; 2278236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2279236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2280236884Smm VERIFY(nvlist_add_string(unsup_feat, 2281236884Smm nvpair_name(nvp), "") == 0); 2282236884Smm } 2283236884Smm } 2284236884Smm 2285236884Smm if (!nvlist_empty(unsup_feat)) { 2286236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2287236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2288236884Smm nvlist_free(unsup_feat); 2289236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2290236884Smm ENOTSUP)); 2291236884Smm } 2292236884Smm 2293236884Smm nvlist_free(unsup_feat); 2294236884Smm } 2295236884Smm 2296236884Smm /* 2297168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2298219089Spjd * incomplete configuration. We first check to see if the pool 2299219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2300219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2301219089Spjd * can handle missing vdevs. 2302168404Spjd */ 2303219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2304219089Spjd &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2305219089Spjd rvd->vdev_guid_sum != ub->ub_guid_sum) 2306219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2307219089Spjd 2308219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2309219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2310219089Spjd spa_try_repair(spa, config); 2311219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2312219089Spjd nvlist_free(spa->spa_config_splitting); 2313219089Spjd spa->spa_config_splitting = NULL; 2314168404Spjd } 2315168404Spjd 2316168404Spjd /* 2317168404Spjd * Initialize internal SPA structures. 2318168404Spjd */ 2319168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2320168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2321219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2322219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2323219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2324219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2325219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2326219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2327219089Spjd 2328236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2329219089Spjd if (error) 2330219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2331168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2332168404Spjd 2333219089Spjd if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2334219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2335168404Spjd 2336236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2337236884Smm boolean_t missing_feat_read = B_FALSE; 2338238926Smm nvlist_t *unsup_feat, *enabled_feat; 2339236884Smm 2340236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2341236884Smm &spa->spa_feat_for_read_obj) != 0) { 2342236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2343236884Smm } 2344236884Smm 2345236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2346236884Smm &spa->spa_feat_for_write_obj) != 0) { 2347236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2348236884Smm } 2349236884Smm 2350236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2351236884Smm &spa->spa_feat_desc_obj) != 0) { 2352236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2353236884Smm } 2354236884Smm 2355238926Smm enabled_feat = fnvlist_alloc(); 2356238926Smm unsup_feat = fnvlist_alloc(); 2357236884Smm 2358259813Sdelphij if (!spa_features_check(spa, B_FALSE, 2359238926Smm unsup_feat, enabled_feat)) 2360236884Smm missing_feat_read = B_TRUE; 2361236884Smm 2362236884Smm if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2363259813Sdelphij if (!spa_features_check(spa, B_TRUE, 2364238926Smm unsup_feat, enabled_feat)) { 2365236884Smm missing_feat_write = B_TRUE; 2366238926Smm } 2367236884Smm } 2368236884Smm 2369238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2370238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2371238926Smm 2372236884Smm if (!nvlist_empty(unsup_feat)) { 2373238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2374238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2375236884Smm } 2376236884Smm 2377238926Smm fnvlist_free(enabled_feat); 2378238926Smm fnvlist_free(unsup_feat); 2379236884Smm 2380236884Smm if (!missing_feat_read) { 2381236884Smm fnvlist_add_boolean(spa->spa_load_info, 2382236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2383236884Smm } 2384236884Smm 2385236884Smm /* 2386236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2387236884Smm * twofold: to determine whether the pool is available for 2388236884Smm * import in read-write mode and (if it is not) whether the 2389236884Smm * pool is available for import in read-only mode. If the pool 2390236884Smm * is available for import in read-write mode, it is displayed 2391236884Smm * as available in userland; if it is not available for import 2392236884Smm * in read-only mode, it is displayed as unavailable in 2393236884Smm * userland. If the pool is available for import in read-only 2394236884Smm * mode but not read-write mode, it is displayed as unavailable 2395236884Smm * in userland with a special note that the pool is actually 2396236884Smm * available for open in read-only mode. 2397236884Smm * 2398236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2399236884Smm * missing a feature for write, we must first determine whether 2400236884Smm * the pool can be opened read-only before returning to 2401236884Smm * userland in order to know whether to display the 2402236884Smm * abovementioned note. 2403236884Smm */ 2404236884Smm if (missing_feat_read || (missing_feat_write && 2405236884Smm spa_writeable(spa))) { 2406236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2407236884Smm ENOTSUP)); 2408236884Smm } 2409260150Sdelphij 2410260150Sdelphij /* 2411260150Sdelphij * Load refcounts for ZFS features from disk into an in-memory 2412260150Sdelphij * cache during SPA initialization. 2413260150Sdelphij */ 2414260150Sdelphij for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2415260150Sdelphij uint64_t refcount; 2416260150Sdelphij 2417260150Sdelphij error = feature_get_refcount_from_disk(spa, 2418260150Sdelphij &spa_feature_table[i], &refcount); 2419260150Sdelphij if (error == 0) { 2420260150Sdelphij spa->spa_feat_refcount_cache[i] = refcount; 2421260150Sdelphij } else if (error == ENOTSUP) { 2422260150Sdelphij spa->spa_feat_refcount_cache[i] = 2423260150Sdelphij SPA_FEATURE_DISABLED; 2424260150Sdelphij } else { 2425260150Sdelphij return (spa_vdev_err(rvd, 2426260150Sdelphij VDEV_AUX_CORRUPT_DATA, EIO)); 2427260150Sdelphij } 2428260150Sdelphij } 2429236884Smm } 2430236884Smm 2431260150Sdelphij if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2432260150Sdelphij if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2433260150Sdelphij &spa->spa_feat_enabled_txg_obj) != 0) { 2434260150Sdelphij return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2435260150Sdelphij } 2436260150Sdelphij } 2437260150Sdelphij 2438236884Smm spa->spa_is_initializing = B_TRUE; 2439236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2440236884Smm spa->spa_is_initializing = B_FALSE; 2441236884Smm if (error != 0) 2442236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2443236884Smm 2444168404Spjd if (!mosconfig) { 2445168498Spjd uint64_t hostid; 2446219089Spjd nvlist_t *policy = NULL, *nvconfig; 2447168404Spjd 2448219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2449219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2450168404Spjd 2451219089Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2452185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2453168498Spjd char *hostname; 2454168498Spjd unsigned long myhostid = 0; 2455168498Spjd 2456219089Spjd VERIFY(nvlist_lookup_string(nvconfig, 2457168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2458168498Spjd 2459219089Spjd#ifdef _KERNEL 2460219089Spjd myhostid = zone_get_hostid(NULL); 2461219089Spjd#else /* _KERNEL */ 2462219089Spjd /* 2463219089Spjd * We're emulating the system's hostid in userland, so 2464219089Spjd * we can't use zone_get_hostid(). 2465219089Spjd */ 2466168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2467219089Spjd#endif /* _KERNEL */ 2468204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 2469219089Spjd hostid != myhostid) { 2470219089Spjd nvlist_free(nvconfig); 2471168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 2472168498Spjd "loaded as it was last accessed by " 2473185029Spjd "another system (host: %s hostid: 0x%lx). " 2474236146Smm "See: http://illumos.org/msg/ZFS-8000-EY", 2475185029Spjd spa_name(spa), hostname, 2476168498Spjd (unsigned long)hostid); 2477249195Smm return (SET_ERROR(EBADF)); 2478168498Spjd } 2479168498Spjd } 2480219089Spjd if (nvlist_lookup_nvlist(spa->spa_config, 2481219089Spjd ZPOOL_REWIND_POLICY, &policy) == 0) 2482219089Spjd VERIFY(nvlist_add_nvlist(nvconfig, 2483219089Spjd ZPOOL_REWIND_POLICY, policy) == 0); 2484168498Spjd 2485219089Spjd spa_config_set(spa, nvconfig); 2486168404Spjd spa_unload(spa); 2487168404Spjd spa_deactivate(spa); 2488209962Smm spa_activate(spa, orig_mode); 2489168404Spjd 2490219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2491168404Spjd } 2492168404Spjd 2493219089Spjd if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2494219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2495219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2496219089Spjd if (error != 0) 2497219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2498168404Spjd 2499168404Spjd /* 2500168404Spjd * Load the bit that tells us to use the new accounting function 2501168404Spjd * (raid-z deflation). If we have an older pool, this will not 2502168404Spjd * be present. 2503168404Spjd */ 2504219089Spjd error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2505219089Spjd if (error != 0 && error != ENOENT) 2506219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2507168404Spjd 2508219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2509219089Spjd &spa->spa_creation_version); 2510219089Spjd if (error != 0 && error != ENOENT) 2511219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2512219089Spjd 2513168404Spjd /* 2514168404Spjd * Load the persistent error log. If we have an older pool, this will 2515168404Spjd * not be present. 2516168404Spjd */ 2517219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2518219089Spjd if (error != 0 && error != ENOENT) 2519219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2520168404Spjd 2521219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2522219089Spjd &spa->spa_errlog_scrub); 2523219089Spjd if (error != 0 && error != ENOENT) 2524219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2525168404Spjd 2526168404Spjd /* 2527168404Spjd * Load the history object. If we have an older pool, this 2528168404Spjd * will not be present. 2529168404Spjd */ 2530219089Spjd error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2531219089Spjd if (error != 0 && error != ENOENT) 2532219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2533168404Spjd 2534168404Spjd /* 2535219089Spjd * If we're assembling the pool from the split-off vdevs of 2536219089Spjd * an existing pool, we don't want to attach the spares & cache 2537219089Spjd * devices. 2538219089Spjd */ 2539219089Spjd 2540219089Spjd /* 2541168404Spjd * Load any hot spares for this pool. 2542168404Spjd */ 2543219089Spjd error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2544219089Spjd if (error != 0 && error != ENOENT) 2545219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2546219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2547185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2548185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 2549219089Spjd &spa->spa_spares.sav_config) != 0) 2550219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2551168404Spjd 2552185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2553168404Spjd spa_load_spares(spa); 2554185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2555219089Spjd } else if (error == 0) { 2556219089Spjd spa->spa_spares.sav_sync = B_TRUE; 2557168404Spjd } 2558168404Spjd 2559185029Spjd /* 2560185029Spjd * Load any level 2 ARC devices for this pool. 2561185029Spjd */ 2562219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2563185029Spjd &spa->spa_l2cache.sav_object); 2564219089Spjd if (error != 0 && error != ENOENT) 2565219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2566219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2567185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2568185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2569219089Spjd &spa->spa_l2cache.sav_config) != 0) 2570219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2571185029Spjd 2572185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2573185029Spjd spa_load_l2cache(spa); 2574185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2575219089Spjd } else if (error == 0) { 2576219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2577185029Spjd } 2578185029Spjd 2579219089Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2580213197Smm 2581219089Spjd error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2582219089Spjd if (error && error != ENOENT) 2583219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2584185029Spjd 2585219089Spjd if (error == 0) { 2586219089Spjd uint64_t autoreplace; 2587185029Spjd 2588219089Spjd spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2589219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2590219089Spjd spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2591219089Spjd spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2592219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2593219089Spjd spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2594219089Spjd &spa->spa_dedup_ditto); 2595185029Spjd 2596219089Spjd spa->spa_autoreplace = (autoreplace != 0); 2597168404Spjd } 2598168404Spjd 2599168404Spjd /* 2600185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 2601185029Spjd * the ZFS DE that it should not issue any faults for unopenable 2602185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 2603185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 2604185029Spjd * over. 2605185029Spjd */ 2606219089Spjd if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2607185029Spjd spa_check_removed(spa->spa_root_vdev); 2608219089Spjd /* 2609219089Spjd * For the import case, this is done in spa_import(), because 2610219089Spjd * at this point we're using the spare definitions from 2611219089Spjd * the MOS config, not necessarily from the userland config. 2612219089Spjd */ 2613219089Spjd if (state != SPA_LOAD_IMPORT) { 2614219089Spjd spa_aux_check_removed(&spa->spa_spares); 2615219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 2616219089Spjd } 2617219089Spjd } 2618185029Spjd 2619185029Spjd /* 2620168404Spjd * Load the vdev state for all toplevel vdevs. 2621168404Spjd */ 2622168404Spjd vdev_load(rvd); 2623168404Spjd 2624168404Spjd /* 2625168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 2626168404Spjd */ 2627185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2628168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2629185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2630168404Spjd 2631168404Spjd /* 2632219089Spjd * Load the DDTs (dedup tables). 2633168404Spjd */ 2634219089Spjd error = ddt_load(spa); 2635219089Spjd if (error != 0) 2636219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2637219089Spjd 2638219089Spjd spa_update_dspace(spa); 2639219089Spjd 2640219089Spjd /* 2641219089Spjd * Validate the config, using the MOS config to fill in any 2642219089Spjd * information which might be missing. If we fail to validate 2643219089Spjd * the config then declare the pool unfit for use. If we're 2644219089Spjd * assembling a pool from a split, the log is not transferred 2645219089Spjd * over. 2646219089Spjd */ 2647219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2648219089Spjd nvlist_t *nvconfig; 2649219089Spjd 2650219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2651219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2652219089Spjd 2653219089Spjd if (!spa_config_valid(spa, nvconfig)) { 2654219089Spjd nvlist_free(nvconfig); 2655219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2656219089Spjd ENXIO)); 2657219089Spjd } 2658219089Spjd nvlist_free(nvconfig); 2659219089Spjd 2660219089Spjd /* 2661236884Smm * Now that we've validated the config, check the state of the 2662219089Spjd * root vdev. If it can't be opened, it indicates one or 2663219089Spjd * more toplevel vdevs are faulted. 2664219089Spjd */ 2665219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2666249195Smm return (SET_ERROR(ENXIO)); 2667219089Spjd 2668219089Spjd if (spa_check_logs(spa)) { 2669219089Spjd *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2670219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2671219089Spjd } 2672168404Spjd } 2673168404Spjd 2674236884Smm if (missing_feat_write) { 2675236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 2676236884Smm 2677236884Smm /* 2678236884Smm * At this point, we know that we can open the pool in 2679236884Smm * read-only mode but not read-write mode. We now have enough 2680236884Smm * information and can return to userland. 2681236884Smm */ 2682236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2683236884Smm } 2684236884Smm 2685219089Spjd /* 2686219089Spjd * We've successfully opened the pool, verify that we're ready 2687219089Spjd * to start pushing transactions. 2688219089Spjd */ 2689219089Spjd if (state != SPA_LOAD_TRYIMPORT) { 2690219089Spjd if (error = spa_load_verify(spa)) 2691219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2692219089Spjd error)); 2693219089Spjd } 2694219089Spjd 2695219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2696219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 2697168404Spjd dmu_tx_t *tx; 2698168404Spjd int need_update = B_FALSE; 2699168404Spjd 2700209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 2701209962Smm 2702168404Spjd /* 2703168404Spjd * Claim log blocks that haven't been committed yet. 2704168404Spjd * This must all happen in a single txg. 2705219089Spjd * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2706219089Spjd * invoked from zil_claim_log_block()'s i/o done callback. 2707219089Spjd * Price of rollback is that we abandon the log. 2708168404Spjd */ 2709219089Spjd spa->spa_claiming = B_TRUE; 2710219089Spjd 2711168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2712168404Spjd spa_first_txg(spa)); 2713185029Spjd (void) dmu_objset_find(spa_name(spa), 2714168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 2715168404Spjd dmu_tx_commit(tx); 2716168404Spjd 2717219089Spjd spa->spa_claiming = B_FALSE; 2718219089Spjd 2719219089Spjd spa_set_log_state(spa, SPA_LOG_GOOD); 2720168404Spjd spa->spa_sync_on = B_TRUE; 2721168404Spjd txg_sync_start(spa->spa_dsl_pool); 2722168404Spjd 2723168404Spjd /* 2724219089Spjd * Wait for all claims to sync. We sync up to the highest 2725219089Spjd * claimed log block birth time so that claimed log blocks 2726219089Spjd * don't appear to be from the future. spa_claim_max_txg 2727219089Spjd * will have been set for us by either zil_check_log_chain() 2728219089Spjd * (invoked from spa_check_logs()) or zil_claim() above. 2729168404Spjd */ 2730219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2731168404Spjd 2732168404Spjd /* 2733168404Spjd * If the config cache is stale, or we have uninitialized 2734168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 2735209962Smm * 2736219089Spjd * If this is a verbatim import, trust the current 2737209962Smm * in-core spa_config and update the disk labels. 2738168404Spjd */ 2739168404Spjd if (config_cache_txg != spa->spa_config_txg || 2740219089Spjd state == SPA_LOAD_IMPORT || 2741219089Spjd state == SPA_LOAD_RECOVER || 2742219089Spjd (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2743168404Spjd need_update = B_TRUE; 2744168404Spjd 2745209962Smm for (int c = 0; c < rvd->vdev_children; c++) 2746168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 2747168404Spjd need_update = B_TRUE; 2748168404Spjd 2749168404Spjd /* 2750168404Spjd * Update the config cache asychronously in case we're the 2751168404Spjd * root pool, in which case the config cache isn't writable yet. 2752168404Spjd */ 2753168404Spjd if (need_update) 2754168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2755208683Spjd 2756208683Spjd /* 2757208683Spjd * Check all DTLs to see if anything needs resilvering. 2758208683Spjd */ 2759219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2760219089Spjd vdev_resilver_needed(rvd, NULL, NULL)) 2761208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2762219089Spjd 2763219089Spjd /* 2764248571Smm * Log the fact that we booted up (so that we can detect if 2765248571Smm * we rebooted in the middle of an operation). 2766248571Smm */ 2767248571Smm spa_history_log_version(spa, "open"); 2768248571Smm 2769248571Smm /* 2770219089Spjd * Delete any inconsistent datasets. 2771219089Spjd */ 2772219089Spjd (void) dmu_objset_find(spa_name(spa), 2773219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2774219089Spjd 2775219089Spjd /* 2776219089Spjd * Clean up any stale temporary dataset userrefs. 2777219089Spjd */ 2778219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2779168404Spjd } 2780168404Spjd 2781219089Spjd return (0); 2782219089Spjd} 2783168404Spjd 2784219089Spjdstatic int 2785219089Spjdspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2786219089Spjd{ 2787219089Spjd int mode = spa->spa_mode; 2788219089Spjd 2789219089Spjd spa_unload(spa); 2790219089Spjd spa_deactivate(spa); 2791219089Spjd 2792219089Spjd spa->spa_load_max_txg--; 2793219089Spjd 2794219089Spjd spa_activate(spa, mode); 2795219089Spjd spa_async_suspend(spa); 2796219089Spjd 2797219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2798168404Spjd} 2799168404Spjd 2800236884Smm/* 2801236884Smm * If spa_load() fails this function will try loading prior txg's. If 2802236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2803236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2804236884Smm * function will not rewind the pool and will return the same error as 2805236884Smm * spa_load(). 2806236884Smm */ 2807219089Spjdstatic int 2808219089Spjdspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2809219089Spjd uint64_t max_request, int rewind_flags) 2810219089Spjd{ 2811236884Smm nvlist_t *loadinfo = NULL; 2812219089Spjd nvlist_t *config = NULL; 2813219089Spjd int load_error, rewind_error; 2814219089Spjd uint64_t safe_rewind_txg; 2815219089Spjd uint64_t min_txg; 2816219089Spjd 2817219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2818219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 2819219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2820219089Spjd } else { 2821219089Spjd spa->spa_load_max_txg = max_request; 2822219089Spjd } 2823219089Spjd 2824219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2825219089Spjd mosconfig); 2826219089Spjd if (load_error == 0) 2827219089Spjd return (0); 2828219089Spjd 2829219089Spjd if (spa->spa_root_vdev != NULL) 2830219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2831219089Spjd 2832219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2833219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2834219089Spjd 2835219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 2836219089Spjd nvlist_free(config); 2837219089Spjd return (load_error); 2838219089Spjd } 2839219089Spjd 2840236884Smm if (state == SPA_LOAD_RECOVER) { 2841236884Smm /* Price of rolling back is discarding txgs, including log */ 2842219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2843236884Smm } else { 2844236884Smm /* 2845236884Smm * If we aren't rolling back save the load info from our first 2846236884Smm * import attempt so that we can restore it after attempting 2847236884Smm * to rewind. 2848236884Smm */ 2849236884Smm loadinfo = spa->spa_load_info; 2850236884Smm spa->spa_load_info = fnvlist_alloc(); 2851236884Smm } 2852219089Spjd 2853219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2854219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2855219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2856219089Spjd TXG_INITIAL : safe_rewind_txg; 2857219089Spjd 2858219089Spjd /* 2859219089Spjd * Continue as long as we're finding errors, we're still within 2860219089Spjd * the acceptable rewind range, and we're still finding uberblocks 2861219089Spjd */ 2862219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2863219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2864219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 2865219089Spjd spa->spa_extreme_rewind = B_TRUE; 2866219089Spjd rewind_error = spa_load_retry(spa, state, mosconfig); 2867219089Spjd } 2868219089Spjd 2869219089Spjd spa->spa_extreme_rewind = B_FALSE; 2870219089Spjd spa->spa_load_max_txg = UINT64_MAX; 2871219089Spjd 2872219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2873219089Spjd spa_config_set(spa, config); 2874219089Spjd 2875236884Smm if (state == SPA_LOAD_RECOVER) { 2876236884Smm ASSERT3P(loadinfo, ==, NULL); 2877236884Smm return (rewind_error); 2878236884Smm } else { 2879236884Smm /* Store the rewind info as part of the initial load info */ 2880236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2881236884Smm spa->spa_load_info); 2882236884Smm 2883236884Smm /* Restore the initial load info */ 2884236884Smm fnvlist_free(spa->spa_load_info); 2885236884Smm spa->spa_load_info = loadinfo; 2886236884Smm 2887236884Smm return (load_error); 2888236884Smm } 2889219089Spjd} 2890219089Spjd 2891168404Spjd/* 2892168404Spjd * Pool Open/Import 2893168404Spjd * 2894168404Spjd * The import case is identical to an open except that the configuration is sent 2895168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 2896168404Spjd * case of an open, the pool configuration will exist in the 2897185029Spjd * POOL_STATE_UNINITIALIZED state. 2898168404Spjd * 2899168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 2900168404Spjd * the same time open the pool, without having to keep around the spa_t in some 2901168404Spjd * ambiguous state. 2902168404Spjd */ 2903168404Spjdstatic int 2904219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2905219089Spjd nvlist_t **config) 2906168404Spjd{ 2907168404Spjd spa_t *spa; 2908219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 2909168404Spjd int error; 2910168404Spjd int locked = B_FALSE; 2911219089Spjd int firstopen = B_FALSE; 2912168404Spjd 2913168404Spjd *spapp = NULL; 2914168404Spjd 2915168404Spjd /* 2916168404Spjd * As disgusting as this is, we need to support recursive calls to this 2917168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 2918168404Spjd * up calling spa_open() again. The real fix is to figure out how to 2919168404Spjd * avoid dsl_dir_open() calling this in the first place. 2920168404Spjd */ 2921168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 2922168404Spjd mutex_enter(&spa_namespace_lock); 2923168404Spjd locked = B_TRUE; 2924168404Spjd } 2925168404Spjd 2926168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 2927168404Spjd if (locked) 2928168404Spjd mutex_exit(&spa_namespace_lock); 2929249195Smm return (SET_ERROR(ENOENT)); 2930168404Spjd } 2931219089Spjd 2932168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2933219089Spjd zpool_rewind_policy_t policy; 2934168404Spjd 2935219089Spjd firstopen = B_TRUE; 2936219089Spjd 2937219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2938219089Spjd &policy); 2939219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 2940219089Spjd state = SPA_LOAD_RECOVER; 2941219089Spjd 2942209962Smm spa_activate(spa, spa_mode_global); 2943168404Spjd 2944219089Spjd if (state != SPA_LOAD_RECOVER) 2945219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2946168404Spjd 2947219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2948219089Spjd policy.zrp_request); 2949219089Spjd 2950168404Spjd if (error == EBADF) { 2951168404Spjd /* 2952168404Spjd * If vdev_validate() returns failure (indicated by 2953168404Spjd * EBADF), it indicates that one of the vdevs indicates 2954168404Spjd * that the pool has been exported or destroyed. If 2955168404Spjd * this is the case, the config cache is out of sync and 2956168404Spjd * we should remove the pool from the namespace. 2957168404Spjd */ 2958168404Spjd spa_unload(spa); 2959168404Spjd spa_deactivate(spa); 2960185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 2961168404Spjd spa_remove(spa); 2962168404Spjd if (locked) 2963168404Spjd mutex_exit(&spa_namespace_lock); 2964249195Smm return (SET_ERROR(ENOENT)); 2965168404Spjd } 2966168404Spjd 2967168404Spjd if (error) { 2968168404Spjd /* 2969168404Spjd * We can't open the pool, but we still have useful 2970168404Spjd * information: the state of each vdev after the 2971168404Spjd * attempted vdev_open(). Return this to the user. 2972168404Spjd */ 2973219089Spjd if (config != NULL && spa->spa_config) { 2974219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 2975219089Spjd KM_SLEEP) == 0); 2976219089Spjd VERIFY(nvlist_add_nvlist(*config, 2977219089Spjd ZPOOL_CONFIG_LOAD_INFO, 2978219089Spjd spa->spa_load_info) == 0); 2979219089Spjd } 2980168404Spjd spa_unload(spa); 2981168404Spjd spa_deactivate(spa); 2982219089Spjd spa->spa_last_open_failed = error; 2983168404Spjd if (locked) 2984168404Spjd mutex_exit(&spa_namespace_lock); 2985168404Spjd *spapp = NULL; 2986168404Spjd return (error); 2987168404Spjd } 2988168404Spjd } 2989168404Spjd 2990168404Spjd spa_open_ref(spa, tag); 2991185029Spjd 2992219089Spjd if (config != NULL) 2993219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2994219089Spjd 2995219089Spjd /* 2996219089Spjd * If we've recovered the pool, pass back any information we 2997219089Spjd * gathered while doing the load. 2998219089Spjd */ 2999219089Spjd if (state == SPA_LOAD_RECOVER) { 3000219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3001219089Spjd spa->spa_load_info) == 0); 3002219089Spjd } 3003219089Spjd 3004219089Spjd if (locked) { 3005219089Spjd spa->spa_last_open_failed = 0; 3006219089Spjd spa->spa_last_ubsync_txg = 0; 3007219089Spjd spa->spa_load_txg = 0; 3008168404Spjd mutex_exit(&spa_namespace_lock); 3009219089Spjd#ifdef __FreeBSD__ 3010219089Spjd#ifdef _KERNEL 3011219089Spjd if (firstopen) 3012249047Savg zvol_create_minors(spa->spa_name); 3013219089Spjd#endif 3014219089Spjd#endif 3015219089Spjd } 3016168404Spjd 3017168404Spjd *spapp = spa; 3018168404Spjd 3019168404Spjd return (0); 3020168404Spjd} 3021168404Spjd 3022168404Spjdint 3023219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3024219089Spjd nvlist_t **config) 3025219089Spjd{ 3026219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 3027219089Spjd} 3028219089Spjd 3029219089Spjdint 3030168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 3031168404Spjd{ 3032219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 3033168404Spjd} 3034168404Spjd 3035168404Spjd/* 3036168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 3037168404Spjd * preventing it from being exported or destroyed. 3038168404Spjd */ 3039168404Spjdspa_t * 3040168404Spjdspa_inject_addref(char *name) 3041168404Spjd{ 3042168404Spjd spa_t *spa; 3043168404Spjd 3044168404Spjd mutex_enter(&spa_namespace_lock); 3045168404Spjd if ((spa = spa_lookup(name)) == NULL) { 3046168404Spjd mutex_exit(&spa_namespace_lock); 3047168404Spjd return (NULL); 3048168404Spjd } 3049168404Spjd spa->spa_inject_ref++; 3050168404Spjd mutex_exit(&spa_namespace_lock); 3051168404Spjd 3052168404Spjd return (spa); 3053168404Spjd} 3054168404Spjd 3055168404Spjdvoid 3056168404Spjdspa_inject_delref(spa_t *spa) 3057168404Spjd{ 3058168404Spjd mutex_enter(&spa_namespace_lock); 3059168404Spjd spa->spa_inject_ref--; 3060168404Spjd mutex_exit(&spa_namespace_lock); 3061168404Spjd} 3062168404Spjd 3063185029Spjd/* 3064185029Spjd * Add spares device information to the nvlist. 3065185029Spjd */ 3066168404Spjdstatic void 3067168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 3068168404Spjd{ 3069168404Spjd nvlist_t **spares; 3070168404Spjd uint_t i, nspares; 3071168404Spjd nvlist_t *nvroot; 3072168404Spjd uint64_t guid; 3073168404Spjd vdev_stat_t *vs; 3074168404Spjd uint_t vsc; 3075168404Spjd uint64_t pool; 3076168404Spjd 3077209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3078209962Smm 3079185029Spjd if (spa->spa_spares.sav_count == 0) 3080168404Spjd return; 3081168404Spjd 3082168404Spjd VERIFY(nvlist_lookup_nvlist(config, 3083168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3084185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3085168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3086168404Spjd if (nspares != 0) { 3087168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3088168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3089168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3090168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3091168404Spjd 3092168404Spjd /* 3093168404Spjd * Go through and find any spares which have since been 3094168404Spjd * repurposed as an active spare. If this is the case, update 3095168404Spjd * their status appropriately. 3096168404Spjd */ 3097168404Spjd for (i = 0; i < nspares; i++) { 3098168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 3099168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3100185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 3101185029Spjd pool != 0ULL) { 3102168404Spjd VERIFY(nvlist_lookup_uint64_array( 3103219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 3104168404Spjd (uint64_t **)&vs, &vsc) == 0); 3105168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 3106168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 3107168404Spjd } 3108168404Spjd } 3109168404Spjd } 3110168404Spjd} 3111168404Spjd 3112185029Spjd/* 3113185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 3114185029Spjd */ 3115185029Spjdstatic void 3116185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 3117185029Spjd{ 3118185029Spjd nvlist_t **l2cache; 3119185029Spjd uint_t i, j, nl2cache; 3120185029Spjd nvlist_t *nvroot; 3121185029Spjd uint64_t guid; 3122185029Spjd vdev_t *vd; 3123185029Spjd vdev_stat_t *vs; 3124185029Spjd uint_t vsc; 3125185029Spjd 3126209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3127209962Smm 3128185029Spjd if (spa->spa_l2cache.sav_count == 0) 3129185029Spjd return; 3130185029Spjd 3131185029Spjd VERIFY(nvlist_lookup_nvlist(config, 3132185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3133185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3134185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3135185029Spjd if (nl2cache != 0) { 3136185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3137185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3138185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3139185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3140185029Spjd 3141185029Spjd /* 3142185029Spjd * Update level 2 cache device stats. 3143185029Spjd */ 3144185029Spjd 3145185029Spjd for (i = 0; i < nl2cache; i++) { 3146185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 3147185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3148185029Spjd 3149185029Spjd vd = NULL; 3150185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3151185029Spjd if (guid == 3152185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3153185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 3154185029Spjd break; 3155185029Spjd } 3156185029Spjd } 3157185029Spjd ASSERT(vd != NULL); 3158185029Spjd 3159185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3160219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3161219089Spjd == 0); 3162185029Spjd vdev_get_stats(vd, vs); 3163185029Spjd } 3164185029Spjd } 3165185029Spjd} 3166185029Spjd 3167236884Smmstatic void 3168236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3169236884Smm{ 3170236884Smm nvlist_t *features; 3171236884Smm zap_cursor_t zc; 3172236884Smm zap_attribute_t za; 3173236884Smm 3174236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3175236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3176236884Smm 3177253993Smav /* We may be unable to read features if pool is suspended. */ 3178253993Smav if (spa_suspended(spa)) 3179253993Smav goto out; 3180253993Smav 3181236884Smm if (spa->spa_feat_for_read_obj != 0) { 3182236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3183236884Smm spa->spa_feat_for_read_obj); 3184236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3185236884Smm zap_cursor_advance(&zc)) { 3186236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3187236884Smm za.za_num_integers == 1); 3188236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3189236884Smm za.za_first_integer)); 3190236884Smm } 3191236884Smm zap_cursor_fini(&zc); 3192236884Smm } 3193236884Smm 3194236884Smm if (spa->spa_feat_for_write_obj != 0) { 3195236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3196236884Smm spa->spa_feat_for_write_obj); 3197236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3198236884Smm zap_cursor_advance(&zc)) { 3199236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3200236884Smm za.za_num_integers == 1); 3201236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3202236884Smm za.za_first_integer)); 3203236884Smm } 3204236884Smm zap_cursor_fini(&zc); 3205236884Smm } 3206236884Smm 3207253993Smavout: 3208236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3209236884Smm features) == 0); 3210236884Smm nvlist_free(features); 3211236884Smm} 3212236884Smm 3213168404Spjdint 3214236884Smmspa_get_stats(const char *name, nvlist_t **config, 3215236884Smm char *altroot, size_t buflen) 3216168404Spjd{ 3217168404Spjd int error; 3218168404Spjd spa_t *spa; 3219168404Spjd 3220168404Spjd *config = NULL; 3221219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3222168404Spjd 3223209962Smm if (spa != NULL) { 3224209962Smm /* 3225209962Smm * This still leaves a window of inconsistency where the spares 3226209962Smm * or l2cache devices could change and the config would be 3227209962Smm * self-inconsistent. 3228209962Smm */ 3229209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3230168404Spjd 3231209962Smm if (*config != NULL) { 3232219089Spjd uint64_t loadtimes[2]; 3233219089Spjd 3234219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3235219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3236219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3237219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3238219089Spjd 3239185029Spjd VERIFY(nvlist_add_uint64(*config, 3240209962Smm ZPOOL_CONFIG_ERRCOUNT, 3241209962Smm spa_get_errlog_size(spa)) == 0); 3242185029Spjd 3243209962Smm if (spa_suspended(spa)) 3244209962Smm VERIFY(nvlist_add_uint64(*config, 3245209962Smm ZPOOL_CONFIG_SUSPENDED, 3246209962Smm spa->spa_failmode) == 0); 3247209962Smm 3248209962Smm spa_add_spares(spa, *config); 3249209962Smm spa_add_l2cache(spa, *config); 3250236884Smm spa_add_feature_stats(spa, *config); 3251209962Smm } 3252168404Spjd } 3253168404Spjd 3254168404Spjd /* 3255168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3256168404Spjd * and call spa_lookup() directly. 3257168404Spjd */ 3258168404Spjd if (altroot) { 3259168404Spjd if (spa == NULL) { 3260168404Spjd mutex_enter(&spa_namespace_lock); 3261168404Spjd spa = spa_lookup(name); 3262168404Spjd if (spa) 3263168404Spjd spa_altroot(spa, altroot, buflen); 3264168404Spjd else 3265168404Spjd altroot[0] = '\0'; 3266168404Spjd spa = NULL; 3267168404Spjd mutex_exit(&spa_namespace_lock); 3268168404Spjd } else { 3269168404Spjd spa_altroot(spa, altroot, buflen); 3270168404Spjd } 3271168404Spjd } 3272168404Spjd 3273209962Smm if (spa != NULL) { 3274209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3275168404Spjd spa_close(spa, FTAG); 3276209962Smm } 3277168404Spjd 3278168404Spjd return (error); 3279168404Spjd} 3280168404Spjd 3281168404Spjd/* 3282185029Spjd * Validate that the auxiliary device array is well formed. We must have an 3283185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 3284185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3285185029Spjd * specified, as long as they are well-formed. 3286168404Spjd */ 3287168404Spjdstatic int 3288185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3289185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 3290185029Spjd vdev_labeltype_t label) 3291168404Spjd{ 3292185029Spjd nvlist_t **dev; 3293185029Spjd uint_t i, ndev; 3294168404Spjd vdev_t *vd; 3295168404Spjd int error; 3296168404Spjd 3297185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3298185029Spjd 3299168404Spjd /* 3300185029Spjd * It's acceptable to have no devs specified. 3301168404Spjd */ 3302185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3303168404Spjd return (0); 3304168404Spjd 3305185029Spjd if (ndev == 0) 3306249195Smm return (SET_ERROR(EINVAL)); 3307168404Spjd 3308168404Spjd /* 3309185029Spjd * Make sure the pool is formatted with a version that supports this 3310185029Spjd * device type. 3311168404Spjd */ 3312185029Spjd if (spa_version(spa) < version) 3313249195Smm return (SET_ERROR(ENOTSUP)); 3314168404Spjd 3315168404Spjd /* 3316185029Spjd * Set the pending device list so we correctly handle device in-use 3317168404Spjd * checking. 3318168404Spjd */ 3319185029Spjd sav->sav_pending = dev; 3320185029Spjd sav->sav_npending = ndev; 3321168404Spjd 3322185029Spjd for (i = 0; i < ndev; i++) { 3323185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3324168404Spjd mode)) != 0) 3325168404Spjd goto out; 3326168404Spjd 3327168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3328168404Spjd vdev_free(vd); 3329249195Smm error = SET_ERROR(EINVAL); 3330168404Spjd goto out; 3331168404Spjd } 3332168404Spjd 3333185029Spjd /* 3334185029Spjd * The L2ARC currently only supports disk devices in 3335185029Spjd * kernel context. For user-level testing, we allow it. 3336185029Spjd */ 3337185029Spjd#ifdef _KERNEL 3338185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3339185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3340249195Smm error = SET_ERROR(ENOTBLK); 3341230514Smm vdev_free(vd); 3342185029Spjd goto out; 3343185029Spjd } 3344185029Spjd#endif 3345168404Spjd vd->vdev_top = vd; 3346168404Spjd 3347168404Spjd if ((error = vdev_open(vd)) == 0 && 3348185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 3349185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3350168404Spjd vd->vdev_guid) == 0); 3351168404Spjd } 3352168404Spjd 3353168404Spjd vdev_free(vd); 3354168404Spjd 3355185029Spjd if (error && 3356185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3357168404Spjd goto out; 3358168404Spjd else 3359168404Spjd error = 0; 3360168404Spjd } 3361168404Spjd 3362168404Spjdout: 3363185029Spjd sav->sav_pending = NULL; 3364185029Spjd sav->sav_npending = 0; 3365168404Spjd return (error); 3366168404Spjd} 3367168404Spjd 3368185029Spjdstatic int 3369185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3370185029Spjd{ 3371185029Spjd int error; 3372185029Spjd 3373185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3374185029Spjd 3375185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3376185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3377185029Spjd VDEV_LABEL_SPARE)) != 0) { 3378185029Spjd return (error); 3379185029Spjd } 3380185029Spjd 3381185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3382185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3383185029Spjd VDEV_LABEL_L2CACHE)); 3384185029Spjd} 3385185029Spjd 3386185029Spjdstatic void 3387185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3388185029Spjd const char *config) 3389185029Spjd{ 3390185029Spjd int i; 3391185029Spjd 3392185029Spjd if (sav->sav_config != NULL) { 3393185029Spjd nvlist_t **olddevs; 3394185029Spjd uint_t oldndevs; 3395185029Spjd nvlist_t **newdevs; 3396185029Spjd 3397185029Spjd /* 3398185029Spjd * Generate new dev list by concatentating with the 3399185029Spjd * current dev list. 3400185029Spjd */ 3401185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3402185029Spjd &olddevs, &oldndevs) == 0); 3403185029Spjd 3404185029Spjd newdevs = kmem_alloc(sizeof (void *) * 3405185029Spjd (ndevs + oldndevs), KM_SLEEP); 3406185029Spjd for (i = 0; i < oldndevs; i++) 3407185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3408185029Spjd KM_SLEEP) == 0); 3409185029Spjd for (i = 0; i < ndevs; i++) 3410185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3411185029Spjd KM_SLEEP) == 0); 3412185029Spjd 3413185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 3414185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 3415185029Spjd 3416185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3417185029Spjd config, newdevs, ndevs + oldndevs) == 0); 3418185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 3419185029Spjd nvlist_free(newdevs[i]); 3420185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3421185029Spjd } else { 3422185029Spjd /* 3423185029Spjd * Generate a new dev list. 3424185029Spjd */ 3425185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3426185029Spjd KM_SLEEP) == 0); 3427185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3428185029Spjd devs, ndevs) == 0); 3429185029Spjd } 3430185029Spjd} 3431185029Spjd 3432168404Spjd/* 3433185029Spjd * Stop and drop level 2 ARC devices 3434185029Spjd */ 3435185029Spjdvoid 3436185029Spjdspa_l2cache_drop(spa_t *spa) 3437185029Spjd{ 3438185029Spjd vdev_t *vd; 3439185029Spjd int i; 3440185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 3441185029Spjd 3442185029Spjd for (i = 0; i < sav->sav_count; i++) { 3443185029Spjd uint64_t pool; 3444185029Spjd 3445185029Spjd vd = sav->sav_vdevs[i]; 3446185029Spjd ASSERT(vd != NULL); 3447185029Spjd 3448209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3449209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 3450185029Spjd l2arc_remove_vdev(vd); 3451185029Spjd } 3452185029Spjd} 3453185029Spjd 3454185029Spjd/* 3455168404Spjd * Pool Creation 3456168404Spjd */ 3457168404Spjdint 3458185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3459248571Smm nvlist_t *zplprops) 3460168404Spjd{ 3461168404Spjd spa_t *spa; 3462185029Spjd char *altroot = NULL; 3463168404Spjd vdev_t *rvd; 3464168404Spjd dsl_pool_t *dp; 3465168404Spjd dmu_tx_t *tx; 3466219089Spjd int error = 0; 3467168404Spjd uint64_t txg = TXG_INITIAL; 3468185029Spjd nvlist_t **spares, **l2cache; 3469185029Spjd uint_t nspares, nl2cache; 3470219089Spjd uint64_t version, obj; 3471236884Smm boolean_t has_features; 3472168404Spjd 3473168404Spjd /* 3474168404Spjd * If this pool already exists, return failure. 3475168404Spjd */ 3476168404Spjd mutex_enter(&spa_namespace_lock); 3477168404Spjd if (spa_lookup(pool) != NULL) { 3478168404Spjd mutex_exit(&spa_namespace_lock); 3479249195Smm return (SET_ERROR(EEXIST)); 3480168404Spjd } 3481168404Spjd 3482168404Spjd /* 3483168404Spjd * Allocate a new spa_t structure. 3484168404Spjd */ 3485185029Spjd (void) nvlist_lookup_string(props, 3486185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3487219089Spjd spa = spa_add(pool, NULL, altroot); 3488209962Smm spa_activate(spa, spa_mode_global); 3489168404Spjd 3490185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 3491185029Spjd spa_deactivate(spa); 3492185029Spjd spa_remove(spa); 3493185029Spjd mutex_exit(&spa_namespace_lock); 3494185029Spjd return (error); 3495185029Spjd } 3496185029Spjd 3497236884Smm has_features = B_FALSE; 3498236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3499236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3500236884Smm if (zpool_prop_feature(nvpair_name(elem))) 3501236884Smm has_features = B_TRUE; 3502236884Smm } 3503236884Smm 3504236884Smm if (has_features || nvlist_lookup_uint64(props, 3505236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3506185029Spjd version = SPA_VERSION; 3507236884Smm } 3508236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3509219089Spjd 3510219089Spjd spa->spa_first_txg = txg; 3511219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 3512185029Spjd spa->spa_uberblock.ub_version = version; 3513168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3514168404Spjd 3515168404Spjd /* 3516209962Smm * Create "The Godfather" zio to hold all async IOs 3517209962Smm */ 3518209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 3519209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 3520209962Smm 3521209962Smm /* 3522168404Spjd * Create the root vdev. 3523168404Spjd */ 3524185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3525168404Spjd 3526168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3527168404Spjd 3528168404Spjd ASSERT(error != 0 || rvd != NULL); 3529168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3530168404Spjd 3531185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 3532249195Smm error = SET_ERROR(EINVAL); 3533168404Spjd 3534168404Spjd if (error == 0 && 3535168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3536185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 3537168404Spjd VDEV_ALLOC_ADD)) == 0) { 3538219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 3539254591Sgibbs vdev_ashift_optimize(rvd->vdev_child[c]); 3540219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 3541219089Spjd vdev_expand(rvd->vdev_child[c], txg); 3542219089Spjd } 3543168404Spjd } 3544168404Spjd 3545185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3546168404Spjd 3547168404Spjd if (error != 0) { 3548168404Spjd spa_unload(spa); 3549168404Spjd spa_deactivate(spa); 3550168404Spjd spa_remove(spa); 3551168404Spjd mutex_exit(&spa_namespace_lock); 3552168404Spjd return (error); 3553168404Spjd } 3554168404Spjd 3555168404Spjd /* 3556168404Spjd * Get the list of spares, if specified. 3557168404Spjd */ 3558168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3559168404Spjd &spares, &nspares) == 0) { 3560185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3561168404Spjd KM_SLEEP) == 0); 3562185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3563168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3564185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3565168404Spjd spa_load_spares(spa); 3566185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3567185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3568168404Spjd } 3569168404Spjd 3570185029Spjd /* 3571185029Spjd * Get the list of level 2 cache devices, if specified. 3572185029Spjd */ 3573185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3574185029Spjd &l2cache, &nl2cache) == 0) { 3575185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3576185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 3577185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3578185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3579185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3580185029Spjd spa_load_l2cache(spa); 3581185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3582185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3583185029Spjd } 3584185029Spjd 3585236884Smm spa->spa_is_initializing = B_TRUE; 3586185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3587168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 3588236884Smm spa->spa_is_initializing = B_FALSE; 3589168404Spjd 3590219089Spjd /* 3591219089Spjd * Create DDTs (dedup tables). 3592219089Spjd */ 3593219089Spjd ddt_create(spa); 3594219089Spjd 3595219089Spjd spa_update_dspace(spa); 3596219089Spjd 3597168404Spjd tx = dmu_tx_create_assigned(dp, txg); 3598168404Spjd 3599168404Spjd /* 3600168404Spjd * Create the pool config object. 3601168404Spjd */ 3602168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3603185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3604168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3605168404Spjd 3606168404Spjd if (zap_add(spa->spa_meta_objset, 3607168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3608168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3609168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 3610168404Spjd } 3611168404Spjd 3612236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 3613236884Smm spa_feature_create_zap_objects(spa, tx); 3614236884Smm 3615219089Spjd if (zap_add(spa->spa_meta_objset, 3616219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3617219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 3618219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 3619219089Spjd } 3620219089Spjd 3621185029Spjd /* Newly created pools with the right version are always deflated. */ 3622185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3623185029Spjd spa->spa_deflate = TRUE; 3624185029Spjd if (zap_add(spa->spa_meta_objset, 3625185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3626185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3627185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 3628185029Spjd } 3629168404Spjd } 3630168404Spjd 3631168404Spjd /* 3632219089Spjd * Create the deferred-free bpobj. Turn off compression 3633168404Spjd * because sync-to-convergence takes longer if the blocksize 3634168404Spjd * keeps changing. 3635168404Spjd */ 3636219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3637219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 3638168404Spjd ZIO_COMPRESS_OFF, tx); 3639168404Spjd if (zap_add(spa->spa_meta_objset, 3640219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3641219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 3642219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 3643168404Spjd } 3644219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3645219089Spjd spa->spa_meta_objset, obj)); 3646168404Spjd 3647168404Spjd /* 3648168404Spjd * Create the pool's history object. 3649168404Spjd */ 3650185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 3651185029Spjd spa_history_create_obj(spa, tx); 3652168404Spjd 3653185029Spjd /* 3654185029Spjd * Set pool properties. 3655185029Spjd */ 3656185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3657185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3658185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3659219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3660219089Spjd 3661209962Smm if (props != NULL) { 3662209962Smm spa_configfile_set(spa, props, B_FALSE); 3663248571Smm spa_sync_props(props, tx); 3664209962Smm } 3665185029Spjd 3666168404Spjd dmu_tx_commit(tx); 3667168404Spjd 3668168404Spjd spa->spa_sync_on = B_TRUE; 3669168404Spjd txg_sync_start(spa->spa_dsl_pool); 3670168404Spjd 3671168404Spjd /* 3672168404Spjd * We explicitly wait for the first transaction to complete so that our 3673168404Spjd * bean counters are appropriately updated. 3674168404Spjd */ 3675168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 3676168404Spjd 3677185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3678168404Spjd 3679248571Smm spa_history_log_version(spa, "create"); 3680185029Spjd 3681208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 3682208442Smm 3683168404Spjd mutex_exit(&spa_namespace_lock); 3684168404Spjd 3685168404Spjd return (0); 3686168404Spjd} 3687168404Spjd 3688241286Savg#ifdef _KERNEL 3689219089Spjd#if defined(sun) 3690185029Spjd/* 3691219089Spjd * Get the root pool information from the root disk, then import the root pool 3692219089Spjd * during the system boot up time. 3693185029Spjd */ 3694219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3695219089Spjd 3696219089Spjdstatic nvlist_t * 3697219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3698185029Spjd{ 3699219089Spjd nvlist_t *config; 3700185029Spjd nvlist_t *nvtop, *nvroot; 3701185029Spjd uint64_t pgid; 3702185029Spjd 3703219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3704219089Spjd return (NULL); 3705219089Spjd 3706168404Spjd /* 3707185029Spjd * Add this top-level vdev to the child array. 3708168404Spjd */ 3709219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3710219089Spjd &nvtop) == 0); 3711219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3712219089Spjd &pgid) == 0); 3713219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3714168404Spjd 3715185029Spjd /* 3716185029Spjd * Put this pool's top-level vdevs into a root vdev. 3717185029Spjd */ 3718185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3719219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3720219089Spjd VDEV_TYPE_ROOT) == 0); 3721185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3722185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3723185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3724185029Spjd &nvtop, 1) == 0); 3725168404Spjd 3726168404Spjd /* 3727185029Spjd * Replace the existing vdev_tree with the new root vdev in 3728185029Spjd * this pool's configuration (remove the old, add the new). 3729168404Spjd */ 3730185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3731185029Spjd nvlist_free(nvroot); 3732219089Spjd return (config); 3733185029Spjd} 3734168404Spjd 3735185029Spjd/* 3736219089Spjd * Walk the vdev tree and see if we can find a device with "better" 3737219089Spjd * configuration. A configuration is "better" if the label on that 3738219089Spjd * device has a more recent txg. 3739185029Spjd */ 3740219089Spjdstatic void 3741219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3742185029Spjd{ 3743219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3744219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3745185029Spjd 3746219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 3747219089Spjd nvlist_t *label; 3748219089Spjd uint64_t label_txg; 3749185029Spjd 3750219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3751219089Spjd &label) != 0) 3752219089Spjd return; 3753185029Spjd 3754219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3755219089Spjd &label_txg) == 0); 3756168404Spjd 3757219089Spjd /* 3758219089Spjd * Do we have a better boot device? 3759219089Spjd */ 3760219089Spjd if (label_txg > *txg) { 3761219089Spjd *txg = label_txg; 3762219089Spjd *avd = vd; 3763185029Spjd } 3764219089Spjd nvlist_free(label); 3765185029Spjd } 3766185029Spjd} 3767185029Spjd 3768185029Spjd/* 3769185029Spjd * Import a root pool. 3770185029Spjd * 3771185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 3772185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3773185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 3774185029Spjd * 3775185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 3776185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 3777185029Spjd * e.g. 3778185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 3779185029Spjd */ 3780185029Spjdint 3781185029Spjdspa_import_rootpool(char *devpath, char *devid) 3782185029Spjd{ 3783219089Spjd spa_t *spa; 3784219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 3785219089Spjd nvlist_t *config, *nvtop; 3786219089Spjd uint64_t guid, txg; 3787185029Spjd char *pname; 3788185029Spjd int error; 3789185029Spjd 3790185029Spjd /* 3791219089Spjd * Read the label from the boot device and generate a configuration. 3792185029Spjd */ 3793219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3794219089Spjd#if defined(_OBP) && defined(_KERNEL) 3795219089Spjd if (config == NULL) { 3796219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 3797219089Spjd /* iscsi boot */ 3798219089Spjd get_iscsi_bootpath_phy(devpath); 3799219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3800219089Spjd } 3801219089Spjd } 3802219089Spjd#endif 3803219089Spjd if (config == NULL) { 3804236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3805219089Spjd devpath); 3806249195Smm return (SET_ERROR(EIO)); 3807219089Spjd } 3808185029Spjd 3809219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3810219089Spjd &pname) == 0); 3811219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3812185029Spjd 3813209962Smm mutex_enter(&spa_namespace_lock); 3814209962Smm if ((spa = spa_lookup(pname)) != NULL) { 3815209962Smm /* 3816209962Smm * Remove the existing root pool from the namespace so that we 3817209962Smm * can replace it with the correct config we just read in. 3818209962Smm */ 3819209962Smm spa_remove(spa); 3820209962Smm } 3821185029Spjd 3822219089Spjd spa = spa_add(pname, config, NULL); 3823209962Smm spa->spa_is_root = B_TRUE; 3824219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3825209962Smm 3826219089Spjd /* 3827219089Spjd * Build up a vdev tree based on the boot device's label config. 3828219089Spjd */ 3829219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3830219089Spjd &nvtop) == 0); 3831219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3832219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3833219089Spjd VDEV_ALLOC_ROOTPOOL); 3834219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3835219089Spjd if (error) { 3836209962Smm mutex_exit(&spa_namespace_lock); 3837219089Spjd nvlist_free(config); 3838219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3839219089Spjd pname); 3840219089Spjd return (error); 3841209962Smm } 3842209962Smm 3843219089Spjd /* 3844219089Spjd * Get the boot vdev. 3845219089Spjd */ 3846219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3847219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3848219089Spjd (u_longlong_t)guid); 3849249195Smm error = SET_ERROR(ENOENT); 3850219089Spjd goto out; 3851219089Spjd } 3852209962Smm 3853219089Spjd /* 3854219089Spjd * Determine if there is a better boot device. 3855219089Spjd */ 3856219089Spjd avd = bvd; 3857219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 3858219089Spjd if (avd != bvd) { 3859219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3860219089Spjd "try booting from '%s'", avd->vdev_path); 3861249195Smm error = SET_ERROR(EINVAL); 3862219089Spjd goto out; 3863219089Spjd } 3864209962Smm 3865219089Spjd /* 3866219089Spjd * If the boot device is part of a spare vdev then ensure that 3867219089Spjd * we're booting off the active spare. 3868219089Spjd */ 3869219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3870219089Spjd !bvd->vdev_isspare) { 3871219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3872219089Spjd "try booting from '%s'", 3873219089Spjd bvd->vdev_parent-> 3874219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3875249195Smm error = SET_ERROR(EINVAL); 3876219089Spjd goto out; 3877219089Spjd } 3878209962Smm 3879219089Spjd error = 0; 3880219089Spjdout: 3881219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3882219089Spjd vdev_free(rvd); 3883219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3884209962Smm mutex_exit(&spa_namespace_lock); 3885209962Smm 3886219089Spjd nvlist_free(config); 3887219089Spjd return (error); 3888185029Spjd} 3889185029Spjd 3890241286Savg#else 3891241286Savg 3892243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3893243502Savg uint64_t *count); 3894241286Savg 3895241286Savgstatic nvlist_t * 3896241286Savgspa_generate_rootconf(const char *name) 3897241286Savg{ 3898243502Savg nvlist_t **configs, **tops; 3899241286Savg nvlist_t *config; 3900243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 3901243502Savg uint64_t *holes; 3902243502Savg uint64_t best_txg; 3903243213Savg uint64_t nchildren; 3904241286Savg uint64_t pgid; 3905243502Savg uint64_t count; 3906243502Savg uint64_t i; 3907243502Savg uint_t nholes; 3908241286Savg 3909243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 3910241286Savg return (NULL); 3911241286Savg 3912243502Savg ASSERT3U(count, !=, 0); 3913243502Savg best_txg = 0; 3914243502Savg for (i = 0; i < count; i++) { 3915243502Savg uint64_t txg; 3916243502Savg 3917243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 3918243502Savg &txg) == 0); 3919243502Savg if (txg > best_txg) { 3920243502Savg best_txg = txg; 3921243502Savg best_cfg = configs[i]; 3922243502Savg } 3923243502Savg } 3924243502Savg 3925245945Savg nchildren = 1; 3926245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 3927243502Savg holes = NULL; 3928243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 3929243502Savg &holes, &nholes); 3930243502Savg 3931244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 3932243502Savg for (i = 0; i < nchildren; i++) { 3933243502Savg if (i >= count) 3934243502Savg break; 3935243502Savg if (configs[i] == NULL) 3936243502Savg continue; 3937243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 3938243502Savg &nvtop) == 0); 3939243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 3940243213Savg } 3941243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 3942243502Savg if (i >= nchildren) 3943243502Savg continue; 3944243502Savg if (tops[holes[i]] != NULL) 3945243502Savg continue; 3946243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 3947243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 3948243502Savg VDEV_TYPE_HOLE) == 0); 3949243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 3950243502Savg holes[i]) == 0); 3951243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 3952243502Savg 0) == 0); 3953243502Savg } 3954243502Savg for (i = 0; i < nchildren; i++) { 3955243502Savg if (tops[i] != NULL) 3956243502Savg continue; 3957243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 3958243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 3959243502Savg VDEV_TYPE_MISSING) == 0); 3960243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 3961243502Savg i) == 0); 3962243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 3963243502Savg 0) == 0); 3964243502Savg } 3965243213Savg 3966243213Savg /* 3967243502Savg * Create pool config based on the best vdev config. 3968241286Savg */ 3969243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 3970241286Savg 3971241286Savg /* 3972241286Savg * Put this pool's top-level vdevs into a root vdev. 3973241286Savg */ 3974243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3975243502Savg &pgid) == 0); 3976241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3977241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3978241286Savg VDEV_TYPE_ROOT) == 0); 3979241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3980241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3981241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3982243502Savg tops, nchildren) == 0); 3983241286Savg 3984241286Savg /* 3985241286Savg * Replace the existing vdev_tree with the new root vdev in 3986241286Savg * this pool's configuration (remove the old, add the new). 3987241286Savg */ 3988241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3989243502Savg 3990243502Savg /* 3991243502Savg * Drop vdev config elements that should not be present at pool level. 3992243502Savg */ 3993243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 3994243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 3995243502Savg 3996243502Savg for (i = 0; i < count; i++) 3997243502Savg nvlist_free(configs[i]); 3998243502Savg kmem_free(configs, count * sizeof(void *)); 3999243502Savg for (i = 0; i < nchildren; i++) 4000243502Savg nvlist_free(tops[i]); 4001243502Savg kmem_free(tops, nchildren * sizeof(void *)); 4002241286Savg nvlist_free(nvroot); 4003241286Savg return (config); 4004241286Savg} 4005241286Savg 4006241286Savgint 4007241286Savgspa_import_rootpool(const char *name) 4008241286Savg{ 4009241286Savg spa_t *spa; 4010241286Savg vdev_t *rvd, *bvd, *avd = NULL; 4011241286Savg nvlist_t *config, *nvtop; 4012241286Savg uint64_t txg; 4013241286Savg char *pname; 4014241286Savg int error; 4015241286Savg 4016241286Savg /* 4017241286Savg * Read the label from the boot device and generate a configuration. 4018241286Savg */ 4019241286Savg config = spa_generate_rootconf(name); 4020243213Savg 4021243213Savg mutex_enter(&spa_namespace_lock); 4022243213Savg if (config != NULL) { 4023243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4024243213Savg &pname) == 0 && strcmp(name, pname) == 0); 4025243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 4026243213Savg == 0); 4027243213Savg 4028243213Savg if ((spa = spa_lookup(pname)) != NULL) { 4029243213Savg /* 4030243213Savg * Remove the existing root pool from the namespace so 4031243213Savg * that we can replace it with the correct config 4032243213Savg * we just read in. 4033243213Savg */ 4034243213Savg spa_remove(spa); 4035243213Savg } 4036243213Savg spa = spa_add(pname, config, NULL); 4037243501Savg 4038243501Savg /* 4039243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4040243501Savg * via spa_version(). 4041243501Savg */ 4042243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4043243501Savg &spa->spa_ubsync.ub_version) != 0) 4044243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4045243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 4046241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4047241286Savg name); 4048241286Savg return (EIO); 4049243213Savg } else { 4050243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4051241286Savg } 4052241286Savg spa->spa_is_root = B_TRUE; 4053241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4054241286Savg 4055241286Savg /* 4056241286Savg * Build up a vdev tree based on the boot device's label config. 4057241286Savg */ 4058241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4059241286Savg &nvtop) == 0); 4060241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4061241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4062241286Savg VDEV_ALLOC_ROOTPOOL); 4063241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4064241286Savg if (error) { 4065241286Savg mutex_exit(&spa_namespace_lock); 4066241286Savg nvlist_free(config); 4067241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4068241286Savg pname); 4069241286Savg return (error); 4070241286Savg } 4071241286Savg 4072241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4073241286Savg vdev_free(rvd); 4074241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4075241286Savg mutex_exit(&spa_namespace_lock); 4076241286Savg 4077243213Savg nvlist_free(config); 4078243213Savg return (0); 4079241286Savg} 4080241286Savg 4081241286Savg#endif /* sun */ 4082219089Spjd#endif 4083219089Spjd 4084209962Smm/* 4085209962Smm * Import a non-root pool into the system. 4086209962Smm */ 4087185029Spjdint 4088219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4089185029Spjd{ 4090209962Smm spa_t *spa; 4091209962Smm char *altroot = NULL; 4092219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 4093219089Spjd zpool_rewind_policy_t policy; 4094219089Spjd uint64_t mode = spa_mode_global; 4095219089Spjd uint64_t readonly = B_FALSE; 4096209962Smm int error; 4097209962Smm nvlist_t *nvroot; 4098209962Smm nvlist_t **spares, **l2cache; 4099209962Smm uint_t nspares, nl2cache; 4100209962Smm 4101209962Smm /* 4102209962Smm * If a pool with this name exists, return failure. 4103209962Smm */ 4104209962Smm mutex_enter(&spa_namespace_lock); 4105219089Spjd if (spa_lookup(pool) != NULL) { 4106209962Smm mutex_exit(&spa_namespace_lock); 4107249195Smm return (SET_ERROR(EEXIST)); 4108209962Smm } 4109209962Smm 4110209962Smm /* 4111209962Smm * Create and initialize the spa structure. 4112209962Smm */ 4113209962Smm (void) nvlist_lookup_string(props, 4114209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4115219089Spjd (void) nvlist_lookup_uint64(props, 4116219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4117219089Spjd if (readonly) 4118219089Spjd mode = FREAD; 4119219089Spjd spa = spa_add(pool, config, altroot); 4120219089Spjd spa->spa_import_flags = flags; 4121209962Smm 4122209962Smm /* 4123219089Spjd * Verbatim import - Take a pool and insert it into the namespace 4124219089Spjd * as if it had been loaded at boot. 4125219089Spjd */ 4126219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4127219089Spjd if (props != NULL) 4128219089Spjd spa_configfile_set(spa, props, B_FALSE); 4129219089Spjd 4130219089Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 4131219089Spjd 4132219089Spjd mutex_exit(&spa_namespace_lock); 4133219089Spjd return (0); 4134219089Spjd } 4135219089Spjd 4136219089Spjd spa_activate(spa, mode); 4137219089Spjd 4138219089Spjd /* 4139209962Smm * Don't start async tasks until we know everything is healthy. 4140209962Smm */ 4141209962Smm spa_async_suspend(spa); 4142209962Smm 4143219089Spjd zpool_get_rewind_policy(config, &policy); 4144219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 4145219089Spjd state = SPA_LOAD_RECOVER; 4146219089Spjd 4147209962Smm /* 4148209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4149209962Smm * because the user-supplied config is actually the one to trust when 4150209962Smm * doing an import. 4151209962Smm */ 4152219089Spjd if (state != SPA_LOAD_RECOVER) 4153219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4154209962Smm 4155219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4156219089Spjd policy.zrp_request); 4157219089Spjd 4158219089Spjd /* 4159219089Spjd * Propagate anything learned while loading the pool and pass it 4160219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 4161219089Spjd */ 4162219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4163219089Spjd spa->spa_load_info) == 0); 4164219089Spjd 4165209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4166209962Smm /* 4167209962Smm * Toss any existing sparelist, as it doesn't have any validity 4168209962Smm * anymore, and conflicts with spa_has_spare(). 4169209962Smm */ 4170209962Smm if (spa->spa_spares.sav_config) { 4171209962Smm nvlist_free(spa->spa_spares.sav_config); 4172209962Smm spa->spa_spares.sav_config = NULL; 4173209962Smm spa_load_spares(spa); 4174209962Smm } 4175209962Smm if (spa->spa_l2cache.sav_config) { 4176209962Smm nvlist_free(spa->spa_l2cache.sav_config); 4177209962Smm spa->spa_l2cache.sav_config = NULL; 4178209962Smm spa_load_l2cache(spa); 4179209962Smm } 4180209962Smm 4181209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4182209962Smm &nvroot) == 0); 4183209962Smm if (error == 0) 4184209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4185209962Smm VDEV_ALLOC_SPARE); 4186209962Smm if (error == 0) 4187209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4188209962Smm VDEV_ALLOC_L2CACHE); 4189209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4190209962Smm 4191209962Smm if (props != NULL) 4192209962Smm spa_configfile_set(spa, props, B_FALSE); 4193209962Smm 4194209962Smm if (error != 0 || (props && spa_writeable(spa) && 4195209962Smm (error = spa_prop_set(spa, props)))) { 4196209962Smm spa_unload(spa); 4197209962Smm spa_deactivate(spa); 4198209962Smm spa_remove(spa); 4199209962Smm mutex_exit(&spa_namespace_lock); 4200209962Smm return (error); 4201209962Smm } 4202209962Smm 4203209962Smm spa_async_resume(spa); 4204209962Smm 4205209962Smm /* 4206209962Smm * Override any spares and level 2 cache devices as specified by 4207209962Smm * the user, as these may have correct device names/devids, etc. 4208209962Smm */ 4209209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4210209962Smm &spares, &nspares) == 0) { 4211209962Smm if (spa->spa_spares.sav_config) 4212209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4213209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4214209962Smm else 4215209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4216209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4217209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4218209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4219209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4220209962Smm spa_load_spares(spa); 4221209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4222209962Smm spa->spa_spares.sav_sync = B_TRUE; 4223209962Smm } 4224209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4225209962Smm &l2cache, &nl2cache) == 0) { 4226209962Smm if (spa->spa_l2cache.sav_config) 4227209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4228209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4229209962Smm else 4230209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4231209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4232209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4233209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4234209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4235209962Smm spa_load_l2cache(spa); 4236209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4237209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 4238209962Smm } 4239209962Smm 4240219089Spjd /* 4241219089Spjd * Check for any removed devices. 4242219089Spjd */ 4243219089Spjd if (spa->spa_autoreplace) { 4244219089Spjd spa_aux_check_removed(&spa->spa_spares); 4245219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 4246219089Spjd } 4247219089Spjd 4248209962Smm if (spa_writeable(spa)) { 4249209962Smm /* 4250209962Smm * Update the config cache to include the newly-imported pool. 4251209962Smm */ 4252209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4253209962Smm } 4254209962Smm 4255219089Spjd /* 4256219089Spjd * It's possible that the pool was expanded while it was exported. 4257219089Spjd * We kick off an async task to handle this for us. 4258219089Spjd */ 4259219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4260219089Spjd 4261209962Smm mutex_exit(&spa_namespace_lock); 4262248571Smm spa_history_log_version(spa, "import"); 4263209962Smm 4264219089Spjd#ifdef __FreeBSD__ 4265219089Spjd#ifdef _KERNEL 4266219089Spjd zvol_create_minors(pool); 4267219089Spjd#endif 4268219089Spjd#endif 4269209962Smm return (0); 4270185029Spjd} 4271185029Spjd 4272168404Spjdnvlist_t * 4273168404Spjdspa_tryimport(nvlist_t *tryconfig) 4274168404Spjd{ 4275168404Spjd nvlist_t *config = NULL; 4276168404Spjd char *poolname; 4277168404Spjd spa_t *spa; 4278168404Spjd uint64_t state; 4279208443Smm int error; 4280168404Spjd 4281168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4282168404Spjd return (NULL); 4283168404Spjd 4284168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4285168404Spjd return (NULL); 4286168404Spjd 4287168404Spjd /* 4288168404Spjd * Create and initialize the spa structure. 4289168404Spjd */ 4290168404Spjd mutex_enter(&spa_namespace_lock); 4291219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4292209962Smm spa_activate(spa, FREAD); 4293168404Spjd 4294168404Spjd /* 4295168404Spjd * Pass off the heavy lifting to spa_load(). 4296168404Spjd * Pass TRUE for mosconfig because the user-supplied config 4297168404Spjd * is actually the one to trust when doing an import. 4298168404Spjd */ 4299219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4300168404Spjd 4301168404Spjd /* 4302168404Spjd * If 'tryconfig' was at least parsable, return the current config. 4303168404Spjd */ 4304168404Spjd if (spa->spa_root_vdev != NULL) { 4305168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4306168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4307168404Spjd poolname) == 0); 4308168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4309168404Spjd state) == 0); 4310168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4311168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 4312236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4313236884Smm spa->spa_load_info) == 0); 4314168404Spjd 4315168404Spjd /* 4316185029Spjd * If the bootfs property exists on this pool then we 4317185029Spjd * copy it out so that external consumers can tell which 4318185029Spjd * pools are bootable. 4319168404Spjd */ 4320208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 4321185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4322185029Spjd 4323185029Spjd /* 4324185029Spjd * We have to play games with the name since the 4325185029Spjd * pool was opened as TRYIMPORT_NAME. 4326185029Spjd */ 4327185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 4328185029Spjd spa->spa_bootfs, tmpname) == 0) { 4329185029Spjd char *cp; 4330185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4331185029Spjd 4332185029Spjd cp = strchr(tmpname, '/'); 4333185029Spjd if (cp == NULL) { 4334185029Spjd (void) strlcpy(dsname, tmpname, 4335185029Spjd MAXPATHLEN); 4336185029Spjd } else { 4337185029Spjd (void) snprintf(dsname, MAXPATHLEN, 4338185029Spjd "%s/%s", poolname, ++cp); 4339185029Spjd } 4340185029Spjd VERIFY(nvlist_add_string(config, 4341185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4342185029Spjd kmem_free(dsname, MAXPATHLEN); 4343185029Spjd } 4344185029Spjd kmem_free(tmpname, MAXPATHLEN); 4345185029Spjd } 4346185029Spjd 4347185029Spjd /* 4348185029Spjd * Add the list of hot spares and level 2 cache devices. 4349185029Spjd */ 4350209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4351168404Spjd spa_add_spares(spa, config); 4352185029Spjd spa_add_l2cache(spa, config); 4353209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4354168404Spjd } 4355168404Spjd 4356168404Spjd spa_unload(spa); 4357168404Spjd spa_deactivate(spa); 4358168404Spjd spa_remove(spa); 4359168404Spjd mutex_exit(&spa_namespace_lock); 4360168404Spjd 4361168404Spjd return (config); 4362168404Spjd} 4363168404Spjd 4364168404Spjd/* 4365168404Spjd * Pool export/destroy 4366168404Spjd * 4367168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 4368168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 4369168404Spjd * update the pool state and sync all the labels to disk, removing the 4370207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4371207670Smm * we don't sync the labels or remove the configuration cache. 4372168404Spjd */ 4373168404Spjdstatic int 4374185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4375207670Smm boolean_t force, boolean_t hardforce) 4376168404Spjd{ 4377168404Spjd spa_t *spa; 4378168404Spjd 4379168404Spjd if (oldconfig) 4380168404Spjd *oldconfig = NULL; 4381168404Spjd 4382209962Smm if (!(spa_mode_global & FWRITE)) 4383249195Smm return (SET_ERROR(EROFS)); 4384168404Spjd 4385168404Spjd mutex_enter(&spa_namespace_lock); 4386168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4387168404Spjd mutex_exit(&spa_namespace_lock); 4388249195Smm return (SET_ERROR(ENOENT)); 4389168404Spjd } 4390168404Spjd 4391168404Spjd /* 4392168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 4393168404Spjd * reacquire the namespace lock, and see if we can export. 4394168404Spjd */ 4395168404Spjd spa_open_ref(spa, FTAG); 4396168404Spjd mutex_exit(&spa_namespace_lock); 4397168404Spjd spa_async_suspend(spa); 4398168404Spjd mutex_enter(&spa_namespace_lock); 4399168404Spjd spa_close(spa, FTAG); 4400168404Spjd 4401168404Spjd /* 4402168404Spjd * The pool will be in core if it's openable, 4403168404Spjd * in which case we can modify its state. 4404168404Spjd */ 4405168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4406168404Spjd /* 4407168404Spjd * Objsets may be open only because they're dirty, so we 4408168404Spjd * have to force it to sync before checking spa_refcnt. 4409168404Spjd */ 4410168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 4411168404Spjd 4412168404Spjd /* 4413168404Spjd * A pool cannot be exported or destroyed if there are active 4414168404Spjd * references. If we are resetting a pool, allow references by 4415168404Spjd * fault injection handlers. 4416168404Spjd */ 4417168404Spjd if (!spa_refcount_zero(spa) || 4418168404Spjd (spa->spa_inject_ref != 0 && 4419168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 4420168404Spjd spa_async_resume(spa); 4421168404Spjd mutex_exit(&spa_namespace_lock); 4422249195Smm return (SET_ERROR(EBUSY)); 4423168404Spjd } 4424168404Spjd 4425185029Spjd /* 4426185029Spjd * A pool cannot be exported if it has an active shared spare. 4427185029Spjd * This is to prevent other pools stealing the active spare 4428185029Spjd * from an exported pool. At user's own will, such pool can 4429185029Spjd * be forcedly exported. 4430185029Spjd */ 4431185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 4432185029Spjd spa_has_active_shared_spare(spa)) { 4433185029Spjd spa_async_resume(spa); 4434185029Spjd mutex_exit(&spa_namespace_lock); 4435249195Smm return (SET_ERROR(EXDEV)); 4436185029Spjd } 4437168404Spjd 4438168404Spjd /* 4439168404Spjd * We want this to be reflected on every label, 4440168404Spjd * so mark them all dirty. spa_unload() will do the 4441168404Spjd * final sync that pushes these changes out. 4442168404Spjd */ 4443207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4444185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4445168404Spjd spa->spa_state = new_state; 4446219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 4447219089Spjd TXG_DEFER_SIZE + 1; 4448168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4449185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4450168404Spjd } 4451168404Spjd } 4452168404Spjd 4453185029Spjd spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4454185029Spjd 4455168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4456168404Spjd spa_unload(spa); 4457168404Spjd spa_deactivate(spa); 4458168404Spjd } 4459168404Spjd 4460168404Spjd if (oldconfig && spa->spa_config) 4461168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4462168404Spjd 4463168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 4464207670Smm if (!hardforce) 4465207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 4466168404Spjd spa_remove(spa); 4467168404Spjd } 4468168404Spjd mutex_exit(&spa_namespace_lock); 4469168404Spjd 4470168404Spjd return (0); 4471168404Spjd} 4472168404Spjd 4473168404Spjd/* 4474168404Spjd * Destroy a storage pool. 4475168404Spjd */ 4476168404Spjdint 4477168404Spjdspa_destroy(char *pool) 4478168404Spjd{ 4479207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4480207670Smm B_FALSE, B_FALSE)); 4481168404Spjd} 4482168404Spjd 4483168404Spjd/* 4484168404Spjd * Export a storage pool. 4485168404Spjd */ 4486168404Spjdint 4487207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4488207670Smm boolean_t hardforce) 4489168404Spjd{ 4490207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4491207670Smm force, hardforce)); 4492168404Spjd} 4493168404Spjd 4494168404Spjd/* 4495168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 4496168404Spjd * from the namespace in any way. 4497168404Spjd */ 4498168404Spjdint 4499168404Spjdspa_reset(char *pool) 4500168404Spjd{ 4501185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4502207670Smm B_FALSE, B_FALSE)); 4503168404Spjd} 4504168404Spjd 4505168404Spjd/* 4506168404Spjd * ========================================================================== 4507168404Spjd * Device manipulation 4508168404Spjd * ========================================================================== 4509168404Spjd */ 4510168404Spjd 4511168404Spjd/* 4512185029Spjd * Add a device to a storage pool. 4513168404Spjd */ 4514168404Spjdint 4515168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4516168404Spjd{ 4517219089Spjd uint64_t txg, id; 4518209962Smm int error; 4519168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4520168404Spjd vdev_t *vd, *tvd; 4521185029Spjd nvlist_t **spares, **l2cache; 4522185029Spjd uint_t nspares, nl2cache; 4523168404Spjd 4524219089Spjd ASSERT(spa_writeable(spa)); 4525219089Spjd 4526168404Spjd txg = spa_vdev_enter(spa); 4527168404Spjd 4528168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4529168404Spjd VDEV_ALLOC_ADD)) != 0) 4530168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4531168404Spjd 4532185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4533168404Spjd 4534185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4535185029Spjd &nspares) != 0) 4536168404Spjd nspares = 0; 4537168404Spjd 4538185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4539185029Spjd &nl2cache) != 0) 4540185029Spjd nl2cache = 0; 4541185029Spjd 4542185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4543168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4544168404Spjd 4545185029Spjd if (vd->vdev_children != 0 && 4546185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 4547185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4548168404Spjd 4549168404Spjd /* 4550185029Spjd * We must validate the spares and l2cache devices after checking the 4551185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4552168404Spjd */ 4553185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4554168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4555168404Spjd 4556168404Spjd /* 4557168404Spjd * Transfer each new top-level vdev from vd to rvd. 4558168404Spjd */ 4559209962Smm for (int c = 0; c < vd->vdev_children; c++) { 4560219089Spjd 4561219089Spjd /* 4562219089Spjd * Set the vdev id to the first hole, if one exists. 4563219089Spjd */ 4564219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 4565219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 4566219089Spjd vdev_free(rvd->vdev_child[id]); 4567219089Spjd break; 4568219089Spjd } 4569219089Spjd } 4570168404Spjd tvd = vd->vdev_child[c]; 4571168404Spjd vdev_remove_child(vd, tvd); 4572219089Spjd tvd->vdev_id = id; 4573168404Spjd vdev_add_child(rvd, tvd); 4574168404Spjd vdev_config_dirty(tvd); 4575168404Spjd } 4576168404Spjd 4577168404Spjd if (nspares != 0) { 4578185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4579185029Spjd ZPOOL_CONFIG_SPARES); 4580168404Spjd spa_load_spares(spa); 4581185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4582168404Spjd } 4583168404Spjd 4584185029Spjd if (nl2cache != 0) { 4585185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4586185029Spjd ZPOOL_CONFIG_L2CACHE); 4587185029Spjd spa_load_l2cache(spa); 4588185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4589185029Spjd } 4590185029Spjd 4591168404Spjd /* 4592168404Spjd * We have to be careful when adding new vdevs to an existing pool. 4593168404Spjd * If other threads start allocating from these vdevs before we 4594168404Spjd * sync the config cache, and we lose power, then upon reboot we may 4595168404Spjd * fail to open the pool because there are DVAs that the config cache 4596168404Spjd * can't translate. Therefore, we first add the vdevs without 4597168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4598168404Spjd * and then let spa_config_update() initialize the new metaslabs. 4599168404Spjd * 4600168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 4601168404Spjd * if we lose power at any point in this sequence, the remaining 4602168404Spjd * steps will be completed the next time we load the pool. 4603168404Spjd */ 4604168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 4605168404Spjd 4606168404Spjd mutex_enter(&spa_namespace_lock); 4607168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4608168404Spjd mutex_exit(&spa_namespace_lock); 4609168404Spjd 4610168404Spjd return (0); 4611168404Spjd} 4612168404Spjd 4613168404Spjd/* 4614168404Spjd * Attach a device to a mirror. The arguments are the path to any device 4615168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 4616168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 4617168404Spjd * 4618168404Spjd * If 'replacing' is specified, the new device is intended to replace the 4619168404Spjd * existing device; in this case the two devices are made into their own 4620185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 4621168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 4622168404Spjd * extra rules: you can't attach to it after it's been created, and upon 4623168404Spjd * completion of resilvering, the first disk (the one being replaced) 4624168404Spjd * is automatically detached. 4625168404Spjd */ 4626168404Spjdint 4627168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4628168404Spjd{ 4629219089Spjd uint64_t txg, dtl_max_txg; 4630168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4631168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4632168404Spjd vdev_ops_t *pvops; 4633185029Spjd char *oldvdpath, *newvdpath; 4634185029Spjd int newvd_isspare; 4635185029Spjd int error; 4636168404Spjd 4637219089Spjd ASSERT(spa_writeable(spa)); 4638219089Spjd 4639168404Spjd txg = spa_vdev_enter(spa); 4640168404Spjd 4641185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4642168404Spjd 4643168404Spjd if (oldvd == NULL) 4644168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4645168404Spjd 4646168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 4647168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4648168404Spjd 4649168404Spjd pvd = oldvd->vdev_parent; 4650168404Spjd 4651168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4652230514Smm VDEV_ALLOC_ATTACH)) != 0) 4653185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4654185029Spjd 4655185029Spjd if (newrootvd->vdev_children != 1) 4656168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4657168404Spjd 4658168404Spjd newvd = newrootvd->vdev_child[0]; 4659168404Spjd 4660168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 4661168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4662168404Spjd 4663168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4664168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 4665168404Spjd 4666185029Spjd /* 4667185029Spjd * Spares can't replace logs 4668185029Spjd */ 4669185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4670185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4671185029Spjd 4672168404Spjd if (!replacing) { 4673168404Spjd /* 4674168404Spjd * For attach, the only allowable parent is a mirror or the root 4675168404Spjd * vdev. 4676168404Spjd */ 4677168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 4678168404Spjd pvd->vdev_ops != &vdev_root_ops) 4679168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4680168404Spjd 4681168404Spjd pvops = &vdev_mirror_ops; 4682168404Spjd } else { 4683168404Spjd /* 4684168404Spjd * Active hot spares can only be replaced by inactive hot 4685168404Spjd * spares. 4686168404Spjd */ 4687168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4688219089Spjd oldvd->vdev_isspare && 4689168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 4690168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4691168404Spjd 4692168404Spjd /* 4693168404Spjd * If the source is a hot spare, and the parent isn't already a 4694168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 4695168404Spjd * want to create a replacing vdev. The user is not allowed to 4696168404Spjd * attach to a spared vdev child unless the 'isspare' state is 4697168404Spjd * the same (spare replaces spare, non-spare replaces 4698168404Spjd * non-spare). 4699168404Spjd */ 4700219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 4701219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4702168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4703219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 4704219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 4705168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4706219089Spjd } 4707219089Spjd 4708219089Spjd if (newvd->vdev_isspare) 4709168404Spjd pvops = &vdev_spare_ops; 4710168404Spjd else 4711168404Spjd pvops = &vdev_replacing_ops; 4712168404Spjd } 4713168404Spjd 4714168404Spjd /* 4715219089Spjd * Make sure the new device is big enough. 4716168404Spjd */ 4717219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4718168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4719168404Spjd 4720168404Spjd /* 4721168404Spjd * The new device cannot have a higher alignment requirement 4722168404Spjd * than the top-level vdev. 4723168404Spjd */ 4724168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4725168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4726168404Spjd 4727168404Spjd /* 4728168404Spjd * If this is an in-place replacement, update oldvd's path and devid 4729168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 4730168404Spjd */ 4731168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4732168404Spjd spa_strfree(oldvd->vdev_path); 4733168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4734168404Spjd KM_SLEEP); 4735168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 4736168404Spjd newvd->vdev_path, "old"); 4737168404Spjd if (oldvd->vdev_devid != NULL) { 4738168404Spjd spa_strfree(oldvd->vdev_devid); 4739168404Spjd oldvd->vdev_devid = NULL; 4740168404Spjd } 4741168404Spjd } 4742168404Spjd 4743219089Spjd /* mark the device being resilvered */ 4744254112Sdelphij newvd->vdev_resilver_txg = txg; 4745219089Spjd 4746168404Spjd /* 4747168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 4748168404Spjd * mirror/replacing/spare vdev above oldvd. 4749168404Spjd */ 4750168404Spjd if (pvd->vdev_ops != pvops) 4751168404Spjd pvd = vdev_add_parent(oldvd, pvops); 4752168404Spjd 4753168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 4754168404Spjd ASSERT(pvd->vdev_ops == pvops); 4755168404Spjd ASSERT(oldvd->vdev_parent == pvd); 4756168404Spjd 4757168404Spjd /* 4758168404Spjd * Extract the new device from its root and add it to pvd. 4759168404Spjd */ 4760168404Spjd vdev_remove_child(newrootvd, newvd); 4761168404Spjd newvd->vdev_id = pvd->vdev_children; 4762219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 4763168404Spjd vdev_add_child(pvd, newvd); 4764168404Spjd 4765168404Spjd tvd = newvd->vdev_top; 4766168404Spjd ASSERT(pvd->vdev_top == tvd); 4767168404Spjd ASSERT(tvd->vdev_parent == rvd); 4768168404Spjd 4769168404Spjd vdev_config_dirty(tvd); 4770168404Spjd 4771168404Spjd /* 4772219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4773219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 4774219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 4775168404Spjd */ 4776219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4777168404Spjd 4778219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4779219089Spjd dtl_max_txg - TXG_INITIAL); 4780168404Spjd 4781209962Smm if (newvd->vdev_isspare) { 4782168404Spjd spa_spare_activate(newvd); 4783209962Smm spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4784209962Smm } 4785209962Smm 4786185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 4787185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 4788185029Spjd newvd_isspare = newvd->vdev_isspare; 4789168404Spjd 4790168404Spjd /* 4791168404Spjd * Mark newvd's DTL dirty in this txg. 4792168404Spjd */ 4793168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 4794168404Spjd 4795219089Spjd /* 4796258717Savg * Schedule the resilver to restart in the future. We do this to 4797258717Savg * ensure that dmu_sync-ed blocks have been stitched into the 4798258717Savg * respective datasets. 4799219089Spjd */ 4800219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4801168404Spjd 4802219089Spjd /* 4803219089Spjd * Commit the config 4804219089Spjd */ 4805219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4806185029Spjd 4807248571Smm spa_history_log_internal(spa, "vdev attach", NULL, 4808219089Spjd "%s vdev=%s %s vdev=%s", 4809219089Spjd replacing && newvd_isspare ? "spare in" : 4810219089Spjd replacing ? "replace" : "attach", newvdpath, 4811219089Spjd replacing ? "for" : "to", oldvdpath); 4812219089Spjd 4813185029Spjd spa_strfree(oldvdpath); 4814185029Spjd spa_strfree(newvdpath); 4815185029Spjd 4816219089Spjd if (spa->spa_bootfs) 4817219089Spjd spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4818168404Spjd 4819168404Spjd return (0); 4820168404Spjd} 4821168404Spjd 4822168404Spjd/* 4823168404Spjd * Detach a device from a mirror or replacing vdev. 4824251631Sdelphij * 4825168404Spjd * If 'replace_done' is specified, only detach if the parent 4826168404Spjd * is a replacing vdev. 4827168404Spjd */ 4828168404Spjdint 4829209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4830168404Spjd{ 4831168404Spjd uint64_t txg; 4832209962Smm int error; 4833168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4834168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 4835168404Spjd boolean_t unspare = B_FALSE; 4836247187Smm uint64_t unspare_guid = 0; 4837219089Spjd char *vdpath; 4838168404Spjd 4839219089Spjd ASSERT(spa_writeable(spa)); 4840219089Spjd 4841168404Spjd txg = spa_vdev_enter(spa); 4842168404Spjd 4843185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4844168404Spjd 4845168404Spjd if (vd == NULL) 4846168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4847168404Spjd 4848168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 4849168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4850168404Spjd 4851168404Spjd pvd = vd->vdev_parent; 4852168404Spjd 4853168404Spjd /* 4854209962Smm * If the parent/child relationship is not as expected, don't do it. 4855209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4856209962Smm * vdev that's replacing B with C. The user's intent in replacing 4857209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 4858209962Smm * the replace by detaching C, the expected behavior is to end up 4859209962Smm * M(A,B). But suppose that right after deciding to detach C, 4860209962Smm * the replacement of B completes. We would have M(A,C), and then 4861209962Smm * ask to detach C, which would leave us with just A -- not what 4862209962Smm * the user wanted. To prevent this, we make sure that the 4863209962Smm * parent/child relationship hasn't changed -- in this example, 4864209962Smm * that C's parent is still the replacing vdev R. 4865209962Smm */ 4866209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 4867209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4868209962Smm 4869209962Smm /* 4870219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 4871168404Spjd */ 4872219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4873219089Spjd pvd->vdev_ops != &vdev_spare_ops) 4874219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4875168404Spjd 4876168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4877185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 4878168404Spjd 4879168404Spjd /* 4880168404Spjd * Only mirror, replacing, and spare vdevs support detach. 4881168404Spjd */ 4882168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 4883168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 4884168404Spjd pvd->vdev_ops != &vdev_spare_ops) 4885168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4886168404Spjd 4887168404Spjd /* 4888209962Smm * If this device has the only valid copy of some data, 4889209962Smm * we cannot safely detach it. 4890168404Spjd */ 4891209962Smm if (vdev_dtl_required(vd)) 4892168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4893168404Spjd 4894209962Smm ASSERT(pvd->vdev_children >= 2); 4895168404Spjd 4896168404Spjd /* 4897185029Spjd * If we are detaching the second disk from a replacing vdev, then 4898185029Spjd * check to see if we changed the original vdev's path to have "/old" 4899185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 4900168404Spjd */ 4901219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4902219089Spjd vd->vdev_path != NULL) { 4903219089Spjd size_t len = strlen(vd->vdev_path); 4904219089Spjd 4905219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 4906219089Spjd cvd = pvd->vdev_child[c]; 4907219089Spjd 4908219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 4909219089Spjd continue; 4910219089Spjd 4911219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4912219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 4913219089Spjd spa_strfree(cvd->vdev_path); 4914219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 4915219089Spjd break; 4916219089Spjd } 4917185029Spjd } 4918185029Spjd } 4919168404Spjd 4920168404Spjd /* 4921168404Spjd * If we are detaching the original disk from a spare, then it implies 4922168404Spjd * that the spare should become a real disk, and be removed from the 4923168404Spjd * active spare list for the pool. 4924168404Spjd */ 4925168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4926219089Spjd vd->vdev_id == 0 && 4927219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4928168404Spjd unspare = B_TRUE; 4929168404Spjd 4930168404Spjd /* 4931168404Spjd * Erase the disk labels so the disk can be used for other things. 4932168404Spjd * This must be done after all other error cases are handled, 4933168404Spjd * but before we disembowel vd (so we can still do I/O to it). 4934168404Spjd * But if we can't do it, don't treat the error as fatal -- 4935168404Spjd * it may be that the unwritability of the disk is the reason 4936168404Spjd * it's being detached! 4937168404Spjd */ 4938168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4939168404Spjd 4940168404Spjd /* 4941168404Spjd * Remove vd from its parent and compact the parent's children. 4942168404Spjd */ 4943168404Spjd vdev_remove_child(pvd, vd); 4944168404Spjd vdev_compact_children(pvd); 4945168404Spjd 4946168404Spjd /* 4947168404Spjd * Remember one of the remaining children so we can get tvd below. 4948168404Spjd */ 4949219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4950168404Spjd 4951168404Spjd /* 4952168404Spjd * If we need to remove the remaining child from the list of hot spares, 4953209962Smm * do it now, marking the vdev as no longer a spare in the process. 4954209962Smm * We must do this before vdev_remove_parent(), because that can 4955209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 4956209962Smm * reason, we must remove the spare now, in the same txg as the detach; 4957209962Smm * otherwise someone could attach a new sibling, change the GUID, and 4958209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4959168404Spjd */ 4960168404Spjd if (unspare) { 4961168404Spjd ASSERT(cvd->vdev_isspare); 4962168404Spjd spa_spare_remove(cvd); 4963168404Spjd unspare_guid = cvd->vdev_guid; 4964209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4965219089Spjd cvd->vdev_unspare = B_TRUE; 4966168404Spjd } 4967168404Spjd 4968168404Spjd /* 4969168404Spjd * If the parent mirror/replacing vdev only has one child, 4970168404Spjd * the parent is no longer needed. Remove it from the tree. 4971168404Spjd */ 4972219089Spjd if (pvd->vdev_children == 1) { 4973219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 4974219089Spjd cvd->vdev_unspare = B_FALSE; 4975168404Spjd vdev_remove_parent(cvd); 4976219089Spjd } 4977168404Spjd 4978219089Spjd 4979168404Spjd /* 4980168404Spjd * We don't set tvd until now because the parent we just removed 4981168404Spjd * may have been the previous top-level vdev. 4982168404Spjd */ 4983168404Spjd tvd = cvd->vdev_top; 4984168404Spjd ASSERT(tvd->vdev_parent == rvd); 4985168404Spjd 4986168404Spjd /* 4987168404Spjd * Reevaluate the parent vdev state. 4988168404Spjd */ 4989185029Spjd vdev_propagate_state(cvd); 4990168404Spjd 4991168404Spjd /* 4992219089Spjd * If the 'autoexpand' property is set on the pool then automatically 4993219089Spjd * try to expand the size of the pool. For example if the device we 4994219089Spjd * just detached was smaller than the others, it may be possible to 4995219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4996219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 4997168404Spjd */ 4998219089Spjd if (spa->spa_autoexpand) { 4999219089Spjd vdev_reopen(tvd); 5000219089Spjd vdev_expand(tvd, txg); 5001219089Spjd } 5002168404Spjd 5003168404Spjd vdev_config_dirty(tvd); 5004168404Spjd 5005168404Spjd /* 5006168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5007168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 5008168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 5009168404Spjd * prevent vd from being accessed after it's freed. 5010168404Spjd */ 5011219089Spjd vdpath = spa_strdup(vd->vdev_path); 5012209962Smm for (int t = 0; t < TXG_SIZE; t++) 5013168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5014168404Spjd vd->vdev_detached = B_TRUE; 5015168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 5016168404Spjd 5017185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 5018185029Spjd 5019219089Spjd /* hang on to the spa before we release the lock */ 5020219089Spjd spa_open_ref(spa, FTAG); 5021219089Spjd 5022168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 5023168404Spjd 5024248571Smm spa_history_log_internal(spa, "detach", NULL, 5025219089Spjd "vdev=%s", vdpath); 5026219089Spjd spa_strfree(vdpath); 5027219089Spjd 5028168404Spjd /* 5029168404Spjd * If this was the removal of the original device in a hot spare vdev, 5030168404Spjd * then we want to go through and remove the device from the hot spare 5031168404Spjd * list of every other pool. 5032168404Spjd */ 5033168404Spjd if (unspare) { 5034219089Spjd spa_t *altspa = NULL; 5035219089Spjd 5036168404Spjd mutex_enter(&spa_namespace_lock); 5037219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 5038219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 5039219089Spjd altspa == spa) 5040168404Spjd continue; 5041219089Spjd 5042219089Spjd spa_open_ref(altspa, FTAG); 5043185029Spjd mutex_exit(&spa_namespace_lock); 5044219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5045185029Spjd mutex_enter(&spa_namespace_lock); 5046219089Spjd spa_close(altspa, FTAG); 5047168404Spjd } 5048168404Spjd mutex_exit(&spa_namespace_lock); 5049219089Spjd 5050219089Spjd /* search the rest of the vdevs for spares to remove */ 5051219089Spjd spa_vdev_resilver_done(spa); 5052168404Spjd } 5053168404Spjd 5054219089Spjd /* all done with the spa; OK to release */ 5055219089Spjd mutex_enter(&spa_namespace_lock); 5056219089Spjd spa_close(spa, FTAG); 5057219089Spjd mutex_exit(&spa_namespace_lock); 5058219089Spjd 5059168404Spjd return (error); 5060168404Spjd} 5061168404Spjd 5062219089Spjd/* 5063219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 5064219089Spjd */ 5065219089Spjdint 5066219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5067219089Spjd nvlist_t *props, boolean_t exp) 5068219089Spjd{ 5069219089Spjd int error = 0; 5070219089Spjd uint64_t txg, *glist; 5071219089Spjd spa_t *newspa; 5072219089Spjd uint_t c, children, lastlog; 5073219089Spjd nvlist_t **child, *nvl, *tmp; 5074219089Spjd dmu_tx_t *tx; 5075219089Spjd char *altroot = NULL; 5076219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5077219089Spjd boolean_t activate_slog; 5078219089Spjd 5079219089Spjd ASSERT(spa_writeable(spa)); 5080219089Spjd 5081219089Spjd txg = spa_vdev_enter(spa); 5082219089Spjd 5083219089Spjd /* clear the log and flush everything up to now */ 5084219089Spjd activate_slog = spa_passivate_log(spa); 5085219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5086219089Spjd error = spa_offline_log(spa); 5087219089Spjd txg = spa_vdev_config_enter(spa); 5088219089Spjd 5089219089Spjd if (activate_slog) 5090219089Spjd spa_activate_log(spa); 5091219089Spjd 5092219089Spjd if (error != 0) 5093219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5094219089Spjd 5095219089Spjd /* check new spa name before going any further */ 5096219089Spjd if (spa_lookup(newname) != NULL) 5097219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5098219089Spjd 5099219089Spjd /* 5100219089Spjd * scan through all the children to ensure they're all mirrors 5101219089Spjd */ 5102219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5103219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5104219089Spjd &children) != 0) 5105219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5106219089Spjd 5107219089Spjd /* first, check to ensure we've got the right child count */ 5108219089Spjd rvd = spa->spa_root_vdev; 5109219089Spjd lastlog = 0; 5110219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 5111219089Spjd vdev_t *vd = rvd->vdev_child[c]; 5112219089Spjd 5113219089Spjd /* don't count the holes & logs as children */ 5114219089Spjd if (vd->vdev_islog || vd->vdev_ishole) { 5115219089Spjd if (lastlog == 0) 5116219089Spjd lastlog = c; 5117219089Spjd continue; 5118219089Spjd } 5119219089Spjd 5120219089Spjd lastlog = 0; 5121219089Spjd } 5122219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5123219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5124219089Spjd 5125219089Spjd /* next, ensure no spare or cache devices are part of the split */ 5126219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5127219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5128219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5129219089Spjd 5130219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5131219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5132219089Spjd 5133219089Spjd /* then, loop over each vdev and validate it */ 5134219089Spjd for (c = 0; c < children; c++) { 5135219089Spjd uint64_t is_hole = 0; 5136219089Spjd 5137219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5138219089Spjd &is_hole); 5139219089Spjd 5140219089Spjd if (is_hole != 0) { 5141219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5142219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5143219089Spjd continue; 5144219089Spjd } else { 5145249195Smm error = SET_ERROR(EINVAL); 5146219089Spjd break; 5147219089Spjd } 5148219089Spjd } 5149219089Spjd 5150219089Spjd /* which disk is going to be split? */ 5151219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5152219089Spjd &glist[c]) != 0) { 5153249195Smm error = SET_ERROR(EINVAL); 5154219089Spjd break; 5155219089Spjd } 5156219089Spjd 5157219089Spjd /* look it up in the spa */ 5158219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5159219089Spjd if (vml[c] == NULL) { 5160249195Smm error = SET_ERROR(ENODEV); 5161219089Spjd break; 5162219089Spjd } 5163219089Spjd 5164219089Spjd /* make sure there's nothing stopping the split */ 5165219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5166219089Spjd vml[c]->vdev_islog || 5167219089Spjd vml[c]->vdev_ishole || 5168219089Spjd vml[c]->vdev_isspare || 5169219089Spjd vml[c]->vdev_isl2cache || 5170219089Spjd !vdev_writeable(vml[c]) || 5171219089Spjd vml[c]->vdev_children != 0 || 5172219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5173219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5174249195Smm error = SET_ERROR(EINVAL); 5175219089Spjd break; 5176219089Spjd } 5177219089Spjd 5178219089Spjd if (vdev_dtl_required(vml[c])) { 5179249195Smm error = SET_ERROR(EBUSY); 5180219089Spjd break; 5181219089Spjd } 5182219089Spjd 5183219089Spjd /* we need certain info from the top level */ 5184219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5185219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 5186219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5187219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 5188219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5189219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 5190219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5191219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 5192219089Spjd } 5193219089Spjd 5194219089Spjd if (error != 0) { 5195219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5196219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5197219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5198219089Spjd } 5199219089Spjd 5200219089Spjd /* stop writers from using the disks */ 5201219089Spjd for (c = 0; c < children; c++) { 5202219089Spjd if (vml[c] != NULL) 5203219089Spjd vml[c]->vdev_offline = B_TRUE; 5204219089Spjd } 5205219089Spjd vdev_reopen(spa->spa_root_vdev); 5206219089Spjd 5207219089Spjd /* 5208219089Spjd * Temporarily record the splitting vdevs in the spa config. This 5209219089Spjd * will disappear once the config is regenerated. 5210219089Spjd */ 5211219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5212219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5213219089Spjd glist, children) == 0); 5214219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5215219089Spjd 5216219089Spjd mutex_enter(&spa->spa_props_lock); 5217219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5218219089Spjd nvl) == 0); 5219219089Spjd mutex_exit(&spa->spa_props_lock); 5220219089Spjd spa->spa_config_splitting = nvl; 5221219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5222219089Spjd 5223219089Spjd /* configure and create the new pool */ 5224219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5225219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5226219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5227219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5228219089Spjd spa_version(spa)) == 0); 5229219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5230219089Spjd spa->spa_config_txg) == 0); 5231219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5232219089Spjd spa_generate_guid(NULL)) == 0); 5233219089Spjd (void) nvlist_lookup_string(props, 5234219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5235219089Spjd 5236219089Spjd /* add the new pool to the namespace */ 5237219089Spjd newspa = spa_add(newname, config, altroot); 5238219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 5239219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 5240219089Spjd 5241219089Spjd /* release the spa config lock, retaining the namespace lock */ 5242219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5243219089Spjd 5244219089Spjd if (zio_injection_enabled) 5245219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 5246219089Spjd 5247219089Spjd spa_activate(newspa, spa_mode_global); 5248219089Spjd spa_async_suspend(newspa); 5249219089Spjd 5250219089Spjd#ifndef sun 5251219089Spjd /* mark that we are creating new spa by splitting */ 5252219089Spjd newspa->spa_splitting_newspa = B_TRUE; 5253219089Spjd#endif 5254219089Spjd /* create the new pool from the disks of the original pool */ 5255219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5256219089Spjd#ifndef sun 5257219089Spjd newspa->spa_splitting_newspa = B_FALSE; 5258219089Spjd#endif 5259219089Spjd if (error) 5260219089Spjd goto out; 5261219089Spjd 5262219089Spjd /* if that worked, generate a real config for the new pool */ 5263219089Spjd if (newspa->spa_root_vdev != NULL) { 5264219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5265219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 5266219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5267219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5268219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5269219089Spjd B_TRUE)); 5270219089Spjd } 5271219089Spjd 5272219089Spjd /* set the props */ 5273219089Spjd if (props != NULL) { 5274219089Spjd spa_configfile_set(newspa, props, B_FALSE); 5275219089Spjd error = spa_prop_set(newspa, props); 5276219089Spjd if (error) 5277219089Spjd goto out; 5278219089Spjd } 5279219089Spjd 5280219089Spjd /* flush everything */ 5281219089Spjd txg = spa_vdev_config_enter(newspa); 5282219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 5283219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5284219089Spjd 5285219089Spjd if (zio_injection_enabled) 5286219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 5287219089Spjd 5288219089Spjd spa_async_resume(newspa); 5289219089Spjd 5290219089Spjd /* finally, update the original pool's config */ 5291219089Spjd txg = spa_vdev_config_enter(spa); 5292219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5293219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 5294219089Spjd if (error != 0) 5295219089Spjd dmu_tx_abort(tx); 5296219089Spjd for (c = 0; c < children; c++) { 5297219089Spjd if (vml[c] != NULL) { 5298219089Spjd vdev_split(vml[c]); 5299219089Spjd if (error == 0) 5300248571Smm spa_history_log_internal(spa, "detach", tx, 5301248571Smm "vdev=%s", vml[c]->vdev_path); 5302219089Spjd vdev_free(vml[c]); 5303219089Spjd } 5304219089Spjd } 5305219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5306219089Spjd spa->spa_config_splitting = NULL; 5307219089Spjd nvlist_free(nvl); 5308219089Spjd if (error == 0) 5309219089Spjd dmu_tx_commit(tx); 5310219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 5311219089Spjd 5312219089Spjd if (zio_injection_enabled) 5313219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 5314219089Spjd 5315219089Spjd /* split is complete; log a history record */ 5316248571Smm spa_history_log_internal(newspa, "split", NULL, 5317248571Smm "from pool %s", spa_name(spa)); 5318219089Spjd 5319219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5320219089Spjd 5321219089Spjd /* if we're not going to mount the filesystems in userland, export */ 5322219089Spjd if (exp) 5323219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5324219089Spjd B_FALSE, B_FALSE); 5325219089Spjd 5326219089Spjd return (error); 5327219089Spjd 5328219089Spjdout: 5329219089Spjd spa_unload(newspa); 5330219089Spjd spa_deactivate(newspa); 5331219089Spjd spa_remove(newspa); 5332219089Spjd 5333219089Spjd txg = spa_vdev_config_enter(spa); 5334219089Spjd 5335219089Spjd /* re-online all offlined disks */ 5336219089Spjd for (c = 0; c < children; c++) { 5337219089Spjd if (vml[c] != NULL) 5338219089Spjd vml[c]->vdev_offline = B_FALSE; 5339219089Spjd } 5340219089Spjd vdev_reopen(spa->spa_root_vdev); 5341219089Spjd 5342219089Spjd nvlist_free(spa->spa_config_splitting); 5343219089Spjd spa->spa_config_splitting = NULL; 5344219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 5345219089Spjd 5346219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5347219089Spjd return (error); 5348219089Spjd} 5349219089Spjd 5350185029Spjdstatic nvlist_t * 5351185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5352185029Spjd{ 5353185029Spjd for (int i = 0; i < count; i++) { 5354185029Spjd uint64_t guid; 5355185029Spjd 5356185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5357185029Spjd &guid) == 0); 5358185029Spjd 5359185029Spjd if (guid == target_guid) 5360185029Spjd return (nvpp[i]); 5361185029Spjd } 5362185029Spjd 5363185029Spjd return (NULL); 5364185029Spjd} 5365185029Spjd 5366185029Spjdstatic void 5367185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5368185029Spjd nvlist_t *dev_to_remove) 5369185029Spjd{ 5370185029Spjd nvlist_t **newdev = NULL; 5371185029Spjd 5372185029Spjd if (count > 1) 5373185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5374185029Spjd 5375185029Spjd for (int i = 0, j = 0; i < count; i++) { 5376185029Spjd if (dev[i] == dev_to_remove) 5377185029Spjd continue; 5378185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5379185029Spjd } 5380185029Spjd 5381185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5382185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5383185029Spjd 5384185029Spjd for (int i = 0; i < count - 1; i++) 5385185029Spjd nvlist_free(newdev[i]); 5386185029Spjd 5387185029Spjd if (count > 1) 5388185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 5389185029Spjd} 5390185029Spjd 5391168404Spjd/* 5392219089Spjd * Evacuate the device. 5393219089Spjd */ 5394219089Spjdstatic int 5395219089Spjdspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5396219089Spjd{ 5397219089Spjd uint64_t txg; 5398219089Spjd int error = 0; 5399219089Spjd 5400219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5401219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5402219089Spjd ASSERT(vd == vd->vdev_top); 5403219089Spjd 5404219089Spjd /* 5405219089Spjd * Evacuate the device. We don't hold the config lock as writer 5406219089Spjd * since we need to do I/O but we do keep the 5407219089Spjd * spa_namespace_lock held. Once this completes the device 5408219089Spjd * should no longer have any blocks allocated on it. 5409219089Spjd */ 5410219089Spjd if (vd->vdev_islog) { 5411219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5412219089Spjd error = spa_offline_log(spa); 5413219089Spjd } else { 5414249195Smm error = SET_ERROR(ENOTSUP); 5415219089Spjd } 5416219089Spjd 5417219089Spjd if (error) 5418219089Spjd return (error); 5419219089Spjd 5420219089Spjd /* 5421219089Spjd * The evacuation succeeded. Remove any remaining MOS metadata 5422219089Spjd * associated with this vdev, and wait for these changes to sync. 5423219089Spjd */ 5424240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 5425219089Spjd txg = spa_vdev_config_enter(spa); 5426219089Spjd vd->vdev_removing = B_TRUE; 5427258717Savg vdev_dirty_leaves(vd, VDD_DTL, txg); 5428219089Spjd vdev_config_dirty(vd); 5429219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5430219089Spjd 5431219089Spjd return (0); 5432219089Spjd} 5433219089Spjd 5434219089Spjd/* 5435219089Spjd * Complete the removal by cleaning up the namespace. 5436219089Spjd */ 5437219089Spjdstatic void 5438219089Spjdspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5439219089Spjd{ 5440219089Spjd vdev_t *rvd = spa->spa_root_vdev; 5441219089Spjd uint64_t id = vd->vdev_id; 5442219089Spjd boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5443219089Spjd 5444219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5445219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5446219089Spjd ASSERT(vd == vd->vdev_top); 5447219089Spjd 5448219089Spjd /* 5449219089Spjd * Only remove any devices which are empty. 5450219089Spjd */ 5451219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5452219089Spjd return; 5453219089Spjd 5454219089Spjd (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5455219089Spjd 5456219089Spjd if (list_link_active(&vd->vdev_state_dirty_node)) 5457219089Spjd vdev_state_clean(vd); 5458219089Spjd if (list_link_active(&vd->vdev_config_dirty_node)) 5459219089Spjd vdev_config_clean(vd); 5460219089Spjd 5461219089Spjd vdev_free(vd); 5462219089Spjd 5463219089Spjd if (last_vdev) { 5464219089Spjd vdev_compact_children(rvd); 5465219089Spjd } else { 5466219089Spjd vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5467219089Spjd vdev_add_child(rvd, vd); 5468219089Spjd } 5469219089Spjd vdev_config_dirty(rvd); 5470219089Spjd 5471219089Spjd /* 5472219089Spjd * Reassess the health of our root vdev. 5473219089Spjd */ 5474219089Spjd vdev_reopen(rvd); 5475219089Spjd} 5476219089Spjd 5477219089Spjd/* 5478219089Spjd * Remove a device from the pool - 5479219089Spjd * 5480219089Spjd * Removing a device from the vdev namespace requires several steps 5481219089Spjd * and can take a significant amount of time. As a result we use 5482219089Spjd * the spa_vdev_config_[enter/exit] functions which allow us to 5483219089Spjd * grab and release the spa_config_lock while still holding the namespace 5484219089Spjd * lock. During each step the configuration is synced out. 5485251631Sdelphij * 5486251631Sdelphij * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5487251631Sdelphij * devices. 5488219089Spjd */ 5489168404Spjdint 5490168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5491168404Spjd{ 5492168404Spjd vdev_t *vd; 5493219089Spjd metaslab_group_t *mg; 5494185029Spjd nvlist_t **spares, **l2cache, *nv; 5495219089Spjd uint64_t txg = 0; 5496185029Spjd uint_t nspares, nl2cache; 5497185029Spjd int error = 0; 5498209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5499168404Spjd 5500219089Spjd ASSERT(spa_writeable(spa)); 5501219089Spjd 5502209962Smm if (!locked) 5503209962Smm txg = spa_vdev_enter(spa); 5504168404Spjd 5505185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5506168404Spjd 5507185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 5508185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5509185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5510185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5511185029Spjd /* 5512185029Spjd * Only remove the hot spare if it's not currently in use 5513185029Spjd * in this pool. 5514185029Spjd */ 5515185029Spjd if (vd == NULL || unspare) { 5516185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 5517185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5518185029Spjd spa_load_spares(spa); 5519185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5520185029Spjd } else { 5521249195Smm error = SET_ERROR(EBUSY); 5522168404Spjd } 5523185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 5524185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5525185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5526185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5527185029Spjd /* 5528185029Spjd * Cache devices can always be removed. 5529185029Spjd */ 5530185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5531185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5532185029Spjd spa_load_l2cache(spa); 5533185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5534219089Spjd } else if (vd != NULL && vd->vdev_islog) { 5535219089Spjd ASSERT(!locked); 5536219089Spjd ASSERT(vd == vd->vdev_top); 5537219089Spjd 5538219089Spjd /* 5539219089Spjd * XXX - Once we have bp-rewrite this should 5540219089Spjd * become the common case. 5541219089Spjd */ 5542219089Spjd 5543219089Spjd mg = vd->vdev_mg; 5544219089Spjd 5545219089Spjd /* 5546219089Spjd * Stop allocating from this vdev. 5547219089Spjd */ 5548219089Spjd metaslab_group_passivate(mg); 5549219089Spjd 5550219089Spjd /* 5551219089Spjd * Wait for the youngest allocations and frees to sync, 5552219089Spjd * and then wait for the deferral of those frees to finish. 5553219089Spjd */ 5554219089Spjd spa_vdev_config_exit(spa, NULL, 5555219089Spjd txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5556219089Spjd 5557219089Spjd /* 5558219089Spjd * Attempt to evacuate the vdev. 5559219089Spjd */ 5560219089Spjd error = spa_vdev_remove_evacuate(spa, vd); 5561219089Spjd 5562219089Spjd txg = spa_vdev_config_enter(spa); 5563219089Spjd 5564219089Spjd /* 5565219089Spjd * If we couldn't evacuate the vdev, unwind. 5566219089Spjd */ 5567219089Spjd if (error) { 5568219089Spjd metaslab_group_activate(mg); 5569219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5570219089Spjd } 5571219089Spjd 5572219089Spjd /* 5573219089Spjd * Clean up the vdev namespace. 5574219089Spjd */ 5575219089Spjd spa_vdev_remove_from_namespace(spa, vd); 5576219089Spjd 5577185029Spjd } else if (vd != NULL) { 5578185029Spjd /* 5579185029Spjd * Normal vdevs cannot be removed (yet). 5580185029Spjd */ 5581249195Smm error = SET_ERROR(ENOTSUP); 5582168404Spjd } else { 5583185029Spjd /* 5584185029Spjd * There is no vdev of any kind with the specified guid. 5585185029Spjd */ 5586249195Smm error = SET_ERROR(ENOENT); 5587168404Spjd } 5588168404Spjd 5589209962Smm if (!locked) 5590209962Smm return (spa_vdev_exit(spa, NULL, txg, error)); 5591209962Smm 5592209962Smm return (error); 5593168404Spjd} 5594168404Spjd 5595168404Spjd/* 5596185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 5597251631Sdelphij * currently spared, so we can detach it. 5598168404Spjd */ 5599168404Spjdstatic vdev_t * 5600185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 5601168404Spjd{ 5602168404Spjd vdev_t *newvd, *oldvd; 5603168404Spjd 5604219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5605185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5606168404Spjd if (oldvd != NULL) 5607168404Spjd return (oldvd); 5608168404Spjd } 5609168404Spjd 5610185029Spjd /* 5611219089Spjd * Check for a completed replacement. We always consider the first 5612219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 5613219089Spjd * the newest (see spa_vdev_attach() for how that works). In 5614219089Spjd * the case where the newest vdev is faulted, we will not automatically 5615219089Spjd * remove it after a resilver completes. This is OK as it will require 5616219089Spjd * user intervention to determine which disk the admin wishes to keep. 5617185029Spjd */ 5618219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 5619219089Spjd ASSERT(vd->vdev_children > 1); 5620219089Spjd 5621219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 5622168404Spjd oldvd = vd->vdev_child[0]; 5623168404Spjd 5624209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 5625219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5626209962Smm !vdev_dtl_required(oldvd)) 5627168404Spjd return (oldvd); 5628168404Spjd } 5629168404Spjd 5630185029Spjd /* 5631185029Spjd * Check for a completed resilver with the 'unspare' flag set. 5632185029Spjd */ 5633219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 5634219089Spjd vdev_t *first = vd->vdev_child[0]; 5635219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5636185029Spjd 5637219089Spjd if (last->vdev_unspare) { 5638219089Spjd oldvd = first; 5639219089Spjd newvd = last; 5640219089Spjd } else if (first->vdev_unspare) { 5641219089Spjd oldvd = last; 5642219089Spjd newvd = first; 5643219089Spjd } else { 5644219089Spjd oldvd = NULL; 5645219089Spjd } 5646219089Spjd 5647219089Spjd if (oldvd != NULL && 5648209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 5649219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5650219089Spjd !vdev_dtl_required(oldvd)) 5651185029Spjd return (oldvd); 5652219089Spjd 5653219089Spjd /* 5654219089Spjd * If there are more than two spares attached to a disk, 5655219089Spjd * and those spares are not required, then we want to 5656219089Spjd * attempt to free them up now so that they can be used 5657219089Spjd * by other pools. Once we're back down to a single 5658219089Spjd * disk+spare, we stop removing them. 5659219089Spjd */ 5660219089Spjd if (vd->vdev_children > 2) { 5661219089Spjd newvd = vd->vdev_child[1]; 5662219089Spjd 5663219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 5664219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 5665219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 5666219089Spjd !vdev_dtl_required(newvd)) 5667219089Spjd return (newvd); 5668185029Spjd } 5669185029Spjd } 5670185029Spjd 5671168404Spjd return (NULL); 5672168404Spjd} 5673168404Spjd 5674168404Spjdstatic void 5675185029Spjdspa_vdev_resilver_done(spa_t *spa) 5676168404Spjd{ 5677209962Smm vdev_t *vd, *pvd, *ppvd; 5678209962Smm uint64_t guid, sguid, pguid, ppguid; 5679168404Spjd 5680209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5681168404Spjd 5682185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5683209962Smm pvd = vd->vdev_parent; 5684209962Smm ppvd = pvd->vdev_parent; 5685168404Spjd guid = vd->vdev_guid; 5686209962Smm pguid = pvd->vdev_guid; 5687209962Smm ppguid = ppvd->vdev_guid; 5688209962Smm sguid = 0; 5689168404Spjd /* 5690168404Spjd * If we have just finished replacing a hot spared device, then 5691168404Spjd * we need to detach the parent's first child (the original hot 5692168404Spjd * spare) as well. 5693168404Spjd */ 5694219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5695219089Spjd ppvd->vdev_children == 2) { 5696168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5697209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 5698168404Spjd } 5699254112Sdelphij ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5700254112Sdelphij 5701209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5702209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5703168404Spjd return; 5704209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5705168404Spjd return; 5706209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5707168404Spjd } 5708168404Spjd 5709209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5710168404Spjd} 5711168404Spjd 5712168404Spjd/* 5713219089Spjd * Update the stored path or FRU for this vdev. 5714168404Spjd */ 5715168404Spjdint 5716209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5717209962Smm boolean_t ispath) 5718168404Spjd{ 5719185029Spjd vdev_t *vd; 5720219089Spjd boolean_t sync = B_FALSE; 5721168404Spjd 5722219089Spjd ASSERT(spa_writeable(spa)); 5723168404Spjd 5724219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 5725219089Spjd 5726209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5727219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5728168404Spjd 5729168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5730219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5731168404Spjd 5732209962Smm if (ispath) { 5733219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 5734219089Spjd spa_strfree(vd->vdev_path); 5735219089Spjd vd->vdev_path = spa_strdup(value); 5736219089Spjd sync = B_TRUE; 5737219089Spjd } 5738209962Smm } else { 5739219089Spjd if (vd->vdev_fru == NULL) { 5740219089Spjd vd->vdev_fru = spa_strdup(value); 5741219089Spjd sync = B_TRUE; 5742219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 5743209962Smm spa_strfree(vd->vdev_fru); 5744219089Spjd vd->vdev_fru = spa_strdup(value); 5745219089Spjd sync = B_TRUE; 5746219089Spjd } 5747209962Smm } 5748168404Spjd 5749219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5750168404Spjd} 5751168404Spjd 5752209962Smmint 5753209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5754209962Smm{ 5755209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5756209962Smm} 5757209962Smm 5758209962Smmint 5759209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5760209962Smm{ 5761209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5762209962Smm} 5763209962Smm 5764168404Spjd/* 5765168404Spjd * ========================================================================== 5766219089Spjd * SPA Scanning 5767168404Spjd * ========================================================================== 5768168404Spjd */ 5769168404Spjd 5770168404Spjdint 5771219089Spjdspa_scan_stop(spa_t *spa) 5772168404Spjd{ 5773185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5774219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5775249195Smm return (SET_ERROR(EBUSY)); 5776219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 5777219089Spjd} 5778168404Spjd 5779219089Spjdint 5780219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 5781219089Spjd{ 5782219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5783219089Spjd 5784219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5785249195Smm return (SET_ERROR(ENOTSUP)); 5786168404Spjd 5787168404Spjd /* 5788185029Spjd * If a resilver was requested, but there is no DTL on a 5789185029Spjd * writeable leaf device, we have nothing to do. 5790168404Spjd */ 5791219089Spjd if (func == POOL_SCAN_RESILVER && 5792185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5793185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5794168404Spjd return (0); 5795168404Spjd } 5796168404Spjd 5797219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 5798168404Spjd} 5799168404Spjd 5800168404Spjd/* 5801168404Spjd * ========================================================================== 5802168404Spjd * SPA async task processing 5803168404Spjd * ========================================================================== 5804168404Spjd */ 5805168404Spjd 5806168404Spjdstatic void 5807185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 5808168404Spjd{ 5809185029Spjd if (vd->vdev_remove_wanted) { 5810219089Spjd vd->vdev_remove_wanted = B_FALSE; 5811219089Spjd vd->vdev_delayed_close = B_FALSE; 5812185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5813209962Smm 5814209962Smm /* 5815209962Smm * We want to clear the stats, but we don't want to do a full 5816209962Smm * vdev_clear() as that will cause us to throw away 5817209962Smm * degraded/faulted state as well as attempt to reopen the 5818209962Smm * device, all of which is a waste. 5819209962Smm */ 5820209962Smm vd->vdev_stat.vs_read_errors = 0; 5821209962Smm vd->vdev_stat.vs_write_errors = 0; 5822209962Smm vd->vdev_stat.vs_checksum_errors = 0; 5823209962Smm 5824185029Spjd vdev_state_dirty(vd->vdev_top); 5825185029Spjd } 5826168404Spjd 5827185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5828185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 5829185029Spjd} 5830168404Spjd 5831185029Spjdstatic void 5832185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 5833185029Spjd{ 5834185029Spjd if (vd->vdev_probe_wanted) { 5835219089Spjd vd->vdev_probe_wanted = B_FALSE; 5836185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 5837168404Spjd } 5838168404Spjd 5839185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5840185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 5841168404Spjd} 5842168404Spjd 5843168404Spjdstatic void 5844219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 5845219089Spjd{ 5846219089Spjd sysevent_id_t eid; 5847219089Spjd nvlist_t *attr; 5848219089Spjd char *physpath; 5849219089Spjd 5850219089Spjd if (!spa->spa_autoexpand) 5851219089Spjd return; 5852219089Spjd 5853219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5854219089Spjd vdev_t *cvd = vd->vdev_child[c]; 5855219089Spjd spa_async_autoexpand(spa, cvd); 5856219089Spjd } 5857219089Spjd 5858219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5859219089Spjd return; 5860219089Spjd 5861219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5862219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5863219089Spjd 5864219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5865219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5866219089Spjd 5867219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5868219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5869219089Spjd 5870219089Spjd nvlist_free(attr); 5871219089Spjd kmem_free(physpath, MAXPATHLEN); 5872219089Spjd} 5873219089Spjd 5874219089Spjdstatic void 5875168404Spjdspa_async_thread(void *arg) 5876168404Spjd{ 5877168404Spjd spa_t *spa = arg; 5878168404Spjd int tasks; 5879168404Spjd 5880168404Spjd ASSERT(spa->spa_sync_on); 5881168404Spjd 5882168404Spjd mutex_enter(&spa->spa_async_lock); 5883168404Spjd tasks = spa->spa_async_tasks; 5884253990Smav spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 5885168404Spjd mutex_exit(&spa->spa_async_lock); 5886168404Spjd 5887168404Spjd /* 5888168404Spjd * See if the config needs to be updated. 5889168404Spjd */ 5890168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5891219089Spjd uint64_t old_space, new_space; 5892219089Spjd 5893168404Spjd mutex_enter(&spa_namespace_lock); 5894219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 5895168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5896219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 5897168404Spjd mutex_exit(&spa_namespace_lock); 5898219089Spjd 5899219089Spjd /* 5900219089Spjd * If the pool grew as a result of the config update, 5901219089Spjd * then log an internal history event. 5902219089Spjd */ 5903219089Spjd if (new_space != old_space) { 5904248571Smm spa_history_log_internal(spa, "vdev online", NULL, 5905219089Spjd "pool '%s' size: %llu(+%llu)", 5906219089Spjd spa_name(spa), new_space, new_space - old_space); 5907219089Spjd } 5908168404Spjd } 5909168404Spjd 5910219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5911219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5912219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 5913219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 5914219089Spjd } 5915219089Spjd 5916168404Spjd /* 5917185029Spjd * See if any devices need to be probed. 5918168404Spjd */ 5919185029Spjd if (tasks & SPA_ASYNC_PROBE) { 5920219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5921185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 5922185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5923185029Spjd } 5924168404Spjd 5925168404Spjd /* 5926185029Spjd * If any devices are done replacing, detach them. 5927168404Spjd */ 5928185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 5929185029Spjd spa_vdev_resilver_done(spa); 5930168404Spjd 5931168404Spjd /* 5932168404Spjd * Kick off a resilver. 5933168404Spjd */ 5934168404Spjd if (tasks & SPA_ASYNC_RESILVER) 5935219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 5936168404Spjd 5937168404Spjd /* 5938168404Spjd * Let the world know that we're done. 5939168404Spjd */ 5940168404Spjd mutex_enter(&spa->spa_async_lock); 5941168404Spjd spa->spa_async_thread = NULL; 5942168404Spjd cv_broadcast(&spa->spa_async_cv); 5943168404Spjd mutex_exit(&spa->spa_async_lock); 5944168404Spjd thread_exit(); 5945168404Spjd} 5946168404Spjd 5947253990Smavstatic void 5948253990Smavspa_async_thread_vd(void *arg) 5949253990Smav{ 5950253990Smav spa_t *spa = arg; 5951253990Smav int tasks; 5952253990Smav 5953253990Smav ASSERT(spa->spa_sync_on); 5954253990Smav 5955253990Smav mutex_enter(&spa->spa_async_lock); 5956253990Smav tasks = spa->spa_async_tasks; 5957253990Smavretry: 5958253990Smav spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 5959253990Smav mutex_exit(&spa->spa_async_lock); 5960253990Smav 5961253990Smav /* 5962253990Smav * See if any devices need to be marked REMOVED. 5963253990Smav */ 5964253990Smav if (tasks & SPA_ASYNC_REMOVE) { 5965253990Smav spa_vdev_state_enter(spa, SCL_NONE); 5966253990Smav spa_async_remove(spa, spa->spa_root_vdev); 5967253990Smav for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5968253990Smav spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5969253990Smav for (int i = 0; i < spa->spa_spares.sav_count; i++) 5970253990Smav spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5971253990Smav (void) spa_vdev_state_exit(spa, NULL, 0); 5972253990Smav } 5973253990Smav 5974253990Smav /* 5975253990Smav * Let the world know that we're done. 5976253990Smav */ 5977253990Smav mutex_enter(&spa->spa_async_lock); 5978253990Smav tasks = spa->spa_async_tasks; 5979253990Smav if ((tasks & SPA_ASYNC_REMOVE) != 0) 5980253990Smav goto retry; 5981253990Smav spa->spa_async_thread_vd = NULL; 5982253990Smav cv_broadcast(&spa->spa_async_cv); 5983253990Smav mutex_exit(&spa->spa_async_lock); 5984253990Smav thread_exit(); 5985253990Smav} 5986253990Smav 5987168404Spjdvoid 5988168404Spjdspa_async_suspend(spa_t *spa) 5989168404Spjd{ 5990168404Spjd mutex_enter(&spa->spa_async_lock); 5991168404Spjd spa->spa_async_suspended++; 5992253990Smav while (spa->spa_async_thread != NULL && 5993253990Smav spa->spa_async_thread_vd != NULL) 5994168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5995168404Spjd mutex_exit(&spa->spa_async_lock); 5996168404Spjd} 5997168404Spjd 5998168404Spjdvoid 5999168404Spjdspa_async_resume(spa_t *spa) 6000168404Spjd{ 6001168404Spjd mutex_enter(&spa->spa_async_lock); 6002168404Spjd ASSERT(spa->spa_async_suspended != 0); 6003168404Spjd spa->spa_async_suspended--; 6004168404Spjd mutex_exit(&spa->spa_async_lock); 6005168404Spjd} 6006168404Spjd 6007251636Sdelphijstatic boolean_t 6008251636Sdelphijspa_async_tasks_pending(spa_t *spa) 6009251636Sdelphij{ 6010251636Sdelphij uint_t non_config_tasks; 6011251636Sdelphij uint_t config_task; 6012251636Sdelphij boolean_t config_task_suspended; 6013251636Sdelphij 6014253990Smav non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6015253990Smav SPA_ASYNC_REMOVE); 6016251636Sdelphij config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6017251636Sdelphij if (spa->spa_ccw_fail_time == 0) { 6018251636Sdelphij config_task_suspended = B_FALSE; 6019251636Sdelphij } else { 6020251636Sdelphij config_task_suspended = 6021251636Sdelphij (gethrtime() - spa->spa_ccw_fail_time) < 6022251636Sdelphij (zfs_ccw_retry_interval * NANOSEC); 6023251636Sdelphij } 6024251636Sdelphij 6025251636Sdelphij return (non_config_tasks || (config_task && !config_task_suspended)); 6026251636Sdelphij} 6027251636Sdelphij 6028168404Spjdstatic void 6029168404Spjdspa_async_dispatch(spa_t *spa) 6030168404Spjd{ 6031168404Spjd mutex_enter(&spa->spa_async_lock); 6032251636Sdelphij if (spa_async_tasks_pending(spa) && 6033251636Sdelphij !spa->spa_async_suspended && 6034168404Spjd spa->spa_async_thread == NULL && 6035251636Sdelphij rootdir != NULL) 6036168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 6037168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6038168404Spjd mutex_exit(&spa->spa_async_lock); 6039168404Spjd} 6040168404Spjd 6041253990Smavstatic void 6042253990Smavspa_async_dispatch_vd(spa_t *spa) 6043253990Smav{ 6044253990Smav mutex_enter(&spa->spa_async_lock); 6045253990Smav if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6046253990Smav !spa->spa_async_suspended && 6047253990Smav spa->spa_async_thread_vd == NULL && 6048253990Smav rootdir != NULL) 6049253990Smav spa->spa_async_thread_vd = thread_create(NULL, 0, 6050253990Smav spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6051253990Smav mutex_exit(&spa->spa_async_lock); 6052253990Smav} 6053253990Smav 6054168404Spjdvoid 6055168404Spjdspa_async_request(spa_t *spa, int task) 6056168404Spjd{ 6057219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6058168404Spjd mutex_enter(&spa->spa_async_lock); 6059168404Spjd spa->spa_async_tasks |= task; 6060168404Spjd mutex_exit(&spa->spa_async_lock); 6061253990Smav spa_async_dispatch_vd(spa); 6062168404Spjd} 6063168404Spjd 6064168404Spjd/* 6065168404Spjd * ========================================================================== 6066168404Spjd * SPA syncing routines 6067168404Spjd * ========================================================================== 6068168404Spjd */ 6069168404Spjd 6070219089Spjdstatic int 6071219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6072168404Spjd{ 6073219089Spjd bpobj_t *bpo = arg; 6074219089Spjd bpobj_enqueue(bpo, bp, tx); 6075219089Spjd return (0); 6076219089Spjd} 6077168404Spjd 6078219089Spjdstatic int 6079219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6080219089Spjd{ 6081219089Spjd zio_t *zio = arg; 6082168404Spjd 6083219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6084240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 6085219089Spjd return (0); 6086168404Spjd} 6087168404Spjd 6088258632Savg/* 6089258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6090258632Savg * amount of time spent syncing frees. 6091258632Savg */ 6092168404Spjdstatic void 6093258632Savgspa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6094258632Savg{ 6095258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6096258632Savg bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6097258632Savg VERIFY(zio_wait(zio) == 0); 6098258632Savg} 6099258632Savg 6100258632Savg/* 6101258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6102258632Savg * amount of time spent syncing deferred frees. 6103258632Savg */ 6104258632Savgstatic void 6105258632Savgspa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6106258632Savg{ 6107258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6108258632Savg VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6109258632Savg spa_free_sync_cb, zio, tx), ==, 0); 6110258632Savg VERIFY0(zio_wait(zio)); 6111258632Savg} 6112258632Savg 6113258632Savg 6114258632Savgstatic void 6115168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6116168404Spjd{ 6117168404Spjd char *packed = NULL; 6118185029Spjd size_t bufsize; 6119168404Spjd size_t nvsize = 0; 6120168404Spjd dmu_buf_t *db; 6121168404Spjd 6122168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6123168404Spjd 6124185029Spjd /* 6125185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6126260150Sdelphij * information. This avoids the dmu_buf_will_dirty() path and 6127185029Spjd * saves us a pre-read to get data we don't actually care about. 6128185029Spjd */ 6129236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6130185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 6131168404Spjd 6132168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6133168404Spjd KM_SLEEP) == 0); 6134185029Spjd bzero(packed + nvsize, bufsize - nvsize); 6135168404Spjd 6136185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6137168404Spjd 6138185029Spjd kmem_free(packed, bufsize); 6139168404Spjd 6140168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6141168404Spjd dmu_buf_will_dirty(db, tx); 6142168404Spjd *(uint64_t *)db->db_data = nvsize; 6143168404Spjd dmu_buf_rele(db, FTAG); 6144168404Spjd} 6145168404Spjd 6146168404Spjdstatic void 6147185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6148185029Spjd const char *config, const char *entry) 6149168404Spjd{ 6150168404Spjd nvlist_t *nvroot; 6151185029Spjd nvlist_t **list; 6152168404Spjd int i; 6153168404Spjd 6154185029Spjd if (!sav->sav_sync) 6155168404Spjd return; 6156168404Spjd 6157168404Spjd /* 6158185029Spjd * Update the MOS nvlist describing the list of available devices. 6159185029Spjd * spa_validate_aux() will have already made sure this nvlist is 6160185029Spjd * valid and the vdevs are labeled appropriately. 6161168404Spjd */ 6162185029Spjd if (sav->sav_object == 0) { 6163185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6164185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6165185029Spjd sizeof (uint64_t), tx); 6166168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 6167185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6168185029Spjd &sav->sav_object, tx) == 0); 6169168404Spjd } 6170168404Spjd 6171168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6172185029Spjd if (sav->sav_count == 0) { 6173185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6174168404Spjd } else { 6175185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6176185029Spjd for (i = 0; i < sav->sav_count; i++) 6177185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6178219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 6179185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6180185029Spjd sav->sav_count) == 0); 6181185029Spjd for (i = 0; i < sav->sav_count; i++) 6182185029Spjd nvlist_free(list[i]); 6183185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 6184168404Spjd } 6185168404Spjd 6186185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6187168404Spjd nvlist_free(nvroot); 6188168404Spjd 6189185029Spjd sav->sav_sync = B_FALSE; 6190168404Spjd} 6191168404Spjd 6192168404Spjdstatic void 6193168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6194168404Spjd{ 6195168404Spjd nvlist_t *config; 6196168404Spjd 6197185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) 6198168404Spjd return; 6199168404Spjd 6200185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6201168404Spjd 6202185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 6203185029Spjd dmu_tx_get_txg(tx), B_FALSE); 6204185029Spjd 6205243505Smm /* 6206243505Smm * If we're upgrading the spa version then make sure that 6207243505Smm * the config object gets updated with the correct version. 6208243505Smm */ 6209243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6210243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6211243505Smm spa->spa_uberblock.ub_version); 6212243505Smm 6213185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6214185029Spjd 6215168404Spjd if (spa->spa_config_syncing) 6216168404Spjd nvlist_free(spa->spa_config_syncing); 6217168404Spjd spa->spa_config_syncing = config; 6218168404Spjd 6219168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6220168404Spjd} 6221168404Spjd 6222236884Smmstatic void 6223248571Smmspa_sync_version(void *arg, dmu_tx_t *tx) 6224236884Smm{ 6225248571Smm uint64_t *versionp = arg; 6226248571Smm uint64_t version = *versionp; 6227248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6228236884Smm 6229236884Smm /* 6230236884Smm * Setting the version is special cased when first creating the pool. 6231236884Smm */ 6232236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 6233236884Smm 6234247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6235236884Smm ASSERT(version >= spa_version(spa)); 6236236884Smm 6237236884Smm spa->spa_uberblock.ub_version = version; 6238236884Smm vdev_config_dirty(spa->spa_root_vdev); 6239248571Smm spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6240236884Smm} 6241236884Smm 6242185029Spjd/* 6243185029Spjd * Set zpool properties. 6244185029Spjd */ 6245168404Spjdstatic void 6246248571Smmspa_sync_props(void *arg, dmu_tx_t *tx) 6247168404Spjd{ 6248248571Smm nvlist_t *nvp = arg; 6249248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6250185029Spjd objset_t *mos = spa->spa_meta_objset; 6251236884Smm nvpair_t *elem = NULL; 6252168404Spjd 6253168404Spjd mutex_enter(&spa->spa_props_lock); 6254168404Spjd 6255185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 6256236884Smm uint64_t intval; 6257236884Smm char *strval, *fname; 6258236884Smm zpool_prop_t prop; 6259236884Smm const char *propname; 6260236884Smm zprop_type_t proptype; 6261259813Sdelphij spa_feature_t fid; 6262236884Smm 6263185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6264236884Smm case ZPROP_INVAL: 6265236884Smm /* 6266236884Smm * We checked this earlier in spa_prop_validate(). 6267236884Smm */ 6268236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 6269236884Smm 6270236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 6271259813Sdelphij VERIFY0(zfeature_lookup_name(fname, &fid)); 6272236884Smm 6273259813Sdelphij spa_feature_enable(spa, fid, tx); 6274248571Smm spa_history_log_internal(spa, "set", tx, 6275248571Smm "%s=enabled", nvpair_name(elem)); 6276236884Smm break; 6277236884Smm 6278185029Spjd case ZPOOL_PROP_VERSION: 6279258717Savg intval = fnvpair_value_uint64(elem); 6280185029Spjd /* 6281236884Smm * The version is synced seperatly before other 6282236884Smm * properties and should be correct by now. 6283185029Spjd */ 6284236884Smm ASSERT3U(spa_version(spa), >=, intval); 6285185029Spjd break; 6286168404Spjd 6287185029Spjd case ZPOOL_PROP_ALTROOT: 6288185029Spjd /* 6289185029Spjd * 'altroot' is a non-persistent property. It should 6290185029Spjd * have been set temporarily at creation or import time. 6291185029Spjd */ 6292185029Spjd ASSERT(spa->spa_root != NULL); 6293185029Spjd break; 6294168404Spjd 6295219089Spjd case ZPOOL_PROP_READONLY: 6296185029Spjd case ZPOOL_PROP_CACHEFILE: 6297185029Spjd /* 6298219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 6299219089Spjd * properties. 6300185029Spjd */ 6301168404Spjd break; 6302228103Smm case ZPOOL_PROP_COMMENT: 6303258717Savg strval = fnvpair_value_string(elem); 6304228103Smm if (spa->spa_comment != NULL) 6305228103Smm spa_strfree(spa->spa_comment); 6306228103Smm spa->spa_comment = spa_strdup(strval); 6307228103Smm /* 6308228103Smm * We need to dirty the configuration on all the vdevs 6309228103Smm * so that their labels get updated. It's unnecessary 6310228103Smm * to do this for pool creation since the vdev's 6311228103Smm * configuratoin has already been dirtied. 6312228103Smm */ 6313228103Smm if (tx->tx_txg != TXG_INITIAL) 6314228103Smm vdev_config_dirty(spa->spa_root_vdev); 6315248571Smm spa_history_log_internal(spa, "set", tx, 6316248571Smm "%s=%s", nvpair_name(elem), strval); 6317228103Smm break; 6318185029Spjd default: 6319185029Spjd /* 6320185029Spjd * Set pool property values in the poolprops mos object. 6321185029Spjd */ 6322185029Spjd if (spa->spa_pool_props_object == 0) { 6323236884Smm spa->spa_pool_props_object = 6324236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 6325185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6326236884Smm tx); 6327185029Spjd } 6328185029Spjd 6329185029Spjd /* normalize the property name */ 6330185029Spjd propname = zpool_prop_to_name(prop); 6331185029Spjd proptype = zpool_prop_get_type(prop); 6332185029Spjd 6333185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 6334185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 6335258717Savg strval = fnvpair_value_string(elem); 6336258717Savg VERIFY0(zap_update(mos, 6337185029Spjd spa->spa_pool_props_object, propname, 6338258717Savg 1, strlen(strval) + 1, strval, tx)); 6339248571Smm spa_history_log_internal(spa, "set", tx, 6340248571Smm "%s=%s", nvpair_name(elem), strval); 6341185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6342258717Savg intval = fnvpair_value_uint64(elem); 6343185029Spjd 6344185029Spjd if (proptype == PROP_TYPE_INDEX) { 6345185029Spjd const char *unused; 6346258717Savg VERIFY0(zpool_prop_index_to_string( 6347258717Savg prop, intval, &unused)); 6348185029Spjd } 6349258717Savg VERIFY0(zap_update(mos, 6350185029Spjd spa->spa_pool_props_object, propname, 6351258717Savg 8, 1, &intval, tx)); 6352248571Smm spa_history_log_internal(spa, "set", tx, 6353248571Smm "%s=%lld", nvpair_name(elem), intval); 6354185029Spjd } else { 6355185029Spjd ASSERT(0); /* not allowed */ 6356185029Spjd } 6357185029Spjd 6358185029Spjd switch (prop) { 6359185029Spjd case ZPOOL_PROP_DELEGATION: 6360185029Spjd spa->spa_delegation = intval; 6361185029Spjd break; 6362185029Spjd case ZPOOL_PROP_BOOTFS: 6363185029Spjd spa->spa_bootfs = intval; 6364185029Spjd break; 6365185029Spjd case ZPOOL_PROP_FAILUREMODE: 6366185029Spjd spa->spa_failmode = intval; 6367185029Spjd break; 6368219089Spjd case ZPOOL_PROP_AUTOEXPAND: 6369219089Spjd spa->spa_autoexpand = intval; 6370219089Spjd if (tx->tx_txg != TXG_INITIAL) 6371219089Spjd spa_async_request(spa, 6372219089Spjd SPA_ASYNC_AUTOEXPAND); 6373219089Spjd break; 6374219089Spjd case ZPOOL_PROP_DEDUPDITTO: 6375219089Spjd spa->spa_dedup_ditto = intval; 6376219089Spjd break; 6377185029Spjd default: 6378185029Spjd break; 6379185029Spjd } 6380168404Spjd } 6381185029Spjd 6382168404Spjd } 6383185029Spjd 6384185029Spjd mutex_exit(&spa->spa_props_lock); 6385168404Spjd} 6386168404Spjd 6387168404Spjd/* 6388219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 6389219089Spjd * reflect the new version this txg, so there must be no changes this 6390219089Spjd * txg to anything that the upgrade code depends on after it executes. 6391219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 6392219089Spjd * tasks. 6393219089Spjd */ 6394219089Spjdstatic void 6395219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6396219089Spjd{ 6397219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6398219089Spjd 6399219089Spjd ASSERT(spa->spa_sync_pass == 1); 6400219089Spjd 6401248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6402248571Smm 6403219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6404219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6405219089Spjd dsl_pool_create_origin(dp, tx); 6406219089Spjd 6407219089Spjd /* Keeping the origin open increases spa_minref */ 6408219089Spjd spa->spa_minref += 3; 6409219089Spjd } 6410219089Spjd 6411219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6412219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6413219089Spjd dsl_pool_upgrade_clones(dp, tx); 6414219089Spjd } 6415219089Spjd 6416219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6417219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6418219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 6419219089Spjd 6420219089Spjd /* Keeping the freedir open increases spa_minref */ 6421219089Spjd spa->spa_minref += 3; 6422219089Spjd } 6423236884Smm 6424236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6425236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6426236884Smm spa_feature_create_zap_objects(spa, tx); 6427236884Smm } 6428248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 6429219089Spjd} 6430219089Spjd 6431219089Spjd/* 6432168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 6433168404Spjd * part of the process, so we iterate until it converges. 6434168404Spjd */ 6435168404Spjdvoid 6436168404Spjdspa_sync(spa_t *spa, uint64_t txg) 6437168404Spjd{ 6438168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6439168404Spjd objset_t *mos = spa->spa_meta_objset; 6440219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6441168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6442168404Spjd vdev_t *vd; 6443168404Spjd dmu_tx_t *tx; 6444185029Spjd int error; 6445168404Spjd 6446219089Spjd VERIFY(spa_writeable(spa)); 6447219089Spjd 6448168404Spjd /* 6449168404Spjd * Lock out configuration changes. 6450168404Spjd */ 6451185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6452168404Spjd 6453168404Spjd spa->spa_syncing_txg = txg; 6454168404Spjd spa->spa_sync_pass = 0; 6455168404Spjd 6456185029Spjd /* 6457185029Spjd * If there are any pending vdev state changes, convert them 6458185029Spjd * into config changes that go out with this transaction group. 6459185029Spjd */ 6460185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6461209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 6462209962Smm /* 6463209962Smm * We need the write lock here because, for aux vdevs, 6464209962Smm * calling vdev_config_dirty() modifies sav_config. 6465209962Smm * This is ugly and will become unnecessary when we 6466209962Smm * eliminate the aux vdev wart by integrating all vdevs 6467209962Smm * into the root vdev tree. 6468209962Smm */ 6469209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6470209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6471209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6472209962Smm vdev_state_clean(vd); 6473209962Smm vdev_config_dirty(vd); 6474209962Smm } 6475209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6476209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6477185029Spjd } 6478185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6479185029Spjd 6480168404Spjd tx = dmu_tx_create_assigned(dp, txg); 6481168404Spjd 6482247265Smm spa->spa_sync_starttime = gethrtime(); 6483247265Smm#ifdef illumos 6484247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6485247265Smm spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6486247265Smm#else /* FreeBSD */ 6487247265Smm#ifdef _KERNEL 6488247265Smm callout_reset(&spa->spa_deadman_cycid, 6489247265Smm hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6490247265Smm#endif 6491247265Smm#endif 6492247265Smm 6493168404Spjd /* 6494185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6495168404Spjd * set spa_deflate if we have no raid-z vdevs. 6496168404Spjd */ 6497185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6498185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6499168404Spjd int i; 6500168404Spjd 6501168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 6502168404Spjd vd = rvd->vdev_child[i]; 6503168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6504168404Spjd break; 6505168404Spjd } 6506168404Spjd if (i == rvd->vdev_children) { 6507168404Spjd spa->spa_deflate = TRUE; 6508168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 6509168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6510168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6511168404Spjd } 6512168404Spjd } 6513168404Spjd 6514168404Spjd /* 6515219089Spjd * If anything has changed in this txg, or if someone is waiting 6516219089Spjd * for this txg to sync (eg, spa_vdev_remove()), push the 6517219089Spjd * deferred frees from the previous txg. If not, leave them 6518219089Spjd * alone so that we don't generate work on an otherwise idle 6519219089Spjd * system. 6520168404Spjd */ 6521168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6522168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6523219089Spjd !txg_list_empty(&dp->dp_sync_tasks, txg) || 6524219089Spjd ((dsl_scan_active(dp->dp_scan) || 6525219089Spjd txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6526258632Savg spa_sync_deferred_frees(spa, tx); 6527219089Spjd } 6528168404Spjd 6529168404Spjd /* 6530168404Spjd * Iterate to convergence. 6531168404Spjd */ 6532168404Spjd do { 6533219089Spjd int pass = ++spa->spa_sync_pass; 6534168404Spjd 6535168404Spjd spa_sync_config_object(spa, tx); 6536185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6537185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6538185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6539185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6540168404Spjd spa_errlog_sync(spa, txg); 6541168404Spjd dsl_pool_sync(dp, txg); 6542168404Spjd 6543243503Smm if (pass < zfs_sync_pass_deferred_free) { 6544258632Savg spa_sync_frees(spa, free_bpl, tx); 6545219089Spjd } else { 6546219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 6547258632Savg &spa->spa_deferred_bpobj, tx); 6548168404Spjd } 6549168404Spjd 6550219089Spjd ddt_sync(spa, txg); 6551219089Spjd dsl_scan_sync(dp, tx); 6552168404Spjd 6553219089Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6554219089Spjd vdev_sync(vd, txg); 6555168404Spjd 6556219089Spjd if (pass == 1) 6557219089Spjd spa_sync_upgrades(spa, tx); 6558168404Spjd 6559219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 6560219089Spjd 6561168404Spjd /* 6562168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 6563168404Spjd * to commit the transaction group. 6564168404Spjd * 6565185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 6566185029Spjd * random top-level vdevs that are known to be visible in the 6567185029Spjd * config cache (see spa_vdev_add() for a complete description). 6568185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6569168404Spjd */ 6570185029Spjd for (;;) { 6571185029Spjd /* 6572185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 6573185029Spjd * while we're attempting to write the vdev labels. 6574185029Spjd */ 6575185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6576168404Spjd 6577185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 6578185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 6579185029Spjd int svdcount = 0; 6580185029Spjd int children = rvd->vdev_children; 6581185029Spjd int c0 = spa_get_random(children); 6582185029Spjd 6583219089Spjd for (int c = 0; c < children; c++) { 6584185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 6585185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6586185029Spjd continue; 6587185029Spjd svd[svdcount++] = vd; 6588185029Spjd if (svdcount == SPA_DVAS_PER_BP) 6589185029Spjd break; 6590185029Spjd } 6591213198Smm error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6592213198Smm if (error != 0) 6593213198Smm error = vdev_config_sync(svd, svdcount, txg, 6594213198Smm B_TRUE); 6595185029Spjd } else { 6596185029Spjd error = vdev_config_sync(rvd->vdev_child, 6597213198Smm rvd->vdev_children, txg, B_FALSE); 6598213198Smm if (error != 0) 6599213198Smm error = vdev_config_sync(rvd->vdev_child, 6600213198Smm rvd->vdev_children, txg, B_TRUE); 6601168404Spjd } 6602185029Spjd 6603239620Smm if (error == 0) 6604239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 6605239620Smm 6606185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6607185029Spjd 6608185029Spjd if (error == 0) 6609185029Spjd break; 6610185029Spjd zio_suspend(spa, NULL); 6611185029Spjd zio_resume_wait(spa); 6612168404Spjd } 6613168404Spjd dmu_tx_commit(tx); 6614168404Spjd 6615247265Smm#ifdef illumos 6616247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6617247265Smm#else /* FreeBSD */ 6618247265Smm#ifdef _KERNEL 6619247265Smm callout_drain(&spa->spa_deadman_cycid); 6620247265Smm#endif 6621247265Smm#endif 6622247265Smm 6623168404Spjd /* 6624168404Spjd * Clear the dirty config list. 6625168404Spjd */ 6626185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6627168404Spjd vdev_config_clean(vd); 6628168404Spjd 6629168404Spjd /* 6630168404Spjd * Now that the new config has synced transactionally, 6631168404Spjd * let it become visible to the config cache. 6632168404Spjd */ 6633168404Spjd if (spa->spa_config_syncing != NULL) { 6634168404Spjd spa_config_set(spa, spa->spa_config_syncing); 6635168404Spjd spa->spa_config_txg = txg; 6636168404Spjd spa->spa_config_syncing = NULL; 6637168404Spjd } 6638168404Spjd 6639168404Spjd spa->spa_ubsync = spa->spa_uberblock; 6640168404Spjd 6641219089Spjd dsl_pool_sync_done(dp, txg); 6642168404Spjd 6643168404Spjd /* 6644168404Spjd * Update usable space statistics. 6645168404Spjd */ 6646168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6647168404Spjd vdev_sync_done(vd, txg); 6648168404Spjd 6649219089Spjd spa_update_dspace(spa); 6650219089Spjd 6651168404Spjd /* 6652168404Spjd * It had better be the case that we didn't dirty anything 6653168404Spjd * since vdev_config_sync(). 6654168404Spjd */ 6655168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6656168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6657168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6658168404Spjd 6659219089Spjd spa->spa_sync_pass = 0; 6660219089Spjd 6661185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6662168404Spjd 6663219089Spjd spa_handle_ignored_writes(spa); 6664219089Spjd 6665168404Spjd /* 6666168404Spjd * If any async tasks have been requested, kick them off. 6667168404Spjd */ 6668168404Spjd spa_async_dispatch(spa); 6669253990Smav spa_async_dispatch_vd(spa); 6670168404Spjd} 6671168404Spjd 6672168404Spjd/* 6673168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 6674168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 6675168404Spjd * sync. 6676168404Spjd */ 6677168404Spjdvoid 6678168404Spjdspa_sync_allpools(void) 6679168404Spjd{ 6680168404Spjd spa_t *spa = NULL; 6681168404Spjd mutex_enter(&spa_namespace_lock); 6682168404Spjd while ((spa = spa_next(spa)) != NULL) { 6683219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 6684219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 6685168404Spjd continue; 6686168404Spjd spa_open_ref(spa, FTAG); 6687168404Spjd mutex_exit(&spa_namespace_lock); 6688168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6689168404Spjd mutex_enter(&spa_namespace_lock); 6690168404Spjd spa_close(spa, FTAG); 6691168404Spjd } 6692168404Spjd mutex_exit(&spa_namespace_lock); 6693168404Spjd} 6694168404Spjd 6695168404Spjd/* 6696168404Spjd * ========================================================================== 6697168404Spjd * Miscellaneous routines 6698168404Spjd * ========================================================================== 6699168404Spjd */ 6700168404Spjd 6701168404Spjd/* 6702168404Spjd * Remove all pools in the system. 6703168404Spjd */ 6704168404Spjdvoid 6705168404Spjdspa_evict_all(void) 6706168404Spjd{ 6707168404Spjd spa_t *spa; 6708168404Spjd 6709168404Spjd /* 6710168404Spjd * Remove all cached state. All pools should be closed now, 6711168404Spjd * so every spa in the AVL tree should be unreferenced. 6712168404Spjd */ 6713168404Spjd mutex_enter(&spa_namespace_lock); 6714168404Spjd while ((spa = spa_next(NULL)) != NULL) { 6715168404Spjd /* 6716168404Spjd * Stop async tasks. The async thread may need to detach 6717168404Spjd * a device that's been replaced, which requires grabbing 6718168404Spjd * spa_namespace_lock, so we must drop it here. 6719168404Spjd */ 6720168404Spjd spa_open_ref(spa, FTAG); 6721168404Spjd mutex_exit(&spa_namespace_lock); 6722168404Spjd spa_async_suspend(spa); 6723168404Spjd mutex_enter(&spa_namespace_lock); 6724168404Spjd spa_close(spa, FTAG); 6725168404Spjd 6726168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6727168404Spjd spa_unload(spa); 6728168404Spjd spa_deactivate(spa); 6729168404Spjd } 6730168404Spjd spa_remove(spa); 6731168404Spjd } 6732168404Spjd mutex_exit(&spa_namespace_lock); 6733168404Spjd} 6734168404Spjd 6735168404Spjdvdev_t * 6736209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6737168404Spjd{ 6738185029Spjd vdev_t *vd; 6739185029Spjd int i; 6740185029Spjd 6741185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6742185029Spjd return (vd); 6743185029Spjd 6744209962Smm if (aux) { 6745185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6746185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 6747185029Spjd if (vd->vdev_guid == guid) 6748185029Spjd return (vd); 6749185029Spjd } 6750209962Smm 6751209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 6752209962Smm vd = spa->spa_spares.sav_vdevs[i]; 6753209962Smm if (vd->vdev_guid == guid) 6754209962Smm return (vd); 6755209962Smm } 6756185029Spjd } 6757185029Spjd 6758185029Spjd return (NULL); 6759168404Spjd} 6760168404Spjd 6761168404Spjdvoid 6762185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 6763168404Spjd{ 6764219089Spjd ASSERT(spa_writeable(spa)); 6765219089Spjd 6766185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6767168404Spjd 6768168404Spjd /* 6769168404Spjd * This should only be called for a non-faulted pool, and since a 6770168404Spjd * future version would result in an unopenable pool, this shouldn't be 6771168404Spjd * possible. 6772168404Spjd */ 6773247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6774185029Spjd ASSERT(version >= spa->spa_uberblock.ub_version); 6775168404Spjd 6776185029Spjd spa->spa_uberblock.ub_version = version; 6777168404Spjd vdev_config_dirty(spa->spa_root_vdev); 6778168404Spjd 6779185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 6780168404Spjd 6781168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6782168404Spjd} 6783168404Spjd 6784168404Spjdboolean_t 6785168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 6786168404Spjd{ 6787168404Spjd int i; 6788168404Spjd uint64_t spareguid; 6789185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6790168404Spjd 6791185029Spjd for (i = 0; i < sav->sav_count; i++) 6792185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 6793168404Spjd return (B_TRUE); 6794168404Spjd 6795185029Spjd for (i = 0; i < sav->sav_npending; i++) { 6796185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6797185029Spjd &spareguid) == 0 && spareguid == guid) 6798168404Spjd return (B_TRUE); 6799168404Spjd } 6800168404Spjd 6801168404Spjd return (B_FALSE); 6802168404Spjd} 6803168404Spjd 6804185029Spjd/* 6805185029Spjd * Check if a pool has an active shared spare device. 6806185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 6807185029Spjd */ 6808185029Spjdstatic boolean_t 6809185029Spjdspa_has_active_shared_spare(spa_t *spa) 6810168404Spjd{ 6811185029Spjd int i, refcnt; 6812185029Spjd uint64_t pool; 6813185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6814185029Spjd 6815185029Spjd for (i = 0; i < sav->sav_count; i++) { 6816185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6817185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6818185029Spjd refcnt > 2) 6819185029Spjd return (B_TRUE); 6820185029Spjd } 6821185029Spjd 6822185029Spjd return (B_FALSE); 6823168404Spjd} 6824168404Spjd 6825185029Spjd/* 6826185029Spjd * Post a sysevent corresponding to the given event. The 'name' must be one of 6827185029Spjd * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6828185029Spjd * filled in from the spa and (optionally) the vdev. This doesn't do anything 6829185029Spjd * in the userland libzpool, as we don't want consumers to misinterpret ztest 6830185029Spjd * or zdb as real changes. 6831185029Spjd */ 6832185029Spjdvoid 6833185029Spjdspa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6834168404Spjd{ 6835185029Spjd#ifdef _KERNEL 6836185029Spjd sysevent_t *ev; 6837185029Spjd sysevent_attr_list_t *attr = NULL; 6838185029Spjd sysevent_value_t value; 6839185029Spjd sysevent_id_t eid; 6840168404Spjd 6841185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6842185029Spjd SE_SLEEP); 6843168404Spjd 6844185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6845185029Spjd value.value.sv_string = spa_name(spa); 6846185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6847185029Spjd goto done; 6848168404Spjd 6849185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6850185029Spjd value.value.sv_uint64 = spa_guid(spa); 6851185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6852185029Spjd goto done; 6853168404Spjd 6854185029Spjd if (vd) { 6855185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6856185029Spjd value.value.sv_uint64 = vd->vdev_guid; 6857185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6858185029Spjd SE_SLEEP) != 0) 6859185029Spjd goto done; 6860168404Spjd 6861185029Spjd if (vd->vdev_path) { 6862185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6863185029Spjd value.value.sv_string = vd->vdev_path; 6864185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6865185029Spjd &value, SE_SLEEP) != 0) 6866185029Spjd goto done; 6867168404Spjd } 6868168404Spjd } 6869168404Spjd 6870185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 6871185029Spjd goto done; 6872185029Spjd attr = NULL; 6873168404Spjd 6874185029Spjd (void) log_sysevent(ev, SE_SLEEP, &eid); 6875185029Spjd 6876185029Spjddone: 6877185029Spjd if (attr) 6878185029Spjd sysevent_free_attr(attr); 6879185029Spjd sysevent_free(ev); 6880185029Spjd#endif 6881168404Spjd} 6882