spa.c revision 272598
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24264670Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25268126Sdelphij * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. 26247265Smm * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30251629Sdelphij * SPA: Storage Pool Allocator 31251629Sdelphij * 32168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 33168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 34168404Spjd * pool. 35168404Spjd */ 36168404Spjd 37168404Spjd#include <sys/zfs_context.h> 38168404Spjd#include <sys/fm/fs/zfs.h> 39168404Spjd#include <sys/spa_impl.h> 40168404Spjd#include <sys/zio.h> 41168404Spjd#include <sys/zio_checksum.h> 42168404Spjd#include <sys/dmu.h> 43168404Spjd#include <sys/dmu_tx.h> 44168404Spjd#include <sys/zap.h> 45168404Spjd#include <sys/zil.h> 46219089Spjd#include <sys/ddt.h> 47168404Spjd#include <sys/vdev_impl.h> 48168404Spjd#include <sys/metaslab.h> 49219089Spjd#include <sys/metaslab_impl.h> 50168404Spjd#include <sys/uberblock_impl.h> 51168404Spjd#include <sys/txg.h> 52168404Spjd#include <sys/avl.h> 53168404Spjd#include <sys/dmu_traverse.h> 54168404Spjd#include <sys/dmu_objset.h> 55168404Spjd#include <sys/unique.h> 56168404Spjd#include <sys/dsl_pool.h> 57168404Spjd#include <sys/dsl_dataset.h> 58168404Spjd#include <sys/dsl_dir.h> 59168404Spjd#include <sys/dsl_prop.h> 60168404Spjd#include <sys/dsl_synctask.h> 61168404Spjd#include <sys/fs/zfs.h> 62185029Spjd#include <sys/arc.h> 63168404Spjd#include <sys/callb.h> 64185029Spjd#include <sys/spa_boot.h> 65219089Spjd#include <sys/zfs_ioctl.h> 66219089Spjd#include <sys/dsl_scan.h> 67248571Smm#include <sys/dmu_send.h> 68248571Smm#include <sys/dsl_destroy.h> 69248571Smm#include <sys/dsl_userhold.h> 70236884Smm#include <sys/zfeature.h> 71219089Spjd#include <sys/zvol.h> 72240868Spjd#include <sys/trim_map.h> 73168404Spjd 74219089Spjd#ifdef _KERNEL 75219089Spjd#include <sys/callb.h> 76219089Spjd#include <sys/cpupart.h> 77219089Spjd#include <sys/zone.h> 78219089Spjd#endif /* _KERNEL */ 79219089Spjd 80185029Spjd#include "zfs_prop.h" 81185029Spjd#include "zfs_comutil.h" 82168404Spjd 83204073Spjd/* Check hostid on import? */ 84204073Spjdstatic int check_hostid = 1; 85204073Spjd 86251636Sdelphij/* 87251636Sdelphij * The interval, in seconds, at which failed configuration cache file writes 88251636Sdelphij * should be retried. 89251636Sdelphij */ 90251636Sdelphijstatic int zfs_ccw_retry_interval = 300; 91251636Sdelphij 92271785SwillSYSCTL_DECL(_vfs_zfs); 93271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, 94271785Swill "Check hostid on import?"); 95271785SwillTUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); 96271785SwillSYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, 97271785Swill &zfs_ccw_retry_interval, 0, 98271785Swill "Configuration cache file write, retry after failure, interval (seconds)"); 99271785Swill 100219089Spjdtypedef enum zti_modes { 101258631Savg ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 102258631Savg ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 103258631Savg ZTI_MODE_NULL, /* don't create a taskq */ 104258631Savg ZTI_NMODES 105219089Spjd} zti_modes_t; 106168712Spjd 107258631Savg#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 108258631Savg#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 109258631Savg#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 110209962Smm 111258631Savg#define ZTI_N(n) ZTI_P(n, 1) 112258631Savg#define ZTI_ONE ZTI_N(1) 113209962Smm 114209962Smmtypedef struct zio_taskq_info { 115258631Savg zti_modes_t zti_mode; 116211931Smm uint_t zti_value; 117258631Savg uint_t zti_count; 118209962Smm} zio_taskq_info_t; 119209962Smm 120209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 121219089Spjd "issue", "issue_high", "intr", "intr_high" 122209962Smm}; 123209962Smm 124211931Smm/* 125258631Savg * This table defines the taskq settings for each ZFS I/O type. When 126258631Savg * initializing a pool, we use this table to create an appropriately sized 127258631Savg * taskq. Some operations are low volume and therefore have a small, static 128258631Savg * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 129258631Savg * macros. Other operations process a large amount of data; the ZTI_BATCH 130258631Savg * macro causes us to create a taskq oriented for throughput. Some operations 131258631Savg * are so high frequency and short-lived that the taskq itself can become a a 132258631Savg * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 133258631Savg * additional degree of parallelism specified by the number of threads per- 134258631Savg * taskq and the number of taskqs; when dispatching an event in this case, the 135258631Savg * particular taskq is chosen at random. 136258631Savg * 137258631Savg * The different taskq priorities are to handle the different contexts (issue 138258631Savg * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 139258631Savg * need to be handled with minimum delay. 140211931Smm */ 141211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 142211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 143258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 144264670Sdelphij { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 145258631Savg { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 146258631Savg { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 147258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 148258631Savg { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 149209962Smm}; 150209962Smm 151248571Smmstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 152248571Smmstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 153185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 154219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 155219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 156219089Spjd char **ereport); 157219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 158185029Spjd 159258632Savguint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 160219089Spjd#ifdef PSRSET_BIND 161219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 162219089Spjd#endif 163219089Spjd#ifdef SYSDC 164219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 165219089Spjd#endif 166219089Spjduint_t zio_taskq_basedc = 80; /* base duty cycle */ 167219089Spjd 168219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 169243503Smmextern int zfs_sync_pass_deferred_free; 170219089Spjd 171247265Smm#ifndef illumos 172247265Smmextern void spa_deadman(void *arg); 173247265Smm#endif 174247265Smm 175168404Spjd/* 176219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 177219089Spjd * to get the vdev stats associated with the imported devices. 178219089Spjd */ 179219089Spjd#define TRYIMPORT_NAME "$import" 180219089Spjd 181219089Spjd/* 182168404Spjd * ========================================================================== 183185029Spjd * SPA properties routines 184185029Spjd * ========================================================================== 185185029Spjd */ 186185029Spjd 187185029Spjd/* 188185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 189185029Spjd */ 190185029Spjdstatic void 191185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 192185029Spjd uint64_t intval, zprop_source_t src) 193185029Spjd{ 194185029Spjd const char *propname = zpool_prop_to_name(prop); 195185029Spjd nvlist_t *propval; 196185029Spjd 197185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 198185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 199185029Spjd 200185029Spjd if (strval != NULL) 201185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 202185029Spjd else 203185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 204185029Spjd 205185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 206185029Spjd nvlist_free(propval); 207185029Spjd} 208185029Spjd 209185029Spjd/* 210185029Spjd * Get property values from the spa configuration. 211185029Spjd */ 212185029Spjdstatic void 213185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 214185029Spjd{ 215236155Smm vdev_t *rvd = spa->spa_root_vdev; 216236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 217269118Sdelphij uint64_t size, alloc, cap, version; 218185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 219185029Spjd spa_config_dirent_t *dp; 220269118Sdelphij metaslab_class_t *mc = spa_normal_class(spa); 221185029Spjd 222185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 223185029Spjd 224236155Smm if (rvd != NULL) { 225219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 226219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 227209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 228209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 229219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 230219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 231219089Spjd size - alloc, src); 232236155Smm 233269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 234269118Sdelphij metaslab_class_fragmentation(mc), src); 235269118Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 236269118Sdelphij metaslab_class_expandable_space(mc), src); 237219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 238219089Spjd (spa_mode(spa) == FREAD), src); 239185029Spjd 240219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 241209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 242185029Spjd 243219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 244219089Spjd ddt_get_pool_dedup_ratio(spa), src); 245219089Spjd 246209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 247236155Smm rvd->vdev_state, src); 248209962Smm 249209962Smm version = spa_version(spa); 250209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 251209962Smm src = ZPROP_SRC_DEFAULT; 252209962Smm else 253209962Smm src = ZPROP_SRC_LOCAL; 254209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 255209962Smm } 256209962Smm 257236884Smm if (pool != NULL) { 258236884Smm /* 259236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 260236884Smm * when opening pools before this version freedir will be NULL. 261236884Smm */ 262268079Sdelphij if (pool->dp_free_dir != NULL) { 263236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 264268079Sdelphij pool->dp_free_dir->dd_phys->dd_used_bytes, src); 265236884Smm } else { 266236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 267236884Smm NULL, 0, src); 268236884Smm } 269268079Sdelphij 270268079Sdelphij if (pool->dp_leak_dir != NULL) { 271268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 272268079Sdelphij pool->dp_leak_dir->dd_phys->dd_used_bytes, src); 273268079Sdelphij } else { 274268079Sdelphij spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 275268079Sdelphij NULL, 0, src); 276268079Sdelphij } 277236884Smm } 278236884Smm 279185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 280185029Spjd 281228103Smm if (spa->spa_comment != NULL) { 282228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 283228103Smm 0, ZPROP_SRC_LOCAL); 284228103Smm } 285228103Smm 286185029Spjd if (spa->spa_root != NULL) 287185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 288185029Spjd 0, ZPROP_SRC_LOCAL); 289185029Spjd 290185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 291185029Spjd if (dp->scd_path == NULL) { 292185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 293185029Spjd "none", 0, ZPROP_SRC_LOCAL); 294185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 295185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 296185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 297185029Spjd } 298185029Spjd } 299185029Spjd} 300185029Spjd 301185029Spjd/* 302185029Spjd * Get zpool property values. 303185029Spjd */ 304185029Spjdint 305185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 306185029Spjd{ 307219089Spjd objset_t *mos = spa->spa_meta_objset; 308185029Spjd zap_cursor_t zc; 309185029Spjd zap_attribute_t za; 310185029Spjd int err; 311185029Spjd 312185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 313185029Spjd 314185029Spjd mutex_enter(&spa->spa_props_lock); 315185029Spjd 316185029Spjd /* 317185029Spjd * Get properties from the spa config. 318185029Spjd */ 319185029Spjd spa_prop_get_config(spa, nvp); 320185029Spjd 321185029Spjd /* If no pool property object, no more prop to get. */ 322219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 323185029Spjd mutex_exit(&spa->spa_props_lock); 324185029Spjd return (0); 325185029Spjd } 326185029Spjd 327185029Spjd /* 328185029Spjd * Get properties from the MOS pool property object. 329185029Spjd */ 330185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 331185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 332185029Spjd zap_cursor_advance(&zc)) { 333185029Spjd uint64_t intval = 0; 334185029Spjd char *strval = NULL; 335185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 336185029Spjd zpool_prop_t prop; 337185029Spjd 338185029Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 339185029Spjd continue; 340185029Spjd 341185029Spjd switch (za.za_integer_length) { 342185029Spjd case 8: 343185029Spjd /* integer property */ 344185029Spjd if (za.za_first_integer != 345185029Spjd zpool_prop_default_numeric(prop)) 346185029Spjd src = ZPROP_SRC_LOCAL; 347185029Spjd 348185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 349185029Spjd dsl_pool_t *dp; 350185029Spjd dsl_dataset_t *ds = NULL; 351185029Spjd 352185029Spjd dp = spa_get_dsl(spa); 353248571Smm dsl_pool_config_enter(dp, FTAG); 354185029Spjd if (err = dsl_dataset_hold_obj(dp, 355185029Spjd za.za_first_integer, FTAG, &ds)) { 356248571Smm dsl_pool_config_exit(dp, FTAG); 357185029Spjd break; 358185029Spjd } 359185029Spjd 360185029Spjd strval = kmem_alloc( 361185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 362185029Spjd KM_SLEEP); 363185029Spjd dsl_dataset_name(ds, strval); 364185029Spjd dsl_dataset_rele(ds, FTAG); 365248571Smm dsl_pool_config_exit(dp, FTAG); 366185029Spjd } else { 367185029Spjd strval = NULL; 368185029Spjd intval = za.za_first_integer; 369185029Spjd } 370185029Spjd 371185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 372185029Spjd 373185029Spjd if (strval != NULL) 374185029Spjd kmem_free(strval, 375185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 376185029Spjd 377185029Spjd break; 378185029Spjd 379185029Spjd case 1: 380185029Spjd /* string property */ 381185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 382185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 383185029Spjd za.za_name, 1, za.za_num_integers, strval); 384185029Spjd if (err) { 385185029Spjd kmem_free(strval, za.za_num_integers); 386185029Spjd break; 387185029Spjd } 388185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 389185029Spjd kmem_free(strval, za.za_num_integers); 390185029Spjd break; 391185029Spjd 392185029Spjd default: 393185029Spjd break; 394185029Spjd } 395185029Spjd } 396185029Spjd zap_cursor_fini(&zc); 397185029Spjd mutex_exit(&spa->spa_props_lock); 398185029Spjdout: 399185029Spjd if (err && err != ENOENT) { 400185029Spjd nvlist_free(*nvp); 401185029Spjd *nvp = NULL; 402185029Spjd return (err); 403185029Spjd } 404185029Spjd 405185029Spjd return (0); 406185029Spjd} 407185029Spjd 408185029Spjd/* 409185029Spjd * Validate the given pool properties nvlist and modify the list 410185029Spjd * for the property values to be set. 411185029Spjd */ 412185029Spjdstatic int 413185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 414185029Spjd{ 415185029Spjd nvpair_t *elem; 416185029Spjd int error = 0, reset_bootfs = 0; 417247187Smm uint64_t objnum = 0; 418236884Smm boolean_t has_feature = B_FALSE; 419185029Spjd 420185029Spjd elem = NULL; 421185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 422185029Spjd uint64_t intval; 423236884Smm char *strval, *slash, *check, *fname; 424236884Smm const char *propname = nvpair_name(elem); 425236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 426185029Spjd 427236884Smm switch (prop) { 428236884Smm case ZPROP_INVAL: 429236884Smm if (!zpool_prop_feature(propname)) { 430249195Smm error = SET_ERROR(EINVAL); 431236884Smm break; 432236884Smm } 433185029Spjd 434236884Smm /* 435236884Smm * Sanitize the input. 436236884Smm */ 437236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 438249195Smm error = SET_ERROR(EINVAL); 439236884Smm break; 440236884Smm } 441185029Spjd 442236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 443249195Smm error = SET_ERROR(EINVAL); 444236884Smm break; 445236884Smm } 446236884Smm 447236884Smm if (intval != 0) { 448249195Smm error = SET_ERROR(EINVAL); 449236884Smm break; 450236884Smm } 451236884Smm 452236884Smm fname = strchr(propname, '@') + 1; 453236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 454249195Smm error = SET_ERROR(EINVAL); 455236884Smm break; 456236884Smm } 457236884Smm 458236884Smm has_feature = B_TRUE; 459236884Smm break; 460236884Smm 461185029Spjd case ZPOOL_PROP_VERSION: 462185029Spjd error = nvpair_value_uint64(elem, &intval); 463185029Spjd if (!error && 464236884Smm (intval < spa_version(spa) || 465236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 466236884Smm has_feature)) 467249195Smm error = SET_ERROR(EINVAL); 468185029Spjd break; 469185029Spjd 470185029Spjd case ZPOOL_PROP_DELEGATION: 471185029Spjd case ZPOOL_PROP_AUTOREPLACE: 472185029Spjd case ZPOOL_PROP_LISTSNAPS: 473219089Spjd case ZPOOL_PROP_AUTOEXPAND: 474185029Spjd error = nvpair_value_uint64(elem, &intval); 475185029Spjd if (!error && intval > 1) 476249195Smm error = SET_ERROR(EINVAL); 477185029Spjd break; 478185029Spjd 479185029Spjd case ZPOOL_PROP_BOOTFS: 480209962Smm /* 481209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 482209962Smm * or the pool is still being created (version == 0), 483209962Smm * the bootfs property cannot be set. 484209962Smm */ 485185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 486249195Smm error = SET_ERROR(ENOTSUP); 487185029Spjd break; 488185029Spjd } 489185029Spjd 490185029Spjd /* 491185029Spjd * Make sure the vdev config is bootable 492185029Spjd */ 493185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 494249195Smm error = SET_ERROR(ENOTSUP); 495185029Spjd break; 496185029Spjd } 497185029Spjd 498185029Spjd reset_bootfs = 1; 499185029Spjd 500185029Spjd error = nvpair_value_string(elem, &strval); 501185029Spjd 502185029Spjd if (!error) { 503236884Smm objset_t *os; 504185029Spjd uint64_t compress; 505185029Spjd 506185029Spjd if (strval == NULL || strval[0] == '\0') { 507185029Spjd objnum = zpool_prop_default_numeric( 508185029Spjd ZPOOL_PROP_BOOTFS); 509185029Spjd break; 510185029Spjd } 511185029Spjd 512219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 513185029Spjd break; 514185029Spjd 515219089Spjd /* Must be ZPL and not gzip compressed. */ 516219089Spjd 517219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 518249195Smm error = SET_ERROR(ENOTSUP); 519248571Smm } else if ((error = 520248571Smm dsl_prop_get_int_ds(dmu_objset_ds(os), 521185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 522248571Smm &compress)) == 0 && 523185029Spjd !BOOTFS_COMPRESS_VALID(compress)) { 524249195Smm error = SET_ERROR(ENOTSUP); 525185029Spjd } else { 526185029Spjd objnum = dmu_objset_id(os); 527185029Spjd } 528219089Spjd dmu_objset_rele(os, FTAG); 529185029Spjd } 530185029Spjd break; 531185029Spjd 532185029Spjd case ZPOOL_PROP_FAILUREMODE: 533185029Spjd error = nvpair_value_uint64(elem, &intval); 534185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 535185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 536249195Smm error = SET_ERROR(EINVAL); 537185029Spjd 538185029Spjd /* 539185029Spjd * This is a special case which only occurs when 540185029Spjd * the pool has completely failed. This allows 541185029Spjd * the user to change the in-core failmode property 542185029Spjd * without syncing it out to disk (I/Os might 543185029Spjd * currently be blocked). We do this by returning 544185029Spjd * EIO to the caller (spa_prop_set) to trick it 545185029Spjd * into thinking we encountered a property validation 546185029Spjd * error. 547185029Spjd */ 548185029Spjd if (!error && spa_suspended(spa)) { 549185029Spjd spa->spa_failmode = intval; 550249195Smm error = SET_ERROR(EIO); 551185029Spjd } 552185029Spjd break; 553185029Spjd 554185029Spjd case ZPOOL_PROP_CACHEFILE: 555185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 556185029Spjd break; 557185029Spjd 558185029Spjd if (strval[0] == '\0') 559185029Spjd break; 560185029Spjd 561185029Spjd if (strcmp(strval, "none") == 0) 562185029Spjd break; 563185029Spjd 564185029Spjd if (strval[0] != '/') { 565249195Smm error = SET_ERROR(EINVAL); 566185029Spjd break; 567185029Spjd } 568185029Spjd 569185029Spjd slash = strrchr(strval, '/'); 570185029Spjd ASSERT(slash != NULL); 571185029Spjd 572185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 573185029Spjd strcmp(slash, "/..") == 0) 574249195Smm error = SET_ERROR(EINVAL); 575185029Spjd break; 576219089Spjd 577228103Smm case ZPOOL_PROP_COMMENT: 578228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 579228103Smm break; 580228103Smm for (check = strval; *check != '\0'; check++) { 581228103Smm /* 582228103Smm * The kernel doesn't have an easy isprint() 583228103Smm * check. For this kernel check, we merely 584228103Smm * check ASCII apart from DEL. Fix this if 585228103Smm * there is an easy-to-use kernel isprint(). 586228103Smm */ 587228103Smm if (*check >= 0x7f) { 588249195Smm error = SET_ERROR(EINVAL); 589228103Smm break; 590228103Smm } 591228103Smm check++; 592228103Smm } 593228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 594228103Smm error = E2BIG; 595228103Smm break; 596228103Smm 597219089Spjd case ZPOOL_PROP_DEDUPDITTO: 598219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 599249195Smm error = SET_ERROR(ENOTSUP); 600219089Spjd else 601219089Spjd error = nvpair_value_uint64(elem, &intval); 602219089Spjd if (error == 0 && 603219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 604249195Smm error = SET_ERROR(EINVAL); 605219089Spjd break; 606185029Spjd } 607185029Spjd 608185029Spjd if (error) 609185029Spjd break; 610185029Spjd } 611185029Spjd 612185029Spjd if (!error && reset_bootfs) { 613185029Spjd error = nvlist_remove(props, 614185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 615185029Spjd 616185029Spjd if (!error) { 617185029Spjd error = nvlist_add_uint64(props, 618185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 619185029Spjd } 620185029Spjd } 621185029Spjd 622185029Spjd return (error); 623185029Spjd} 624185029Spjd 625209962Smmvoid 626209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 627209962Smm{ 628209962Smm char *cachefile; 629209962Smm spa_config_dirent_t *dp; 630209962Smm 631209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 632209962Smm &cachefile) != 0) 633209962Smm return; 634209962Smm 635209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 636209962Smm KM_SLEEP); 637209962Smm 638209962Smm if (cachefile[0] == '\0') 639209962Smm dp->scd_path = spa_strdup(spa_config_path); 640209962Smm else if (strcmp(cachefile, "none") == 0) 641209962Smm dp->scd_path = NULL; 642209962Smm else 643209962Smm dp->scd_path = spa_strdup(cachefile); 644209962Smm 645209962Smm list_insert_head(&spa->spa_config_list, dp); 646209962Smm if (need_sync) 647209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 648209962Smm} 649209962Smm 650185029Spjdint 651185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 652185029Spjd{ 653185029Spjd int error; 654236884Smm nvpair_t *elem = NULL; 655209962Smm boolean_t need_sync = B_FALSE; 656185029Spjd 657185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 658185029Spjd return (error); 659185029Spjd 660209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 661236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 662209962Smm 663219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 664219089Spjd prop == ZPOOL_PROP_ALTROOT || 665219089Spjd prop == ZPOOL_PROP_READONLY) 666209962Smm continue; 667209962Smm 668236884Smm if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 669236884Smm uint64_t ver; 670236884Smm 671236884Smm if (prop == ZPOOL_PROP_VERSION) { 672236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 673236884Smm } else { 674236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 675236884Smm ver = SPA_VERSION_FEATURES; 676236884Smm need_sync = B_TRUE; 677236884Smm } 678236884Smm 679236884Smm /* Save time if the version is already set. */ 680236884Smm if (ver == spa_version(spa)) 681236884Smm continue; 682236884Smm 683236884Smm /* 684236884Smm * In addition to the pool directory object, we might 685236884Smm * create the pool properties object, the features for 686236884Smm * read object, the features for write object, or the 687236884Smm * feature descriptions object. 688236884Smm */ 689248571Smm error = dsl_sync_task(spa->spa_name, NULL, 690268473Sdelphij spa_sync_version, &ver, 691268473Sdelphij 6, ZFS_SPACE_CHECK_RESERVED); 692236884Smm if (error) 693236884Smm return (error); 694236884Smm continue; 695236884Smm } 696236884Smm 697209962Smm need_sync = B_TRUE; 698209962Smm break; 699209962Smm } 700209962Smm 701236884Smm if (need_sync) { 702248571Smm return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 703268473Sdelphij nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 704236884Smm } 705236884Smm 706236884Smm return (0); 707185029Spjd} 708185029Spjd 709185029Spjd/* 710185029Spjd * If the bootfs property value is dsobj, clear it. 711185029Spjd */ 712185029Spjdvoid 713185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 714185029Spjd{ 715185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 716185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 717185029Spjd spa->spa_pool_props_object, 718185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 719185029Spjd spa->spa_bootfs = 0; 720185029Spjd } 721185029Spjd} 722185029Spjd 723239620Smm/*ARGSUSED*/ 724239620Smmstatic int 725248571Smmspa_change_guid_check(void *arg, dmu_tx_t *tx) 726239620Smm{ 727248571Smm uint64_t *newguid = arg; 728248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 729239620Smm vdev_t *rvd = spa->spa_root_vdev; 730239620Smm uint64_t vdev_state; 731239620Smm 732239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 733239620Smm vdev_state = rvd->vdev_state; 734239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 735239620Smm 736239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 737249195Smm return (SET_ERROR(ENXIO)); 738239620Smm 739239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 740239620Smm 741239620Smm return (0); 742239620Smm} 743239620Smm 744239620Smmstatic void 745248571Smmspa_change_guid_sync(void *arg, dmu_tx_t *tx) 746239620Smm{ 747248571Smm uint64_t *newguid = arg; 748248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 749239620Smm uint64_t oldguid; 750239620Smm vdev_t *rvd = spa->spa_root_vdev; 751239620Smm 752239620Smm oldguid = spa_guid(spa); 753239620Smm 754239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 755239620Smm rvd->vdev_guid = *newguid; 756239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 757239620Smm vdev_config_dirty(rvd); 758239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 759239620Smm 760248571Smm spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 761239620Smm oldguid, *newguid); 762239620Smm} 763239620Smm 764185029Spjd/* 765228103Smm * Change the GUID for the pool. This is done so that we can later 766228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 767228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 768228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 769228103Smm * online when we do this, or else any vdevs that weren't present 770228103Smm * would be orphaned from our pool. We are also going to issue a 771228103Smm * sysevent to update any watchers. 772228103Smm */ 773228103Smmint 774228103Smmspa_change_guid(spa_t *spa) 775228103Smm{ 776239620Smm int error; 777239620Smm uint64_t guid; 778228103Smm 779254074Sdelphij mutex_enter(&spa->spa_vdev_top_lock); 780239620Smm mutex_enter(&spa_namespace_lock); 781239620Smm guid = spa_generate_guid(NULL); 782228103Smm 783248571Smm error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 784268473Sdelphij spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 785228103Smm 786239620Smm if (error == 0) { 787239620Smm spa_config_sync(spa, B_FALSE, B_TRUE); 788239620Smm spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 789239620Smm } 790228103Smm 791239620Smm mutex_exit(&spa_namespace_lock); 792254074Sdelphij mutex_exit(&spa->spa_vdev_top_lock); 793228103Smm 794239620Smm return (error); 795228103Smm} 796228103Smm 797228103Smm/* 798185029Spjd * ========================================================================== 799168404Spjd * SPA state manipulation (open/create/destroy/import/export) 800168404Spjd * ========================================================================== 801168404Spjd */ 802168404Spjd 803168404Spjdstatic int 804168404Spjdspa_error_entry_compare(const void *a, const void *b) 805168404Spjd{ 806168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 807168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 808168404Spjd int ret; 809168404Spjd 810168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 811268123Sdelphij sizeof (zbookmark_phys_t)); 812168404Spjd 813168404Spjd if (ret < 0) 814168404Spjd return (-1); 815168404Spjd else if (ret > 0) 816168404Spjd return (1); 817168404Spjd else 818168404Spjd return (0); 819168404Spjd} 820168404Spjd 821168404Spjd/* 822168404Spjd * Utility function which retrieves copies of the current logs and 823168404Spjd * re-initializes them in the process. 824168404Spjd */ 825168404Spjdvoid 826168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 827168404Spjd{ 828168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 829168404Spjd 830168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 831168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 832168404Spjd 833168404Spjd avl_create(&spa->spa_errlist_scrub, 834168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 835168404Spjd offsetof(spa_error_entry_t, se_avl)); 836168404Spjd avl_create(&spa->spa_errlist_last, 837168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 838168404Spjd offsetof(spa_error_entry_t, se_avl)); 839168404Spjd} 840168404Spjd 841258631Savgstatic void 842258631Savgspa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 843168404Spjd{ 844258631Savg const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 845258631Savg enum zti_modes mode = ztip->zti_mode; 846258631Savg uint_t value = ztip->zti_value; 847258631Savg uint_t count = ztip->zti_count; 848258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 849258631Savg char name[32]; 850258630Savg uint_t flags = 0; 851219089Spjd boolean_t batch = B_FALSE; 852168404Spjd 853258631Savg if (mode == ZTI_MODE_NULL) { 854258631Savg tqs->stqs_count = 0; 855258631Savg tqs->stqs_taskq = NULL; 856258631Savg return; 857258631Savg } 858168404Spjd 859258631Savg ASSERT3U(count, >, 0); 860168404Spjd 861258631Savg tqs->stqs_count = count; 862258631Savg tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 863219089Spjd 864258632Savg switch (mode) { 865258632Savg case ZTI_MODE_FIXED: 866258632Savg ASSERT3U(value, >=, 1); 867258632Savg value = MAX(value, 1); 868258632Savg break; 869219089Spjd 870258632Savg case ZTI_MODE_BATCH: 871258632Savg batch = B_TRUE; 872258632Savg flags |= TASKQ_THREADS_CPU_PCT; 873258632Savg value = zio_taskq_batch_pct; 874258632Savg break; 875219089Spjd 876258632Savg default: 877258632Savg panic("unrecognized mode for %s_%s taskq (%u:%u) in " 878258632Savg "spa_activate()", 879258632Savg zio_type_name[t], zio_taskq_types[q], mode, value); 880258632Savg break; 881258632Savg } 882258631Savg 883258632Savg for (uint_t i = 0; i < count; i++) { 884258632Savg taskq_t *tq; 885258631Savg 886258631Savg if (count > 1) { 887258631Savg (void) snprintf(name, sizeof (name), "%s_%s_%u", 888258631Savg zio_type_name[t], zio_taskq_types[q], i); 889258631Savg } else { 890258631Savg (void) snprintf(name, sizeof (name), "%s_%s", 891258631Savg zio_type_name[t], zio_taskq_types[q]); 892258631Savg } 893258631Savg 894219089Spjd#ifdef SYSDC 895258631Savg if (zio_taskq_sysdc && spa->spa_proc != &p0) { 896258631Savg if (batch) 897258631Savg flags |= TASKQ_DC_BATCH; 898219089Spjd 899258631Savg tq = taskq_create_sysdc(name, value, 50, INT_MAX, 900258631Savg spa->spa_proc, zio_taskq_basedc, flags); 901258631Savg } else { 902258631Savg#endif 903258632Savg pri_t pri = maxclsyspri; 904258632Savg /* 905258632Savg * The write issue taskq can be extremely CPU 906258632Savg * intensive. Run it at slightly lower priority 907258632Savg * than the other taskqs. 908258632Savg */ 909258632Savg if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 910258632Savg pri--; 911258632Savg 912258632Savg tq = taskq_create_proc(name, value, pri, 50, 913258631Savg INT_MAX, spa->spa_proc, flags); 914258631Savg#ifdef SYSDC 915258631Savg } 916258631Savg#endif 917258631Savg 918258631Savg tqs->stqs_taskq[i] = tq; 919219089Spjd } 920219089Spjd} 921219089Spjd 922219089Spjdstatic void 923258631Savgspa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 924258631Savg{ 925258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 926258631Savg 927258631Savg if (tqs->stqs_taskq == NULL) { 928258631Savg ASSERT0(tqs->stqs_count); 929258631Savg return; 930258631Savg } 931258631Savg 932258631Savg for (uint_t i = 0; i < tqs->stqs_count; i++) { 933258631Savg ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 934258631Savg taskq_destroy(tqs->stqs_taskq[i]); 935258631Savg } 936258631Savg 937258631Savg kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 938258631Savg tqs->stqs_taskq = NULL; 939258631Savg} 940258631Savg 941258631Savg/* 942258631Savg * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 943258631Savg * Note that a type may have multiple discrete taskqs to avoid lock contention 944258631Savg * on the taskq itself. In that case we choose which taskq at random by using 945258631Savg * the low bits of gethrtime(). 946258631Savg */ 947258631Savgvoid 948258631Savgspa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 949258631Savg task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 950258631Savg{ 951258631Savg spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 952258631Savg taskq_t *tq; 953258631Savg 954258631Savg ASSERT3P(tqs->stqs_taskq, !=, NULL); 955258631Savg ASSERT3U(tqs->stqs_count, !=, 0); 956258631Savg 957258631Savg if (tqs->stqs_count == 1) { 958258631Savg tq = tqs->stqs_taskq[0]; 959258631Savg } else { 960267038Sbdrewery#ifdef _KERNEL 961267029Smav tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 962267038Sbdrewery#else 963267038Sbdrewery tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 964267038Sbdrewery#endif 965258631Savg } 966258631Savg 967258631Savg taskq_dispatch_ent(tq, func, arg, flags, ent); 968258631Savg} 969258631Savg 970258631Savgstatic void 971219089Spjdspa_create_zio_taskqs(spa_t *spa) 972219089Spjd{ 973185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 974185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 975258631Savg spa_taskqs_init(spa, t, q); 976219089Spjd } 977219089Spjd } 978219089Spjd} 979209962Smm 980219089Spjd#ifdef _KERNEL 981219089Spjd#ifdef SPA_PROCESS 982219089Spjdstatic void 983219089Spjdspa_thread(void *arg) 984219089Spjd{ 985219089Spjd callb_cpr_t cprinfo; 986209962Smm 987219089Spjd spa_t *spa = arg; 988219089Spjd user_t *pu = PTOU(curproc); 989209962Smm 990219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 991219089Spjd spa->spa_name); 992209962Smm 993219089Spjd ASSERT(curproc != &p0); 994219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 995219089Spjd "zpool-%s", spa->spa_name); 996219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 997211931Smm 998219089Spjd#ifdef PSRSET_BIND 999219089Spjd /* bind this thread to the requested psrset */ 1000219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 1001219089Spjd pool_lock(); 1002219089Spjd mutex_enter(&cpu_lock); 1003219089Spjd mutex_enter(&pidlock); 1004219089Spjd mutex_enter(&curproc->p_lock); 1005219089Spjd 1006219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1007219089Spjd 0, NULL, NULL) == 0) { 1008219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 1009219089Spjd } else { 1010219089Spjd cmn_err(CE_WARN, 1011219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 1012219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1013219089Spjd } 1014219089Spjd 1015219089Spjd mutex_exit(&curproc->p_lock); 1016219089Spjd mutex_exit(&pidlock); 1017219089Spjd mutex_exit(&cpu_lock); 1018219089Spjd pool_unlock(); 1019219089Spjd } 1020219089Spjd#endif 1021219089Spjd 1022219089Spjd#ifdef SYSDC 1023219089Spjd if (zio_taskq_sysdc) { 1024219089Spjd sysdc_thread_enter(curthread, 100, 0); 1025219089Spjd } 1026219089Spjd#endif 1027219089Spjd 1028219089Spjd spa->spa_proc = curproc; 1029219089Spjd spa->spa_did = curthread->t_did; 1030219089Spjd 1031219089Spjd spa_create_zio_taskqs(spa); 1032219089Spjd 1033219089Spjd mutex_enter(&spa->spa_proc_lock); 1034219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1035219089Spjd 1036219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 1037219089Spjd cv_broadcast(&spa->spa_proc_cv); 1038219089Spjd 1039219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 1040219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1041219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1042219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1043219089Spjd 1044219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1045219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 1046219089Spjd spa->spa_proc = &p0; 1047219089Spjd cv_broadcast(&spa->spa_proc_cv); 1048219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1049219089Spjd 1050219089Spjd mutex_enter(&curproc->p_lock); 1051219089Spjd lwp_exit(); 1052219089Spjd} 1053219089Spjd#endif /* SPA_PROCESS */ 1054219089Spjd#endif 1055219089Spjd 1056219089Spjd/* 1057219089Spjd * Activate an uninitialized pool. 1058219089Spjd */ 1059219089Spjdstatic void 1060219089Spjdspa_activate(spa_t *spa, int mode) 1061219089Spjd{ 1062219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1063219089Spjd 1064219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 1065219089Spjd spa->spa_mode = mode; 1066219089Spjd 1067219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1068219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1069219089Spjd 1070219089Spjd /* Try to create a covering process */ 1071219089Spjd mutex_enter(&spa->spa_proc_lock); 1072219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1073219089Spjd ASSERT(spa->spa_proc == &p0); 1074219089Spjd spa->spa_did = 0; 1075219089Spjd 1076219089Spjd#ifdef SPA_PROCESS 1077219089Spjd /* Only create a process if we're going to be around a while. */ 1078219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1079219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1080219089Spjd NULL, 0) == 0) { 1081219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 1082219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 1083219089Spjd cv_wait(&spa->spa_proc_cv, 1084219089Spjd &spa->spa_proc_lock); 1085209962Smm } 1086219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1087219089Spjd ASSERT(spa->spa_proc != &p0); 1088219089Spjd ASSERT(spa->spa_did != 0); 1089219089Spjd } else { 1090219089Spjd#ifdef _KERNEL 1091219089Spjd cmn_err(CE_WARN, 1092219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 1093219089Spjd spa->spa_name); 1094219089Spjd#endif 1095185029Spjd } 1096168404Spjd } 1097219089Spjd#endif /* SPA_PROCESS */ 1098219089Spjd mutex_exit(&spa->spa_proc_lock); 1099168404Spjd 1100219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1101219089Spjd ASSERT(spa->spa_proc == &p0); 1102219089Spjd if (spa->spa_proc == &p0) { 1103219089Spjd spa_create_zio_taskqs(spa); 1104219089Spjd } 1105219089Spjd 1106240868Spjd /* 1107240868Spjd * Start TRIM thread. 1108240868Spjd */ 1109240868Spjd trim_thread_create(spa); 1110240868Spjd 1111185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1112185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1113185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1114185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1115168404Spjd 1116168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 1117168404Spjd offsetof(struct vdev, vdev_txg_node)); 1118168404Spjd 1119168404Spjd avl_create(&spa->spa_errlist_scrub, 1120168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1121168404Spjd offsetof(spa_error_entry_t, se_avl)); 1122168404Spjd avl_create(&spa->spa_errlist_last, 1123168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1124168404Spjd offsetof(spa_error_entry_t, se_avl)); 1125168404Spjd} 1126168404Spjd 1127168404Spjd/* 1128168404Spjd * Opposite of spa_activate(). 1129168404Spjd */ 1130168404Spjdstatic void 1131168404Spjdspa_deactivate(spa_t *spa) 1132168404Spjd{ 1133168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1134168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1135168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1136209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1137168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1138168404Spjd 1139240868Spjd /* 1140240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1141240868Spjd * before spa_deactivate(). 1142240868Spjd */ 1143240868Spjd trim_thread_destroy(spa); 1144240868Spjd 1145168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1146168404Spjd 1147185029Spjd list_destroy(&spa->spa_config_dirty_list); 1148185029Spjd list_destroy(&spa->spa_state_dirty_list); 1149168404Spjd 1150185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1151185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1152258631Savg spa_taskqs_fini(spa, t, q); 1153185029Spjd } 1154168404Spjd } 1155168404Spjd 1156168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1157168404Spjd spa->spa_normal_class = NULL; 1158168404Spjd 1159185029Spjd metaslab_class_destroy(spa->spa_log_class); 1160185029Spjd spa->spa_log_class = NULL; 1161185029Spjd 1162168404Spjd /* 1163168404Spjd * If this was part of an import or the open otherwise failed, we may 1164168404Spjd * still have errors left in the queues. Empty them just in case. 1165168404Spjd */ 1166168404Spjd spa_errlog_drain(spa); 1167168404Spjd 1168168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1169168404Spjd avl_destroy(&spa->spa_errlist_last); 1170168404Spjd 1171168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1172219089Spjd 1173219089Spjd mutex_enter(&spa->spa_proc_lock); 1174219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1175219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1176219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1177219089Spjd cv_broadcast(&spa->spa_proc_cv); 1178219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1179219089Spjd ASSERT(spa->spa_proc != &p0); 1180219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1181219089Spjd } 1182219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1183219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1184219089Spjd } 1185219089Spjd ASSERT(spa->spa_proc == &p0); 1186219089Spjd mutex_exit(&spa->spa_proc_lock); 1187219089Spjd 1188219089Spjd#ifdef SPA_PROCESS 1189219089Spjd /* 1190219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1191219089Spjd * module, so that the module can't be unloaded out from underneath 1192219089Spjd * it. 1193219089Spjd */ 1194219089Spjd if (spa->spa_did != 0) { 1195219089Spjd thread_join(spa->spa_did); 1196219089Spjd spa->spa_did = 0; 1197219089Spjd } 1198219089Spjd#endif /* SPA_PROCESS */ 1199168404Spjd} 1200168404Spjd 1201168404Spjd/* 1202168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1203168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1204168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1205168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1206168404Spjd */ 1207168404Spjdstatic int 1208168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1209168404Spjd uint_t id, int atype) 1210168404Spjd{ 1211168404Spjd nvlist_t **child; 1212219089Spjd uint_t children; 1213168404Spjd int error; 1214168404Spjd 1215168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1216168404Spjd return (error); 1217168404Spjd 1218168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1219168404Spjd return (0); 1220168404Spjd 1221185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1222185029Spjd &child, &children); 1223185029Spjd 1224185029Spjd if (error == ENOENT) 1225185029Spjd return (0); 1226185029Spjd 1227185029Spjd if (error) { 1228168404Spjd vdev_free(*vdp); 1229168404Spjd *vdp = NULL; 1230249195Smm return (SET_ERROR(EINVAL)); 1231168404Spjd } 1232168404Spjd 1233219089Spjd for (int c = 0; c < children; c++) { 1234168404Spjd vdev_t *vd; 1235168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1236168404Spjd atype)) != 0) { 1237168404Spjd vdev_free(*vdp); 1238168404Spjd *vdp = NULL; 1239168404Spjd return (error); 1240168404Spjd } 1241168404Spjd } 1242168404Spjd 1243168404Spjd ASSERT(*vdp != NULL); 1244168404Spjd 1245168404Spjd return (0); 1246168404Spjd} 1247168404Spjd 1248168404Spjd/* 1249168404Spjd * Opposite of spa_load(). 1250168404Spjd */ 1251168404Spjdstatic void 1252168404Spjdspa_unload(spa_t *spa) 1253168404Spjd{ 1254168404Spjd int i; 1255168404Spjd 1256185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1257185029Spjd 1258168404Spjd /* 1259240868Spjd * Stop TRIM thread. 1260240868Spjd */ 1261240868Spjd trim_thread_destroy(spa); 1262240868Spjd 1263240868Spjd /* 1264168404Spjd * Stop async tasks. 1265168404Spjd */ 1266168404Spjd spa_async_suspend(spa); 1267168404Spjd 1268168404Spjd /* 1269168404Spjd * Stop syncing. 1270168404Spjd */ 1271168404Spjd if (spa->spa_sync_on) { 1272168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1273168404Spjd spa->spa_sync_on = B_FALSE; 1274168404Spjd } 1275168404Spjd 1276168404Spjd /* 1277185029Spjd * Wait for any outstanding async I/O to complete. 1278168404Spjd */ 1279209962Smm if (spa->spa_async_zio_root != NULL) { 1280272598Sdelphij for (int i = 0; i < max_ncpus; i++) 1281272598Sdelphij (void) zio_wait(spa->spa_async_zio_root[i]); 1282272598Sdelphij kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1283209962Smm spa->spa_async_zio_root = NULL; 1284209962Smm } 1285168404Spjd 1286219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1287219089Spjd 1288258717Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1289258717Savg 1290168404Spjd /* 1291258717Savg * Close all vdevs. 1292258717Savg */ 1293258717Savg if (spa->spa_root_vdev) 1294258717Savg vdev_free(spa->spa_root_vdev); 1295258717Savg ASSERT(spa->spa_root_vdev == NULL); 1296258717Savg 1297258717Savg /* 1298168404Spjd * Close the dsl pool. 1299168404Spjd */ 1300168404Spjd if (spa->spa_dsl_pool) { 1301168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1302168404Spjd spa->spa_dsl_pool = NULL; 1303219089Spjd spa->spa_meta_objset = NULL; 1304168404Spjd } 1305168404Spjd 1306219089Spjd ddt_unload(spa); 1307219089Spjd 1308209962Smm 1309168404Spjd /* 1310209962Smm * Drop and purge level 2 cache 1311209962Smm */ 1312209962Smm spa_l2cache_drop(spa); 1313209962Smm 1314185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1315185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1316185029Spjd if (spa->spa_spares.sav_vdevs) { 1317185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1318185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1319185029Spjd spa->spa_spares.sav_vdevs = NULL; 1320168404Spjd } 1321185029Spjd if (spa->spa_spares.sav_config) { 1322185029Spjd nvlist_free(spa->spa_spares.sav_config); 1323185029Spjd spa->spa_spares.sav_config = NULL; 1324168404Spjd } 1325185029Spjd spa->spa_spares.sav_count = 0; 1326168404Spjd 1327230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1328230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1329185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1330230514Smm } 1331185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1332185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1333185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1334185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1335185029Spjd } 1336185029Spjd if (spa->spa_l2cache.sav_config) { 1337185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1338185029Spjd spa->spa_l2cache.sav_config = NULL; 1339185029Spjd } 1340185029Spjd spa->spa_l2cache.sav_count = 0; 1341185029Spjd 1342168404Spjd spa->spa_async_suspended = 0; 1343209962Smm 1344228103Smm if (spa->spa_comment != NULL) { 1345228103Smm spa_strfree(spa->spa_comment); 1346228103Smm spa->spa_comment = NULL; 1347228103Smm } 1348228103Smm 1349209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1350168404Spjd} 1351168404Spjd 1352168404Spjd/* 1353168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1354168404Spjd * this pool. When this is called, we have some form of basic information in 1355185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1356185029Spjd * then re-generate a more complete list including status information. 1357168404Spjd */ 1358168404Spjdstatic void 1359168404Spjdspa_load_spares(spa_t *spa) 1360168404Spjd{ 1361168404Spjd nvlist_t **spares; 1362168404Spjd uint_t nspares; 1363168404Spjd int i; 1364168404Spjd vdev_t *vd, *tvd; 1365168404Spjd 1366185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1367185029Spjd 1368168404Spjd /* 1369168404Spjd * First, close and free any existing spare vdevs. 1370168404Spjd */ 1371185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1372185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1373168404Spjd 1374168404Spjd /* Undo the call to spa_activate() below */ 1375185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1376185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1377168404Spjd spa_spare_remove(tvd); 1378168404Spjd vdev_close(vd); 1379168404Spjd vdev_free(vd); 1380168404Spjd } 1381168404Spjd 1382185029Spjd if (spa->spa_spares.sav_vdevs) 1383185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1384185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1385168404Spjd 1386185029Spjd if (spa->spa_spares.sav_config == NULL) 1387168404Spjd nspares = 0; 1388168404Spjd else 1389185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1390168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1391168404Spjd 1392185029Spjd spa->spa_spares.sav_count = (int)nspares; 1393185029Spjd spa->spa_spares.sav_vdevs = NULL; 1394168404Spjd 1395168404Spjd if (nspares == 0) 1396168404Spjd return; 1397168404Spjd 1398168404Spjd /* 1399168404Spjd * Construct the array of vdevs, opening them to get status in the 1400168404Spjd * process. For each spare, there is potentially two different vdev_t 1401168404Spjd * structures associated with it: one in the list of spares (used only 1402168404Spjd * for basic validation purposes) and one in the active vdev 1403168404Spjd * configuration (if it's spared in). During this phase we open and 1404168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1405168404Spjd * active configuration, then we also mark this vdev as an active spare. 1406168404Spjd */ 1407185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1408185029Spjd KM_SLEEP); 1409185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1410168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1411168404Spjd VDEV_ALLOC_SPARE) == 0); 1412168404Spjd ASSERT(vd != NULL); 1413168404Spjd 1414185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1415168404Spjd 1416185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1417185029Spjd B_FALSE)) != NULL) { 1418168404Spjd if (!tvd->vdev_isspare) 1419168404Spjd spa_spare_add(tvd); 1420168404Spjd 1421168404Spjd /* 1422168404Spjd * We only mark the spare active if we were successfully 1423168404Spjd * able to load the vdev. Otherwise, importing a pool 1424168404Spjd * with a bad active spare would result in strange 1425168404Spjd * behavior, because multiple pool would think the spare 1426168404Spjd * is actively in use. 1427168404Spjd * 1428168404Spjd * There is a vulnerability here to an equally bizarre 1429168404Spjd * circumstance, where a dead active spare is later 1430168404Spjd * brought back to life (onlined or otherwise). Given 1431168404Spjd * the rarity of this scenario, and the extra complexity 1432168404Spjd * it adds, we ignore the possibility. 1433168404Spjd */ 1434168404Spjd if (!vdev_is_dead(tvd)) 1435168404Spjd spa_spare_activate(tvd); 1436168404Spjd } 1437168404Spjd 1438185029Spjd vd->vdev_top = vd; 1439209962Smm vd->vdev_aux = &spa->spa_spares; 1440185029Spjd 1441168404Spjd if (vdev_open(vd) != 0) 1442168404Spjd continue; 1443168404Spjd 1444185029Spjd if (vdev_validate_aux(vd) == 0) 1445185029Spjd spa_spare_add(vd); 1446168404Spjd } 1447168404Spjd 1448168404Spjd /* 1449168404Spjd * Recompute the stashed list of spares, with status information 1450168404Spjd * this time. 1451168404Spjd */ 1452185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1453168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1454168404Spjd 1455185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1456185029Spjd KM_SLEEP); 1457185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1458185029Spjd spares[i] = vdev_config_generate(spa, 1459219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1460185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1461185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1462185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1463168404Spjd nvlist_free(spares[i]); 1464185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1465168404Spjd} 1466168404Spjd 1467185029Spjd/* 1468185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1469185029Spjd * this pool. When this is called, we have some form of basic information in 1470185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1471185029Spjd * then re-generate a more complete list including status information. 1472185029Spjd * Devices which are already active have their details maintained, and are 1473185029Spjd * not re-opened. 1474185029Spjd */ 1475185029Spjdstatic void 1476185029Spjdspa_load_l2cache(spa_t *spa) 1477185029Spjd{ 1478185029Spjd nvlist_t **l2cache; 1479185029Spjd uint_t nl2cache; 1480185029Spjd int i, j, oldnvdevs; 1481219089Spjd uint64_t guid; 1482185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1483185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1484185029Spjd 1485185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1486185029Spjd 1487185029Spjd if (sav->sav_config != NULL) { 1488185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1489185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1490185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1491185029Spjd } else { 1492185029Spjd nl2cache = 0; 1493247187Smm newvdevs = NULL; 1494185029Spjd } 1495185029Spjd 1496185029Spjd oldvdevs = sav->sav_vdevs; 1497185029Spjd oldnvdevs = sav->sav_count; 1498185029Spjd sav->sav_vdevs = NULL; 1499185029Spjd sav->sav_count = 0; 1500185029Spjd 1501185029Spjd /* 1502185029Spjd * Process new nvlist of vdevs. 1503185029Spjd */ 1504185029Spjd for (i = 0; i < nl2cache; i++) { 1505185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1506185029Spjd &guid) == 0); 1507185029Spjd 1508185029Spjd newvdevs[i] = NULL; 1509185029Spjd for (j = 0; j < oldnvdevs; j++) { 1510185029Spjd vd = oldvdevs[j]; 1511185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1512185029Spjd /* 1513185029Spjd * Retain previous vdev for add/remove ops. 1514185029Spjd */ 1515185029Spjd newvdevs[i] = vd; 1516185029Spjd oldvdevs[j] = NULL; 1517185029Spjd break; 1518185029Spjd } 1519185029Spjd } 1520185029Spjd 1521185029Spjd if (newvdevs[i] == NULL) { 1522185029Spjd /* 1523185029Spjd * Create new vdev 1524185029Spjd */ 1525185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1526185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1527185029Spjd ASSERT(vd != NULL); 1528185029Spjd newvdevs[i] = vd; 1529185029Spjd 1530185029Spjd /* 1531185029Spjd * Commit this vdev as an l2cache device, 1532185029Spjd * even if it fails to open. 1533185029Spjd */ 1534185029Spjd spa_l2cache_add(vd); 1535185029Spjd 1536185029Spjd vd->vdev_top = vd; 1537185029Spjd vd->vdev_aux = sav; 1538185029Spjd 1539185029Spjd spa_l2cache_activate(vd); 1540185029Spjd 1541185029Spjd if (vdev_open(vd) != 0) 1542185029Spjd continue; 1543185029Spjd 1544185029Spjd (void) vdev_validate_aux(vd); 1545185029Spjd 1546219089Spjd if (!vdev_is_dead(vd)) 1547219089Spjd l2arc_add_vdev(spa, vd); 1548185029Spjd } 1549185029Spjd } 1550185029Spjd 1551185029Spjd /* 1552185029Spjd * Purge vdevs that were dropped 1553185029Spjd */ 1554185029Spjd for (i = 0; i < oldnvdevs; i++) { 1555185029Spjd uint64_t pool; 1556185029Spjd 1557185029Spjd vd = oldvdevs[i]; 1558185029Spjd if (vd != NULL) { 1559230514Smm ASSERT(vd->vdev_isl2cache); 1560230514Smm 1561209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1562209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1563185029Spjd l2arc_remove_vdev(vd); 1564230514Smm vdev_clear_stats(vd); 1565230514Smm vdev_free(vd); 1566185029Spjd } 1567185029Spjd } 1568185029Spjd 1569185029Spjd if (oldvdevs) 1570185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1571185029Spjd 1572185029Spjd if (sav->sav_config == NULL) 1573185029Spjd goto out; 1574185029Spjd 1575185029Spjd sav->sav_vdevs = newvdevs; 1576185029Spjd sav->sav_count = (int)nl2cache; 1577185029Spjd 1578185029Spjd /* 1579185029Spjd * Recompute the stashed list of l2cache devices, with status 1580185029Spjd * information this time. 1581185029Spjd */ 1582185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1583185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1584185029Spjd 1585185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1586185029Spjd for (i = 0; i < sav->sav_count; i++) 1587185029Spjd l2cache[i] = vdev_config_generate(spa, 1588219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1589185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1590185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1591185029Spjdout: 1592185029Spjd for (i = 0; i < sav->sav_count; i++) 1593185029Spjd nvlist_free(l2cache[i]); 1594185029Spjd if (sav->sav_count) 1595185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1596185029Spjd} 1597185029Spjd 1598168404Spjdstatic int 1599168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1600168404Spjd{ 1601168404Spjd dmu_buf_t *db; 1602168404Spjd char *packed = NULL; 1603168404Spjd size_t nvsize = 0; 1604168404Spjd int error; 1605168404Spjd *value = NULL; 1606168404Spjd 1607262676Sdelphij error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1608262676Sdelphij if (error != 0) 1609262676Sdelphij return (error); 1610168404Spjd nvsize = *(uint64_t *)db->db_data; 1611168404Spjd dmu_buf_rele(db, FTAG); 1612168404Spjd 1613168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1614209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1615209962Smm DMU_READ_PREFETCH); 1616168404Spjd if (error == 0) 1617168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1618168404Spjd kmem_free(packed, nvsize); 1619168404Spjd 1620168404Spjd return (error); 1621168404Spjd} 1622168404Spjd 1623168404Spjd/* 1624185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1625185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1626185029Spjd */ 1627185029Spjdstatic void 1628185029Spjdspa_check_removed(vdev_t *vd) 1629185029Spjd{ 1630219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1631185029Spjd spa_check_removed(vd->vdev_child[c]); 1632185029Spjd 1633249188Smm if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1634249188Smm !vd->vdev_ishole) { 1635185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1636185029Spjd spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1637185029Spjd } 1638185029Spjd} 1639185029Spjd 1640185029Spjd/* 1641219089Spjd * Validate the current config against the MOS config 1642213197Smm */ 1643219089Spjdstatic boolean_t 1644219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1645213197Smm{ 1646219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1647219089Spjd nvlist_t *nv; 1648213197Smm 1649219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1650213197Smm 1651219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1652219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1653219089Spjd 1654219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1655219089Spjd 1656219089Spjd /* 1657219089Spjd * If we're doing a normal import, then build up any additional 1658219089Spjd * diagnostic information about missing devices in this config. 1659219089Spjd * We'll pass this up to the user for further processing. 1660219089Spjd */ 1661219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1662219089Spjd nvlist_t **child, *nv; 1663219089Spjd uint64_t idx = 0; 1664219089Spjd 1665219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1666219089Spjd KM_SLEEP); 1667219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1668219089Spjd 1669219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1670219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1671219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1672219089Spjd 1673219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1674219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1675219089Spjd mtvd->vdev_islog) 1676219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1677219089Spjd B_FALSE, 0); 1678219089Spjd } 1679219089Spjd 1680219089Spjd if (idx) { 1681219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1682219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1683219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1684219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1685219089Spjd 1686219089Spjd for (int i = 0; i < idx; i++) 1687219089Spjd nvlist_free(child[i]); 1688219089Spjd } 1689219089Spjd nvlist_free(nv); 1690219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1691219089Spjd } 1692219089Spjd 1693219089Spjd /* 1694219089Spjd * Compare the root vdev tree with the information we have 1695219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1696219089Spjd * with the corresponding MOS config top-level (mtvd). 1697219089Spjd */ 1698219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1699213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1700219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1701213197Smm 1702219089Spjd /* 1703219089Spjd * Resolve any "missing" vdevs in the current configuration. 1704219089Spjd * If we find that the MOS config has more accurate information 1705219089Spjd * about the top-level vdev then use that vdev instead. 1706219089Spjd */ 1707219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1708219089Spjd mtvd->vdev_ops != &vdev_missing_ops) { 1709219089Spjd 1710219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1711219089Spjd continue; 1712219089Spjd 1713219089Spjd /* 1714219089Spjd * Device specific actions. 1715219089Spjd */ 1716219089Spjd if (mtvd->vdev_islog) { 1717219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1718219089Spjd } else { 1719219089Spjd /* 1720219089Spjd * XXX - once we have 'readonly' pool 1721219089Spjd * support we should be able to handle 1722219089Spjd * missing data devices by transitioning 1723219089Spjd * the pool to readonly. 1724219089Spjd */ 1725219089Spjd continue; 1726219089Spjd } 1727219089Spjd 1728219089Spjd /* 1729219089Spjd * Swap the missing vdev with the data we were 1730219089Spjd * able to obtain from the MOS config. 1731219089Spjd */ 1732219089Spjd vdev_remove_child(rvd, tvd); 1733219089Spjd vdev_remove_child(mrvd, mtvd); 1734219089Spjd 1735219089Spjd vdev_add_child(rvd, mtvd); 1736219089Spjd vdev_add_child(mrvd, tvd); 1737219089Spjd 1738219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1739219089Spjd vdev_load(mtvd); 1740219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1741219089Spjd 1742219089Spjd vdev_reopen(rvd); 1743219089Spjd } else if (mtvd->vdev_islog) { 1744219089Spjd /* 1745219089Spjd * Load the slog device's state from the MOS config 1746219089Spjd * since it's possible that the label does not 1747219089Spjd * contain the most up-to-date information. 1748219089Spjd */ 1749219089Spjd vdev_load_log_state(tvd, mtvd); 1750219089Spjd vdev_reopen(tvd); 1751219089Spjd } 1752213197Smm } 1753219089Spjd vdev_free(mrvd); 1754219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1755219089Spjd 1756219089Spjd /* 1757219089Spjd * Ensure we were able to validate the config. 1758219089Spjd */ 1759219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1760213197Smm} 1761213197Smm 1762213197Smm/* 1763185029Spjd * Check for missing log devices 1764185029Spjd */ 1765248571Smmstatic boolean_t 1766185029Spjdspa_check_logs(spa_t *spa) 1767185029Spjd{ 1768248571Smm boolean_t rv = B_FALSE; 1769248571Smm 1770185029Spjd switch (spa->spa_log_state) { 1771185029Spjd case SPA_LOG_MISSING: 1772185029Spjd /* need to recheck in case slog has been restored */ 1773185029Spjd case SPA_LOG_UNKNOWN: 1774248571Smm rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, 1775248571Smm NULL, DS_FIND_CHILDREN) != 0); 1776248571Smm if (rv) 1777219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1778185029Spjd break; 1779185029Spjd } 1780248571Smm return (rv); 1781185029Spjd} 1782185029Spjd 1783219089Spjdstatic boolean_t 1784219089Spjdspa_passivate_log(spa_t *spa) 1785219089Spjd{ 1786219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1787219089Spjd boolean_t slog_found = B_FALSE; 1788219089Spjd 1789219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1790219089Spjd 1791219089Spjd if (!spa_has_slogs(spa)) 1792219089Spjd return (B_FALSE); 1793219089Spjd 1794219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1795219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1796219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1797219089Spjd 1798219089Spjd if (tvd->vdev_islog) { 1799219089Spjd metaslab_group_passivate(mg); 1800219089Spjd slog_found = B_TRUE; 1801219089Spjd } 1802219089Spjd } 1803219089Spjd 1804219089Spjd return (slog_found); 1805219089Spjd} 1806219089Spjd 1807219089Spjdstatic void 1808219089Spjdspa_activate_log(spa_t *spa) 1809219089Spjd{ 1810219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1811219089Spjd 1812219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1813219089Spjd 1814219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1815219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1816219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1817219089Spjd 1818219089Spjd if (tvd->vdev_islog) 1819219089Spjd metaslab_group_activate(mg); 1820219089Spjd } 1821219089Spjd} 1822219089Spjd 1823219089Spjdint 1824219089Spjdspa_offline_log(spa_t *spa) 1825219089Spjd{ 1826248571Smm int error; 1827219089Spjd 1828248571Smm error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1829248571Smm NULL, DS_FIND_CHILDREN); 1830248571Smm if (error == 0) { 1831219089Spjd /* 1832219089Spjd * We successfully offlined the log device, sync out the 1833219089Spjd * current txg so that the "stubby" block can be removed 1834219089Spjd * by zil_sync(). 1835219089Spjd */ 1836219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1837219089Spjd } 1838219089Spjd return (error); 1839219089Spjd} 1840219089Spjd 1841219089Spjdstatic void 1842219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1843219089Spjd{ 1844219089Spjd int i; 1845219089Spjd 1846219089Spjd for (i = 0; i < sav->sav_count; i++) 1847219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1848219089Spjd} 1849219089Spjd 1850219089Spjdvoid 1851219089Spjdspa_claim_notify(zio_t *zio) 1852219089Spjd{ 1853219089Spjd spa_t *spa = zio->io_spa; 1854219089Spjd 1855219089Spjd if (zio->io_error) 1856219089Spjd return; 1857219089Spjd 1858219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1859219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1860219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1861219089Spjd mutex_exit(&spa->spa_props_lock); 1862219089Spjd} 1863219089Spjd 1864219089Spjdtypedef struct spa_load_error { 1865219089Spjd uint64_t sle_meta_count; 1866219089Spjd uint64_t sle_data_count; 1867219089Spjd} spa_load_error_t; 1868219089Spjd 1869219089Spjdstatic void 1870219089Spjdspa_load_verify_done(zio_t *zio) 1871219089Spjd{ 1872219089Spjd blkptr_t *bp = zio->io_bp; 1873219089Spjd spa_load_error_t *sle = zio->io_private; 1874219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1875219089Spjd int error = zio->io_error; 1876268720Sdelphij spa_t *spa = zio->io_spa; 1877219089Spjd 1878219089Spjd if (error) { 1879236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1880219089Spjd type != DMU_OT_INTENT_LOG) 1881270247Sdelphij atomic_inc_64(&sle->sle_meta_count); 1882219089Spjd else 1883270247Sdelphij atomic_inc_64(&sle->sle_data_count); 1884219089Spjd } 1885219089Spjd zio_data_buf_free(zio->io_data, zio->io_size); 1886268720Sdelphij 1887268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 1888268720Sdelphij spa->spa_scrub_inflight--; 1889268720Sdelphij cv_broadcast(&spa->spa_scrub_io_cv); 1890268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 1891219089Spjd} 1892219089Spjd 1893268720Sdelphij/* 1894268720Sdelphij * Maximum number of concurrent scrub i/os to create while verifying 1895268720Sdelphij * a pool while importing it. 1896268720Sdelphij */ 1897268720Sdelphijint spa_load_verify_maxinflight = 10000; 1898268720Sdelphijboolean_t spa_load_verify_metadata = B_TRUE; 1899268720Sdelphijboolean_t spa_load_verify_data = B_TRUE; 1900268720Sdelphij 1901268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, 1902268720Sdelphij &spa_load_verify_maxinflight, 0, 1903268720Sdelphij "Maximum number of concurrent scrub I/Os to create while verifying a " 1904268720Sdelphij "pool while importing it"); 1905268720Sdelphij 1906268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, 1907268720Sdelphij &spa_load_verify_metadata, 0, 1908268720Sdelphij "Check metadata on import?"); 1909268720Sdelphij 1910268720SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, 1911268720Sdelphij &spa_load_verify_data, 0, 1912268720Sdelphij "Check user data on import?"); 1913268720Sdelphij 1914219089Spjd/*ARGSUSED*/ 1915219089Spjdstatic int 1916219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1917268123Sdelphij const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1918219089Spjd{ 1919268720Sdelphij if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1920268720Sdelphij return (0); 1921268720Sdelphij /* 1922268720Sdelphij * Note: normally this routine will not be called if 1923268720Sdelphij * spa_load_verify_metadata is not set. However, it may be useful 1924268720Sdelphij * to manually set the flag after the traversal has begun. 1925268720Sdelphij */ 1926268720Sdelphij if (!spa_load_verify_metadata) 1927268720Sdelphij return (0); 1928268720Sdelphij if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) 1929268720Sdelphij return (0); 1930219089Spjd 1931268720Sdelphij zio_t *rio = arg; 1932268720Sdelphij size_t size = BP_GET_PSIZE(bp); 1933268720Sdelphij void *data = zio_data_buf_alloc(size); 1934268720Sdelphij 1935268720Sdelphij mutex_enter(&spa->spa_scrub_lock); 1936268720Sdelphij while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1937268720Sdelphij cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1938268720Sdelphij spa->spa_scrub_inflight++; 1939268720Sdelphij mutex_exit(&spa->spa_scrub_lock); 1940268720Sdelphij 1941268720Sdelphij zio_nowait(zio_read(rio, spa, bp, data, size, 1942268720Sdelphij spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1943268720Sdelphij ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1944268720Sdelphij ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1945219089Spjd return (0); 1946219089Spjd} 1947219089Spjd 1948219089Spjdstatic int 1949219089Spjdspa_load_verify(spa_t *spa) 1950219089Spjd{ 1951219089Spjd zio_t *rio; 1952219089Spjd spa_load_error_t sle = { 0 }; 1953219089Spjd zpool_rewind_policy_t policy; 1954219089Spjd boolean_t verify_ok = B_FALSE; 1955268720Sdelphij int error = 0; 1956219089Spjd 1957219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 1958219089Spjd 1959219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1960219089Spjd return (0); 1961219089Spjd 1962219089Spjd rio = zio_root(spa, NULL, &sle, 1963219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1964219089Spjd 1965268720Sdelphij if (spa_load_verify_metadata) { 1966268720Sdelphij error = traverse_pool(spa, spa->spa_verify_min_txg, 1967268720Sdelphij TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 1968268720Sdelphij spa_load_verify_cb, rio); 1969268720Sdelphij } 1970219089Spjd 1971219089Spjd (void) zio_wait(rio); 1972219089Spjd 1973219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 1974219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 1975219089Spjd 1976219089Spjd if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1977219089Spjd sle.sle_data_count <= policy.zrp_maxdata) { 1978219089Spjd int64_t loss = 0; 1979219089Spjd 1980219089Spjd verify_ok = B_TRUE; 1981219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1982219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1983219089Spjd 1984219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1985219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1986219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1987219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 1988219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1989219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1990219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1991219089Spjd } else { 1992219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1993219089Spjd } 1994219089Spjd 1995219089Spjd if (error) { 1996219089Spjd if (error != ENXIO && error != EIO) 1997249195Smm error = SET_ERROR(EIO); 1998219089Spjd return (error); 1999219089Spjd } 2000219089Spjd 2001219089Spjd return (verify_ok ? 0 : EIO); 2002219089Spjd} 2003219089Spjd 2004185029Spjd/* 2005219089Spjd * Find a value in the pool props object. 2006168404Spjd */ 2007219089Spjdstatic void 2008219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2009219089Spjd{ 2010219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2011219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2012219089Spjd} 2013219089Spjd 2014219089Spjd/* 2015219089Spjd * Find a value in the pool directory object. 2016219089Spjd */ 2017168404Spjdstatic int 2018219089Spjdspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 2019168404Spjd{ 2020219089Spjd return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2021219089Spjd name, sizeof (uint64_t), 1, val)); 2022219089Spjd} 2023168404Spjd 2024219089Spjdstatic int 2025219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2026219089Spjd{ 2027219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2028219089Spjd return (err); 2029219089Spjd} 2030219089Spjd 2031219089Spjd/* 2032219089Spjd * Fix up config after a partly-completed split. This is done with the 2033219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2034219089Spjd * pool have that entry in their config, but only the splitting one contains 2035219089Spjd * a list of all the guids of the vdevs that are being split off. 2036219089Spjd * 2037219089Spjd * This function determines what to do with that list: either rejoin 2038219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 2039219089Spjd * the rejoin, each disk that is offlined is marked online again, and 2040219089Spjd * we do a reopen() call. If the vdev label for every disk that was 2041219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2042219089Spjd * then we call vdev_split() on each disk, and complete the split. 2043219089Spjd * 2044219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 2045219089Spjd * the original pool. 2046219089Spjd */ 2047219089Spjdstatic void 2048219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 2049219089Spjd{ 2050219089Spjd uint_t extracted; 2051219089Spjd uint64_t *glist; 2052219089Spjd uint_t i, gcount; 2053219089Spjd nvlist_t *nvl; 2054219089Spjd vdev_t **vd; 2055219089Spjd boolean_t attempt_reopen; 2056219089Spjd 2057219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2058219089Spjd return; 2059219089Spjd 2060219089Spjd /* check that the config is complete */ 2061219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2062219089Spjd &glist, &gcount) != 0) 2063219089Spjd return; 2064219089Spjd 2065219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2066219089Spjd 2067219089Spjd /* attempt to online all the vdevs & validate */ 2068219089Spjd attempt_reopen = B_TRUE; 2069219089Spjd for (i = 0; i < gcount; i++) { 2070219089Spjd if (glist[i] == 0) /* vdev is hole */ 2071219089Spjd continue; 2072219089Spjd 2073219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2074219089Spjd if (vd[i] == NULL) { 2075219089Spjd /* 2076219089Spjd * Don't bother attempting to reopen the disks; 2077219089Spjd * just do the split. 2078219089Spjd */ 2079219089Spjd attempt_reopen = B_FALSE; 2080219089Spjd } else { 2081219089Spjd /* attempt to re-online it */ 2082219089Spjd vd[i]->vdev_offline = B_FALSE; 2083219089Spjd } 2084219089Spjd } 2085219089Spjd 2086219089Spjd if (attempt_reopen) { 2087219089Spjd vdev_reopen(spa->spa_root_vdev); 2088219089Spjd 2089219089Spjd /* check each device to see what state it's in */ 2090219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 2091219089Spjd if (vd[i] != NULL && 2092219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2093219089Spjd break; 2094219089Spjd ++extracted; 2095219089Spjd } 2096219089Spjd } 2097219089Spjd 2098209962Smm /* 2099219089Spjd * If every disk has been moved to the new pool, or if we never 2100219089Spjd * even attempted to look at them, then we split them off for 2101219089Spjd * good. 2102209962Smm */ 2103219089Spjd if (!attempt_reopen || gcount == extracted) { 2104219089Spjd for (i = 0; i < gcount; i++) 2105219089Spjd if (vd[i] != NULL) 2106219089Spjd vdev_split(vd[i]); 2107219089Spjd vdev_reopen(spa->spa_root_vdev); 2108219089Spjd } 2109209962Smm 2110219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 2111219089Spjd} 2112185029Spjd 2113219089Spjdstatic int 2114219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2115219089Spjd boolean_t mosconfig) 2116219089Spjd{ 2117219089Spjd nvlist_t *config = spa->spa_config; 2118219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 2119228103Smm char *comment; 2120219089Spjd int error; 2121219089Spjd uint64_t pool_guid; 2122219089Spjd nvlist_t *nvl; 2123168404Spjd 2124219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2125249195Smm return (SET_ERROR(EINVAL)); 2126168404Spjd 2127228103Smm ASSERT(spa->spa_comment == NULL); 2128228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2129228103Smm spa->spa_comment = spa_strdup(comment); 2130228103Smm 2131168404Spjd /* 2132168404Spjd * Versioning wasn't explicitly added to the label until later, so if 2133168404Spjd * it's not present treat it as the initial version. 2134168404Spjd */ 2135219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2136219089Spjd &spa->spa_ubsync.ub_version) != 0) 2137219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2138168404Spjd 2139168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2140168404Spjd &spa->spa_config_txg); 2141168404Spjd 2142168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2143168404Spjd spa_guid_exists(pool_guid, 0)) { 2144249195Smm error = SET_ERROR(EEXIST); 2145219089Spjd } else { 2146228103Smm spa->spa_config_guid = pool_guid; 2147219089Spjd 2148219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2149219089Spjd &nvl) == 0) { 2150219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2151219089Spjd KM_SLEEP) == 0); 2152219089Spjd } 2153219089Spjd 2154236884Smm nvlist_free(spa->spa_load_info); 2155236884Smm spa->spa_load_info = fnvlist_alloc(); 2156236884Smm 2157219089Spjd gethrestime(&spa->spa_loaded_ts); 2158219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 2159219089Spjd mosconfig, &ereport); 2160168404Spjd } 2161168404Spjd 2162219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2163219089Spjd if (error) { 2164219089Spjd if (error != EEXIST) { 2165219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2166219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2167219089Spjd } 2168219089Spjd if (error != EBADF) { 2169219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2170219089Spjd } 2171219089Spjd } 2172219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2173219089Spjd spa->spa_ena = 0; 2174168404Spjd 2175219089Spjd return (error); 2176219089Spjd} 2177219089Spjd 2178219089Spjd/* 2179219089Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 2180219089Spjd * source of configuration information. 2181219089Spjd */ 2182219089Spjdstatic int 2183219089Spjdspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2184219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2185219089Spjd char **ereport) 2186219089Spjd{ 2187219089Spjd int error = 0; 2188219089Spjd nvlist_t *nvroot = NULL; 2189236884Smm nvlist_t *label; 2190219089Spjd vdev_t *rvd; 2191219089Spjd uberblock_t *ub = &spa->spa_uberblock; 2192219089Spjd uint64_t children, config_cache_txg = spa->spa_config_txg; 2193219089Spjd int orig_mode = spa->spa_mode; 2194219089Spjd int parse; 2195219089Spjd uint64_t obj; 2196236884Smm boolean_t missing_feat_write = B_FALSE; 2197219089Spjd 2198168404Spjd /* 2199219089Spjd * If this is an untrusted config, access the pool in read-only mode. 2200219089Spjd * This prevents things like resilvering recently removed devices. 2201219089Spjd */ 2202219089Spjd if (!mosconfig) 2203219089Spjd spa->spa_mode = FREAD; 2204219089Spjd 2205219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2206219089Spjd 2207219089Spjd spa->spa_load_state = state; 2208219089Spjd 2209219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2210249195Smm return (SET_ERROR(EINVAL)); 2211219089Spjd 2212219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 2213219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2214219089Spjd 2215219089Spjd /* 2216209962Smm * Create "The Godfather" zio to hold all async IOs 2217209962Smm */ 2218272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2219272598Sdelphij KM_SLEEP); 2220272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 2221272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2222272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2223272598Sdelphij ZIO_FLAG_GODFATHER); 2224272598Sdelphij } 2225209962Smm 2226209962Smm /* 2227168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2228168404Spjd * value that will be returned by spa_version() since parsing the 2229168404Spjd * configuration requires knowing the version number. 2230168404Spjd */ 2231185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2232219089Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2233185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2234168404Spjd 2235168404Spjd if (error != 0) 2236219089Spjd return (error); 2237168404Spjd 2238168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2239168404Spjd 2240219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2241219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2242219089Spjd } 2243219089Spjd 2244168404Spjd /* 2245168404Spjd * Try to open all vdevs, loading each label in the process. 2246168404Spjd */ 2247185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2248168926Spjd error = vdev_open(rvd); 2249185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2250168926Spjd if (error != 0) 2251219089Spjd return (error); 2252168404Spjd 2253168404Spjd /* 2254209962Smm * We need to validate the vdev labels against the configuration that 2255209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 2256209962Smm * mosconfig is true then we're validating the vdev labels based on 2257219089Spjd * that config. Otherwise, we're validating against the cached config 2258209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 2259209962Smm * later we will recursively call spa_load() and validate against 2260209962Smm * the vdev config. 2261219089Spjd * 2262219089Spjd * If we're assembling a new pool that's been split off from an 2263219089Spjd * existing pool, the labels haven't yet been updated so we skip 2264219089Spjd * validation for now. 2265168404Spjd */ 2266219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2267219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2268230514Smm error = vdev_validate(rvd, mosconfig); 2269219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2270168404Spjd 2271219089Spjd if (error != 0) 2272219089Spjd return (error); 2273219089Spjd 2274219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2275249195Smm return (SET_ERROR(ENXIO)); 2276168404Spjd } 2277168404Spjd 2278168404Spjd /* 2279168404Spjd * Find the best uberblock. 2280168404Spjd */ 2281236884Smm vdev_uberblock_load(rvd, ub, &label); 2282168404Spjd 2283168404Spjd /* 2284168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2285168404Spjd */ 2286236884Smm if (ub->ub_txg == 0) { 2287236884Smm nvlist_free(label); 2288219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2289236884Smm } 2290168404Spjd 2291168404Spjd /* 2292236884Smm * If the pool has an unsupported version we can't open it. 2293168404Spjd */ 2294236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2295236884Smm nvlist_free(label); 2296219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2297236884Smm } 2298168404Spjd 2299236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2300236884Smm nvlist_t *features; 2301236884Smm 2302236884Smm /* 2303236884Smm * If we weren't able to find what's necessary for reading the 2304236884Smm * MOS in the label, return failure. 2305236884Smm */ 2306236884Smm if (label == NULL || nvlist_lookup_nvlist(label, 2307236884Smm ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2308236884Smm nvlist_free(label); 2309236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2310236884Smm ENXIO)); 2311236884Smm } 2312236884Smm 2313236884Smm /* 2314236884Smm * Update our in-core representation with the definitive values 2315236884Smm * from the label. 2316236884Smm */ 2317236884Smm nvlist_free(spa->spa_label_features); 2318236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2319236884Smm } 2320236884Smm 2321236884Smm nvlist_free(label); 2322236884Smm 2323168404Spjd /* 2324236884Smm * Look through entries in the label nvlist's features_for_read. If 2325236884Smm * there is a feature listed there which we don't understand then we 2326236884Smm * cannot open a pool. 2327236884Smm */ 2328236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2329236884Smm nvlist_t *unsup_feat; 2330236884Smm 2331236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2332236884Smm 0); 2333236884Smm 2334236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2335236884Smm NULL); nvp != NULL; 2336236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2337236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2338236884Smm VERIFY(nvlist_add_string(unsup_feat, 2339236884Smm nvpair_name(nvp), "") == 0); 2340236884Smm } 2341236884Smm } 2342236884Smm 2343236884Smm if (!nvlist_empty(unsup_feat)) { 2344236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2345236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2346236884Smm nvlist_free(unsup_feat); 2347236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2348236884Smm ENOTSUP)); 2349236884Smm } 2350236884Smm 2351236884Smm nvlist_free(unsup_feat); 2352236884Smm } 2353236884Smm 2354236884Smm /* 2355168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2356219089Spjd * incomplete configuration. We first check to see if the pool 2357219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2358219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2359219089Spjd * can handle missing vdevs. 2360168404Spjd */ 2361219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2362219089Spjd &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2363219089Spjd rvd->vdev_guid_sum != ub->ub_guid_sum) 2364219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2365219089Spjd 2366219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2367219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2368219089Spjd spa_try_repair(spa, config); 2369219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2370219089Spjd nvlist_free(spa->spa_config_splitting); 2371219089Spjd spa->spa_config_splitting = NULL; 2372168404Spjd } 2373168404Spjd 2374168404Spjd /* 2375168404Spjd * Initialize internal SPA structures. 2376168404Spjd */ 2377168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2378168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2379219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2380219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2381219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2382219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2383219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2384219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2385219089Spjd 2386236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2387219089Spjd if (error) 2388219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2389168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2390168404Spjd 2391219089Spjd if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2392219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2393168404Spjd 2394236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2395236884Smm boolean_t missing_feat_read = B_FALSE; 2396238926Smm nvlist_t *unsup_feat, *enabled_feat; 2397236884Smm 2398236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2399236884Smm &spa->spa_feat_for_read_obj) != 0) { 2400236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2401236884Smm } 2402236884Smm 2403236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2404236884Smm &spa->spa_feat_for_write_obj) != 0) { 2405236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2406236884Smm } 2407236884Smm 2408236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2409236884Smm &spa->spa_feat_desc_obj) != 0) { 2410236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2411236884Smm } 2412236884Smm 2413238926Smm enabled_feat = fnvlist_alloc(); 2414238926Smm unsup_feat = fnvlist_alloc(); 2415236884Smm 2416259813Sdelphij if (!spa_features_check(spa, B_FALSE, 2417238926Smm unsup_feat, enabled_feat)) 2418236884Smm missing_feat_read = B_TRUE; 2419236884Smm 2420236884Smm if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2421259813Sdelphij if (!spa_features_check(spa, B_TRUE, 2422238926Smm unsup_feat, enabled_feat)) { 2423236884Smm missing_feat_write = B_TRUE; 2424238926Smm } 2425236884Smm } 2426236884Smm 2427238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2428238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2429238926Smm 2430236884Smm if (!nvlist_empty(unsup_feat)) { 2431238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2432238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2433236884Smm } 2434236884Smm 2435238926Smm fnvlist_free(enabled_feat); 2436238926Smm fnvlist_free(unsup_feat); 2437236884Smm 2438236884Smm if (!missing_feat_read) { 2439236884Smm fnvlist_add_boolean(spa->spa_load_info, 2440236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2441236884Smm } 2442236884Smm 2443236884Smm /* 2444236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2445236884Smm * twofold: to determine whether the pool is available for 2446236884Smm * import in read-write mode and (if it is not) whether the 2447236884Smm * pool is available for import in read-only mode. If the pool 2448236884Smm * is available for import in read-write mode, it is displayed 2449236884Smm * as available in userland; if it is not available for import 2450236884Smm * in read-only mode, it is displayed as unavailable in 2451236884Smm * userland. If the pool is available for import in read-only 2452236884Smm * mode but not read-write mode, it is displayed as unavailable 2453236884Smm * in userland with a special note that the pool is actually 2454236884Smm * available for open in read-only mode. 2455236884Smm * 2456236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2457236884Smm * missing a feature for write, we must first determine whether 2458236884Smm * the pool can be opened read-only before returning to 2459236884Smm * userland in order to know whether to display the 2460236884Smm * abovementioned note. 2461236884Smm */ 2462236884Smm if (missing_feat_read || (missing_feat_write && 2463236884Smm spa_writeable(spa))) { 2464236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2465236884Smm ENOTSUP)); 2466236884Smm } 2467260150Sdelphij 2468260150Sdelphij /* 2469260150Sdelphij * Load refcounts for ZFS features from disk into an in-memory 2470260150Sdelphij * cache during SPA initialization. 2471260150Sdelphij */ 2472260150Sdelphij for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2473260150Sdelphij uint64_t refcount; 2474260150Sdelphij 2475260150Sdelphij error = feature_get_refcount_from_disk(spa, 2476260150Sdelphij &spa_feature_table[i], &refcount); 2477260150Sdelphij if (error == 0) { 2478260150Sdelphij spa->spa_feat_refcount_cache[i] = refcount; 2479260150Sdelphij } else if (error == ENOTSUP) { 2480260150Sdelphij spa->spa_feat_refcount_cache[i] = 2481260150Sdelphij SPA_FEATURE_DISABLED; 2482260150Sdelphij } else { 2483260150Sdelphij return (spa_vdev_err(rvd, 2484260150Sdelphij VDEV_AUX_CORRUPT_DATA, EIO)); 2485260150Sdelphij } 2486260150Sdelphij } 2487236884Smm } 2488236884Smm 2489260150Sdelphij if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2490260150Sdelphij if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2491268075Sdelphij &spa->spa_feat_enabled_txg_obj) != 0) 2492260150Sdelphij return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2493260150Sdelphij } 2494260150Sdelphij 2495236884Smm spa->spa_is_initializing = B_TRUE; 2496236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2497236884Smm spa->spa_is_initializing = B_FALSE; 2498236884Smm if (error != 0) 2499236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2500236884Smm 2501168404Spjd if (!mosconfig) { 2502168498Spjd uint64_t hostid; 2503219089Spjd nvlist_t *policy = NULL, *nvconfig; 2504168404Spjd 2505219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2506219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2507168404Spjd 2508219089Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2509185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2510168498Spjd char *hostname; 2511168498Spjd unsigned long myhostid = 0; 2512168498Spjd 2513219089Spjd VERIFY(nvlist_lookup_string(nvconfig, 2514168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2515168498Spjd 2516219089Spjd#ifdef _KERNEL 2517219089Spjd myhostid = zone_get_hostid(NULL); 2518219089Spjd#else /* _KERNEL */ 2519219089Spjd /* 2520219089Spjd * We're emulating the system's hostid in userland, so 2521219089Spjd * we can't use zone_get_hostid(). 2522219089Spjd */ 2523168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2524219089Spjd#endif /* _KERNEL */ 2525204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 2526219089Spjd hostid != myhostid) { 2527219089Spjd nvlist_free(nvconfig); 2528168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 2529168498Spjd "loaded as it was last accessed by " 2530185029Spjd "another system (host: %s hostid: 0x%lx). " 2531236146Smm "See: http://illumos.org/msg/ZFS-8000-EY", 2532185029Spjd spa_name(spa), hostname, 2533168498Spjd (unsigned long)hostid); 2534249195Smm return (SET_ERROR(EBADF)); 2535168498Spjd } 2536168498Spjd } 2537219089Spjd if (nvlist_lookup_nvlist(spa->spa_config, 2538219089Spjd ZPOOL_REWIND_POLICY, &policy) == 0) 2539219089Spjd VERIFY(nvlist_add_nvlist(nvconfig, 2540219089Spjd ZPOOL_REWIND_POLICY, policy) == 0); 2541168498Spjd 2542219089Spjd spa_config_set(spa, nvconfig); 2543168404Spjd spa_unload(spa); 2544168404Spjd spa_deactivate(spa); 2545209962Smm spa_activate(spa, orig_mode); 2546168404Spjd 2547219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2548168404Spjd } 2549168404Spjd 2550219089Spjd if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2551219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2552219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2553219089Spjd if (error != 0) 2554219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2555168404Spjd 2556168404Spjd /* 2557168404Spjd * Load the bit that tells us to use the new accounting function 2558168404Spjd * (raid-z deflation). If we have an older pool, this will not 2559168404Spjd * be present. 2560168404Spjd */ 2561219089Spjd error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2562219089Spjd if (error != 0 && error != ENOENT) 2563219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2564168404Spjd 2565219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2566219089Spjd &spa->spa_creation_version); 2567219089Spjd if (error != 0 && error != ENOENT) 2568219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2569219089Spjd 2570168404Spjd /* 2571168404Spjd * Load the persistent error log. If we have an older pool, this will 2572168404Spjd * not be present. 2573168404Spjd */ 2574219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2575219089Spjd if (error != 0 && error != ENOENT) 2576219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2577168404Spjd 2578219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2579219089Spjd &spa->spa_errlog_scrub); 2580219089Spjd if (error != 0 && error != ENOENT) 2581219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2582168404Spjd 2583168404Spjd /* 2584168404Spjd * Load the history object. If we have an older pool, this 2585168404Spjd * will not be present. 2586168404Spjd */ 2587219089Spjd error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2588219089Spjd if (error != 0 && error != ENOENT) 2589219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2590168404Spjd 2591168404Spjd /* 2592219089Spjd * If we're assembling the pool from the split-off vdevs of 2593219089Spjd * an existing pool, we don't want to attach the spares & cache 2594219089Spjd * devices. 2595219089Spjd */ 2596219089Spjd 2597219089Spjd /* 2598168404Spjd * Load any hot spares for this pool. 2599168404Spjd */ 2600219089Spjd error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2601219089Spjd if (error != 0 && error != ENOENT) 2602219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2603219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2604185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2605185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 2606219089Spjd &spa->spa_spares.sav_config) != 0) 2607219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2608168404Spjd 2609185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2610168404Spjd spa_load_spares(spa); 2611185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2612219089Spjd } else if (error == 0) { 2613219089Spjd spa->spa_spares.sav_sync = B_TRUE; 2614168404Spjd } 2615168404Spjd 2616185029Spjd /* 2617185029Spjd * Load any level 2 ARC devices for this pool. 2618185029Spjd */ 2619219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2620185029Spjd &spa->spa_l2cache.sav_object); 2621219089Spjd if (error != 0 && error != ENOENT) 2622219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2623219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2624185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2625185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2626219089Spjd &spa->spa_l2cache.sav_config) != 0) 2627219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2628185029Spjd 2629185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2630185029Spjd spa_load_l2cache(spa); 2631185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2632219089Spjd } else if (error == 0) { 2633219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2634185029Spjd } 2635185029Spjd 2636219089Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2637213197Smm 2638219089Spjd error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2639219089Spjd if (error && error != ENOENT) 2640219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2641185029Spjd 2642219089Spjd if (error == 0) { 2643219089Spjd uint64_t autoreplace; 2644185029Spjd 2645219089Spjd spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2646219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2647219089Spjd spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2648219089Spjd spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2649219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2650219089Spjd spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2651219089Spjd &spa->spa_dedup_ditto); 2652185029Spjd 2653219089Spjd spa->spa_autoreplace = (autoreplace != 0); 2654168404Spjd } 2655168404Spjd 2656168404Spjd /* 2657185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 2658185029Spjd * the ZFS DE that it should not issue any faults for unopenable 2659185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 2660185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 2661185029Spjd * over. 2662185029Spjd */ 2663219089Spjd if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2664185029Spjd spa_check_removed(spa->spa_root_vdev); 2665219089Spjd /* 2666219089Spjd * For the import case, this is done in spa_import(), because 2667219089Spjd * at this point we're using the spare definitions from 2668219089Spjd * the MOS config, not necessarily from the userland config. 2669219089Spjd */ 2670219089Spjd if (state != SPA_LOAD_IMPORT) { 2671219089Spjd spa_aux_check_removed(&spa->spa_spares); 2672219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 2673219089Spjd } 2674219089Spjd } 2675185029Spjd 2676185029Spjd /* 2677168404Spjd * Load the vdev state for all toplevel vdevs. 2678168404Spjd */ 2679168404Spjd vdev_load(rvd); 2680168404Spjd 2681168404Spjd /* 2682168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 2683168404Spjd */ 2684185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2685168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2686185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2687168404Spjd 2688168404Spjd /* 2689219089Spjd * Load the DDTs (dedup tables). 2690168404Spjd */ 2691219089Spjd error = ddt_load(spa); 2692219089Spjd if (error != 0) 2693219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2694219089Spjd 2695219089Spjd spa_update_dspace(spa); 2696219089Spjd 2697219089Spjd /* 2698219089Spjd * Validate the config, using the MOS config to fill in any 2699219089Spjd * information which might be missing. If we fail to validate 2700219089Spjd * the config then declare the pool unfit for use. If we're 2701219089Spjd * assembling a pool from a split, the log is not transferred 2702219089Spjd * over. 2703219089Spjd */ 2704219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2705219089Spjd nvlist_t *nvconfig; 2706219089Spjd 2707219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2708219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2709219089Spjd 2710219089Spjd if (!spa_config_valid(spa, nvconfig)) { 2711219089Spjd nvlist_free(nvconfig); 2712219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2713219089Spjd ENXIO)); 2714219089Spjd } 2715219089Spjd nvlist_free(nvconfig); 2716219089Spjd 2717219089Spjd /* 2718236884Smm * Now that we've validated the config, check the state of the 2719219089Spjd * root vdev. If it can't be opened, it indicates one or 2720219089Spjd * more toplevel vdevs are faulted. 2721219089Spjd */ 2722219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2723249195Smm return (SET_ERROR(ENXIO)); 2724219089Spjd 2725219089Spjd if (spa_check_logs(spa)) { 2726219089Spjd *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2727219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2728219089Spjd } 2729168404Spjd } 2730168404Spjd 2731236884Smm if (missing_feat_write) { 2732236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 2733236884Smm 2734236884Smm /* 2735236884Smm * At this point, we know that we can open the pool in 2736236884Smm * read-only mode but not read-write mode. We now have enough 2737236884Smm * information and can return to userland. 2738236884Smm */ 2739236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2740236884Smm } 2741236884Smm 2742219089Spjd /* 2743219089Spjd * We've successfully opened the pool, verify that we're ready 2744219089Spjd * to start pushing transactions. 2745219089Spjd */ 2746219089Spjd if (state != SPA_LOAD_TRYIMPORT) { 2747219089Spjd if (error = spa_load_verify(spa)) 2748219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2749219089Spjd error)); 2750219089Spjd } 2751219089Spjd 2752219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2753219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 2754168404Spjd dmu_tx_t *tx; 2755168404Spjd int need_update = B_FALSE; 2756168404Spjd 2757209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 2758209962Smm 2759168404Spjd /* 2760168404Spjd * Claim log blocks that haven't been committed yet. 2761168404Spjd * This must all happen in a single txg. 2762219089Spjd * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2763219089Spjd * invoked from zil_claim_log_block()'s i/o done callback. 2764219089Spjd * Price of rollback is that we abandon the log. 2765168404Spjd */ 2766219089Spjd spa->spa_claiming = B_TRUE; 2767219089Spjd 2768168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2769168404Spjd spa_first_txg(spa)); 2770185029Spjd (void) dmu_objset_find(spa_name(spa), 2771168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 2772168404Spjd dmu_tx_commit(tx); 2773168404Spjd 2774219089Spjd spa->spa_claiming = B_FALSE; 2775219089Spjd 2776219089Spjd spa_set_log_state(spa, SPA_LOG_GOOD); 2777168404Spjd spa->spa_sync_on = B_TRUE; 2778168404Spjd txg_sync_start(spa->spa_dsl_pool); 2779168404Spjd 2780168404Spjd /* 2781219089Spjd * Wait for all claims to sync. We sync up to the highest 2782219089Spjd * claimed log block birth time so that claimed log blocks 2783219089Spjd * don't appear to be from the future. spa_claim_max_txg 2784219089Spjd * will have been set for us by either zil_check_log_chain() 2785219089Spjd * (invoked from spa_check_logs()) or zil_claim() above. 2786168404Spjd */ 2787219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2788168404Spjd 2789168404Spjd /* 2790168404Spjd * If the config cache is stale, or we have uninitialized 2791168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 2792209962Smm * 2793219089Spjd * If this is a verbatim import, trust the current 2794209962Smm * in-core spa_config and update the disk labels. 2795168404Spjd */ 2796168404Spjd if (config_cache_txg != spa->spa_config_txg || 2797219089Spjd state == SPA_LOAD_IMPORT || 2798219089Spjd state == SPA_LOAD_RECOVER || 2799219089Spjd (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2800168404Spjd need_update = B_TRUE; 2801168404Spjd 2802209962Smm for (int c = 0; c < rvd->vdev_children; c++) 2803168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 2804168404Spjd need_update = B_TRUE; 2805168404Spjd 2806168404Spjd /* 2807168404Spjd * Update the config cache asychronously in case we're the 2808168404Spjd * root pool, in which case the config cache isn't writable yet. 2809168404Spjd */ 2810168404Spjd if (need_update) 2811168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2812208683Spjd 2813208683Spjd /* 2814208683Spjd * Check all DTLs to see if anything needs resilvering. 2815208683Spjd */ 2816219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2817219089Spjd vdev_resilver_needed(rvd, NULL, NULL)) 2818208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2819219089Spjd 2820219089Spjd /* 2821248571Smm * Log the fact that we booted up (so that we can detect if 2822248571Smm * we rebooted in the middle of an operation). 2823248571Smm */ 2824248571Smm spa_history_log_version(spa, "open"); 2825248571Smm 2826248571Smm /* 2827219089Spjd * Delete any inconsistent datasets. 2828219089Spjd */ 2829219089Spjd (void) dmu_objset_find(spa_name(spa), 2830219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2831219089Spjd 2832219089Spjd /* 2833219089Spjd * Clean up any stale temporary dataset userrefs. 2834219089Spjd */ 2835219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2836168404Spjd } 2837168404Spjd 2838219089Spjd return (0); 2839219089Spjd} 2840168404Spjd 2841219089Spjdstatic int 2842219089Spjdspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2843219089Spjd{ 2844219089Spjd int mode = spa->spa_mode; 2845219089Spjd 2846219089Spjd spa_unload(spa); 2847219089Spjd spa_deactivate(spa); 2848219089Spjd 2849268720Sdelphij spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 2850219089Spjd 2851219089Spjd spa_activate(spa, mode); 2852219089Spjd spa_async_suspend(spa); 2853219089Spjd 2854219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2855168404Spjd} 2856168404Spjd 2857236884Smm/* 2858236884Smm * If spa_load() fails this function will try loading prior txg's. If 2859236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2860236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2861236884Smm * function will not rewind the pool and will return the same error as 2862236884Smm * spa_load(). 2863236884Smm */ 2864219089Spjdstatic int 2865219089Spjdspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2866219089Spjd uint64_t max_request, int rewind_flags) 2867219089Spjd{ 2868236884Smm nvlist_t *loadinfo = NULL; 2869219089Spjd nvlist_t *config = NULL; 2870219089Spjd int load_error, rewind_error; 2871219089Spjd uint64_t safe_rewind_txg; 2872219089Spjd uint64_t min_txg; 2873219089Spjd 2874219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2875219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 2876219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2877219089Spjd } else { 2878219089Spjd spa->spa_load_max_txg = max_request; 2879268720Sdelphij if (max_request != UINT64_MAX) 2880268720Sdelphij spa->spa_extreme_rewind = B_TRUE; 2881219089Spjd } 2882219089Spjd 2883219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2884219089Spjd mosconfig); 2885219089Spjd if (load_error == 0) 2886219089Spjd return (0); 2887219089Spjd 2888219089Spjd if (spa->spa_root_vdev != NULL) 2889219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2890219089Spjd 2891219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2892219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2893219089Spjd 2894219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 2895219089Spjd nvlist_free(config); 2896219089Spjd return (load_error); 2897219089Spjd } 2898219089Spjd 2899236884Smm if (state == SPA_LOAD_RECOVER) { 2900236884Smm /* Price of rolling back is discarding txgs, including log */ 2901219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2902236884Smm } else { 2903236884Smm /* 2904236884Smm * If we aren't rolling back save the load info from our first 2905236884Smm * import attempt so that we can restore it after attempting 2906236884Smm * to rewind. 2907236884Smm */ 2908236884Smm loadinfo = spa->spa_load_info; 2909236884Smm spa->spa_load_info = fnvlist_alloc(); 2910236884Smm } 2911219089Spjd 2912219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2913219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2914219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2915219089Spjd TXG_INITIAL : safe_rewind_txg; 2916219089Spjd 2917219089Spjd /* 2918219089Spjd * Continue as long as we're finding errors, we're still within 2919219089Spjd * the acceptable rewind range, and we're still finding uberblocks 2920219089Spjd */ 2921219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2922219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2923219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 2924219089Spjd spa->spa_extreme_rewind = B_TRUE; 2925219089Spjd rewind_error = spa_load_retry(spa, state, mosconfig); 2926219089Spjd } 2927219089Spjd 2928219089Spjd spa->spa_extreme_rewind = B_FALSE; 2929219089Spjd spa->spa_load_max_txg = UINT64_MAX; 2930219089Spjd 2931219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2932219089Spjd spa_config_set(spa, config); 2933219089Spjd 2934236884Smm if (state == SPA_LOAD_RECOVER) { 2935236884Smm ASSERT3P(loadinfo, ==, NULL); 2936236884Smm return (rewind_error); 2937236884Smm } else { 2938236884Smm /* Store the rewind info as part of the initial load info */ 2939236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2940236884Smm spa->spa_load_info); 2941236884Smm 2942236884Smm /* Restore the initial load info */ 2943236884Smm fnvlist_free(spa->spa_load_info); 2944236884Smm spa->spa_load_info = loadinfo; 2945236884Smm 2946236884Smm return (load_error); 2947236884Smm } 2948219089Spjd} 2949219089Spjd 2950168404Spjd/* 2951168404Spjd * Pool Open/Import 2952168404Spjd * 2953168404Spjd * The import case is identical to an open except that the configuration is sent 2954168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 2955168404Spjd * case of an open, the pool configuration will exist in the 2956185029Spjd * POOL_STATE_UNINITIALIZED state. 2957168404Spjd * 2958168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 2959168404Spjd * the same time open the pool, without having to keep around the spa_t in some 2960168404Spjd * ambiguous state. 2961168404Spjd */ 2962168404Spjdstatic int 2963219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2964219089Spjd nvlist_t **config) 2965168404Spjd{ 2966168404Spjd spa_t *spa; 2967219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 2968168404Spjd int error; 2969168404Spjd int locked = B_FALSE; 2970219089Spjd int firstopen = B_FALSE; 2971168404Spjd 2972168404Spjd *spapp = NULL; 2973168404Spjd 2974168404Spjd /* 2975168404Spjd * As disgusting as this is, we need to support recursive calls to this 2976168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 2977168404Spjd * up calling spa_open() again. The real fix is to figure out how to 2978168404Spjd * avoid dsl_dir_open() calling this in the first place. 2979168404Spjd */ 2980168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 2981168404Spjd mutex_enter(&spa_namespace_lock); 2982168404Spjd locked = B_TRUE; 2983168404Spjd } 2984168404Spjd 2985168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 2986168404Spjd if (locked) 2987168404Spjd mutex_exit(&spa_namespace_lock); 2988249195Smm return (SET_ERROR(ENOENT)); 2989168404Spjd } 2990219089Spjd 2991168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2992219089Spjd zpool_rewind_policy_t policy; 2993168404Spjd 2994219089Spjd firstopen = B_TRUE; 2995219089Spjd 2996219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2997219089Spjd &policy); 2998219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 2999219089Spjd state = SPA_LOAD_RECOVER; 3000219089Spjd 3001209962Smm spa_activate(spa, spa_mode_global); 3002168404Spjd 3003219089Spjd if (state != SPA_LOAD_RECOVER) 3004219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3005168404Spjd 3006219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3007219089Spjd policy.zrp_request); 3008219089Spjd 3009168404Spjd if (error == EBADF) { 3010168404Spjd /* 3011168404Spjd * If vdev_validate() returns failure (indicated by 3012168404Spjd * EBADF), it indicates that one of the vdevs indicates 3013168404Spjd * that the pool has been exported or destroyed. If 3014168404Spjd * this is the case, the config cache is out of sync and 3015168404Spjd * we should remove the pool from the namespace. 3016168404Spjd */ 3017168404Spjd spa_unload(spa); 3018168404Spjd spa_deactivate(spa); 3019185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 3020168404Spjd spa_remove(spa); 3021168404Spjd if (locked) 3022168404Spjd mutex_exit(&spa_namespace_lock); 3023249195Smm return (SET_ERROR(ENOENT)); 3024168404Spjd } 3025168404Spjd 3026168404Spjd if (error) { 3027168404Spjd /* 3028168404Spjd * We can't open the pool, but we still have useful 3029168404Spjd * information: the state of each vdev after the 3030168404Spjd * attempted vdev_open(). Return this to the user. 3031168404Spjd */ 3032219089Spjd if (config != NULL && spa->spa_config) { 3033219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 3034219089Spjd KM_SLEEP) == 0); 3035219089Spjd VERIFY(nvlist_add_nvlist(*config, 3036219089Spjd ZPOOL_CONFIG_LOAD_INFO, 3037219089Spjd spa->spa_load_info) == 0); 3038219089Spjd } 3039168404Spjd spa_unload(spa); 3040168404Spjd spa_deactivate(spa); 3041219089Spjd spa->spa_last_open_failed = error; 3042168404Spjd if (locked) 3043168404Spjd mutex_exit(&spa_namespace_lock); 3044168404Spjd *spapp = NULL; 3045168404Spjd return (error); 3046168404Spjd } 3047168404Spjd } 3048168404Spjd 3049168404Spjd spa_open_ref(spa, tag); 3050185029Spjd 3051219089Spjd if (config != NULL) 3052219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3053219089Spjd 3054219089Spjd /* 3055219089Spjd * If we've recovered the pool, pass back any information we 3056219089Spjd * gathered while doing the load. 3057219089Spjd */ 3058219089Spjd if (state == SPA_LOAD_RECOVER) { 3059219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3060219089Spjd spa->spa_load_info) == 0); 3061219089Spjd } 3062219089Spjd 3063219089Spjd if (locked) { 3064219089Spjd spa->spa_last_open_failed = 0; 3065219089Spjd spa->spa_last_ubsync_txg = 0; 3066219089Spjd spa->spa_load_txg = 0; 3067168404Spjd mutex_exit(&spa_namespace_lock); 3068219089Spjd#ifdef __FreeBSD__ 3069219089Spjd#ifdef _KERNEL 3070219089Spjd if (firstopen) 3071249047Savg zvol_create_minors(spa->spa_name); 3072219089Spjd#endif 3073219089Spjd#endif 3074219089Spjd } 3075168404Spjd 3076168404Spjd *spapp = spa; 3077168404Spjd 3078168404Spjd return (0); 3079168404Spjd} 3080168404Spjd 3081168404Spjdint 3082219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3083219089Spjd nvlist_t **config) 3084219089Spjd{ 3085219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 3086219089Spjd} 3087219089Spjd 3088219089Spjdint 3089168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 3090168404Spjd{ 3091219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 3092168404Spjd} 3093168404Spjd 3094168404Spjd/* 3095168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 3096168404Spjd * preventing it from being exported or destroyed. 3097168404Spjd */ 3098168404Spjdspa_t * 3099168404Spjdspa_inject_addref(char *name) 3100168404Spjd{ 3101168404Spjd spa_t *spa; 3102168404Spjd 3103168404Spjd mutex_enter(&spa_namespace_lock); 3104168404Spjd if ((spa = spa_lookup(name)) == NULL) { 3105168404Spjd mutex_exit(&spa_namespace_lock); 3106168404Spjd return (NULL); 3107168404Spjd } 3108168404Spjd spa->spa_inject_ref++; 3109168404Spjd mutex_exit(&spa_namespace_lock); 3110168404Spjd 3111168404Spjd return (spa); 3112168404Spjd} 3113168404Spjd 3114168404Spjdvoid 3115168404Spjdspa_inject_delref(spa_t *spa) 3116168404Spjd{ 3117168404Spjd mutex_enter(&spa_namespace_lock); 3118168404Spjd spa->spa_inject_ref--; 3119168404Spjd mutex_exit(&spa_namespace_lock); 3120168404Spjd} 3121168404Spjd 3122185029Spjd/* 3123185029Spjd * Add spares device information to the nvlist. 3124185029Spjd */ 3125168404Spjdstatic void 3126168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 3127168404Spjd{ 3128168404Spjd nvlist_t **spares; 3129168404Spjd uint_t i, nspares; 3130168404Spjd nvlist_t *nvroot; 3131168404Spjd uint64_t guid; 3132168404Spjd vdev_stat_t *vs; 3133168404Spjd uint_t vsc; 3134168404Spjd uint64_t pool; 3135168404Spjd 3136209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3137209962Smm 3138185029Spjd if (spa->spa_spares.sav_count == 0) 3139168404Spjd return; 3140168404Spjd 3141168404Spjd VERIFY(nvlist_lookup_nvlist(config, 3142168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3143185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3144168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3145168404Spjd if (nspares != 0) { 3146168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3147168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3148168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3149168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3150168404Spjd 3151168404Spjd /* 3152168404Spjd * Go through and find any spares which have since been 3153168404Spjd * repurposed as an active spare. If this is the case, update 3154168404Spjd * their status appropriately. 3155168404Spjd */ 3156168404Spjd for (i = 0; i < nspares; i++) { 3157168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 3158168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3159185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 3160185029Spjd pool != 0ULL) { 3161168404Spjd VERIFY(nvlist_lookup_uint64_array( 3162219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 3163168404Spjd (uint64_t **)&vs, &vsc) == 0); 3164168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 3165168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 3166168404Spjd } 3167168404Spjd } 3168168404Spjd } 3169168404Spjd} 3170168404Spjd 3171185029Spjd/* 3172185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 3173185029Spjd */ 3174185029Spjdstatic void 3175185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 3176185029Spjd{ 3177185029Spjd nvlist_t **l2cache; 3178185029Spjd uint_t i, j, nl2cache; 3179185029Spjd nvlist_t *nvroot; 3180185029Spjd uint64_t guid; 3181185029Spjd vdev_t *vd; 3182185029Spjd vdev_stat_t *vs; 3183185029Spjd uint_t vsc; 3184185029Spjd 3185209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3186209962Smm 3187185029Spjd if (spa->spa_l2cache.sav_count == 0) 3188185029Spjd return; 3189185029Spjd 3190185029Spjd VERIFY(nvlist_lookup_nvlist(config, 3191185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3192185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3193185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3194185029Spjd if (nl2cache != 0) { 3195185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3196185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3197185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3198185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3199185029Spjd 3200185029Spjd /* 3201185029Spjd * Update level 2 cache device stats. 3202185029Spjd */ 3203185029Spjd 3204185029Spjd for (i = 0; i < nl2cache; i++) { 3205185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 3206185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3207185029Spjd 3208185029Spjd vd = NULL; 3209185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3210185029Spjd if (guid == 3211185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3212185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 3213185029Spjd break; 3214185029Spjd } 3215185029Spjd } 3216185029Spjd ASSERT(vd != NULL); 3217185029Spjd 3218185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3219219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3220219089Spjd == 0); 3221185029Spjd vdev_get_stats(vd, vs); 3222185029Spjd } 3223185029Spjd } 3224185029Spjd} 3225185029Spjd 3226236884Smmstatic void 3227236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3228236884Smm{ 3229236884Smm nvlist_t *features; 3230236884Smm zap_cursor_t zc; 3231236884Smm zap_attribute_t za; 3232236884Smm 3233236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3234236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3235236884Smm 3236253993Smav /* We may be unable to read features if pool is suspended. */ 3237253993Smav if (spa_suspended(spa)) 3238253993Smav goto out; 3239253993Smav 3240236884Smm if (spa->spa_feat_for_read_obj != 0) { 3241236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3242236884Smm spa->spa_feat_for_read_obj); 3243236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3244236884Smm zap_cursor_advance(&zc)) { 3245236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3246236884Smm za.za_num_integers == 1); 3247236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3248236884Smm za.za_first_integer)); 3249236884Smm } 3250236884Smm zap_cursor_fini(&zc); 3251236884Smm } 3252236884Smm 3253236884Smm if (spa->spa_feat_for_write_obj != 0) { 3254236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3255236884Smm spa->spa_feat_for_write_obj); 3256236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3257236884Smm zap_cursor_advance(&zc)) { 3258236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3259236884Smm za.za_num_integers == 1); 3260236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3261236884Smm za.za_first_integer)); 3262236884Smm } 3263236884Smm zap_cursor_fini(&zc); 3264236884Smm } 3265236884Smm 3266253993Smavout: 3267236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3268236884Smm features) == 0); 3269236884Smm nvlist_free(features); 3270236884Smm} 3271236884Smm 3272168404Spjdint 3273236884Smmspa_get_stats(const char *name, nvlist_t **config, 3274236884Smm char *altroot, size_t buflen) 3275168404Spjd{ 3276168404Spjd int error; 3277168404Spjd spa_t *spa; 3278168404Spjd 3279168404Spjd *config = NULL; 3280219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3281168404Spjd 3282209962Smm if (spa != NULL) { 3283209962Smm /* 3284209962Smm * This still leaves a window of inconsistency where the spares 3285209962Smm * or l2cache devices could change and the config would be 3286209962Smm * self-inconsistent. 3287209962Smm */ 3288209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3289168404Spjd 3290209962Smm if (*config != NULL) { 3291219089Spjd uint64_t loadtimes[2]; 3292219089Spjd 3293219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3294219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3295219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3296219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3297219089Spjd 3298185029Spjd VERIFY(nvlist_add_uint64(*config, 3299209962Smm ZPOOL_CONFIG_ERRCOUNT, 3300209962Smm spa_get_errlog_size(spa)) == 0); 3301185029Spjd 3302209962Smm if (spa_suspended(spa)) 3303209962Smm VERIFY(nvlist_add_uint64(*config, 3304209962Smm ZPOOL_CONFIG_SUSPENDED, 3305209962Smm spa->spa_failmode) == 0); 3306209962Smm 3307209962Smm spa_add_spares(spa, *config); 3308209962Smm spa_add_l2cache(spa, *config); 3309236884Smm spa_add_feature_stats(spa, *config); 3310209962Smm } 3311168404Spjd } 3312168404Spjd 3313168404Spjd /* 3314168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3315168404Spjd * and call spa_lookup() directly. 3316168404Spjd */ 3317168404Spjd if (altroot) { 3318168404Spjd if (spa == NULL) { 3319168404Spjd mutex_enter(&spa_namespace_lock); 3320168404Spjd spa = spa_lookup(name); 3321168404Spjd if (spa) 3322168404Spjd spa_altroot(spa, altroot, buflen); 3323168404Spjd else 3324168404Spjd altroot[0] = '\0'; 3325168404Spjd spa = NULL; 3326168404Spjd mutex_exit(&spa_namespace_lock); 3327168404Spjd } else { 3328168404Spjd spa_altroot(spa, altroot, buflen); 3329168404Spjd } 3330168404Spjd } 3331168404Spjd 3332209962Smm if (spa != NULL) { 3333209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3334168404Spjd spa_close(spa, FTAG); 3335209962Smm } 3336168404Spjd 3337168404Spjd return (error); 3338168404Spjd} 3339168404Spjd 3340168404Spjd/* 3341185029Spjd * Validate that the auxiliary device array is well formed. We must have an 3342185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 3343185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3344185029Spjd * specified, as long as they are well-formed. 3345168404Spjd */ 3346168404Spjdstatic int 3347185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3348185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 3349185029Spjd vdev_labeltype_t label) 3350168404Spjd{ 3351185029Spjd nvlist_t **dev; 3352185029Spjd uint_t i, ndev; 3353168404Spjd vdev_t *vd; 3354168404Spjd int error; 3355168404Spjd 3356185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3357185029Spjd 3358168404Spjd /* 3359185029Spjd * It's acceptable to have no devs specified. 3360168404Spjd */ 3361185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3362168404Spjd return (0); 3363168404Spjd 3364185029Spjd if (ndev == 0) 3365249195Smm return (SET_ERROR(EINVAL)); 3366168404Spjd 3367168404Spjd /* 3368185029Spjd * Make sure the pool is formatted with a version that supports this 3369185029Spjd * device type. 3370168404Spjd */ 3371185029Spjd if (spa_version(spa) < version) 3372249195Smm return (SET_ERROR(ENOTSUP)); 3373168404Spjd 3374168404Spjd /* 3375185029Spjd * Set the pending device list so we correctly handle device in-use 3376168404Spjd * checking. 3377168404Spjd */ 3378185029Spjd sav->sav_pending = dev; 3379185029Spjd sav->sav_npending = ndev; 3380168404Spjd 3381185029Spjd for (i = 0; i < ndev; i++) { 3382185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3383168404Spjd mode)) != 0) 3384168404Spjd goto out; 3385168404Spjd 3386168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3387168404Spjd vdev_free(vd); 3388249195Smm error = SET_ERROR(EINVAL); 3389168404Spjd goto out; 3390168404Spjd } 3391168404Spjd 3392185029Spjd /* 3393185029Spjd * The L2ARC currently only supports disk devices in 3394185029Spjd * kernel context. For user-level testing, we allow it. 3395185029Spjd */ 3396185029Spjd#ifdef _KERNEL 3397185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3398185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3399249195Smm error = SET_ERROR(ENOTBLK); 3400230514Smm vdev_free(vd); 3401185029Spjd goto out; 3402185029Spjd } 3403185029Spjd#endif 3404168404Spjd vd->vdev_top = vd; 3405168404Spjd 3406168404Spjd if ((error = vdev_open(vd)) == 0 && 3407185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 3408185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3409168404Spjd vd->vdev_guid) == 0); 3410168404Spjd } 3411168404Spjd 3412168404Spjd vdev_free(vd); 3413168404Spjd 3414185029Spjd if (error && 3415185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3416168404Spjd goto out; 3417168404Spjd else 3418168404Spjd error = 0; 3419168404Spjd } 3420168404Spjd 3421168404Spjdout: 3422185029Spjd sav->sav_pending = NULL; 3423185029Spjd sav->sav_npending = 0; 3424168404Spjd return (error); 3425168404Spjd} 3426168404Spjd 3427185029Spjdstatic int 3428185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3429185029Spjd{ 3430185029Spjd int error; 3431185029Spjd 3432185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3433185029Spjd 3434185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3435185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3436185029Spjd VDEV_LABEL_SPARE)) != 0) { 3437185029Spjd return (error); 3438185029Spjd } 3439185029Spjd 3440185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3441185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3442185029Spjd VDEV_LABEL_L2CACHE)); 3443185029Spjd} 3444185029Spjd 3445185029Spjdstatic void 3446185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3447185029Spjd const char *config) 3448185029Spjd{ 3449185029Spjd int i; 3450185029Spjd 3451185029Spjd if (sav->sav_config != NULL) { 3452185029Spjd nvlist_t **olddevs; 3453185029Spjd uint_t oldndevs; 3454185029Spjd nvlist_t **newdevs; 3455185029Spjd 3456185029Spjd /* 3457185029Spjd * Generate new dev list by concatentating with the 3458185029Spjd * current dev list. 3459185029Spjd */ 3460185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3461185029Spjd &olddevs, &oldndevs) == 0); 3462185029Spjd 3463185029Spjd newdevs = kmem_alloc(sizeof (void *) * 3464185029Spjd (ndevs + oldndevs), KM_SLEEP); 3465185029Spjd for (i = 0; i < oldndevs; i++) 3466185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3467185029Spjd KM_SLEEP) == 0); 3468185029Spjd for (i = 0; i < ndevs; i++) 3469185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3470185029Spjd KM_SLEEP) == 0); 3471185029Spjd 3472185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 3473185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 3474185029Spjd 3475185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3476185029Spjd config, newdevs, ndevs + oldndevs) == 0); 3477185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 3478185029Spjd nvlist_free(newdevs[i]); 3479185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3480185029Spjd } else { 3481185029Spjd /* 3482185029Spjd * Generate a new dev list. 3483185029Spjd */ 3484185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3485185029Spjd KM_SLEEP) == 0); 3486185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3487185029Spjd devs, ndevs) == 0); 3488185029Spjd } 3489185029Spjd} 3490185029Spjd 3491168404Spjd/* 3492185029Spjd * Stop and drop level 2 ARC devices 3493185029Spjd */ 3494185029Spjdvoid 3495185029Spjdspa_l2cache_drop(spa_t *spa) 3496185029Spjd{ 3497185029Spjd vdev_t *vd; 3498185029Spjd int i; 3499185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 3500185029Spjd 3501185029Spjd for (i = 0; i < sav->sav_count; i++) { 3502185029Spjd uint64_t pool; 3503185029Spjd 3504185029Spjd vd = sav->sav_vdevs[i]; 3505185029Spjd ASSERT(vd != NULL); 3506185029Spjd 3507209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3508209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 3509185029Spjd l2arc_remove_vdev(vd); 3510185029Spjd } 3511185029Spjd} 3512185029Spjd 3513185029Spjd/* 3514168404Spjd * Pool Creation 3515168404Spjd */ 3516168404Spjdint 3517185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3518248571Smm nvlist_t *zplprops) 3519168404Spjd{ 3520168404Spjd spa_t *spa; 3521185029Spjd char *altroot = NULL; 3522168404Spjd vdev_t *rvd; 3523168404Spjd dsl_pool_t *dp; 3524168404Spjd dmu_tx_t *tx; 3525219089Spjd int error = 0; 3526168404Spjd uint64_t txg = TXG_INITIAL; 3527185029Spjd nvlist_t **spares, **l2cache; 3528185029Spjd uint_t nspares, nl2cache; 3529219089Spjd uint64_t version, obj; 3530236884Smm boolean_t has_features; 3531168404Spjd 3532168404Spjd /* 3533168404Spjd * If this pool already exists, return failure. 3534168404Spjd */ 3535168404Spjd mutex_enter(&spa_namespace_lock); 3536168404Spjd if (spa_lookup(pool) != NULL) { 3537168404Spjd mutex_exit(&spa_namespace_lock); 3538249195Smm return (SET_ERROR(EEXIST)); 3539168404Spjd } 3540168404Spjd 3541168404Spjd /* 3542168404Spjd * Allocate a new spa_t structure. 3543168404Spjd */ 3544185029Spjd (void) nvlist_lookup_string(props, 3545185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3546219089Spjd spa = spa_add(pool, NULL, altroot); 3547209962Smm spa_activate(spa, spa_mode_global); 3548168404Spjd 3549185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 3550185029Spjd spa_deactivate(spa); 3551185029Spjd spa_remove(spa); 3552185029Spjd mutex_exit(&spa_namespace_lock); 3553185029Spjd return (error); 3554185029Spjd } 3555185029Spjd 3556236884Smm has_features = B_FALSE; 3557236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3558236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3559236884Smm if (zpool_prop_feature(nvpair_name(elem))) 3560236884Smm has_features = B_TRUE; 3561236884Smm } 3562236884Smm 3563236884Smm if (has_features || nvlist_lookup_uint64(props, 3564236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3565185029Spjd version = SPA_VERSION; 3566236884Smm } 3567236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3568219089Spjd 3569219089Spjd spa->spa_first_txg = txg; 3570219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 3571185029Spjd spa->spa_uberblock.ub_version = version; 3572168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3573168404Spjd 3574168404Spjd /* 3575209962Smm * Create "The Godfather" zio to hold all async IOs 3576209962Smm */ 3577272598Sdelphij spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3578272598Sdelphij KM_SLEEP); 3579272598Sdelphij for (int i = 0; i < max_ncpus; i++) { 3580272598Sdelphij spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3581272598Sdelphij ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3582272598Sdelphij ZIO_FLAG_GODFATHER); 3583272598Sdelphij } 3584209962Smm 3585209962Smm /* 3586168404Spjd * Create the root vdev. 3587168404Spjd */ 3588185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3589168404Spjd 3590168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3591168404Spjd 3592168404Spjd ASSERT(error != 0 || rvd != NULL); 3593168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3594168404Spjd 3595185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 3596249195Smm error = SET_ERROR(EINVAL); 3597168404Spjd 3598168404Spjd if (error == 0 && 3599168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3600185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 3601168404Spjd VDEV_ALLOC_ADD)) == 0) { 3602219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 3603254591Sgibbs vdev_ashift_optimize(rvd->vdev_child[c]); 3604219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 3605219089Spjd vdev_expand(rvd->vdev_child[c], txg); 3606219089Spjd } 3607168404Spjd } 3608168404Spjd 3609185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3610168404Spjd 3611168404Spjd if (error != 0) { 3612168404Spjd spa_unload(spa); 3613168404Spjd spa_deactivate(spa); 3614168404Spjd spa_remove(spa); 3615168404Spjd mutex_exit(&spa_namespace_lock); 3616168404Spjd return (error); 3617168404Spjd } 3618168404Spjd 3619168404Spjd /* 3620168404Spjd * Get the list of spares, if specified. 3621168404Spjd */ 3622168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3623168404Spjd &spares, &nspares) == 0) { 3624185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3625168404Spjd KM_SLEEP) == 0); 3626185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3627168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3628185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3629168404Spjd spa_load_spares(spa); 3630185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3631185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3632168404Spjd } 3633168404Spjd 3634185029Spjd /* 3635185029Spjd * Get the list of level 2 cache devices, if specified. 3636185029Spjd */ 3637185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3638185029Spjd &l2cache, &nl2cache) == 0) { 3639185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3640185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 3641185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3642185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3643185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3644185029Spjd spa_load_l2cache(spa); 3645185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3646185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3647185029Spjd } 3648185029Spjd 3649236884Smm spa->spa_is_initializing = B_TRUE; 3650185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3651168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 3652236884Smm spa->spa_is_initializing = B_FALSE; 3653168404Spjd 3654219089Spjd /* 3655219089Spjd * Create DDTs (dedup tables). 3656219089Spjd */ 3657219089Spjd ddt_create(spa); 3658219089Spjd 3659219089Spjd spa_update_dspace(spa); 3660219089Spjd 3661168404Spjd tx = dmu_tx_create_assigned(dp, txg); 3662168404Spjd 3663168404Spjd /* 3664168404Spjd * Create the pool config object. 3665168404Spjd */ 3666168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3667185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3668168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3669168404Spjd 3670168404Spjd if (zap_add(spa->spa_meta_objset, 3671168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3672168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3673168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 3674168404Spjd } 3675168404Spjd 3676236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 3677236884Smm spa_feature_create_zap_objects(spa, tx); 3678236884Smm 3679219089Spjd if (zap_add(spa->spa_meta_objset, 3680219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3681219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 3682219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 3683219089Spjd } 3684219089Spjd 3685185029Spjd /* Newly created pools with the right version are always deflated. */ 3686185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3687185029Spjd spa->spa_deflate = TRUE; 3688185029Spjd if (zap_add(spa->spa_meta_objset, 3689185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3690185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3691185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 3692185029Spjd } 3693168404Spjd } 3694168404Spjd 3695168404Spjd /* 3696219089Spjd * Create the deferred-free bpobj. Turn off compression 3697168404Spjd * because sync-to-convergence takes longer if the blocksize 3698168404Spjd * keeps changing. 3699168404Spjd */ 3700219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3701219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 3702168404Spjd ZIO_COMPRESS_OFF, tx); 3703168404Spjd if (zap_add(spa->spa_meta_objset, 3704219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3705219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 3706219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 3707168404Spjd } 3708219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3709219089Spjd spa->spa_meta_objset, obj)); 3710168404Spjd 3711168404Spjd /* 3712168404Spjd * Create the pool's history object. 3713168404Spjd */ 3714185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 3715185029Spjd spa_history_create_obj(spa, tx); 3716168404Spjd 3717185029Spjd /* 3718185029Spjd * Set pool properties. 3719185029Spjd */ 3720185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3721185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3722185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3723219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3724219089Spjd 3725209962Smm if (props != NULL) { 3726209962Smm spa_configfile_set(spa, props, B_FALSE); 3727248571Smm spa_sync_props(props, tx); 3728209962Smm } 3729185029Spjd 3730168404Spjd dmu_tx_commit(tx); 3731168404Spjd 3732168404Spjd spa->spa_sync_on = B_TRUE; 3733168404Spjd txg_sync_start(spa->spa_dsl_pool); 3734168404Spjd 3735168404Spjd /* 3736168404Spjd * We explicitly wait for the first transaction to complete so that our 3737168404Spjd * bean counters are appropriately updated. 3738168404Spjd */ 3739168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 3740168404Spjd 3741185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3742168404Spjd 3743248571Smm spa_history_log_version(spa, "create"); 3744185029Spjd 3745208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 3746208442Smm 3747168404Spjd mutex_exit(&spa_namespace_lock); 3748168404Spjd 3749168404Spjd return (0); 3750168404Spjd} 3751168404Spjd 3752241286Savg#ifdef _KERNEL 3753219089Spjd#if defined(sun) 3754185029Spjd/* 3755219089Spjd * Get the root pool information from the root disk, then import the root pool 3756219089Spjd * during the system boot up time. 3757185029Spjd */ 3758219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3759219089Spjd 3760219089Spjdstatic nvlist_t * 3761219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3762185029Spjd{ 3763219089Spjd nvlist_t *config; 3764185029Spjd nvlist_t *nvtop, *nvroot; 3765185029Spjd uint64_t pgid; 3766185029Spjd 3767219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3768219089Spjd return (NULL); 3769219089Spjd 3770168404Spjd /* 3771185029Spjd * Add this top-level vdev to the child array. 3772168404Spjd */ 3773219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3774219089Spjd &nvtop) == 0); 3775219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3776219089Spjd &pgid) == 0); 3777219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3778168404Spjd 3779185029Spjd /* 3780185029Spjd * Put this pool's top-level vdevs into a root vdev. 3781185029Spjd */ 3782185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3783219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3784219089Spjd VDEV_TYPE_ROOT) == 0); 3785185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3786185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3787185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3788185029Spjd &nvtop, 1) == 0); 3789168404Spjd 3790168404Spjd /* 3791185029Spjd * Replace the existing vdev_tree with the new root vdev in 3792185029Spjd * this pool's configuration (remove the old, add the new). 3793168404Spjd */ 3794185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3795185029Spjd nvlist_free(nvroot); 3796219089Spjd return (config); 3797185029Spjd} 3798168404Spjd 3799185029Spjd/* 3800219089Spjd * Walk the vdev tree and see if we can find a device with "better" 3801219089Spjd * configuration. A configuration is "better" if the label on that 3802219089Spjd * device has a more recent txg. 3803185029Spjd */ 3804219089Spjdstatic void 3805219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3806185029Spjd{ 3807219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3808219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3809185029Spjd 3810219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 3811219089Spjd nvlist_t *label; 3812219089Spjd uint64_t label_txg; 3813185029Spjd 3814219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3815219089Spjd &label) != 0) 3816219089Spjd return; 3817185029Spjd 3818219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3819219089Spjd &label_txg) == 0); 3820168404Spjd 3821219089Spjd /* 3822219089Spjd * Do we have a better boot device? 3823219089Spjd */ 3824219089Spjd if (label_txg > *txg) { 3825219089Spjd *txg = label_txg; 3826219089Spjd *avd = vd; 3827185029Spjd } 3828219089Spjd nvlist_free(label); 3829185029Spjd } 3830185029Spjd} 3831185029Spjd 3832185029Spjd/* 3833185029Spjd * Import a root pool. 3834185029Spjd * 3835185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 3836185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3837185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 3838185029Spjd * 3839185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 3840185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 3841185029Spjd * e.g. 3842185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 3843185029Spjd */ 3844185029Spjdint 3845185029Spjdspa_import_rootpool(char *devpath, char *devid) 3846185029Spjd{ 3847219089Spjd spa_t *spa; 3848219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 3849219089Spjd nvlist_t *config, *nvtop; 3850219089Spjd uint64_t guid, txg; 3851185029Spjd char *pname; 3852185029Spjd int error; 3853185029Spjd 3854185029Spjd /* 3855219089Spjd * Read the label from the boot device and generate a configuration. 3856185029Spjd */ 3857219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3858219089Spjd#if defined(_OBP) && defined(_KERNEL) 3859219089Spjd if (config == NULL) { 3860219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 3861219089Spjd /* iscsi boot */ 3862219089Spjd get_iscsi_bootpath_phy(devpath); 3863219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3864219089Spjd } 3865219089Spjd } 3866219089Spjd#endif 3867219089Spjd if (config == NULL) { 3868236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3869219089Spjd devpath); 3870249195Smm return (SET_ERROR(EIO)); 3871219089Spjd } 3872185029Spjd 3873219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3874219089Spjd &pname) == 0); 3875219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3876185029Spjd 3877209962Smm mutex_enter(&spa_namespace_lock); 3878209962Smm if ((spa = spa_lookup(pname)) != NULL) { 3879209962Smm /* 3880209962Smm * Remove the existing root pool from the namespace so that we 3881209962Smm * can replace it with the correct config we just read in. 3882209962Smm */ 3883209962Smm spa_remove(spa); 3884209962Smm } 3885185029Spjd 3886219089Spjd spa = spa_add(pname, config, NULL); 3887209962Smm spa->spa_is_root = B_TRUE; 3888219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3889209962Smm 3890219089Spjd /* 3891219089Spjd * Build up a vdev tree based on the boot device's label config. 3892219089Spjd */ 3893219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3894219089Spjd &nvtop) == 0); 3895219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3896219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3897219089Spjd VDEV_ALLOC_ROOTPOOL); 3898219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3899219089Spjd if (error) { 3900209962Smm mutex_exit(&spa_namespace_lock); 3901219089Spjd nvlist_free(config); 3902219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3903219089Spjd pname); 3904219089Spjd return (error); 3905209962Smm } 3906209962Smm 3907219089Spjd /* 3908219089Spjd * Get the boot vdev. 3909219089Spjd */ 3910219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3911219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3912219089Spjd (u_longlong_t)guid); 3913249195Smm error = SET_ERROR(ENOENT); 3914219089Spjd goto out; 3915219089Spjd } 3916209962Smm 3917219089Spjd /* 3918219089Spjd * Determine if there is a better boot device. 3919219089Spjd */ 3920219089Spjd avd = bvd; 3921219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 3922219089Spjd if (avd != bvd) { 3923219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3924219089Spjd "try booting from '%s'", avd->vdev_path); 3925249195Smm error = SET_ERROR(EINVAL); 3926219089Spjd goto out; 3927219089Spjd } 3928209962Smm 3929219089Spjd /* 3930219089Spjd * If the boot device is part of a spare vdev then ensure that 3931219089Spjd * we're booting off the active spare. 3932219089Spjd */ 3933219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3934219089Spjd !bvd->vdev_isspare) { 3935219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3936219089Spjd "try booting from '%s'", 3937219089Spjd bvd->vdev_parent-> 3938219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3939249195Smm error = SET_ERROR(EINVAL); 3940219089Spjd goto out; 3941219089Spjd } 3942209962Smm 3943219089Spjd error = 0; 3944219089Spjdout: 3945219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3946219089Spjd vdev_free(rvd); 3947219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3948209962Smm mutex_exit(&spa_namespace_lock); 3949209962Smm 3950219089Spjd nvlist_free(config); 3951219089Spjd return (error); 3952185029Spjd} 3953185029Spjd 3954241286Savg#else 3955241286Savg 3956243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3957243502Savg uint64_t *count); 3958241286Savg 3959241286Savgstatic nvlist_t * 3960241286Savgspa_generate_rootconf(const char *name) 3961241286Savg{ 3962243502Savg nvlist_t **configs, **tops; 3963241286Savg nvlist_t *config; 3964243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 3965243502Savg uint64_t *holes; 3966243502Savg uint64_t best_txg; 3967243213Savg uint64_t nchildren; 3968241286Savg uint64_t pgid; 3969243502Savg uint64_t count; 3970243502Savg uint64_t i; 3971243502Savg uint_t nholes; 3972241286Savg 3973243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 3974241286Savg return (NULL); 3975241286Savg 3976243502Savg ASSERT3U(count, !=, 0); 3977243502Savg best_txg = 0; 3978243502Savg for (i = 0; i < count; i++) { 3979243502Savg uint64_t txg; 3980243502Savg 3981243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 3982243502Savg &txg) == 0); 3983243502Savg if (txg > best_txg) { 3984243502Savg best_txg = txg; 3985243502Savg best_cfg = configs[i]; 3986243502Savg } 3987243502Savg } 3988243502Savg 3989245945Savg nchildren = 1; 3990245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 3991243502Savg holes = NULL; 3992243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 3993243502Savg &holes, &nholes); 3994243502Savg 3995244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 3996243502Savg for (i = 0; i < nchildren; i++) { 3997243502Savg if (i >= count) 3998243502Savg break; 3999243502Savg if (configs[i] == NULL) 4000243502Savg continue; 4001243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 4002243502Savg &nvtop) == 0); 4003243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 4004243213Savg } 4005243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 4006243502Savg if (i >= nchildren) 4007243502Savg continue; 4008243502Savg if (tops[holes[i]] != NULL) 4009243502Savg continue; 4010243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 4011243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 4012243502Savg VDEV_TYPE_HOLE) == 0); 4013243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 4014243502Savg holes[i]) == 0); 4015243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 4016243502Savg 0) == 0); 4017243502Savg } 4018243502Savg for (i = 0; i < nchildren; i++) { 4019243502Savg if (tops[i] != NULL) 4020243502Savg continue; 4021243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 4022243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 4023243502Savg VDEV_TYPE_MISSING) == 0); 4024243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 4025243502Savg i) == 0); 4026243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 4027243502Savg 0) == 0); 4028243502Savg } 4029243213Savg 4030243213Savg /* 4031243502Savg * Create pool config based on the best vdev config. 4032241286Savg */ 4033243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 4034241286Savg 4035241286Savg /* 4036241286Savg * Put this pool's top-level vdevs into a root vdev. 4037241286Savg */ 4038243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4039243502Savg &pgid) == 0); 4040241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4041241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4042241286Savg VDEV_TYPE_ROOT) == 0); 4043241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4044241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4045241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4046243502Savg tops, nchildren) == 0); 4047241286Savg 4048241286Savg /* 4049241286Savg * Replace the existing vdev_tree with the new root vdev in 4050241286Savg * this pool's configuration (remove the old, add the new). 4051241286Savg */ 4052241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4053243502Savg 4054243502Savg /* 4055243502Savg * Drop vdev config elements that should not be present at pool level. 4056243502Savg */ 4057243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 4058243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 4059243502Savg 4060243502Savg for (i = 0; i < count; i++) 4061243502Savg nvlist_free(configs[i]); 4062243502Savg kmem_free(configs, count * sizeof(void *)); 4063243502Savg for (i = 0; i < nchildren; i++) 4064243502Savg nvlist_free(tops[i]); 4065243502Savg kmem_free(tops, nchildren * sizeof(void *)); 4066241286Savg nvlist_free(nvroot); 4067241286Savg return (config); 4068241286Savg} 4069241286Savg 4070241286Savgint 4071241286Savgspa_import_rootpool(const char *name) 4072241286Savg{ 4073241286Savg spa_t *spa; 4074241286Savg vdev_t *rvd, *bvd, *avd = NULL; 4075241286Savg nvlist_t *config, *nvtop; 4076241286Savg uint64_t txg; 4077241286Savg char *pname; 4078241286Savg int error; 4079241286Savg 4080241286Savg /* 4081241286Savg * Read the label from the boot device and generate a configuration. 4082241286Savg */ 4083241286Savg config = spa_generate_rootconf(name); 4084243213Savg 4085243213Savg mutex_enter(&spa_namespace_lock); 4086243213Savg if (config != NULL) { 4087243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4088243213Savg &pname) == 0 && strcmp(name, pname) == 0); 4089243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 4090243213Savg == 0); 4091243213Savg 4092243213Savg if ((spa = spa_lookup(pname)) != NULL) { 4093243213Savg /* 4094243213Savg * Remove the existing root pool from the namespace so 4095243213Savg * that we can replace it with the correct config 4096243213Savg * we just read in. 4097243213Savg */ 4098243213Savg spa_remove(spa); 4099243213Savg } 4100243213Savg spa = spa_add(pname, config, NULL); 4101243501Savg 4102243501Savg /* 4103243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4104243501Savg * via spa_version(). 4105243501Savg */ 4106243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4107243501Savg &spa->spa_ubsync.ub_version) != 0) 4108243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4109243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 4110241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4111241286Savg name); 4112241286Savg return (EIO); 4113243213Savg } else { 4114243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4115241286Savg } 4116241286Savg spa->spa_is_root = B_TRUE; 4117241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4118241286Savg 4119241286Savg /* 4120241286Savg * Build up a vdev tree based on the boot device's label config. 4121241286Savg */ 4122241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4123241286Savg &nvtop) == 0); 4124241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4125241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4126241286Savg VDEV_ALLOC_ROOTPOOL); 4127241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4128241286Savg if (error) { 4129241286Savg mutex_exit(&spa_namespace_lock); 4130241286Savg nvlist_free(config); 4131241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4132241286Savg pname); 4133241286Savg return (error); 4134241286Savg } 4135241286Savg 4136241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4137241286Savg vdev_free(rvd); 4138241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 4139241286Savg mutex_exit(&spa_namespace_lock); 4140241286Savg 4141243213Savg nvlist_free(config); 4142243213Savg return (0); 4143241286Savg} 4144241286Savg 4145241286Savg#endif /* sun */ 4146219089Spjd#endif 4147219089Spjd 4148209962Smm/* 4149209962Smm * Import a non-root pool into the system. 4150209962Smm */ 4151185029Spjdint 4152219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4153185029Spjd{ 4154209962Smm spa_t *spa; 4155209962Smm char *altroot = NULL; 4156219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 4157219089Spjd zpool_rewind_policy_t policy; 4158219089Spjd uint64_t mode = spa_mode_global; 4159219089Spjd uint64_t readonly = B_FALSE; 4160209962Smm int error; 4161209962Smm nvlist_t *nvroot; 4162209962Smm nvlist_t **spares, **l2cache; 4163209962Smm uint_t nspares, nl2cache; 4164209962Smm 4165209962Smm /* 4166209962Smm * If a pool with this name exists, return failure. 4167209962Smm */ 4168209962Smm mutex_enter(&spa_namespace_lock); 4169219089Spjd if (spa_lookup(pool) != NULL) { 4170209962Smm mutex_exit(&spa_namespace_lock); 4171249195Smm return (SET_ERROR(EEXIST)); 4172209962Smm } 4173209962Smm 4174209962Smm /* 4175209962Smm * Create and initialize the spa structure. 4176209962Smm */ 4177209962Smm (void) nvlist_lookup_string(props, 4178209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4179219089Spjd (void) nvlist_lookup_uint64(props, 4180219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4181219089Spjd if (readonly) 4182219089Spjd mode = FREAD; 4183219089Spjd spa = spa_add(pool, config, altroot); 4184219089Spjd spa->spa_import_flags = flags; 4185209962Smm 4186209962Smm /* 4187219089Spjd * Verbatim import - Take a pool and insert it into the namespace 4188219089Spjd * as if it had been loaded at boot. 4189219089Spjd */ 4190219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4191219089Spjd if (props != NULL) 4192219089Spjd spa_configfile_set(spa, props, B_FALSE); 4193219089Spjd 4194219089Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 4195219089Spjd 4196219089Spjd mutex_exit(&spa_namespace_lock); 4197219089Spjd return (0); 4198219089Spjd } 4199219089Spjd 4200219089Spjd spa_activate(spa, mode); 4201219089Spjd 4202219089Spjd /* 4203209962Smm * Don't start async tasks until we know everything is healthy. 4204209962Smm */ 4205209962Smm spa_async_suspend(spa); 4206209962Smm 4207219089Spjd zpool_get_rewind_policy(config, &policy); 4208219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 4209219089Spjd state = SPA_LOAD_RECOVER; 4210219089Spjd 4211209962Smm /* 4212209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4213209962Smm * because the user-supplied config is actually the one to trust when 4214209962Smm * doing an import. 4215209962Smm */ 4216219089Spjd if (state != SPA_LOAD_RECOVER) 4217219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4218209962Smm 4219219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4220219089Spjd policy.zrp_request); 4221219089Spjd 4222219089Spjd /* 4223219089Spjd * Propagate anything learned while loading the pool and pass it 4224219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 4225219089Spjd */ 4226219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4227219089Spjd spa->spa_load_info) == 0); 4228219089Spjd 4229209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4230209962Smm /* 4231209962Smm * Toss any existing sparelist, as it doesn't have any validity 4232209962Smm * anymore, and conflicts with spa_has_spare(). 4233209962Smm */ 4234209962Smm if (spa->spa_spares.sav_config) { 4235209962Smm nvlist_free(spa->spa_spares.sav_config); 4236209962Smm spa->spa_spares.sav_config = NULL; 4237209962Smm spa_load_spares(spa); 4238209962Smm } 4239209962Smm if (spa->spa_l2cache.sav_config) { 4240209962Smm nvlist_free(spa->spa_l2cache.sav_config); 4241209962Smm spa->spa_l2cache.sav_config = NULL; 4242209962Smm spa_load_l2cache(spa); 4243209962Smm } 4244209962Smm 4245209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4246209962Smm &nvroot) == 0); 4247209962Smm if (error == 0) 4248209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4249209962Smm VDEV_ALLOC_SPARE); 4250209962Smm if (error == 0) 4251209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4252209962Smm VDEV_ALLOC_L2CACHE); 4253209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4254209962Smm 4255209962Smm if (props != NULL) 4256209962Smm spa_configfile_set(spa, props, B_FALSE); 4257209962Smm 4258209962Smm if (error != 0 || (props && spa_writeable(spa) && 4259209962Smm (error = spa_prop_set(spa, props)))) { 4260209962Smm spa_unload(spa); 4261209962Smm spa_deactivate(spa); 4262209962Smm spa_remove(spa); 4263209962Smm mutex_exit(&spa_namespace_lock); 4264209962Smm return (error); 4265209962Smm } 4266209962Smm 4267209962Smm spa_async_resume(spa); 4268209962Smm 4269209962Smm /* 4270209962Smm * Override any spares and level 2 cache devices as specified by 4271209962Smm * the user, as these may have correct device names/devids, etc. 4272209962Smm */ 4273209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4274209962Smm &spares, &nspares) == 0) { 4275209962Smm if (spa->spa_spares.sav_config) 4276209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4277209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4278209962Smm else 4279209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4280209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4281209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4282209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4283209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4284209962Smm spa_load_spares(spa); 4285209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4286209962Smm spa->spa_spares.sav_sync = B_TRUE; 4287209962Smm } 4288209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4289209962Smm &l2cache, &nl2cache) == 0) { 4290209962Smm if (spa->spa_l2cache.sav_config) 4291209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4292209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4293209962Smm else 4294209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4295209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4296209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4297209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4298209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4299209962Smm spa_load_l2cache(spa); 4300209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4301209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 4302209962Smm } 4303209962Smm 4304219089Spjd /* 4305219089Spjd * Check for any removed devices. 4306219089Spjd */ 4307219089Spjd if (spa->spa_autoreplace) { 4308219089Spjd spa_aux_check_removed(&spa->spa_spares); 4309219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 4310219089Spjd } 4311219089Spjd 4312209962Smm if (spa_writeable(spa)) { 4313209962Smm /* 4314209962Smm * Update the config cache to include the newly-imported pool. 4315209962Smm */ 4316209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4317209962Smm } 4318209962Smm 4319219089Spjd /* 4320219089Spjd * It's possible that the pool was expanded while it was exported. 4321219089Spjd * We kick off an async task to handle this for us. 4322219089Spjd */ 4323219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4324219089Spjd 4325209962Smm mutex_exit(&spa_namespace_lock); 4326248571Smm spa_history_log_version(spa, "import"); 4327209962Smm 4328219089Spjd#ifdef __FreeBSD__ 4329219089Spjd#ifdef _KERNEL 4330219089Spjd zvol_create_minors(pool); 4331219089Spjd#endif 4332219089Spjd#endif 4333209962Smm return (0); 4334185029Spjd} 4335185029Spjd 4336168404Spjdnvlist_t * 4337168404Spjdspa_tryimport(nvlist_t *tryconfig) 4338168404Spjd{ 4339168404Spjd nvlist_t *config = NULL; 4340168404Spjd char *poolname; 4341168404Spjd spa_t *spa; 4342168404Spjd uint64_t state; 4343208443Smm int error; 4344168404Spjd 4345168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4346168404Spjd return (NULL); 4347168404Spjd 4348168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4349168404Spjd return (NULL); 4350168404Spjd 4351168404Spjd /* 4352168404Spjd * Create and initialize the spa structure. 4353168404Spjd */ 4354168404Spjd mutex_enter(&spa_namespace_lock); 4355219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4356209962Smm spa_activate(spa, FREAD); 4357168404Spjd 4358168404Spjd /* 4359168404Spjd * Pass off the heavy lifting to spa_load(). 4360168404Spjd * Pass TRUE for mosconfig because the user-supplied config 4361168404Spjd * is actually the one to trust when doing an import. 4362168404Spjd */ 4363219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4364168404Spjd 4365168404Spjd /* 4366168404Spjd * If 'tryconfig' was at least parsable, return the current config. 4367168404Spjd */ 4368168404Spjd if (spa->spa_root_vdev != NULL) { 4369168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4370168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4371168404Spjd poolname) == 0); 4372168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4373168404Spjd state) == 0); 4374168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4375168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 4376236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4377236884Smm spa->spa_load_info) == 0); 4378168404Spjd 4379168404Spjd /* 4380185029Spjd * If the bootfs property exists on this pool then we 4381185029Spjd * copy it out so that external consumers can tell which 4382185029Spjd * pools are bootable. 4383168404Spjd */ 4384208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 4385185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4386185029Spjd 4387185029Spjd /* 4388185029Spjd * We have to play games with the name since the 4389185029Spjd * pool was opened as TRYIMPORT_NAME. 4390185029Spjd */ 4391185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 4392185029Spjd spa->spa_bootfs, tmpname) == 0) { 4393185029Spjd char *cp; 4394185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4395185029Spjd 4396185029Spjd cp = strchr(tmpname, '/'); 4397185029Spjd if (cp == NULL) { 4398185029Spjd (void) strlcpy(dsname, tmpname, 4399185029Spjd MAXPATHLEN); 4400185029Spjd } else { 4401185029Spjd (void) snprintf(dsname, MAXPATHLEN, 4402185029Spjd "%s/%s", poolname, ++cp); 4403185029Spjd } 4404185029Spjd VERIFY(nvlist_add_string(config, 4405185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4406185029Spjd kmem_free(dsname, MAXPATHLEN); 4407185029Spjd } 4408185029Spjd kmem_free(tmpname, MAXPATHLEN); 4409185029Spjd } 4410185029Spjd 4411185029Spjd /* 4412185029Spjd * Add the list of hot spares and level 2 cache devices. 4413185029Spjd */ 4414209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4415168404Spjd spa_add_spares(spa, config); 4416185029Spjd spa_add_l2cache(spa, config); 4417209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4418168404Spjd } 4419168404Spjd 4420168404Spjd spa_unload(spa); 4421168404Spjd spa_deactivate(spa); 4422168404Spjd spa_remove(spa); 4423168404Spjd mutex_exit(&spa_namespace_lock); 4424168404Spjd 4425168404Spjd return (config); 4426168404Spjd} 4427168404Spjd 4428168404Spjd/* 4429168404Spjd * Pool export/destroy 4430168404Spjd * 4431168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 4432168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 4433168404Spjd * update the pool state and sync all the labels to disk, removing the 4434207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4435207670Smm * we don't sync the labels or remove the configuration cache. 4436168404Spjd */ 4437168404Spjdstatic int 4438185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4439207670Smm boolean_t force, boolean_t hardforce) 4440168404Spjd{ 4441168404Spjd spa_t *spa; 4442168404Spjd 4443168404Spjd if (oldconfig) 4444168404Spjd *oldconfig = NULL; 4445168404Spjd 4446209962Smm if (!(spa_mode_global & FWRITE)) 4447249195Smm return (SET_ERROR(EROFS)); 4448168404Spjd 4449168404Spjd mutex_enter(&spa_namespace_lock); 4450168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4451168404Spjd mutex_exit(&spa_namespace_lock); 4452249195Smm return (SET_ERROR(ENOENT)); 4453168404Spjd } 4454168404Spjd 4455168404Spjd /* 4456168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 4457168404Spjd * reacquire the namespace lock, and see if we can export. 4458168404Spjd */ 4459168404Spjd spa_open_ref(spa, FTAG); 4460168404Spjd mutex_exit(&spa_namespace_lock); 4461168404Spjd spa_async_suspend(spa); 4462168404Spjd mutex_enter(&spa_namespace_lock); 4463168404Spjd spa_close(spa, FTAG); 4464168404Spjd 4465168404Spjd /* 4466168404Spjd * The pool will be in core if it's openable, 4467168404Spjd * in which case we can modify its state. 4468168404Spjd */ 4469168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4470168404Spjd /* 4471168404Spjd * Objsets may be open only because they're dirty, so we 4472168404Spjd * have to force it to sync before checking spa_refcnt. 4473168404Spjd */ 4474168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 4475168404Spjd 4476168404Spjd /* 4477168404Spjd * A pool cannot be exported or destroyed if there are active 4478168404Spjd * references. If we are resetting a pool, allow references by 4479168404Spjd * fault injection handlers. 4480168404Spjd */ 4481168404Spjd if (!spa_refcount_zero(spa) || 4482168404Spjd (spa->spa_inject_ref != 0 && 4483168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 4484168404Spjd spa_async_resume(spa); 4485168404Spjd mutex_exit(&spa_namespace_lock); 4486249195Smm return (SET_ERROR(EBUSY)); 4487168404Spjd } 4488168404Spjd 4489185029Spjd /* 4490185029Spjd * A pool cannot be exported if it has an active shared spare. 4491185029Spjd * This is to prevent other pools stealing the active spare 4492185029Spjd * from an exported pool. At user's own will, such pool can 4493185029Spjd * be forcedly exported. 4494185029Spjd */ 4495185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 4496185029Spjd spa_has_active_shared_spare(spa)) { 4497185029Spjd spa_async_resume(spa); 4498185029Spjd mutex_exit(&spa_namespace_lock); 4499249195Smm return (SET_ERROR(EXDEV)); 4500185029Spjd } 4501168404Spjd 4502168404Spjd /* 4503168404Spjd * We want this to be reflected on every label, 4504168404Spjd * so mark them all dirty. spa_unload() will do the 4505168404Spjd * final sync that pushes these changes out. 4506168404Spjd */ 4507207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4508185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4509168404Spjd spa->spa_state = new_state; 4510219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 4511219089Spjd TXG_DEFER_SIZE + 1; 4512168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4513185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4514168404Spjd } 4515168404Spjd } 4516168404Spjd 4517185029Spjd spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4518185029Spjd 4519168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4520168404Spjd spa_unload(spa); 4521168404Spjd spa_deactivate(spa); 4522168404Spjd } 4523168404Spjd 4524168404Spjd if (oldconfig && spa->spa_config) 4525168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4526168404Spjd 4527168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 4528207670Smm if (!hardforce) 4529207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 4530168404Spjd spa_remove(spa); 4531168404Spjd } 4532168404Spjd mutex_exit(&spa_namespace_lock); 4533168404Spjd 4534168404Spjd return (0); 4535168404Spjd} 4536168404Spjd 4537168404Spjd/* 4538168404Spjd * Destroy a storage pool. 4539168404Spjd */ 4540168404Spjdint 4541168404Spjdspa_destroy(char *pool) 4542168404Spjd{ 4543207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4544207670Smm B_FALSE, B_FALSE)); 4545168404Spjd} 4546168404Spjd 4547168404Spjd/* 4548168404Spjd * Export a storage pool. 4549168404Spjd */ 4550168404Spjdint 4551207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4552207670Smm boolean_t hardforce) 4553168404Spjd{ 4554207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4555207670Smm force, hardforce)); 4556168404Spjd} 4557168404Spjd 4558168404Spjd/* 4559168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 4560168404Spjd * from the namespace in any way. 4561168404Spjd */ 4562168404Spjdint 4563168404Spjdspa_reset(char *pool) 4564168404Spjd{ 4565185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4566207670Smm B_FALSE, B_FALSE)); 4567168404Spjd} 4568168404Spjd 4569168404Spjd/* 4570168404Spjd * ========================================================================== 4571168404Spjd * Device manipulation 4572168404Spjd * ========================================================================== 4573168404Spjd */ 4574168404Spjd 4575168404Spjd/* 4576185029Spjd * Add a device to a storage pool. 4577168404Spjd */ 4578168404Spjdint 4579168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4580168404Spjd{ 4581219089Spjd uint64_t txg, id; 4582209962Smm int error; 4583168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4584168404Spjd vdev_t *vd, *tvd; 4585185029Spjd nvlist_t **spares, **l2cache; 4586185029Spjd uint_t nspares, nl2cache; 4587168404Spjd 4588219089Spjd ASSERT(spa_writeable(spa)); 4589219089Spjd 4590168404Spjd txg = spa_vdev_enter(spa); 4591168404Spjd 4592168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4593168404Spjd VDEV_ALLOC_ADD)) != 0) 4594168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4595168404Spjd 4596185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4597168404Spjd 4598185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4599185029Spjd &nspares) != 0) 4600168404Spjd nspares = 0; 4601168404Spjd 4602185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4603185029Spjd &nl2cache) != 0) 4604185029Spjd nl2cache = 0; 4605185029Spjd 4606185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4607168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4608168404Spjd 4609185029Spjd if (vd->vdev_children != 0 && 4610185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 4611185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4612168404Spjd 4613168404Spjd /* 4614185029Spjd * We must validate the spares and l2cache devices after checking the 4615185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4616168404Spjd */ 4617185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4618168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4619168404Spjd 4620168404Spjd /* 4621168404Spjd * Transfer each new top-level vdev from vd to rvd. 4622168404Spjd */ 4623209962Smm for (int c = 0; c < vd->vdev_children; c++) { 4624219089Spjd 4625219089Spjd /* 4626219089Spjd * Set the vdev id to the first hole, if one exists. 4627219089Spjd */ 4628219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 4629219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 4630219089Spjd vdev_free(rvd->vdev_child[id]); 4631219089Spjd break; 4632219089Spjd } 4633219089Spjd } 4634168404Spjd tvd = vd->vdev_child[c]; 4635168404Spjd vdev_remove_child(vd, tvd); 4636219089Spjd tvd->vdev_id = id; 4637168404Spjd vdev_add_child(rvd, tvd); 4638168404Spjd vdev_config_dirty(tvd); 4639168404Spjd } 4640168404Spjd 4641168404Spjd if (nspares != 0) { 4642185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4643185029Spjd ZPOOL_CONFIG_SPARES); 4644168404Spjd spa_load_spares(spa); 4645185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4646168404Spjd } 4647168404Spjd 4648185029Spjd if (nl2cache != 0) { 4649185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4650185029Spjd ZPOOL_CONFIG_L2CACHE); 4651185029Spjd spa_load_l2cache(spa); 4652185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4653185029Spjd } 4654185029Spjd 4655168404Spjd /* 4656168404Spjd * We have to be careful when adding new vdevs to an existing pool. 4657168404Spjd * If other threads start allocating from these vdevs before we 4658168404Spjd * sync the config cache, and we lose power, then upon reboot we may 4659168404Spjd * fail to open the pool because there are DVAs that the config cache 4660168404Spjd * can't translate. Therefore, we first add the vdevs without 4661168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4662168404Spjd * and then let spa_config_update() initialize the new metaslabs. 4663168404Spjd * 4664168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 4665168404Spjd * if we lose power at any point in this sequence, the remaining 4666168404Spjd * steps will be completed the next time we load the pool. 4667168404Spjd */ 4668168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 4669168404Spjd 4670168404Spjd mutex_enter(&spa_namespace_lock); 4671168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4672168404Spjd mutex_exit(&spa_namespace_lock); 4673168404Spjd 4674168404Spjd return (0); 4675168404Spjd} 4676168404Spjd 4677168404Spjd/* 4678168404Spjd * Attach a device to a mirror. The arguments are the path to any device 4679168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 4680168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 4681168404Spjd * 4682168404Spjd * If 'replacing' is specified, the new device is intended to replace the 4683168404Spjd * existing device; in this case the two devices are made into their own 4684185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 4685168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 4686168404Spjd * extra rules: you can't attach to it after it's been created, and upon 4687168404Spjd * completion of resilvering, the first disk (the one being replaced) 4688168404Spjd * is automatically detached. 4689168404Spjd */ 4690168404Spjdint 4691168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4692168404Spjd{ 4693219089Spjd uint64_t txg, dtl_max_txg; 4694168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4695168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4696168404Spjd vdev_ops_t *pvops; 4697185029Spjd char *oldvdpath, *newvdpath; 4698185029Spjd int newvd_isspare; 4699185029Spjd int error; 4700168404Spjd 4701219089Spjd ASSERT(spa_writeable(spa)); 4702219089Spjd 4703168404Spjd txg = spa_vdev_enter(spa); 4704168404Spjd 4705185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4706168404Spjd 4707168404Spjd if (oldvd == NULL) 4708168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4709168404Spjd 4710168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 4711168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4712168404Spjd 4713168404Spjd pvd = oldvd->vdev_parent; 4714168404Spjd 4715168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4716230514Smm VDEV_ALLOC_ATTACH)) != 0) 4717185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4718185029Spjd 4719185029Spjd if (newrootvd->vdev_children != 1) 4720168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4721168404Spjd 4722168404Spjd newvd = newrootvd->vdev_child[0]; 4723168404Spjd 4724168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 4725168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4726168404Spjd 4727168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4728168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 4729168404Spjd 4730185029Spjd /* 4731185029Spjd * Spares can't replace logs 4732185029Spjd */ 4733185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4734185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4735185029Spjd 4736168404Spjd if (!replacing) { 4737168404Spjd /* 4738168404Spjd * For attach, the only allowable parent is a mirror or the root 4739168404Spjd * vdev. 4740168404Spjd */ 4741168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 4742168404Spjd pvd->vdev_ops != &vdev_root_ops) 4743168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4744168404Spjd 4745168404Spjd pvops = &vdev_mirror_ops; 4746168404Spjd } else { 4747168404Spjd /* 4748168404Spjd * Active hot spares can only be replaced by inactive hot 4749168404Spjd * spares. 4750168404Spjd */ 4751168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4752219089Spjd oldvd->vdev_isspare && 4753168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 4754168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4755168404Spjd 4756168404Spjd /* 4757168404Spjd * If the source is a hot spare, and the parent isn't already a 4758168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 4759168404Spjd * want to create a replacing vdev. The user is not allowed to 4760168404Spjd * attach to a spared vdev child unless the 'isspare' state is 4761168404Spjd * the same (spare replaces spare, non-spare replaces 4762168404Spjd * non-spare). 4763168404Spjd */ 4764219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 4765219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4766168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4767219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 4768219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 4769168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4770219089Spjd } 4771219089Spjd 4772219089Spjd if (newvd->vdev_isspare) 4773168404Spjd pvops = &vdev_spare_ops; 4774168404Spjd else 4775168404Spjd pvops = &vdev_replacing_ops; 4776168404Spjd } 4777168404Spjd 4778168404Spjd /* 4779219089Spjd * Make sure the new device is big enough. 4780168404Spjd */ 4781219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4782168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4783168404Spjd 4784168404Spjd /* 4785168404Spjd * The new device cannot have a higher alignment requirement 4786168404Spjd * than the top-level vdev. 4787168404Spjd */ 4788168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4789168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4790168404Spjd 4791168404Spjd /* 4792168404Spjd * If this is an in-place replacement, update oldvd's path and devid 4793168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 4794168404Spjd */ 4795168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4796168404Spjd spa_strfree(oldvd->vdev_path); 4797168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4798168404Spjd KM_SLEEP); 4799168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 4800168404Spjd newvd->vdev_path, "old"); 4801168404Spjd if (oldvd->vdev_devid != NULL) { 4802168404Spjd spa_strfree(oldvd->vdev_devid); 4803168404Spjd oldvd->vdev_devid = NULL; 4804168404Spjd } 4805168404Spjd } 4806168404Spjd 4807219089Spjd /* mark the device being resilvered */ 4808254112Sdelphij newvd->vdev_resilver_txg = txg; 4809219089Spjd 4810168404Spjd /* 4811168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 4812168404Spjd * mirror/replacing/spare vdev above oldvd. 4813168404Spjd */ 4814168404Spjd if (pvd->vdev_ops != pvops) 4815168404Spjd pvd = vdev_add_parent(oldvd, pvops); 4816168404Spjd 4817168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 4818168404Spjd ASSERT(pvd->vdev_ops == pvops); 4819168404Spjd ASSERT(oldvd->vdev_parent == pvd); 4820168404Spjd 4821168404Spjd /* 4822168404Spjd * Extract the new device from its root and add it to pvd. 4823168404Spjd */ 4824168404Spjd vdev_remove_child(newrootvd, newvd); 4825168404Spjd newvd->vdev_id = pvd->vdev_children; 4826219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 4827168404Spjd vdev_add_child(pvd, newvd); 4828168404Spjd 4829168404Spjd tvd = newvd->vdev_top; 4830168404Spjd ASSERT(pvd->vdev_top == tvd); 4831168404Spjd ASSERT(tvd->vdev_parent == rvd); 4832168404Spjd 4833168404Spjd vdev_config_dirty(tvd); 4834168404Spjd 4835168404Spjd /* 4836219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4837219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 4838219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 4839168404Spjd */ 4840219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4841168404Spjd 4842219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4843219089Spjd dtl_max_txg - TXG_INITIAL); 4844168404Spjd 4845209962Smm if (newvd->vdev_isspare) { 4846168404Spjd spa_spare_activate(newvd); 4847209962Smm spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4848209962Smm } 4849209962Smm 4850185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 4851185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 4852185029Spjd newvd_isspare = newvd->vdev_isspare; 4853168404Spjd 4854168404Spjd /* 4855168404Spjd * Mark newvd's DTL dirty in this txg. 4856168404Spjd */ 4857168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 4858168404Spjd 4859219089Spjd /* 4860258717Savg * Schedule the resilver to restart in the future. We do this to 4861258717Savg * ensure that dmu_sync-ed blocks have been stitched into the 4862258717Savg * respective datasets. 4863219089Spjd */ 4864219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4865168404Spjd 4866219089Spjd /* 4867219089Spjd * Commit the config 4868219089Spjd */ 4869219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4870185029Spjd 4871248571Smm spa_history_log_internal(spa, "vdev attach", NULL, 4872219089Spjd "%s vdev=%s %s vdev=%s", 4873219089Spjd replacing && newvd_isspare ? "spare in" : 4874219089Spjd replacing ? "replace" : "attach", newvdpath, 4875219089Spjd replacing ? "for" : "to", oldvdpath); 4876219089Spjd 4877185029Spjd spa_strfree(oldvdpath); 4878185029Spjd spa_strfree(newvdpath); 4879185029Spjd 4880219089Spjd if (spa->spa_bootfs) 4881219089Spjd spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4882168404Spjd 4883168404Spjd return (0); 4884168404Spjd} 4885168404Spjd 4886168404Spjd/* 4887168404Spjd * Detach a device from a mirror or replacing vdev. 4888251631Sdelphij * 4889168404Spjd * If 'replace_done' is specified, only detach if the parent 4890168404Spjd * is a replacing vdev. 4891168404Spjd */ 4892168404Spjdint 4893209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4894168404Spjd{ 4895168404Spjd uint64_t txg; 4896209962Smm int error; 4897168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4898168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 4899168404Spjd boolean_t unspare = B_FALSE; 4900247187Smm uint64_t unspare_guid = 0; 4901219089Spjd char *vdpath; 4902168404Spjd 4903219089Spjd ASSERT(spa_writeable(spa)); 4904219089Spjd 4905168404Spjd txg = spa_vdev_enter(spa); 4906168404Spjd 4907185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4908168404Spjd 4909168404Spjd if (vd == NULL) 4910168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4911168404Spjd 4912168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 4913168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4914168404Spjd 4915168404Spjd pvd = vd->vdev_parent; 4916168404Spjd 4917168404Spjd /* 4918209962Smm * If the parent/child relationship is not as expected, don't do it. 4919209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4920209962Smm * vdev that's replacing B with C. The user's intent in replacing 4921209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 4922209962Smm * the replace by detaching C, the expected behavior is to end up 4923209962Smm * M(A,B). But suppose that right after deciding to detach C, 4924209962Smm * the replacement of B completes. We would have M(A,C), and then 4925209962Smm * ask to detach C, which would leave us with just A -- not what 4926209962Smm * the user wanted. To prevent this, we make sure that the 4927209962Smm * parent/child relationship hasn't changed -- in this example, 4928209962Smm * that C's parent is still the replacing vdev R. 4929209962Smm */ 4930209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 4931209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4932209962Smm 4933209962Smm /* 4934219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 4935168404Spjd */ 4936219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4937219089Spjd pvd->vdev_ops != &vdev_spare_ops) 4938219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4939168404Spjd 4940168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4941185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 4942168404Spjd 4943168404Spjd /* 4944168404Spjd * Only mirror, replacing, and spare vdevs support detach. 4945168404Spjd */ 4946168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 4947168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 4948168404Spjd pvd->vdev_ops != &vdev_spare_ops) 4949168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4950168404Spjd 4951168404Spjd /* 4952209962Smm * If this device has the only valid copy of some data, 4953209962Smm * we cannot safely detach it. 4954168404Spjd */ 4955209962Smm if (vdev_dtl_required(vd)) 4956168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4957168404Spjd 4958209962Smm ASSERT(pvd->vdev_children >= 2); 4959168404Spjd 4960168404Spjd /* 4961185029Spjd * If we are detaching the second disk from a replacing vdev, then 4962185029Spjd * check to see if we changed the original vdev's path to have "/old" 4963185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 4964168404Spjd */ 4965219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4966219089Spjd vd->vdev_path != NULL) { 4967219089Spjd size_t len = strlen(vd->vdev_path); 4968219089Spjd 4969219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 4970219089Spjd cvd = pvd->vdev_child[c]; 4971219089Spjd 4972219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 4973219089Spjd continue; 4974219089Spjd 4975219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4976219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 4977219089Spjd spa_strfree(cvd->vdev_path); 4978219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 4979219089Spjd break; 4980219089Spjd } 4981185029Spjd } 4982185029Spjd } 4983168404Spjd 4984168404Spjd /* 4985168404Spjd * If we are detaching the original disk from a spare, then it implies 4986168404Spjd * that the spare should become a real disk, and be removed from the 4987168404Spjd * active spare list for the pool. 4988168404Spjd */ 4989168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4990219089Spjd vd->vdev_id == 0 && 4991219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4992168404Spjd unspare = B_TRUE; 4993168404Spjd 4994168404Spjd /* 4995168404Spjd * Erase the disk labels so the disk can be used for other things. 4996168404Spjd * This must be done after all other error cases are handled, 4997168404Spjd * but before we disembowel vd (so we can still do I/O to it). 4998168404Spjd * But if we can't do it, don't treat the error as fatal -- 4999168404Spjd * it may be that the unwritability of the disk is the reason 5000168404Spjd * it's being detached! 5001168404Spjd */ 5002168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5003168404Spjd 5004168404Spjd /* 5005168404Spjd * Remove vd from its parent and compact the parent's children. 5006168404Spjd */ 5007168404Spjd vdev_remove_child(pvd, vd); 5008168404Spjd vdev_compact_children(pvd); 5009168404Spjd 5010168404Spjd /* 5011168404Spjd * Remember one of the remaining children so we can get tvd below. 5012168404Spjd */ 5013219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 5014168404Spjd 5015168404Spjd /* 5016168404Spjd * If we need to remove the remaining child from the list of hot spares, 5017209962Smm * do it now, marking the vdev as no longer a spare in the process. 5018209962Smm * We must do this before vdev_remove_parent(), because that can 5019209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 5020209962Smm * reason, we must remove the spare now, in the same txg as the detach; 5021209962Smm * otherwise someone could attach a new sibling, change the GUID, and 5022209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 5023168404Spjd */ 5024168404Spjd if (unspare) { 5025168404Spjd ASSERT(cvd->vdev_isspare); 5026168404Spjd spa_spare_remove(cvd); 5027168404Spjd unspare_guid = cvd->vdev_guid; 5028209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 5029219089Spjd cvd->vdev_unspare = B_TRUE; 5030168404Spjd } 5031168404Spjd 5032168404Spjd /* 5033168404Spjd * If the parent mirror/replacing vdev only has one child, 5034168404Spjd * the parent is no longer needed. Remove it from the tree. 5035168404Spjd */ 5036219089Spjd if (pvd->vdev_children == 1) { 5037219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 5038219089Spjd cvd->vdev_unspare = B_FALSE; 5039168404Spjd vdev_remove_parent(cvd); 5040219089Spjd } 5041168404Spjd 5042219089Spjd 5043168404Spjd /* 5044168404Spjd * We don't set tvd until now because the parent we just removed 5045168404Spjd * may have been the previous top-level vdev. 5046168404Spjd */ 5047168404Spjd tvd = cvd->vdev_top; 5048168404Spjd ASSERT(tvd->vdev_parent == rvd); 5049168404Spjd 5050168404Spjd /* 5051168404Spjd * Reevaluate the parent vdev state. 5052168404Spjd */ 5053185029Spjd vdev_propagate_state(cvd); 5054168404Spjd 5055168404Spjd /* 5056219089Spjd * If the 'autoexpand' property is set on the pool then automatically 5057219089Spjd * try to expand the size of the pool. For example if the device we 5058219089Spjd * just detached was smaller than the others, it may be possible to 5059219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5060219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 5061168404Spjd */ 5062219089Spjd if (spa->spa_autoexpand) { 5063219089Spjd vdev_reopen(tvd); 5064219089Spjd vdev_expand(tvd, txg); 5065219089Spjd } 5066168404Spjd 5067168404Spjd vdev_config_dirty(tvd); 5068168404Spjd 5069168404Spjd /* 5070168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5071168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 5072168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 5073168404Spjd * prevent vd from being accessed after it's freed. 5074168404Spjd */ 5075219089Spjd vdpath = spa_strdup(vd->vdev_path); 5076209962Smm for (int t = 0; t < TXG_SIZE; t++) 5077168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5078168404Spjd vd->vdev_detached = B_TRUE; 5079168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 5080168404Spjd 5081185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 5082185029Spjd 5083219089Spjd /* hang on to the spa before we release the lock */ 5084219089Spjd spa_open_ref(spa, FTAG); 5085219089Spjd 5086168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 5087168404Spjd 5088248571Smm spa_history_log_internal(spa, "detach", NULL, 5089219089Spjd "vdev=%s", vdpath); 5090219089Spjd spa_strfree(vdpath); 5091219089Spjd 5092168404Spjd /* 5093168404Spjd * If this was the removal of the original device in a hot spare vdev, 5094168404Spjd * then we want to go through and remove the device from the hot spare 5095168404Spjd * list of every other pool. 5096168404Spjd */ 5097168404Spjd if (unspare) { 5098219089Spjd spa_t *altspa = NULL; 5099219089Spjd 5100168404Spjd mutex_enter(&spa_namespace_lock); 5101219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 5102219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 5103219089Spjd altspa == spa) 5104168404Spjd continue; 5105219089Spjd 5106219089Spjd spa_open_ref(altspa, FTAG); 5107185029Spjd mutex_exit(&spa_namespace_lock); 5108219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5109185029Spjd mutex_enter(&spa_namespace_lock); 5110219089Spjd spa_close(altspa, FTAG); 5111168404Spjd } 5112168404Spjd mutex_exit(&spa_namespace_lock); 5113219089Spjd 5114219089Spjd /* search the rest of the vdevs for spares to remove */ 5115219089Spjd spa_vdev_resilver_done(spa); 5116168404Spjd } 5117168404Spjd 5118219089Spjd /* all done with the spa; OK to release */ 5119219089Spjd mutex_enter(&spa_namespace_lock); 5120219089Spjd spa_close(spa, FTAG); 5121219089Spjd mutex_exit(&spa_namespace_lock); 5122219089Spjd 5123168404Spjd return (error); 5124168404Spjd} 5125168404Spjd 5126219089Spjd/* 5127219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 5128219089Spjd */ 5129219089Spjdint 5130219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5131219089Spjd nvlist_t *props, boolean_t exp) 5132219089Spjd{ 5133219089Spjd int error = 0; 5134219089Spjd uint64_t txg, *glist; 5135219089Spjd spa_t *newspa; 5136219089Spjd uint_t c, children, lastlog; 5137219089Spjd nvlist_t **child, *nvl, *tmp; 5138219089Spjd dmu_tx_t *tx; 5139219089Spjd char *altroot = NULL; 5140219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5141219089Spjd boolean_t activate_slog; 5142219089Spjd 5143219089Spjd ASSERT(spa_writeable(spa)); 5144219089Spjd 5145219089Spjd txg = spa_vdev_enter(spa); 5146219089Spjd 5147219089Spjd /* clear the log and flush everything up to now */ 5148219089Spjd activate_slog = spa_passivate_log(spa); 5149219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5150219089Spjd error = spa_offline_log(spa); 5151219089Spjd txg = spa_vdev_config_enter(spa); 5152219089Spjd 5153219089Spjd if (activate_slog) 5154219089Spjd spa_activate_log(spa); 5155219089Spjd 5156219089Spjd if (error != 0) 5157219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5158219089Spjd 5159219089Spjd /* check new spa name before going any further */ 5160219089Spjd if (spa_lookup(newname) != NULL) 5161219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5162219089Spjd 5163219089Spjd /* 5164219089Spjd * scan through all the children to ensure they're all mirrors 5165219089Spjd */ 5166219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5167219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5168219089Spjd &children) != 0) 5169219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5170219089Spjd 5171219089Spjd /* first, check to ensure we've got the right child count */ 5172219089Spjd rvd = spa->spa_root_vdev; 5173219089Spjd lastlog = 0; 5174219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 5175219089Spjd vdev_t *vd = rvd->vdev_child[c]; 5176219089Spjd 5177219089Spjd /* don't count the holes & logs as children */ 5178219089Spjd if (vd->vdev_islog || vd->vdev_ishole) { 5179219089Spjd if (lastlog == 0) 5180219089Spjd lastlog = c; 5181219089Spjd continue; 5182219089Spjd } 5183219089Spjd 5184219089Spjd lastlog = 0; 5185219089Spjd } 5186219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5187219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5188219089Spjd 5189219089Spjd /* next, ensure no spare or cache devices are part of the split */ 5190219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5191219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5192219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5193219089Spjd 5194219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5195219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5196219089Spjd 5197219089Spjd /* then, loop over each vdev and validate it */ 5198219089Spjd for (c = 0; c < children; c++) { 5199219089Spjd uint64_t is_hole = 0; 5200219089Spjd 5201219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5202219089Spjd &is_hole); 5203219089Spjd 5204219089Spjd if (is_hole != 0) { 5205219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5206219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5207219089Spjd continue; 5208219089Spjd } else { 5209249195Smm error = SET_ERROR(EINVAL); 5210219089Spjd break; 5211219089Spjd } 5212219089Spjd } 5213219089Spjd 5214219089Spjd /* which disk is going to be split? */ 5215219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5216219089Spjd &glist[c]) != 0) { 5217249195Smm error = SET_ERROR(EINVAL); 5218219089Spjd break; 5219219089Spjd } 5220219089Spjd 5221219089Spjd /* look it up in the spa */ 5222219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5223219089Spjd if (vml[c] == NULL) { 5224249195Smm error = SET_ERROR(ENODEV); 5225219089Spjd break; 5226219089Spjd } 5227219089Spjd 5228219089Spjd /* make sure there's nothing stopping the split */ 5229219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5230219089Spjd vml[c]->vdev_islog || 5231219089Spjd vml[c]->vdev_ishole || 5232219089Spjd vml[c]->vdev_isspare || 5233219089Spjd vml[c]->vdev_isl2cache || 5234219089Spjd !vdev_writeable(vml[c]) || 5235219089Spjd vml[c]->vdev_children != 0 || 5236219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5237219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5238249195Smm error = SET_ERROR(EINVAL); 5239219089Spjd break; 5240219089Spjd } 5241219089Spjd 5242219089Spjd if (vdev_dtl_required(vml[c])) { 5243249195Smm error = SET_ERROR(EBUSY); 5244219089Spjd break; 5245219089Spjd } 5246219089Spjd 5247219089Spjd /* we need certain info from the top level */ 5248219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5249219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 5250219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5251219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 5252219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5253219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 5254219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5255219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 5256219089Spjd } 5257219089Spjd 5258219089Spjd if (error != 0) { 5259219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5260219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5261219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5262219089Spjd } 5263219089Spjd 5264219089Spjd /* stop writers from using the disks */ 5265219089Spjd for (c = 0; c < children; c++) { 5266219089Spjd if (vml[c] != NULL) 5267219089Spjd vml[c]->vdev_offline = B_TRUE; 5268219089Spjd } 5269219089Spjd vdev_reopen(spa->spa_root_vdev); 5270219089Spjd 5271219089Spjd /* 5272219089Spjd * Temporarily record the splitting vdevs in the spa config. This 5273219089Spjd * will disappear once the config is regenerated. 5274219089Spjd */ 5275219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5276219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5277219089Spjd glist, children) == 0); 5278219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5279219089Spjd 5280219089Spjd mutex_enter(&spa->spa_props_lock); 5281219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5282219089Spjd nvl) == 0); 5283219089Spjd mutex_exit(&spa->spa_props_lock); 5284219089Spjd spa->spa_config_splitting = nvl; 5285219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5286219089Spjd 5287219089Spjd /* configure and create the new pool */ 5288219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5289219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5290219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5291219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5292219089Spjd spa_version(spa)) == 0); 5293219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5294219089Spjd spa->spa_config_txg) == 0); 5295219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5296219089Spjd spa_generate_guid(NULL)) == 0); 5297219089Spjd (void) nvlist_lookup_string(props, 5298219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5299219089Spjd 5300219089Spjd /* add the new pool to the namespace */ 5301219089Spjd newspa = spa_add(newname, config, altroot); 5302219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 5303219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 5304219089Spjd 5305219089Spjd /* release the spa config lock, retaining the namespace lock */ 5306219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5307219089Spjd 5308219089Spjd if (zio_injection_enabled) 5309219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 5310219089Spjd 5311219089Spjd spa_activate(newspa, spa_mode_global); 5312219089Spjd spa_async_suspend(newspa); 5313219089Spjd 5314219089Spjd#ifndef sun 5315219089Spjd /* mark that we are creating new spa by splitting */ 5316219089Spjd newspa->spa_splitting_newspa = B_TRUE; 5317219089Spjd#endif 5318219089Spjd /* create the new pool from the disks of the original pool */ 5319219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5320219089Spjd#ifndef sun 5321219089Spjd newspa->spa_splitting_newspa = B_FALSE; 5322219089Spjd#endif 5323219089Spjd if (error) 5324219089Spjd goto out; 5325219089Spjd 5326219089Spjd /* if that worked, generate a real config for the new pool */ 5327219089Spjd if (newspa->spa_root_vdev != NULL) { 5328219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5329219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 5330219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5331219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5332219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5333219089Spjd B_TRUE)); 5334219089Spjd } 5335219089Spjd 5336219089Spjd /* set the props */ 5337219089Spjd if (props != NULL) { 5338219089Spjd spa_configfile_set(newspa, props, B_FALSE); 5339219089Spjd error = spa_prop_set(newspa, props); 5340219089Spjd if (error) 5341219089Spjd goto out; 5342219089Spjd } 5343219089Spjd 5344219089Spjd /* flush everything */ 5345219089Spjd txg = spa_vdev_config_enter(newspa); 5346219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 5347219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5348219089Spjd 5349219089Spjd if (zio_injection_enabled) 5350219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 5351219089Spjd 5352219089Spjd spa_async_resume(newspa); 5353219089Spjd 5354219089Spjd /* finally, update the original pool's config */ 5355219089Spjd txg = spa_vdev_config_enter(spa); 5356219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5357219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 5358219089Spjd if (error != 0) 5359219089Spjd dmu_tx_abort(tx); 5360219089Spjd for (c = 0; c < children; c++) { 5361219089Spjd if (vml[c] != NULL) { 5362219089Spjd vdev_split(vml[c]); 5363219089Spjd if (error == 0) 5364248571Smm spa_history_log_internal(spa, "detach", tx, 5365248571Smm "vdev=%s", vml[c]->vdev_path); 5366219089Spjd vdev_free(vml[c]); 5367219089Spjd } 5368219089Spjd } 5369219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5370219089Spjd spa->spa_config_splitting = NULL; 5371219089Spjd nvlist_free(nvl); 5372219089Spjd if (error == 0) 5373219089Spjd dmu_tx_commit(tx); 5374219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 5375219089Spjd 5376219089Spjd if (zio_injection_enabled) 5377219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 5378219089Spjd 5379219089Spjd /* split is complete; log a history record */ 5380248571Smm spa_history_log_internal(newspa, "split", NULL, 5381248571Smm "from pool %s", spa_name(spa)); 5382219089Spjd 5383219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5384219089Spjd 5385219089Spjd /* if we're not going to mount the filesystems in userland, export */ 5386219089Spjd if (exp) 5387219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5388219089Spjd B_FALSE, B_FALSE); 5389219089Spjd 5390219089Spjd return (error); 5391219089Spjd 5392219089Spjdout: 5393219089Spjd spa_unload(newspa); 5394219089Spjd spa_deactivate(newspa); 5395219089Spjd spa_remove(newspa); 5396219089Spjd 5397219089Spjd txg = spa_vdev_config_enter(spa); 5398219089Spjd 5399219089Spjd /* re-online all offlined disks */ 5400219089Spjd for (c = 0; c < children; c++) { 5401219089Spjd if (vml[c] != NULL) 5402219089Spjd vml[c]->vdev_offline = B_FALSE; 5403219089Spjd } 5404219089Spjd vdev_reopen(spa->spa_root_vdev); 5405219089Spjd 5406219089Spjd nvlist_free(spa->spa_config_splitting); 5407219089Spjd spa->spa_config_splitting = NULL; 5408219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 5409219089Spjd 5410219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5411219089Spjd return (error); 5412219089Spjd} 5413219089Spjd 5414185029Spjdstatic nvlist_t * 5415185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5416185029Spjd{ 5417185029Spjd for (int i = 0; i < count; i++) { 5418185029Spjd uint64_t guid; 5419185029Spjd 5420185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5421185029Spjd &guid) == 0); 5422185029Spjd 5423185029Spjd if (guid == target_guid) 5424185029Spjd return (nvpp[i]); 5425185029Spjd } 5426185029Spjd 5427185029Spjd return (NULL); 5428185029Spjd} 5429185029Spjd 5430185029Spjdstatic void 5431185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5432185029Spjd nvlist_t *dev_to_remove) 5433185029Spjd{ 5434185029Spjd nvlist_t **newdev = NULL; 5435185029Spjd 5436185029Spjd if (count > 1) 5437185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5438185029Spjd 5439185029Spjd for (int i = 0, j = 0; i < count; i++) { 5440185029Spjd if (dev[i] == dev_to_remove) 5441185029Spjd continue; 5442185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5443185029Spjd } 5444185029Spjd 5445185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5446185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5447185029Spjd 5448185029Spjd for (int i = 0; i < count - 1; i++) 5449185029Spjd nvlist_free(newdev[i]); 5450185029Spjd 5451185029Spjd if (count > 1) 5452185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 5453185029Spjd} 5454185029Spjd 5455168404Spjd/* 5456219089Spjd * Evacuate the device. 5457219089Spjd */ 5458219089Spjdstatic int 5459219089Spjdspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5460219089Spjd{ 5461219089Spjd uint64_t txg; 5462219089Spjd int error = 0; 5463219089Spjd 5464219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5465219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5466219089Spjd ASSERT(vd == vd->vdev_top); 5467219089Spjd 5468219089Spjd /* 5469219089Spjd * Evacuate the device. We don't hold the config lock as writer 5470219089Spjd * since we need to do I/O but we do keep the 5471219089Spjd * spa_namespace_lock held. Once this completes the device 5472219089Spjd * should no longer have any blocks allocated on it. 5473219089Spjd */ 5474219089Spjd if (vd->vdev_islog) { 5475219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5476219089Spjd error = spa_offline_log(spa); 5477219089Spjd } else { 5478249195Smm error = SET_ERROR(ENOTSUP); 5479219089Spjd } 5480219089Spjd 5481219089Spjd if (error) 5482219089Spjd return (error); 5483219089Spjd 5484219089Spjd /* 5485219089Spjd * The evacuation succeeded. Remove any remaining MOS metadata 5486219089Spjd * associated with this vdev, and wait for these changes to sync. 5487219089Spjd */ 5488240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 5489219089Spjd txg = spa_vdev_config_enter(spa); 5490219089Spjd vd->vdev_removing = B_TRUE; 5491258717Savg vdev_dirty_leaves(vd, VDD_DTL, txg); 5492219089Spjd vdev_config_dirty(vd); 5493219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5494219089Spjd 5495219089Spjd return (0); 5496219089Spjd} 5497219089Spjd 5498219089Spjd/* 5499219089Spjd * Complete the removal by cleaning up the namespace. 5500219089Spjd */ 5501219089Spjdstatic void 5502219089Spjdspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5503219089Spjd{ 5504219089Spjd vdev_t *rvd = spa->spa_root_vdev; 5505219089Spjd uint64_t id = vd->vdev_id; 5506219089Spjd boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5507219089Spjd 5508219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5509219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5510219089Spjd ASSERT(vd == vd->vdev_top); 5511219089Spjd 5512219089Spjd /* 5513219089Spjd * Only remove any devices which are empty. 5514219089Spjd */ 5515219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5516219089Spjd return; 5517219089Spjd 5518219089Spjd (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5519219089Spjd 5520219089Spjd if (list_link_active(&vd->vdev_state_dirty_node)) 5521219089Spjd vdev_state_clean(vd); 5522219089Spjd if (list_link_active(&vd->vdev_config_dirty_node)) 5523219089Spjd vdev_config_clean(vd); 5524219089Spjd 5525219089Spjd vdev_free(vd); 5526219089Spjd 5527219089Spjd if (last_vdev) { 5528219089Spjd vdev_compact_children(rvd); 5529219089Spjd } else { 5530219089Spjd vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5531219089Spjd vdev_add_child(rvd, vd); 5532219089Spjd } 5533219089Spjd vdev_config_dirty(rvd); 5534219089Spjd 5535219089Spjd /* 5536219089Spjd * Reassess the health of our root vdev. 5537219089Spjd */ 5538219089Spjd vdev_reopen(rvd); 5539219089Spjd} 5540219089Spjd 5541219089Spjd/* 5542219089Spjd * Remove a device from the pool - 5543219089Spjd * 5544219089Spjd * Removing a device from the vdev namespace requires several steps 5545219089Spjd * and can take a significant amount of time. As a result we use 5546219089Spjd * the spa_vdev_config_[enter/exit] functions which allow us to 5547219089Spjd * grab and release the spa_config_lock while still holding the namespace 5548219089Spjd * lock. During each step the configuration is synced out. 5549251631Sdelphij * 5550251631Sdelphij * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5551251631Sdelphij * devices. 5552219089Spjd */ 5553168404Spjdint 5554168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5555168404Spjd{ 5556168404Spjd vdev_t *vd; 5557219089Spjd metaslab_group_t *mg; 5558185029Spjd nvlist_t **spares, **l2cache, *nv; 5559219089Spjd uint64_t txg = 0; 5560185029Spjd uint_t nspares, nl2cache; 5561185029Spjd int error = 0; 5562209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5563168404Spjd 5564219089Spjd ASSERT(spa_writeable(spa)); 5565219089Spjd 5566209962Smm if (!locked) 5567209962Smm txg = spa_vdev_enter(spa); 5568168404Spjd 5569185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5570168404Spjd 5571185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 5572185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5573185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5574185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5575185029Spjd /* 5576185029Spjd * Only remove the hot spare if it's not currently in use 5577185029Spjd * in this pool. 5578185029Spjd */ 5579185029Spjd if (vd == NULL || unspare) { 5580185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 5581185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5582185029Spjd spa_load_spares(spa); 5583185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5584185029Spjd } else { 5585249195Smm error = SET_ERROR(EBUSY); 5586168404Spjd } 5587185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 5588185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5589185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5590185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5591185029Spjd /* 5592185029Spjd * Cache devices can always be removed. 5593185029Spjd */ 5594185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5595185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5596185029Spjd spa_load_l2cache(spa); 5597185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5598219089Spjd } else if (vd != NULL && vd->vdev_islog) { 5599219089Spjd ASSERT(!locked); 5600219089Spjd ASSERT(vd == vd->vdev_top); 5601219089Spjd 5602219089Spjd mg = vd->vdev_mg; 5603219089Spjd 5604219089Spjd /* 5605219089Spjd * Stop allocating from this vdev. 5606219089Spjd */ 5607219089Spjd metaslab_group_passivate(mg); 5608219089Spjd 5609219089Spjd /* 5610219089Spjd * Wait for the youngest allocations and frees to sync, 5611219089Spjd * and then wait for the deferral of those frees to finish. 5612219089Spjd */ 5613219089Spjd spa_vdev_config_exit(spa, NULL, 5614219089Spjd txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5615219089Spjd 5616219089Spjd /* 5617219089Spjd * Attempt to evacuate the vdev. 5618219089Spjd */ 5619219089Spjd error = spa_vdev_remove_evacuate(spa, vd); 5620219089Spjd 5621219089Spjd txg = spa_vdev_config_enter(spa); 5622219089Spjd 5623219089Spjd /* 5624219089Spjd * If we couldn't evacuate the vdev, unwind. 5625219089Spjd */ 5626219089Spjd if (error) { 5627219089Spjd metaslab_group_activate(mg); 5628219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5629219089Spjd } 5630219089Spjd 5631219089Spjd /* 5632219089Spjd * Clean up the vdev namespace. 5633219089Spjd */ 5634219089Spjd spa_vdev_remove_from_namespace(spa, vd); 5635219089Spjd 5636185029Spjd } else if (vd != NULL) { 5637185029Spjd /* 5638185029Spjd * Normal vdevs cannot be removed (yet). 5639185029Spjd */ 5640249195Smm error = SET_ERROR(ENOTSUP); 5641168404Spjd } else { 5642185029Spjd /* 5643185029Spjd * There is no vdev of any kind with the specified guid. 5644185029Spjd */ 5645249195Smm error = SET_ERROR(ENOENT); 5646168404Spjd } 5647168404Spjd 5648209962Smm if (!locked) 5649209962Smm return (spa_vdev_exit(spa, NULL, txg, error)); 5650209962Smm 5651209962Smm return (error); 5652168404Spjd} 5653168404Spjd 5654168404Spjd/* 5655185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 5656251631Sdelphij * currently spared, so we can detach it. 5657168404Spjd */ 5658168404Spjdstatic vdev_t * 5659185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 5660168404Spjd{ 5661168404Spjd vdev_t *newvd, *oldvd; 5662168404Spjd 5663219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5664185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5665168404Spjd if (oldvd != NULL) 5666168404Spjd return (oldvd); 5667168404Spjd } 5668168404Spjd 5669185029Spjd /* 5670219089Spjd * Check for a completed replacement. We always consider the first 5671219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 5672219089Spjd * the newest (see spa_vdev_attach() for how that works). In 5673219089Spjd * the case where the newest vdev is faulted, we will not automatically 5674219089Spjd * remove it after a resilver completes. This is OK as it will require 5675219089Spjd * user intervention to determine which disk the admin wishes to keep. 5676185029Spjd */ 5677219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 5678219089Spjd ASSERT(vd->vdev_children > 1); 5679219089Spjd 5680219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 5681168404Spjd oldvd = vd->vdev_child[0]; 5682168404Spjd 5683209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 5684219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5685209962Smm !vdev_dtl_required(oldvd)) 5686168404Spjd return (oldvd); 5687168404Spjd } 5688168404Spjd 5689185029Spjd /* 5690185029Spjd * Check for a completed resilver with the 'unspare' flag set. 5691185029Spjd */ 5692219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 5693219089Spjd vdev_t *first = vd->vdev_child[0]; 5694219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5695185029Spjd 5696219089Spjd if (last->vdev_unspare) { 5697219089Spjd oldvd = first; 5698219089Spjd newvd = last; 5699219089Spjd } else if (first->vdev_unspare) { 5700219089Spjd oldvd = last; 5701219089Spjd newvd = first; 5702219089Spjd } else { 5703219089Spjd oldvd = NULL; 5704219089Spjd } 5705219089Spjd 5706219089Spjd if (oldvd != NULL && 5707209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 5708219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5709219089Spjd !vdev_dtl_required(oldvd)) 5710185029Spjd return (oldvd); 5711219089Spjd 5712219089Spjd /* 5713219089Spjd * If there are more than two spares attached to a disk, 5714219089Spjd * and those spares are not required, then we want to 5715219089Spjd * attempt to free them up now so that they can be used 5716219089Spjd * by other pools. Once we're back down to a single 5717219089Spjd * disk+spare, we stop removing them. 5718219089Spjd */ 5719219089Spjd if (vd->vdev_children > 2) { 5720219089Spjd newvd = vd->vdev_child[1]; 5721219089Spjd 5722219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 5723219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 5724219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 5725219089Spjd !vdev_dtl_required(newvd)) 5726219089Spjd return (newvd); 5727185029Spjd } 5728185029Spjd } 5729185029Spjd 5730168404Spjd return (NULL); 5731168404Spjd} 5732168404Spjd 5733168404Spjdstatic void 5734185029Spjdspa_vdev_resilver_done(spa_t *spa) 5735168404Spjd{ 5736209962Smm vdev_t *vd, *pvd, *ppvd; 5737209962Smm uint64_t guid, sguid, pguid, ppguid; 5738168404Spjd 5739209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5740168404Spjd 5741185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5742209962Smm pvd = vd->vdev_parent; 5743209962Smm ppvd = pvd->vdev_parent; 5744168404Spjd guid = vd->vdev_guid; 5745209962Smm pguid = pvd->vdev_guid; 5746209962Smm ppguid = ppvd->vdev_guid; 5747209962Smm sguid = 0; 5748168404Spjd /* 5749168404Spjd * If we have just finished replacing a hot spared device, then 5750168404Spjd * we need to detach the parent's first child (the original hot 5751168404Spjd * spare) as well. 5752168404Spjd */ 5753219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5754219089Spjd ppvd->vdev_children == 2) { 5755168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5756209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 5757168404Spjd } 5758254112Sdelphij ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5759254112Sdelphij 5760209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5761209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5762168404Spjd return; 5763209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5764168404Spjd return; 5765209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5766168404Spjd } 5767168404Spjd 5768209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5769168404Spjd} 5770168404Spjd 5771168404Spjd/* 5772219089Spjd * Update the stored path or FRU for this vdev. 5773168404Spjd */ 5774168404Spjdint 5775209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5776209962Smm boolean_t ispath) 5777168404Spjd{ 5778185029Spjd vdev_t *vd; 5779219089Spjd boolean_t sync = B_FALSE; 5780168404Spjd 5781219089Spjd ASSERT(spa_writeable(spa)); 5782168404Spjd 5783219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 5784219089Spjd 5785209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5786219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5787168404Spjd 5788168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5789219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5790168404Spjd 5791209962Smm if (ispath) { 5792219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 5793219089Spjd spa_strfree(vd->vdev_path); 5794219089Spjd vd->vdev_path = spa_strdup(value); 5795219089Spjd sync = B_TRUE; 5796219089Spjd } 5797209962Smm } else { 5798219089Spjd if (vd->vdev_fru == NULL) { 5799219089Spjd vd->vdev_fru = spa_strdup(value); 5800219089Spjd sync = B_TRUE; 5801219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 5802209962Smm spa_strfree(vd->vdev_fru); 5803219089Spjd vd->vdev_fru = spa_strdup(value); 5804219089Spjd sync = B_TRUE; 5805219089Spjd } 5806209962Smm } 5807168404Spjd 5808219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5809168404Spjd} 5810168404Spjd 5811209962Smmint 5812209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5813209962Smm{ 5814209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5815209962Smm} 5816209962Smm 5817209962Smmint 5818209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5819209962Smm{ 5820209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5821209962Smm} 5822209962Smm 5823168404Spjd/* 5824168404Spjd * ========================================================================== 5825219089Spjd * SPA Scanning 5826168404Spjd * ========================================================================== 5827168404Spjd */ 5828168404Spjd 5829168404Spjdint 5830219089Spjdspa_scan_stop(spa_t *spa) 5831168404Spjd{ 5832185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5833219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5834249195Smm return (SET_ERROR(EBUSY)); 5835219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 5836219089Spjd} 5837168404Spjd 5838219089Spjdint 5839219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 5840219089Spjd{ 5841219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5842219089Spjd 5843219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5844249195Smm return (SET_ERROR(ENOTSUP)); 5845168404Spjd 5846168404Spjd /* 5847185029Spjd * If a resilver was requested, but there is no DTL on a 5848185029Spjd * writeable leaf device, we have nothing to do. 5849168404Spjd */ 5850219089Spjd if (func == POOL_SCAN_RESILVER && 5851185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5852185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5853168404Spjd return (0); 5854168404Spjd } 5855168404Spjd 5856219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 5857168404Spjd} 5858168404Spjd 5859168404Spjd/* 5860168404Spjd * ========================================================================== 5861168404Spjd * SPA async task processing 5862168404Spjd * ========================================================================== 5863168404Spjd */ 5864168404Spjd 5865168404Spjdstatic void 5866185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 5867168404Spjd{ 5868185029Spjd if (vd->vdev_remove_wanted) { 5869219089Spjd vd->vdev_remove_wanted = B_FALSE; 5870219089Spjd vd->vdev_delayed_close = B_FALSE; 5871185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5872209962Smm 5873209962Smm /* 5874209962Smm * We want to clear the stats, but we don't want to do a full 5875209962Smm * vdev_clear() as that will cause us to throw away 5876209962Smm * degraded/faulted state as well as attempt to reopen the 5877209962Smm * device, all of which is a waste. 5878209962Smm */ 5879209962Smm vd->vdev_stat.vs_read_errors = 0; 5880209962Smm vd->vdev_stat.vs_write_errors = 0; 5881209962Smm vd->vdev_stat.vs_checksum_errors = 0; 5882209962Smm 5883185029Spjd vdev_state_dirty(vd->vdev_top); 5884185029Spjd } 5885168404Spjd 5886185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5887185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 5888185029Spjd} 5889168404Spjd 5890185029Spjdstatic void 5891185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 5892185029Spjd{ 5893185029Spjd if (vd->vdev_probe_wanted) { 5894219089Spjd vd->vdev_probe_wanted = B_FALSE; 5895185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 5896168404Spjd } 5897168404Spjd 5898185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5899185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 5900168404Spjd} 5901168404Spjd 5902168404Spjdstatic void 5903219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 5904219089Spjd{ 5905219089Spjd sysevent_id_t eid; 5906219089Spjd nvlist_t *attr; 5907219089Spjd char *physpath; 5908219089Spjd 5909219089Spjd if (!spa->spa_autoexpand) 5910219089Spjd return; 5911219089Spjd 5912219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5913219089Spjd vdev_t *cvd = vd->vdev_child[c]; 5914219089Spjd spa_async_autoexpand(spa, cvd); 5915219089Spjd } 5916219089Spjd 5917219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5918219089Spjd return; 5919219089Spjd 5920219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5921219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5922219089Spjd 5923219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5924219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5925219089Spjd 5926219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5927219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5928219089Spjd 5929219089Spjd nvlist_free(attr); 5930219089Spjd kmem_free(physpath, MAXPATHLEN); 5931219089Spjd} 5932219089Spjd 5933219089Spjdstatic void 5934168404Spjdspa_async_thread(void *arg) 5935168404Spjd{ 5936168404Spjd spa_t *spa = arg; 5937168404Spjd int tasks; 5938168404Spjd 5939168404Spjd ASSERT(spa->spa_sync_on); 5940168404Spjd 5941168404Spjd mutex_enter(&spa->spa_async_lock); 5942168404Spjd tasks = spa->spa_async_tasks; 5943253990Smav spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 5944168404Spjd mutex_exit(&spa->spa_async_lock); 5945168404Spjd 5946168404Spjd /* 5947168404Spjd * See if the config needs to be updated. 5948168404Spjd */ 5949168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5950219089Spjd uint64_t old_space, new_space; 5951219089Spjd 5952168404Spjd mutex_enter(&spa_namespace_lock); 5953219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 5954168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5955219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 5956168404Spjd mutex_exit(&spa_namespace_lock); 5957219089Spjd 5958219089Spjd /* 5959219089Spjd * If the pool grew as a result of the config update, 5960219089Spjd * then log an internal history event. 5961219089Spjd */ 5962219089Spjd if (new_space != old_space) { 5963248571Smm spa_history_log_internal(spa, "vdev online", NULL, 5964219089Spjd "pool '%s' size: %llu(+%llu)", 5965219089Spjd spa_name(spa), new_space, new_space - old_space); 5966219089Spjd } 5967168404Spjd } 5968168404Spjd 5969219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5970219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5971219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 5972219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 5973219089Spjd } 5974219089Spjd 5975168404Spjd /* 5976185029Spjd * See if any devices need to be probed. 5977168404Spjd */ 5978185029Spjd if (tasks & SPA_ASYNC_PROBE) { 5979219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5980185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 5981185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5982185029Spjd } 5983168404Spjd 5984168404Spjd /* 5985185029Spjd * If any devices are done replacing, detach them. 5986168404Spjd */ 5987185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 5988185029Spjd spa_vdev_resilver_done(spa); 5989168404Spjd 5990168404Spjd /* 5991168404Spjd * Kick off a resilver. 5992168404Spjd */ 5993168404Spjd if (tasks & SPA_ASYNC_RESILVER) 5994219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 5995168404Spjd 5996168404Spjd /* 5997168404Spjd * Let the world know that we're done. 5998168404Spjd */ 5999168404Spjd mutex_enter(&spa->spa_async_lock); 6000168404Spjd spa->spa_async_thread = NULL; 6001168404Spjd cv_broadcast(&spa->spa_async_cv); 6002168404Spjd mutex_exit(&spa->spa_async_lock); 6003168404Spjd thread_exit(); 6004168404Spjd} 6005168404Spjd 6006253990Smavstatic void 6007253990Smavspa_async_thread_vd(void *arg) 6008253990Smav{ 6009253990Smav spa_t *spa = arg; 6010253990Smav int tasks; 6011253990Smav 6012253990Smav ASSERT(spa->spa_sync_on); 6013253990Smav 6014253990Smav mutex_enter(&spa->spa_async_lock); 6015253990Smav tasks = spa->spa_async_tasks; 6016253990Smavretry: 6017253990Smav spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 6018253990Smav mutex_exit(&spa->spa_async_lock); 6019253990Smav 6020253990Smav /* 6021253990Smav * See if any devices need to be marked REMOVED. 6022253990Smav */ 6023253990Smav if (tasks & SPA_ASYNC_REMOVE) { 6024253990Smav spa_vdev_state_enter(spa, SCL_NONE); 6025253990Smav spa_async_remove(spa, spa->spa_root_vdev); 6026253990Smav for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 6027253990Smav spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 6028253990Smav for (int i = 0; i < spa->spa_spares.sav_count; i++) 6029253990Smav spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 6030253990Smav (void) spa_vdev_state_exit(spa, NULL, 0); 6031253990Smav } 6032253990Smav 6033253990Smav /* 6034253990Smav * Let the world know that we're done. 6035253990Smav */ 6036253990Smav mutex_enter(&spa->spa_async_lock); 6037253990Smav tasks = spa->spa_async_tasks; 6038253990Smav if ((tasks & SPA_ASYNC_REMOVE) != 0) 6039253990Smav goto retry; 6040253990Smav spa->spa_async_thread_vd = NULL; 6041253990Smav cv_broadcast(&spa->spa_async_cv); 6042253990Smav mutex_exit(&spa->spa_async_lock); 6043253990Smav thread_exit(); 6044253990Smav} 6045253990Smav 6046168404Spjdvoid 6047168404Spjdspa_async_suspend(spa_t *spa) 6048168404Spjd{ 6049168404Spjd mutex_enter(&spa->spa_async_lock); 6050168404Spjd spa->spa_async_suspended++; 6051253990Smav while (spa->spa_async_thread != NULL && 6052253990Smav spa->spa_async_thread_vd != NULL) 6053168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6054168404Spjd mutex_exit(&spa->spa_async_lock); 6055168404Spjd} 6056168404Spjd 6057168404Spjdvoid 6058168404Spjdspa_async_resume(spa_t *spa) 6059168404Spjd{ 6060168404Spjd mutex_enter(&spa->spa_async_lock); 6061168404Spjd ASSERT(spa->spa_async_suspended != 0); 6062168404Spjd spa->spa_async_suspended--; 6063168404Spjd mutex_exit(&spa->spa_async_lock); 6064168404Spjd} 6065168404Spjd 6066251636Sdelphijstatic boolean_t 6067251636Sdelphijspa_async_tasks_pending(spa_t *spa) 6068251636Sdelphij{ 6069251636Sdelphij uint_t non_config_tasks; 6070251636Sdelphij uint_t config_task; 6071251636Sdelphij boolean_t config_task_suspended; 6072251636Sdelphij 6073253990Smav non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6074253990Smav SPA_ASYNC_REMOVE); 6075251636Sdelphij config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6076251636Sdelphij if (spa->spa_ccw_fail_time == 0) { 6077251636Sdelphij config_task_suspended = B_FALSE; 6078251636Sdelphij } else { 6079251636Sdelphij config_task_suspended = 6080251636Sdelphij (gethrtime() - spa->spa_ccw_fail_time) < 6081251636Sdelphij (zfs_ccw_retry_interval * NANOSEC); 6082251636Sdelphij } 6083251636Sdelphij 6084251636Sdelphij return (non_config_tasks || (config_task && !config_task_suspended)); 6085251636Sdelphij} 6086251636Sdelphij 6087168404Spjdstatic void 6088168404Spjdspa_async_dispatch(spa_t *spa) 6089168404Spjd{ 6090168404Spjd mutex_enter(&spa->spa_async_lock); 6091251636Sdelphij if (spa_async_tasks_pending(spa) && 6092251636Sdelphij !spa->spa_async_suspended && 6093168404Spjd spa->spa_async_thread == NULL && 6094251636Sdelphij rootdir != NULL) 6095168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 6096168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6097168404Spjd mutex_exit(&spa->spa_async_lock); 6098168404Spjd} 6099168404Spjd 6100253990Smavstatic void 6101253990Smavspa_async_dispatch_vd(spa_t *spa) 6102253990Smav{ 6103253990Smav mutex_enter(&spa->spa_async_lock); 6104253990Smav if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6105253990Smav !spa->spa_async_suspended && 6106253990Smav spa->spa_async_thread_vd == NULL && 6107253990Smav rootdir != NULL) 6108253990Smav spa->spa_async_thread_vd = thread_create(NULL, 0, 6109253990Smav spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6110253990Smav mutex_exit(&spa->spa_async_lock); 6111253990Smav} 6112253990Smav 6113168404Spjdvoid 6114168404Spjdspa_async_request(spa_t *spa, int task) 6115168404Spjd{ 6116219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6117168404Spjd mutex_enter(&spa->spa_async_lock); 6118168404Spjd spa->spa_async_tasks |= task; 6119168404Spjd mutex_exit(&spa->spa_async_lock); 6120253990Smav spa_async_dispatch_vd(spa); 6121168404Spjd} 6122168404Spjd 6123168404Spjd/* 6124168404Spjd * ========================================================================== 6125168404Spjd * SPA syncing routines 6126168404Spjd * ========================================================================== 6127168404Spjd */ 6128168404Spjd 6129219089Spjdstatic int 6130219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6131168404Spjd{ 6132219089Spjd bpobj_t *bpo = arg; 6133219089Spjd bpobj_enqueue(bpo, bp, tx); 6134219089Spjd return (0); 6135219089Spjd} 6136168404Spjd 6137219089Spjdstatic int 6138219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6139219089Spjd{ 6140219089Spjd zio_t *zio = arg; 6141168404Spjd 6142219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6143240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 6144219089Spjd return (0); 6145168404Spjd} 6146168404Spjd 6147258632Savg/* 6148258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6149258632Savg * amount of time spent syncing frees. 6150258632Savg */ 6151168404Spjdstatic void 6152258632Savgspa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6153258632Savg{ 6154258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6155258632Savg bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6156258632Savg VERIFY(zio_wait(zio) == 0); 6157258632Savg} 6158258632Savg 6159258632Savg/* 6160258632Savg * Note: this simple function is not inlined to make it easier to dtrace the 6161258632Savg * amount of time spent syncing deferred frees. 6162258632Savg */ 6163258632Savgstatic void 6164258632Savgspa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6165258632Savg{ 6166258632Savg zio_t *zio = zio_root(spa, NULL, NULL, 0); 6167258632Savg VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6168258632Savg spa_free_sync_cb, zio, tx), ==, 0); 6169258632Savg VERIFY0(zio_wait(zio)); 6170258632Savg} 6171258632Savg 6172258632Savg 6173258632Savgstatic void 6174168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6175168404Spjd{ 6176168404Spjd char *packed = NULL; 6177185029Spjd size_t bufsize; 6178168404Spjd size_t nvsize = 0; 6179168404Spjd dmu_buf_t *db; 6180168404Spjd 6181168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6182168404Spjd 6183185029Spjd /* 6184185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6185260150Sdelphij * information. This avoids the dmu_buf_will_dirty() path and 6186185029Spjd * saves us a pre-read to get data we don't actually care about. 6187185029Spjd */ 6188236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6189185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 6190168404Spjd 6191168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6192168404Spjd KM_SLEEP) == 0); 6193185029Spjd bzero(packed + nvsize, bufsize - nvsize); 6194168404Spjd 6195185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6196168404Spjd 6197185029Spjd kmem_free(packed, bufsize); 6198168404Spjd 6199168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6200168404Spjd dmu_buf_will_dirty(db, tx); 6201168404Spjd *(uint64_t *)db->db_data = nvsize; 6202168404Spjd dmu_buf_rele(db, FTAG); 6203168404Spjd} 6204168404Spjd 6205168404Spjdstatic void 6206185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6207185029Spjd const char *config, const char *entry) 6208168404Spjd{ 6209168404Spjd nvlist_t *nvroot; 6210185029Spjd nvlist_t **list; 6211168404Spjd int i; 6212168404Spjd 6213185029Spjd if (!sav->sav_sync) 6214168404Spjd return; 6215168404Spjd 6216168404Spjd /* 6217185029Spjd * Update the MOS nvlist describing the list of available devices. 6218185029Spjd * spa_validate_aux() will have already made sure this nvlist is 6219185029Spjd * valid and the vdevs are labeled appropriately. 6220168404Spjd */ 6221185029Spjd if (sav->sav_object == 0) { 6222185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6223185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6224185029Spjd sizeof (uint64_t), tx); 6225168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 6226185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6227185029Spjd &sav->sav_object, tx) == 0); 6228168404Spjd } 6229168404Spjd 6230168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6231185029Spjd if (sav->sav_count == 0) { 6232185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6233168404Spjd } else { 6234185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6235185029Spjd for (i = 0; i < sav->sav_count; i++) 6236185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6237219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 6238185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6239185029Spjd sav->sav_count) == 0); 6240185029Spjd for (i = 0; i < sav->sav_count; i++) 6241185029Spjd nvlist_free(list[i]); 6242185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 6243168404Spjd } 6244168404Spjd 6245185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6246168404Spjd nvlist_free(nvroot); 6247168404Spjd 6248185029Spjd sav->sav_sync = B_FALSE; 6249168404Spjd} 6250168404Spjd 6251168404Spjdstatic void 6252168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6253168404Spjd{ 6254168404Spjd nvlist_t *config; 6255168404Spjd 6256185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) 6257168404Spjd return; 6258168404Spjd 6259185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6260168404Spjd 6261185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 6262185029Spjd dmu_tx_get_txg(tx), B_FALSE); 6263185029Spjd 6264243505Smm /* 6265243505Smm * If we're upgrading the spa version then make sure that 6266243505Smm * the config object gets updated with the correct version. 6267243505Smm */ 6268243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6269243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6270243505Smm spa->spa_uberblock.ub_version); 6271243505Smm 6272185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6273185029Spjd 6274168404Spjd if (spa->spa_config_syncing) 6275168404Spjd nvlist_free(spa->spa_config_syncing); 6276168404Spjd spa->spa_config_syncing = config; 6277168404Spjd 6278168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6279168404Spjd} 6280168404Spjd 6281236884Smmstatic void 6282248571Smmspa_sync_version(void *arg, dmu_tx_t *tx) 6283236884Smm{ 6284248571Smm uint64_t *versionp = arg; 6285248571Smm uint64_t version = *versionp; 6286248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6287236884Smm 6288236884Smm /* 6289236884Smm * Setting the version is special cased when first creating the pool. 6290236884Smm */ 6291236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 6292236884Smm 6293247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6294236884Smm ASSERT(version >= spa_version(spa)); 6295236884Smm 6296236884Smm spa->spa_uberblock.ub_version = version; 6297236884Smm vdev_config_dirty(spa->spa_root_vdev); 6298248571Smm spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6299236884Smm} 6300236884Smm 6301185029Spjd/* 6302185029Spjd * Set zpool properties. 6303185029Spjd */ 6304168404Spjdstatic void 6305248571Smmspa_sync_props(void *arg, dmu_tx_t *tx) 6306168404Spjd{ 6307248571Smm nvlist_t *nvp = arg; 6308248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6309185029Spjd objset_t *mos = spa->spa_meta_objset; 6310236884Smm nvpair_t *elem = NULL; 6311168404Spjd 6312168404Spjd mutex_enter(&spa->spa_props_lock); 6313168404Spjd 6314185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 6315236884Smm uint64_t intval; 6316236884Smm char *strval, *fname; 6317236884Smm zpool_prop_t prop; 6318236884Smm const char *propname; 6319236884Smm zprop_type_t proptype; 6320259813Sdelphij spa_feature_t fid; 6321236884Smm 6322185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6323236884Smm case ZPROP_INVAL: 6324236884Smm /* 6325236884Smm * We checked this earlier in spa_prop_validate(). 6326236884Smm */ 6327236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 6328236884Smm 6329236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 6330259813Sdelphij VERIFY0(zfeature_lookup_name(fname, &fid)); 6331236884Smm 6332259813Sdelphij spa_feature_enable(spa, fid, tx); 6333248571Smm spa_history_log_internal(spa, "set", tx, 6334248571Smm "%s=enabled", nvpair_name(elem)); 6335236884Smm break; 6336236884Smm 6337185029Spjd case ZPOOL_PROP_VERSION: 6338258717Savg intval = fnvpair_value_uint64(elem); 6339185029Spjd /* 6340236884Smm * The version is synced seperatly before other 6341236884Smm * properties and should be correct by now. 6342185029Spjd */ 6343236884Smm ASSERT3U(spa_version(spa), >=, intval); 6344185029Spjd break; 6345168404Spjd 6346185029Spjd case ZPOOL_PROP_ALTROOT: 6347185029Spjd /* 6348185029Spjd * 'altroot' is a non-persistent property. It should 6349185029Spjd * have been set temporarily at creation or import time. 6350185029Spjd */ 6351185029Spjd ASSERT(spa->spa_root != NULL); 6352185029Spjd break; 6353168404Spjd 6354219089Spjd case ZPOOL_PROP_READONLY: 6355185029Spjd case ZPOOL_PROP_CACHEFILE: 6356185029Spjd /* 6357219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 6358219089Spjd * properties. 6359185029Spjd */ 6360168404Spjd break; 6361228103Smm case ZPOOL_PROP_COMMENT: 6362258717Savg strval = fnvpair_value_string(elem); 6363228103Smm if (spa->spa_comment != NULL) 6364228103Smm spa_strfree(spa->spa_comment); 6365228103Smm spa->spa_comment = spa_strdup(strval); 6366228103Smm /* 6367228103Smm * We need to dirty the configuration on all the vdevs 6368228103Smm * so that their labels get updated. It's unnecessary 6369228103Smm * to do this for pool creation since the vdev's 6370228103Smm * configuratoin has already been dirtied. 6371228103Smm */ 6372228103Smm if (tx->tx_txg != TXG_INITIAL) 6373228103Smm vdev_config_dirty(spa->spa_root_vdev); 6374248571Smm spa_history_log_internal(spa, "set", tx, 6375248571Smm "%s=%s", nvpair_name(elem), strval); 6376228103Smm break; 6377185029Spjd default: 6378185029Spjd /* 6379185029Spjd * Set pool property values in the poolprops mos object. 6380185029Spjd */ 6381185029Spjd if (spa->spa_pool_props_object == 0) { 6382236884Smm spa->spa_pool_props_object = 6383236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 6384185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6385236884Smm tx); 6386185029Spjd } 6387185029Spjd 6388185029Spjd /* normalize the property name */ 6389185029Spjd propname = zpool_prop_to_name(prop); 6390185029Spjd proptype = zpool_prop_get_type(prop); 6391185029Spjd 6392185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 6393185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 6394258717Savg strval = fnvpair_value_string(elem); 6395258717Savg VERIFY0(zap_update(mos, 6396185029Spjd spa->spa_pool_props_object, propname, 6397258717Savg 1, strlen(strval) + 1, strval, tx)); 6398248571Smm spa_history_log_internal(spa, "set", tx, 6399248571Smm "%s=%s", nvpair_name(elem), strval); 6400185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6401258717Savg intval = fnvpair_value_uint64(elem); 6402185029Spjd 6403185029Spjd if (proptype == PROP_TYPE_INDEX) { 6404185029Spjd const char *unused; 6405258717Savg VERIFY0(zpool_prop_index_to_string( 6406258717Savg prop, intval, &unused)); 6407185029Spjd } 6408258717Savg VERIFY0(zap_update(mos, 6409185029Spjd spa->spa_pool_props_object, propname, 6410258717Savg 8, 1, &intval, tx)); 6411248571Smm spa_history_log_internal(spa, "set", tx, 6412248571Smm "%s=%lld", nvpair_name(elem), intval); 6413185029Spjd } else { 6414185029Spjd ASSERT(0); /* not allowed */ 6415185029Spjd } 6416185029Spjd 6417185029Spjd switch (prop) { 6418185029Spjd case ZPOOL_PROP_DELEGATION: 6419185029Spjd spa->spa_delegation = intval; 6420185029Spjd break; 6421185029Spjd case ZPOOL_PROP_BOOTFS: 6422185029Spjd spa->spa_bootfs = intval; 6423185029Spjd break; 6424185029Spjd case ZPOOL_PROP_FAILUREMODE: 6425185029Spjd spa->spa_failmode = intval; 6426185029Spjd break; 6427219089Spjd case ZPOOL_PROP_AUTOEXPAND: 6428219089Spjd spa->spa_autoexpand = intval; 6429219089Spjd if (tx->tx_txg != TXG_INITIAL) 6430219089Spjd spa_async_request(spa, 6431219089Spjd SPA_ASYNC_AUTOEXPAND); 6432219089Spjd break; 6433219089Spjd case ZPOOL_PROP_DEDUPDITTO: 6434219089Spjd spa->spa_dedup_ditto = intval; 6435219089Spjd break; 6436185029Spjd default: 6437185029Spjd break; 6438185029Spjd } 6439168404Spjd } 6440185029Spjd 6441168404Spjd } 6442185029Spjd 6443185029Spjd mutex_exit(&spa->spa_props_lock); 6444168404Spjd} 6445168404Spjd 6446168404Spjd/* 6447219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 6448219089Spjd * reflect the new version this txg, so there must be no changes this 6449219089Spjd * txg to anything that the upgrade code depends on after it executes. 6450219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 6451219089Spjd * tasks. 6452219089Spjd */ 6453219089Spjdstatic void 6454219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6455219089Spjd{ 6456219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6457219089Spjd 6458219089Spjd ASSERT(spa->spa_sync_pass == 1); 6459219089Spjd 6460248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6461248571Smm 6462219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6463219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6464219089Spjd dsl_pool_create_origin(dp, tx); 6465219089Spjd 6466219089Spjd /* Keeping the origin open increases spa_minref */ 6467219089Spjd spa->spa_minref += 3; 6468219089Spjd } 6469219089Spjd 6470219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6471219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6472219089Spjd dsl_pool_upgrade_clones(dp, tx); 6473219089Spjd } 6474219089Spjd 6475219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6476219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6477219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 6478219089Spjd 6479219089Spjd /* Keeping the freedir open increases spa_minref */ 6480219089Spjd spa->spa_minref += 3; 6481219089Spjd } 6482236884Smm 6483236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6484236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6485236884Smm spa_feature_create_zap_objects(spa, tx); 6486236884Smm } 6487268126Sdelphij 6488268126Sdelphij /* 6489268126Sdelphij * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6490268126Sdelphij * when possibility to use lz4 compression for metadata was added 6491268126Sdelphij * Old pools that have this feature enabled must be upgraded to have 6492268126Sdelphij * this feature active 6493268126Sdelphij */ 6494268126Sdelphij if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6495268126Sdelphij boolean_t lz4_en = spa_feature_is_enabled(spa, 6496268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 6497268126Sdelphij boolean_t lz4_ac = spa_feature_is_active(spa, 6498268126Sdelphij SPA_FEATURE_LZ4_COMPRESS); 6499268126Sdelphij 6500268126Sdelphij if (lz4_en && !lz4_ac) 6501268126Sdelphij spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6502268126Sdelphij } 6503248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 6504219089Spjd} 6505219089Spjd 6506219089Spjd/* 6507168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 6508168404Spjd * part of the process, so we iterate until it converges. 6509168404Spjd */ 6510168404Spjdvoid 6511168404Spjdspa_sync(spa_t *spa, uint64_t txg) 6512168404Spjd{ 6513168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6514168404Spjd objset_t *mos = spa->spa_meta_objset; 6515219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6516168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6517168404Spjd vdev_t *vd; 6518168404Spjd dmu_tx_t *tx; 6519185029Spjd int error; 6520168404Spjd 6521219089Spjd VERIFY(spa_writeable(spa)); 6522219089Spjd 6523168404Spjd /* 6524168404Spjd * Lock out configuration changes. 6525168404Spjd */ 6526185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6527168404Spjd 6528168404Spjd spa->spa_syncing_txg = txg; 6529168404Spjd spa->spa_sync_pass = 0; 6530168404Spjd 6531185029Spjd /* 6532185029Spjd * If there are any pending vdev state changes, convert them 6533185029Spjd * into config changes that go out with this transaction group. 6534185029Spjd */ 6535185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6536209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 6537209962Smm /* 6538209962Smm * We need the write lock here because, for aux vdevs, 6539209962Smm * calling vdev_config_dirty() modifies sav_config. 6540209962Smm * This is ugly and will become unnecessary when we 6541209962Smm * eliminate the aux vdev wart by integrating all vdevs 6542209962Smm * into the root vdev tree. 6543209962Smm */ 6544209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6545209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6546209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6547209962Smm vdev_state_clean(vd); 6548209962Smm vdev_config_dirty(vd); 6549209962Smm } 6550209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6551209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6552185029Spjd } 6553185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6554185029Spjd 6555168404Spjd tx = dmu_tx_create_assigned(dp, txg); 6556168404Spjd 6557247265Smm spa->spa_sync_starttime = gethrtime(); 6558247265Smm#ifdef illumos 6559247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6560247265Smm spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6561247265Smm#else /* FreeBSD */ 6562247265Smm#ifdef _KERNEL 6563247265Smm callout_reset(&spa->spa_deadman_cycid, 6564247265Smm hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6565247265Smm#endif 6566247265Smm#endif 6567247265Smm 6568168404Spjd /* 6569185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6570168404Spjd * set spa_deflate if we have no raid-z vdevs. 6571168404Spjd */ 6572185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6573185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6574168404Spjd int i; 6575168404Spjd 6576168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 6577168404Spjd vd = rvd->vdev_child[i]; 6578168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6579168404Spjd break; 6580168404Spjd } 6581168404Spjd if (i == rvd->vdev_children) { 6582168404Spjd spa->spa_deflate = TRUE; 6583168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 6584168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6585168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6586168404Spjd } 6587168404Spjd } 6588168404Spjd 6589168404Spjd /* 6590219089Spjd * If anything has changed in this txg, or if someone is waiting 6591219089Spjd * for this txg to sync (eg, spa_vdev_remove()), push the 6592219089Spjd * deferred frees from the previous txg. If not, leave them 6593219089Spjd * alone so that we don't generate work on an otherwise idle 6594219089Spjd * system. 6595168404Spjd */ 6596168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6597168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6598219089Spjd !txg_list_empty(&dp->dp_sync_tasks, txg) || 6599219089Spjd ((dsl_scan_active(dp->dp_scan) || 6600219089Spjd txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6601258632Savg spa_sync_deferred_frees(spa, tx); 6602219089Spjd } 6603168404Spjd 6604168404Spjd /* 6605168404Spjd * Iterate to convergence. 6606168404Spjd */ 6607168404Spjd do { 6608219089Spjd int pass = ++spa->spa_sync_pass; 6609168404Spjd 6610168404Spjd spa_sync_config_object(spa, tx); 6611185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6612185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6613185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6614185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6615168404Spjd spa_errlog_sync(spa, txg); 6616168404Spjd dsl_pool_sync(dp, txg); 6617168404Spjd 6618243503Smm if (pass < zfs_sync_pass_deferred_free) { 6619258632Savg spa_sync_frees(spa, free_bpl, tx); 6620219089Spjd } else { 6621219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 6622258632Savg &spa->spa_deferred_bpobj, tx); 6623168404Spjd } 6624168404Spjd 6625219089Spjd ddt_sync(spa, txg); 6626219089Spjd dsl_scan_sync(dp, tx); 6627168404Spjd 6628219089Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6629219089Spjd vdev_sync(vd, txg); 6630168404Spjd 6631219089Spjd if (pass == 1) 6632219089Spjd spa_sync_upgrades(spa, tx); 6633168404Spjd 6634219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 6635219089Spjd 6636168404Spjd /* 6637168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 6638168404Spjd * to commit the transaction group. 6639168404Spjd * 6640185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 6641185029Spjd * random top-level vdevs that are known to be visible in the 6642185029Spjd * config cache (see spa_vdev_add() for a complete description). 6643185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6644168404Spjd */ 6645185029Spjd for (;;) { 6646185029Spjd /* 6647185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 6648185029Spjd * while we're attempting to write the vdev labels. 6649185029Spjd */ 6650185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6651168404Spjd 6652185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 6653185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 6654185029Spjd int svdcount = 0; 6655185029Spjd int children = rvd->vdev_children; 6656185029Spjd int c0 = spa_get_random(children); 6657185029Spjd 6658219089Spjd for (int c = 0; c < children; c++) { 6659185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 6660185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6661185029Spjd continue; 6662185029Spjd svd[svdcount++] = vd; 6663185029Spjd if (svdcount == SPA_DVAS_PER_BP) 6664185029Spjd break; 6665185029Spjd } 6666213198Smm error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6667213198Smm if (error != 0) 6668213198Smm error = vdev_config_sync(svd, svdcount, txg, 6669213198Smm B_TRUE); 6670185029Spjd } else { 6671185029Spjd error = vdev_config_sync(rvd->vdev_child, 6672213198Smm rvd->vdev_children, txg, B_FALSE); 6673213198Smm if (error != 0) 6674213198Smm error = vdev_config_sync(rvd->vdev_child, 6675213198Smm rvd->vdev_children, txg, B_TRUE); 6676168404Spjd } 6677185029Spjd 6678239620Smm if (error == 0) 6679239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 6680239620Smm 6681185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6682185029Spjd 6683185029Spjd if (error == 0) 6684185029Spjd break; 6685185029Spjd zio_suspend(spa, NULL); 6686185029Spjd zio_resume_wait(spa); 6687168404Spjd } 6688168404Spjd dmu_tx_commit(tx); 6689168404Spjd 6690247265Smm#ifdef illumos 6691247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6692247265Smm#else /* FreeBSD */ 6693247265Smm#ifdef _KERNEL 6694247265Smm callout_drain(&spa->spa_deadman_cycid); 6695247265Smm#endif 6696247265Smm#endif 6697247265Smm 6698168404Spjd /* 6699168404Spjd * Clear the dirty config list. 6700168404Spjd */ 6701185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6702168404Spjd vdev_config_clean(vd); 6703168404Spjd 6704168404Spjd /* 6705168404Spjd * Now that the new config has synced transactionally, 6706168404Spjd * let it become visible to the config cache. 6707168404Spjd */ 6708168404Spjd if (spa->spa_config_syncing != NULL) { 6709168404Spjd spa_config_set(spa, spa->spa_config_syncing); 6710168404Spjd spa->spa_config_txg = txg; 6711168404Spjd spa->spa_config_syncing = NULL; 6712168404Spjd } 6713168404Spjd 6714168404Spjd spa->spa_ubsync = spa->spa_uberblock; 6715168404Spjd 6716219089Spjd dsl_pool_sync_done(dp, txg); 6717168404Spjd 6718168404Spjd /* 6719168404Spjd * Update usable space statistics. 6720168404Spjd */ 6721168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6722168404Spjd vdev_sync_done(vd, txg); 6723168404Spjd 6724219089Spjd spa_update_dspace(spa); 6725219089Spjd 6726168404Spjd /* 6727168404Spjd * It had better be the case that we didn't dirty anything 6728168404Spjd * since vdev_config_sync(). 6729168404Spjd */ 6730168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6731168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6732168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6733168404Spjd 6734219089Spjd spa->spa_sync_pass = 0; 6735219089Spjd 6736185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6737168404Spjd 6738219089Spjd spa_handle_ignored_writes(spa); 6739219089Spjd 6740168404Spjd /* 6741168404Spjd * If any async tasks have been requested, kick them off. 6742168404Spjd */ 6743168404Spjd spa_async_dispatch(spa); 6744253990Smav spa_async_dispatch_vd(spa); 6745168404Spjd} 6746168404Spjd 6747168404Spjd/* 6748168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 6749168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 6750168404Spjd * sync. 6751168404Spjd */ 6752168404Spjdvoid 6753168404Spjdspa_sync_allpools(void) 6754168404Spjd{ 6755168404Spjd spa_t *spa = NULL; 6756168404Spjd mutex_enter(&spa_namespace_lock); 6757168404Spjd while ((spa = spa_next(spa)) != NULL) { 6758219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 6759219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 6760168404Spjd continue; 6761168404Spjd spa_open_ref(spa, FTAG); 6762168404Spjd mutex_exit(&spa_namespace_lock); 6763168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6764168404Spjd mutex_enter(&spa_namespace_lock); 6765168404Spjd spa_close(spa, FTAG); 6766168404Spjd } 6767168404Spjd mutex_exit(&spa_namespace_lock); 6768168404Spjd} 6769168404Spjd 6770168404Spjd/* 6771168404Spjd * ========================================================================== 6772168404Spjd * Miscellaneous routines 6773168404Spjd * ========================================================================== 6774168404Spjd */ 6775168404Spjd 6776168404Spjd/* 6777168404Spjd * Remove all pools in the system. 6778168404Spjd */ 6779168404Spjdvoid 6780168404Spjdspa_evict_all(void) 6781168404Spjd{ 6782168404Spjd spa_t *spa; 6783168404Spjd 6784168404Spjd /* 6785168404Spjd * Remove all cached state. All pools should be closed now, 6786168404Spjd * so every spa in the AVL tree should be unreferenced. 6787168404Spjd */ 6788168404Spjd mutex_enter(&spa_namespace_lock); 6789168404Spjd while ((spa = spa_next(NULL)) != NULL) { 6790168404Spjd /* 6791168404Spjd * Stop async tasks. The async thread may need to detach 6792168404Spjd * a device that's been replaced, which requires grabbing 6793168404Spjd * spa_namespace_lock, so we must drop it here. 6794168404Spjd */ 6795168404Spjd spa_open_ref(spa, FTAG); 6796168404Spjd mutex_exit(&spa_namespace_lock); 6797168404Spjd spa_async_suspend(spa); 6798168404Spjd mutex_enter(&spa_namespace_lock); 6799168404Spjd spa_close(spa, FTAG); 6800168404Spjd 6801168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6802168404Spjd spa_unload(spa); 6803168404Spjd spa_deactivate(spa); 6804168404Spjd } 6805168404Spjd spa_remove(spa); 6806168404Spjd } 6807168404Spjd mutex_exit(&spa_namespace_lock); 6808168404Spjd} 6809168404Spjd 6810168404Spjdvdev_t * 6811209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6812168404Spjd{ 6813185029Spjd vdev_t *vd; 6814185029Spjd int i; 6815185029Spjd 6816185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6817185029Spjd return (vd); 6818185029Spjd 6819209962Smm if (aux) { 6820185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6821185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 6822185029Spjd if (vd->vdev_guid == guid) 6823185029Spjd return (vd); 6824185029Spjd } 6825209962Smm 6826209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 6827209962Smm vd = spa->spa_spares.sav_vdevs[i]; 6828209962Smm if (vd->vdev_guid == guid) 6829209962Smm return (vd); 6830209962Smm } 6831185029Spjd } 6832185029Spjd 6833185029Spjd return (NULL); 6834168404Spjd} 6835168404Spjd 6836168404Spjdvoid 6837185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 6838168404Spjd{ 6839219089Spjd ASSERT(spa_writeable(spa)); 6840219089Spjd 6841185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6842168404Spjd 6843168404Spjd /* 6844168404Spjd * This should only be called for a non-faulted pool, and since a 6845168404Spjd * future version would result in an unopenable pool, this shouldn't be 6846168404Spjd * possible. 6847168404Spjd */ 6848247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6849268075Sdelphij ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 6850168404Spjd 6851185029Spjd spa->spa_uberblock.ub_version = version; 6852168404Spjd vdev_config_dirty(spa->spa_root_vdev); 6853168404Spjd 6854185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 6855168404Spjd 6856168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6857168404Spjd} 6858168404Spjd 6859168404Spjdboolean_t 6860168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 6861168404Spjd{ 6862168404Spjd int i; 6863168404Spjd uint64_t spareguid; 6864185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6865168404Spjd 6866185029Spjd for (i = 0; i < sav->sav_count; i++) 6867185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 6868168404Spjd return (B_TRUE); 6869168404Spjd 6870185029Spjd for (i = 0; i < sav->sav_npending; i++) { 6871185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6872185029Spjd &spareguid) == 0 && spareguid == guid) 6873168404Spjd return (B_TRUE); 6874168404Spjd } 6875168404Spjd 6876168404Spjd return (B_FALSE); 6877168404Spjd} 6878168404Spjd 6879185029Spjd/* 6880185029Spjd * Check if a pool has an active shared spare device. 6881185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 6882185029Spjd */ 6883185029Spjdstatic boolean_t 6884185029Spjdspa_has_active_shared_spare(spa_t *spa) 6885168404Spjd{ 6886185029Spjd int i, refcnt; 6887185029Spjd uint64_t pool; 6888185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6889185029Spjd 6890185029Spjd for (i = 0; i < sav->sav_count; i++) { 6891185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6892185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6893185029Spjd refcnt > 2) 6894185029Spjd return (B_TRUE); 6895185029Spjd } 6896185029Spjd 6897185029Spjd return (B_FALSE); 6898168404Spjd} 6899168404Spjd 6900185029Spjd/* 6901185029Spjd * Post a sysevent corresponding to the given event. The 'name' must be one of 6902185029Spjd * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6903185029Spjd * filled in from the spa and (optionally) the vdev. This doesn't do anything 6904185029Spjd * in the userland libzpool, as we don't want consumers to misinterpret ztest 6905185029Spjd * or zdb as real changes. 6906185029Spjd */ 6907185029Spjdvoid 6908185029Spjdspa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6909168404Spjd{ 6910185029Spjd#ifdef _KERNEL 6911185029Spjd sysevent_t *ev; 6912185029Spjd sysevent_attr_list_t *attr = NULL; 6913185029Spjd sysevent_value_t value; 6914185029Spjd sysevent_id_t eid; 6915168404Spjd 6916185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6917185029Spjd SE_SLEEP); 6918168404Spjd 6919185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6920185029Spjd value.value.sv_string = spa_name(spa); 6921185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6922185029Spjd goto done; 6923168404Spjd 6924185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6925185029Spjd value.value.sv_uint64 = spa_guid(spa); 6926185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6927185029Spjd goto done; 6928168404Spjd 6929185029Spjd if (vd) { 6930185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6931185029Spjd value.value.sv_uint64 = vd->vdev_guid; 6932185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6933185029Spjd SE_SLEEP) != 0) 6934185029Spjd goto done; 6935168404Spjd 6936185029Spjd if (vd->vdev_path) { 6937185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6938185029Spjd value.value.sv_string = vd->vdev_path; 6939185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6940185029Spjd &value, SE_SLEEP) != 0) 6941185029Spjd goto done; 6942168404Spjd } 6943168404Spjd } 6944168404Spjd 6945185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 6946185029Spjd goto done; 6947185029Spjd attr = NULL; 6948168404Spjd 6949185029Spjd (void) log_sysevent(ev, SE_SLEEP, &eid); 6950185029Spjd 6951185029Spjddone: 6952185029Spjd if (attr) 6953185029Spjd sysevent_free_attr(attr); 6954185029Spjd sysevent_free(ev); 6955185029Spjd#endif 6956168404Spjd} 6957