1192830Sed/* 2192830Sed * CDDL HEADER START 3192830Sed * 4192830Sed * The contents of this file are subject to the terms of the 5192830Sed * Common Development and Distribution License (the "License"). 6192830Sed * You may not use this file except in compliance with the License. 7192830Sed * 8192830Sed * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9192914Sed * or http://www.opensolaris.org/os/licensing. 10192914Sed * See the License for the specific language governing permissions 11192914Sed * and limitations under the License. 12192914Sed * 13192914Sed * When distributing Covered Code, include this CDDL HEADER in each 14192914Sed * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15192914Sed * If applicable, add the following below this CDDL HEADER, with the 16192914Sed * fields enclosed by brackets "[]" replaced with your own identifying 17192914Sed * information: Portions Copyright [yyyy] [name of copyright owner] 18192914Sed * 19192914Sed * CDDL HEADER END 20192914Sed */ 21192914Sed 22192914Sed/* 23192914Sed * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24192914Sed * Copyright (c) 2013 by Delphix. All rights reserved. 25192914Sed * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 26192914Sed * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27192914Sed */ 28192914Sed 29192914Sed/* 30192914Sed * SPA: Storage Pool Allocator 31192914Sed * 32192914Sed * This file contains all the routines used when modifying on-disk SPA state. 33192914Sed * This includes opening, importing, destroying, exporting a pool, and syncing a 34192914Sed * pool. 35192830Sed */ 36192914Sed 37192830Sed#include <sys/zfs_context.h> 38192830Sed#include <sys/fm/fs/zfs.h> 39192830Sed#include <sys/spa_impl.h> 40192830Sed#include <sys/zio.h> 41192830Sed#include <sys/zio_checksum.h> 42192830Sed#include <sys/dmu.h> 43192830Sed#include <sys/dmu_tx.h> 44192830Sed#include <sys/zap.h> 45192830Sed#include <sys/zil.h> 46192830Sed#include <sys/ddt.h> 47192830Sed#include <sys/vdev_impl.h> 48192830Sed#include <sys/metaslab.h> 49192830Sed#include <sys/metaslab_impl.h> 50192830Sed#include <sys/uberblock_impl.h> 51192830Sed#include <sys/txg.h> 52192830Sed#include <sys/avl.h> 53213567Sed#include <sys/dmu_traverse.h> 54192830Sed#include <sys/dmu_objset.h> 55192830Sed#include <sys/unique.h> 56192830Sed#include <sys/dsl_pool.h> 57192856Sed#include <sys/dsl_dataset.h> 58192856Sed#include <sys/dsl_dir.h> 59192856Sed#include <sys/dsl_prop.h> 60192830Sed#include <sys/dsl_synctask.h> 61192914Sed#include <sys/fs/zfs.h> 62192830Sed#include <sys/arc.h> 63192914Sed#include <sys/callb.h> 64192830Sed#include <sys/spa_boot.h> 65213567Sed#include <sys/zfs_ioctl.h> 66192830Sed#include <sys/dsl_scan.h> 67192830Sed#include <sys/dmu_send.h> 68192830Sed#include <sys/dsl_destroy.h> 69192856Sed#include <sys/dsl_userhold.h> 70192856Sed#include <sys/zfeature.h> 71192830Sed#include <sys/zvol.h> 72192830Sed#include <sys/trim_map.h> 73192830Sed 74192830Sed#ifdef _KERNEL 75192856Sed#include <sys/callb.h> 76192914Sed#include <sys/cpupart.h> 77192856Sed#include <sys/zone.h> 78192830Sed#endif /* _KERNEL */ 79192830Sed 80192914Sed#include "zfs_prop.h" 81192914Sed#include "zfs_comutil.h" 82192914Sed 83196751Sache/* Check hostid on import? */ 84192914Sedstatic int check_hostid = 1; 85192830Sed 86192830SedSYSCTL_DECL(_vfs_zfs); 87192830SedTUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 88192914SedSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 89192914Sed "Check hostid on import?"); 90192914Sed 91192914Sed/* 92192914Sed * The interval, in seconds, at which failed configuration cache file writes 93192856Sed * should be retried. 94192856Sed */ 95192856Sedstatic int zfs_ccw_retry_interval = 300; 96192914Sed 97192830Sedtypedef enum zti_modes { 98192830Sed zti_mode_fixed, /* value is # of threads (min 1) */ 99192830Sed zti_mode_online_percent, /* value is % of online CPUs */ 100192830Sed zti_mode_batch, /* cpu-intensive; value is ignored */ 101192830Sed zti_mode_null, /* don't create a taskq */ 102192830Sed zti_nmodes 103192830Sed} zti_modes_t; 104192830Sed 105192830Sed#define ZTI_FIX(n) { zti_mode_fixed, (n) } 106192830Sed#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 107192830Sed#define ZTI_BATCH { zti_mode_batch, 0 } 108192830Sed#define ZTI_NULL { zti_mode_null, 0 } 109192830Sed 110192830Sed#define ZTI_ONE ZTI_FIX(1) 111192830Sed 112192830Sedtypedef struct zio_taskq_info { 113192830Sed enum zti_modes zti_mode; 114192830Sed uint_t zti_value; 115192830Sed} zio_taskq_info_t; 116192830Sed 117192830Sedstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 118192830Sed "issue", "issue_high", "intr", "intr_high" 119192830Sed}; 120192830Sed 121192830Sed/* 122192830Sed * Define the taskq threads for the following I/O types: 123192830Sed * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 124192830Sed */ 125192830Sedconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 126192830Sed /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 127192830Sed { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 128192830Sed { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 129192830Sed { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 130192830Sed { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 131192830Sed { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 132192830Sed { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 133192830Sed}; 134192830Sed 135192830Sedstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 136192830Sedstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 137192830Sedstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 138192830Sedstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 139192830Sed spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 140192830Sed char **ereport); 141192830Sedstatic void spa_vdev_resilver_done(spa_t *spa); 142192830Sed 143192830Seduint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 144192830Sed#ifdef PSRSET_BIND 145192830Sedid_t zio_taskq_psrset_bind = PS_NONE; 146192830Sed#endif 147192830Sed#ifdef SYSDC 148192830Sedboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 149192830Sed#endif 150192830Seduint_t zio_taskq_basedc = 80; /* base duty cycle */ 151192914Sed 152192830Sedboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 153192830Sedextern int zfs_sync_pass_deferred_free; 154192830Sed 155192830Sed#ifndef illumos 156192830Sedextern void spa_deadman(void *arg); 157192830Sed#endif 158192830Sed 159192830Sed/* 160192830Sed * This (illegal) pool name is used when temporarily importing a spa_t in order 161192830Sed * to get the vdev stats associated with the imported devices. 162192830Sed */ 163192830Sed#define TRYIMPORT_NAME "$import" 164192830Sed 165192830Sed/* 166192830Sed * ========================================================================== 167192830Sed * SPA properties routines 168192830Sed * ========================================================================== 169192830Sed */ 170192830Sed 171192830Sed/* 172192830Sed * Add a (source=src, propname=propval) list to an nvlist. 173192830Sed */ 174192830Sedstatic void 175192830Sedspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 176192830Sed uint64_t intval, zprop_source_t src) 177192830Sed{ 178192830Sed const char *propname = zpool_prop_to_name(prop); 179192830Sed nvlist_t *propval; 180192830Sed 181192830Sed VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 182192830Sed VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 183192830Sed 184192830Sed if (strval != NULL) 185192830Sed VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 186192830Sed else 187192830Sed VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 188192830Sed 189192830Sed VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 190192830Sed nvlist_free(propval); 191192830Sed} 192192830Sed 193192830Sed/* 194192830Sed * Get property values from the spa configuration. 195192830Sed */ 196192830Sedstatic void 197192830Sedspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 198192830Sed{ 199192830Sed vdev_t *rvd = spa->spa_root_vdev; 200192830Sed dsl_pool_t *pool = spa->spa_dsl_pool; 201192830Sed uint64_t size; 202192830Sed uint64_t alloc; 203192830Sed uint64_t space; 204192914Sed uint64_t cap, version; 205192830Sed zprop_source_t src = ZPROP_SRC_NONE; 206192830Sed spa_config_dirent_t *dp; 207192830Sed 208192830Sed ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 209192830Sed 210192830Sed if (rvd != NULL) { 211192830Sed alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 212192830Sed size = metaslab_class_get_space(spa_normal_class(spa)); 213192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 214192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 215192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 216192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 217192830Sed size - alloc, src); 218192830Sed 219192830Sed space = 0; 220192830Sed for (int c = 0; c < rvd->vdev_children; c++) { 221192830Sed vdev_t *tvd = rvd->vdev_child[c]; 222192830Sed space += tvd->vdev_max_asize - tvd->vdev_asize; 223192830Sed } 224192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 225192830Sed src); 226192830Sed 227192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 228192830Sed (spa_mode(spa) == FREAD), src); 229192830Sed 230192830Sed cap = (size == 0) ? 0 : (alloc * 100 / size); 231192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 232192830Sed 233192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 234192830Sed ddt_get_pool_dedup_ratio(spa), src); 235192830Sed 236192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 237192830Sed rvd->vdev_state, src); 238192830Sed 239192830Sed version = spa_version(spa); 240192830Sed if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 241192830Sed src = ZPROP_SRC_DEFAULT; 242192830Sed else 243192830Sed src = ZPROP_SRC_LOCAL; 244192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 245192830Sed } 246192830Sed 247192830Sed if (pool != NULL) { 248192830Sed dsl_dir_t *freedir = pool->dp_free_dir; 249192830Sed 250192830Sed /* 251192830Sed * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 252192830Sed * when opening pools before this version freedir will be NULL. 253192830Sed */ 254192830Sed if (freedir != NULL) { 255192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 256192830Sed freedir->dd_phys->dd_used_bytes, src); 257192830Sed } else { 258192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 259192830Sed NULL, 0, src); 260192830Sed } 261192830Sed } 262192830Sed 263192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 264192830Sed 265192830Sed if (spa->spa_comment != NULL) { 266192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 267192830Sed 0, ZPROP_SRC_LOCAL); 268192830Sed } 269192830Sed 270192830Sed if (spa->spa_root != NULL) 271192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 272192830Sed 0, ZPROP_SRC_LOCAL); 273192830Sed 274192830Sed if ((dp = list_head(&spa->spa_config_list)) != NULL) { 275192830Sed if (dp->scd_path == NULL) { 276192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 277192830Sed "none", 0, ZPROP_SRC_LOCAL); 278192830Sed } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 279192830Sed spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 280192830Sed dp->scd_path, 0, ZPROP_SRC_LOCAL); 281192830Sed } 282192830Sed } 283192830Sed} 284192830Sed 285192830Sed/* 286192830Sed * Get zpool property values. 287192830Sed */ 288192830Sedint 289192830Sedspa_prop_get(spa_t *spa, nvlist_t **nvp) 290192830Sed{ 291192830Sed objset_t *mos = spa->spa_meta_objset; 292192830Sed zap_cursor_t zc; 293192830Sed zap_attribute_t za; 294192830Sed int err; 295192830Sed 296192830Sed VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 297192830Sed 298192830Sed mutex_enter(&spa->spa_props_lock); 299192830Sed 300192856Sed /* 301192830Sed * Get properties from the spa config. 302192830Sed */ 303192830Sed spa_prop_get_config(spa, nvp); 304192830Sed 305192830Sed /* If no pool property object, no more prop to get. */ 306192830Sed if (mos == NULL || spa->spa_pool_props_object == 0) { 307192830Sed mutex_exit(&spa->spa_props_lock); 308192830Sed return (0); 309192830Sed } 310228627Sdim 311192830Sed /* 312192830Sed * Get properties from the MOS pool property object. 313192830Sed */ 314192830Sed for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 315192830Sed (err = zap_cursor_retrieve(&zc, &za)) == 0; 316192830Sed zap_cursor_advance(&zc)) { 317192830Sed uint64_t intval = 0; 318192830Sed char *strval = NULL; 319192830Sed zprop_source_t src = ZPROP_SRC_DEFAULT; 320192830Sed zpool_prop_t prop; 321192830Sed 322192830Sed if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 323192830Sed continue; 324192830Sed 325192830Sed switch (za.za_integer_length) { 326192830Sed case 8: 327192830Sed /* integer property */ 328192830Sed if (za.za_first_integer != 329192830Sed zpool_prop_default_numeric(prop)) 330192830Sed src = ZPROP_SRC_LOCAL; 331192830Sed 332192830Sed if (prop == ZPOOL_PROP_BOOTFS) { 333192830Sed dsl_pool_t *dp; 334192830Sed dsl_dataset_t *ds = NULL; 335192830Sed 336192830Sed dp = spa_get_dsl(spa); 337192830Sed dsl_pool_config_enter(dp, FTAG); 338192830Sed if (err = dsl_dataset_hold_obj(dp, 339192830Sed za.za_first_integer, FTAG, &ds)) { 340192830Sed dsl_pool_config_exit(dp, FTAG); 341192830Sed break; 342192830Sed } 343192830Sed 344192830Sed strval = kmem_alloc( 345192830Sed MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 346192830Sed KM_SLEEP); 347192830Sed dsl_dataset_name(ds, strval); 348192830Sed dsl_dataset_rele(ds, FTAG); 349192830Sed dsl_pool_config_exit(dp, FTAG); 350192830Sed } else { 351192830Sed strval = NULL; 352192830Sed intval = za.za_first_integer; 353192830Sed } 354192830Sed 355192830Sed spa_prop_add_list(*nvp, prop, strval, intval, src); 356192830Sed 357192830Sed if (strval != NULL) 358192830Sed kmem_free(strval, 359192830Sed MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 360192830Sed 361192830Sed break; 362192830Sed 363192830Sed case 1: 364192830Sed /* string property */ 365192830Sed strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 366192830Sed err = zap_lookup(mos, spa->spa_pool_props_object, 367192830Sed za.za_name, 1, za.za_num_integers, strval); 368192830Sed if (err) { 369192830Sed kmem_free(strval, za.za_num_integers); 370192830Sed break; 371192830Sed } 372192830Sed spa_prop_add_list(*nvp, prop, strval, 0, src); 373192830Sed kmem_free(strval, za.za_num_integers); 374192830Sed break; 375192830Sed 376192830Sed default: 377192830Sed break; 378192830Sed } 379192830Sed } 380192830Sed zap_cursor_fini(&zc); 381192830Sed mutex_exit(&spa->spa_props_lock); 382192830Sedout: 383192830Sed if (err && err != ENOENT) { 384192830Sed nvlist_free(*nvp); 385192830Sed *nvp = NULL; 386192830Sed return (err); 387192830Sed } 388192830Sed 389192830Sed return (0); 390192830Sed} 391192830Sed 392192830Sed/* 393192830Sed * Validate the given pool properties nvlist and modify the list 394192830Sed * for the property values to be set. 395192830Sed */ 396192830Sedstatic int 397192830Sedspa_prop_validate(spa_t *spa, nvlist_t *props) 398192830Sed{ 399192830Sed nvpair_t *elem; 400192830Sed int error = 0, reset_bootfs = 0; 401192830Sed uint64_t objnum = 0; 402192830Sed boolean_t has_feature = B_FALSE; 403192830Sed 404192830Sed elem = NULL; 405192830Sed while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 406192830Sed uint64_t intval; 407192830Sed char *strval, *slash, *check, *fname; 408192830Sed const char *propname = nvpair_name(elem); 409192830Sed zpool_prop_t prop = zpool_name_to_prop(propname); 410192830Sed 411192830Sed switch (prop) { 412192830Sed case ZPROP_INVAL: 413192830Sed if (!zpool_prop_feature(propname)) { 414192830Sed error = SET_ERROR(EINVAL); 415192830Sed break; 416192830Sed } 417192830Sed 418192830Sed /* 419192830Sed * Sanitize the input. 420192830Sed */ 421192830Sed if (nvpair_type(elem) != DATA_TYPE_UINT64) { 422192830Sed error = SET_ERROR(EINVAL); 423192830Sed break; 424192830Sed } 425192830Sed 426192830Sed if (nvpair_value_uint64(elem, &intval) != 0) { 427192830Sed error = SET_ERROR(EINVAL); 428192830Sed break; 429192830Sed } 430192830Sed 431192830Sed if (intval != 0) { 432192830Sed error = SET_ERROR(EINVAL); 433192830Sed break; 434192830Sed } 435192830Sed 436192830Sed fname = strchr(propname, '@') + 1; 437192830Sed if (zfeature_lookup_name(fname, NULL) != 0) { 438192830Sed error = SET_ERROR(EINVAL); 439192830Sed break; 440192830Sed } 441192830Sed 442192830Sed has_feature = B_TRUE; 443192830Sed break; 444192830Sed 445192830Sed case ZPOOL_PROP_VERSION: 446192830Sed error = nvpair_value_uint64(elem, &intval); 447192830Sed if (!error && 448192830Sed (intval < spa_version(spa) || 449192830Sed intval > SPA_VERSION_BEFORE_FEATURES || 450192830Sed has_feature)) 451192830Sed error = SET_ERROR(EINVAL); 452192830Sed break; 453192830Sed 454192830Sed case ZPOOL_PROP_DELEGATION: 455192830Sed case ZPOOL_PROP_AUTOREPLACE: 456192830Sed case ZPOOL_PROP_LISTSNAPS: 457192830Sed case ZPOOL_PROP_AUTOEXPAND: 458192830Sed error = nvpair_value_uint64(elem, &intval); 459192830Sed if (!error && intval > 1) 460192830Sed error = SET_ERROR(EINVAL); 461192830Sed break; 462192830Sed 463192830Sed case ZPOOL_PROP_BOOTFS: 464192830Sed /* 465192830Sed * If the pool version is less than SPA_VERSION_BOOTFS, 466192830Sed * or the pool is still being created (version == 0), 467192830Sed * the bootfs property cannot be set. 468192830Sed */ 469192830Sed if (spa_version(spa) < SPA_VERSION_BOOTFS) { 470192830Sed error = SET_ERROR(ENOTSUP); 471192830Sed break; 472192830Sed } 473192830Sed 474192830Sed /* 475192830Sed * Make sure the vdev config is bootable 476192830Sed */ 477192830Sed if (!vdev_is_bootable(spa->spa_root_vdev)) { 478192830Sed error = SET_ERROR(ENOTSUP); 479192830Sed break; 480192830Sed } 481192830Sed 482192830Sed reset_bootfs = 1; 483192830Sed 484192830Sed error = nvpair_value_string(elem, &strval); 485192830Sed 486192830Sed if (!error) { 487192830Sed objset_t *os; 488192830Sed uint64_t compress; 489192830Sed 490192830Sed if (strval == NULL || strval[0] == '\0') { 491192830Sed objnum = zpool_prop_default_numeric( 492192830Sed ZPOOL_PROP_BOOTFS); 493192830Sed break; 494192830Sed } 495192830Sed 496192830Sed if (error = dmu_objset_hold(strval, FTAG, &os)) 497192830Sed break; 498192830Sed 499192830Sed /* Must be ZPL and not gzip compressed. */ 500192830Sed 501192830Sed if (dmu_objset_type(os) != DMU_OST_ZFS) { 502192830Sed error = SET_ERROR(ENOTSUP); 503192830Sed } else if ((error = 504192830Sed dsl_prop_get_int_ds(dmu_objset_ds(os), 505192830Sed zfs_prop_to_name(ZFS_PROP_COMPRESSION), 506192830Sed &compress)) == 0 && 507192830Sed !BOOTFS_COMPRESS_VALID(compress)) { 508192830Sed error = SET_ERROR(ENOTSUP); 509192830Sed } else { 510192830Sed objnum = dmu_objset_id(os); 511192830Sed } 512192830Sed dmu_objset_rele(os, FTAG); 513192830Sed } 514192830Sed break; 515192830Sed 516192830Sed case ZPOOL_PROP_FAILUREMODE: 517192830Sed error = nvpair_value_uint64(elem, &intval); 518192830Sed if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 519192830Sed intval > ZIO_FAILURE_MODE_PANIC)) 520192830Sed error = SET_ERROR(EINVAL); 521192830Sed 522192830Sed /* 523192830Sed * This is a special case which only occurs when 524192830Sed * the pool has completely failed. This allows 525192830Sed * the user to change the in-core failmode property 526192830Sed * without syncing it out to disk (I/Os might 527192830Sed * currently be blocked). We do this by returning 528192830Sed * EIO to the caller (spa_prop_set) to trick it 529192830Sed * into thinking we encountered a property validation 530192830Sed * error. 531192830Sed */ 532192830Sed if (!error && spa_suspended(spa)) { 533192830Sed spa->spa_failmode = intval; 534192914Sed error = SET_ERROR(EIO); 535192830Sed } 536192830Sed break; 537192830Sed 538192914Sed case ZPOOL_PROP_CACHEFILE: 539192914Sed if ((error = nvpair_value_string(elem, &strval)) != 0) 540192914Sed break; 541192914Sed 542192914Sed if (strval[0] == '\0') 543192914Sed break; 544192914Sed 545192914Sed if (strcmp(strval, "none") == 0) 546192830Sed break; 547192830Sed 548192830Sed if (strval[0] != '/') { 549192830Sed error = SET_ERROR(EINVAL); 550192830Sed break; 551192830Sed } 552192914Sed 553192830Sed slash = strrchr(strval, '/'); 554192914Sed ASSERT(slash != NULL); 555192914Sed 556192914Sed if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 557192830Sed strcmp(slash, "/..") == 0) 558192830Sed error = SET_ERROR(EINVAL); 559192830Sed break; 560192830Sed 561192830Sed case ZPOOL_PROP_COMMENT: 562192914Sed if ((error = nvpair_value_string(elem, &strval)) != 0) 563192830Sed break; 564192830Sed for (check = strval; *check != '\0'; check++) { 565192830Sed /* 566192830Sed * The kernel doesn't have an easy isprint() 567192830Sed * check. For this kernel check, we merely 568192830Sed * check ASCII apart from DEL. Fix this if 569192830Sed * there is an easy-to-use kernel isprint(). 570192830Sed */ 571192830Sed if (*check >= 0x7f) { 572192830Sed error = SET_ERROR(EINVAL); 573192830Sed break; 574192830Sed } 575192830Sed check++; 576192830Sed } 577192830Sed if (strlen(strval) > ZPROP_MAX_COMMENT) 578192830Sed error = E2BIG; 579192830Sed break; 580192914Sed 581192914Sed case ZPOOL_PROP_DEDUPDITTO: 582192830Sed if (spa_version(spa) < SPA_VERSION_DEDUP) 583192830Sed error = SET_ERROR(ENOTSUP); 584192830Sed else 585192830Sed error = nvpair_value_uint64(elem, &intval); 586192830Sed if (error == 0 && 587192830Sed intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 588192830Sed error = SET_ERROR(EINVAL); 589192830Sed break; 590192830Sed } 591192830Sed 592192830Sed if (error) 593192830Sed break; 594192830Sed } 595192830Sed 596192830Sed if (!error && reset_bootfs) { 597192830Sed error = nvlist_remove(props, 598192830Sed zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 599192830Sed 600192830Sed if (!error) { 601192830Sed error = nvlist_add_uint64(props, 602192830Sed zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 603192830Sed } 604192830Sed } 605192830Sed 606192830Sed return (error); 607192830Sed} 608192830Sed 609192830Sedvoid 610192914Sedspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 611192914Sed{ 612192830Sed char *cachefile; 613192830Sed spa_config_dirent_t *dp; 614192914Sed 615192914Sed if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 616192914Sed &cachefile) != 0) 617192856Sed return; 618192856Sed 619192856Sed dp = kmem_alloc(sizeof (spa_config_dirent_t), 620192914Sed KM_SLEEP); 621192914Sed 622192914Sed if (cachefile[0] == '\0') 623192914Sed dp->scd_path = spa_strdup(spa_config_path); 624192914Sed else if (strcmp(cachefile, "none") == 0) 625192914Sed dp->scd_path = NULL; 626192914Sed else 627192914Sed dp->scd_path = spa_strdup(cachefile); 628192856Sed 629192856Sed list_insert_head(&spa->spa_config_list, dp); 630192914Sed if (need_sync) 631192830Sed spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 632192830Sed} 633192914Sed 634192914Sedint 635192914Sedspa_prop_set(spa_t *spa, nvlist_t *nvp) 636192830Sed{ 637192830Sed int error; 638192830Sed nvpair_t *elem = NULL; 639192830Sed boolean_t need_sync = B_FALSE; 640192830Sed 641192830Sed if ((error = spa_prop_validate(spa, nvp)) != 0) 642192830Sed return (error); 643192830Sed 644192830Sed while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 645192830Sed zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 646192830Sed 647192830Sed if (prop == ZPOOL_PROP_CACHEFILE || 648192830Sed prop == ZPOOL_PROP_ALTROOT || 649192830Sed prop == ZPOOL_PROP_READONLY) 650192830Sed continue; 651192830Sed 652192830Sed if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 653192830Sed uint64_t ver; 654192830Sed 655192830Sed if (prop == ZPOOL_PROP_VERSION) { 656192830Sed VERIFY(nvpair_value_uint64(elem, &ver) == 0); 657192830Sed } else { 658192830Sed ASSERT(zpool_prop_feature(nvpair_name(elem))); 659192830Sed ver = SPA_VERSION_FEATURES; 660192830Sed need_sync = B_TRUE; 661192830Sed } 662192830Sed 663192830Sed /* Save time if the version is already set. */ 664192830Sed if (ver == spa_version(spa)) 665192830Sed continue; 666192830Sed 667192830Sed /* 668192830Sed * In addition to the pool directory object, we might 669192830Sed * create the pool properties object, the features for 670192830Sed * read object, the features for write object, or the 671192830Sed * feature descriptions object. 672192830Sed */ 673192830Sed error = dsl_sync_task(spa->spa_name, NULL, 674192830Sed spa_sync_version, &ver, 6); 675192830Sed if (error) 676192830Sed return (error); 677192830Sed continue; 678192830Sed } 679192830Sed 680192830Sed need_sync = B_TRUE; 681192830Sed break; 682192830Sed } 683192830Sed 684192830Sed if (need_sync) { 685192830Sed return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 686192830Sed nvp, 6)); 687192830Sed } 688192830Sed 689192830Sed return (0); 690192830Sed} 691192830Sed 692192830Sed/* 693192830Sed * If the bootfs property value is dsobj, clear it. 694192830Sed */ 695192830Sedvoid 696192830Sedspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 697192830Sed{ 698192830Sed if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 699192830Sed VERIFY(zap_remove(spa->spa_meta_objset, 700192830Sed spa->spa_pool_props_object, 701192830Sed zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 702192830Sed spa->spa_bootfs = 0; 703192830Sed } 704192830Sed} 705192830Sed 706192830Sed/*ARGSUSED*/ 707192830Sedstatic int 708192830Sedspa_change_guid_check(void *arg, dmu_tx_t *tx) 709192830Sed{ 710192830Sed uint64_t *newguid = arg; 711192830Sed spa_t *spa = dmu_tx_pool(tx)->dp_spa; 712192830Sed vdev_t *rvd = spa->spa_root_vdev; 713192830Sed uint64_t vdev_state; 714192830Sed 715192830Sed spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 716192830Sed vdev_state = rvd->vdev_state; 717192830Sed spa_config_exit(spa, SCL_STATE, FTAG); 718192830Sed 719192830Sed if (vdev_state != VDEV_STATE_HEALTHY) 720192830Sed return (SET_ERROR(ENXIO)); 721192830Sed 722192830Sed ASSERT3U(spa_guid(spa), !=, *newguid); 723192830Sed 724192830Sed return (0); 725196750Sache} 726192830Sed 727192830Sedstatic void 728192830Sedspa_change_guid_sync(void *arg, dmu_tx_t *tx) 729192830Sed{ 730192830Sed uint64_t *newguid = arg; 731192830Sed spa_t *spa = dmu_tx_pool(tx)->dp_spa; 732192830Sed uint64_t oldguid; 733196750Sache vdev_t *rvd = spa->spa_root_vdev; 734192830Sed 735192830Sed oldguid = spa_guid(spa); 736192830Sed 737192830Sed spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 738192830Sed rvd->vdev_guid = *newguid; 739192830Sed rvd->vdev_guid_sum += (*newguid - oldguid); 740192830Sed vdev_config_dirty(rvd); 741192830Sed spa_config_exit(spa, SCL_STATE, FTAG); 742192830Sed 743192830Sed spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 744192830Sed oldguid, *newguid); 745192830Sed} 746192830Sed 747192830Sed/* 748192830Sed * Change the GUID for the pool. This is done so that we can later 749192830Sed * re-import a pool built from a clone of our own vdevs. We will modify 750192830Sed * the root vdev's guid, our own pool guid, and then mark all of our 751192830Sed * vdevs dirty. Note that we must make sure that all our vdevs are 752192830Sed * online when we do this, or else any vdevs that weren't present 753192830Sed * would be orphaned from our pool. We are also going to issue a 754192830Sed * sysevent to update any watchers. 755192830Sed */ 756192830Sedint 757192830Sedspa_change_guid(spa_t *spa) 758192830Sed{ 759192830Sed int error; 760192830Sed uint64_t guid; 761192830Sed 762192830Sed mutex_enter(&spa->spa_vdev_top_lock); 763192830Sed mutex_enter(&spa_namespace_lock); 764192830Sed guid = spa_generate_guid(NULL); 765192830Sed 766192830Sed error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 767192830Sed spa_change_guid_sync, &guid, 5); 768192830Sed 769192830Sed if (error == 0) { 770192830Sed spa_config_sync(spa, B_FALSE, B_TRUE); 771192830Sed spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 772192830Sed } 773192830Sed 774192830Sed mutex_exit(&spa_namespace_lock); 775192830Sed mutex_exit(&spa->spa_vdev_top_lock); 776192830Sed 777192830Sed return (error); 778192830Sed} 779192830Sed 780192830Sed/* 781192830Sed * ========================================================================== 782192830Sed * SPA state manipulation (open/create/destroy/import/export) 783192830Sed * ========================================================================== 784192830Sed */ 785192830Sed 786192830Sedstatic int 787192830Sedspa_error_entry_compare(const void *a, const void *b) 788192830Sed{ 789192830Sed spa_error_entry_t *sa = (spa_error_entry_t *)a; 790192830Sed spa_error_entry_t *sb = (spa_error_entry_t *)b; 791192830Sed int ret; 792192830Sed 793192830Sed ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 794192830Sed sizeof (zbookmark_t)); 795192830Sed 796192830Sed if (ret < 0) 797192830Sed return (-1); 798192830Sed else if (ret > 0) 799192830Sed return (1); 800192830Sed else 801192830Sed return (0); 802192830Sed} 803192830Sed 804192830Sed/* 805192830Sed * Utility function which retrieves copies of the current logs and 806192830Sed * re-initializes them in the process. 807192830Sed */ 808192830Sedvoid 809192830Sedspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 810192914Sed{ 811192830Sed ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 812192830Sed 813192830Sed bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 814192830Sed bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 815192830Sed 816192830Sed avl_create(&spa->spa_errlist_scrub, 817192830Sed spa_error_entry_compare, sizeof (spa_error_entry_t), 818192830Sed offsetof(spa_error_entry_t, se_avl)); 819213567Sed avl_create(&spa->spa_errlist_last, 820192830Sed spa_error_entry_compare, sizeof (spa_error_entry_t), 821192830Sed offsetof(spa_error_entry_t, se_avl)); 822192830Sed} 823192830Sed 824192830Sedstatic taskq_t * 825192830Sedspa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 826192830Sed uint_t value) 827192830Sed{ 828192830Sed uint_t flags = TASKQ_PREPOPULATE; 829192830Sed boolean_t batch = B_FALSE; 830192830Sed 831192830Sed switch (mode) { 832192830Sed case zti_mode_null: 833192830Sed return (NULL); /* no taskq needed */ 834192830Sed 835192830Sed case zti_mode_fixed: 836192830Sed ASSERT3U(value, >=, 1); 837192830Sed value = MAX(value, 1); 838192914Sed break; 839192830Sed 840192830Sed case zti_mode_batch: 841192830Sed batch = B_TRUE; 842192830Sed flags |= TASKQ_THREADS_CPU_PCT; 843192830Sed value = zio_taskq_batch_pct; 844192830Sed break; 845192830Sed 846192830Sed case zti_mode_online_percent: 847192830Sed flags |= TASKQ_THREADS_CPU_PCT; 848192830Sed break; 849192830Sed 850192914Sed default: 851192830Sed panic("unrecognized mode for %s taskq (%u:%u) in " 852192830Sed "spa_activate()", 853192830Sed name, mode, value); 854192830Sed break; 855192830Sed } 856192830Sed 857192830Sed#ifdef SYSDC 858192830Sed if (zio_taskq_sysdc && spa->spa_proc != &p0) { 859192830Sed if (batch) 860192830Sed flags |= TASKQ_DC_BATCH; 861192830Sed 862192830Sed return (taskq_create_sysdc(name, value, 50, INT_MAX, 863192830Sed spa->spa_proc, zio_taskq_basedc, flags)); 864192830Sed } 865192830Sed#endif 866192830Sed return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 867192830Sed spa->spa_proc, flags)); 868192830Sed} 869192830Sed 870192830Sedstatic void 871192830Sedspa_create_zio_taskqs(spa_t *spa) 872192830Sed{ 873192830Sed for (int t = 0; t < ZIO_TYPES; t++) { 874192830Sed for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 875192830Sed const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 876192830Sed enum zti_modes mode = ztip->zti_mode; 877192830Sed uint_t value = ztip->zti_value; 878192830Sed char name[32]; 879192830Sed 880192830Sed (void) snprintf(name, sizeof (name), 881192830Sed "%s_%s", zio_type_name[t], zio_taskq_types[q]); 882192830Sed 883192830Sed spa->spa_zio_taskq[t][q] = 884192830Sed spa_taskq_create(spa, name, mode, value); 885192830Sed } 886192830Sed } 887192830Sed} 888192830Sed 889192830Sed#ifdef _KERNEL 890192830Sed#ifdef SPA_PROCESS 891192830Sedstatic void 892192830Sedspa_thread(void *arg) 893192830Sed{ 894192830Sed callb_cpr_t cprinfo; 895192830Sed 896192830Sed spa_t *spa = arg; 897192830Sed user_t *pu = PTOU(curproc); 898192830Sed 899192830Sed CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 900192830Sed spa->spa_name); 901192830Sed 902192830Sed ASSERT(curproc != &p0); 903192830Sed (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 904192830Sed "zpool-%s", spa->spa_name); 905192830Sed (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 906192830Sed 907192830Sed#ifdef PSRSET_BIND 908192830Sed /* bind this thread to the requested psrset */ 909192830Sed if (zio_taskq_psrset_bind != PS_NONE) { 910192830Sed pool_lock(); 911192830Sed mutex_enter(&cpu_lock); 912192830Sed mutex_enter(&pidlock); 913192830Sed mutex_enter(&curproc->p_lock); 914192830Sed 915192830Sed if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 916192830Sed 0, NULL, NULL) == 0) { 917192830Sed curthread->t_bind_pset = zio_taskq_psrset_bind; 918192830Sed } else { 919192830Sed cmn_err(CE_WARN, 920192830Sed "Couldn't bind process for zfs pool \"%s\" to " 921192830Sed "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 922192830Sed } 923192830Sed 924192830Sed mutex_exit(&curproc->p_lock); 925192830Sed mutex_exit(&pidlock); 926192830Sed mutex_exit(&cpu_lock); 927192830Sed pool_unlock(); 928192830Sed } 929192830Sed#endif 930192830Sed 931192830Sed#ifdef SYSDC 932192830Sed if (zio_taskq_sysdc) { 933192830Sed sysdc_thread_enter(curthread, 100, 0); 934192830Sed } 935192954Smarcel#endif 936192830Sed 937192830Sed spa->spa_proc = curproc; 938192830Sed spa->spa_did = curthread->t_did; 939192914Sed 940192830Sed spa_create_zio_taskqs(spa); 941192830Sed 942192830Sed mutex_enter(&spa->spa_proc_lock); 943192830Sed ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 944192830Sed 945192830Sed spa->spa_proc_state = SPA_PROC_ACTIVE; 946192830Sed cv_broadcast(&spa->spa_proc_cv); 947192830Sed 948192830Sed CALLB_CPR_SAFE_BEGIN(&cprinfo); 949192830Sed while (spa->spa_proc_state == SPA_PROC_ACTIVE) 950192830Sed cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 951192830Sed CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 952192830Sed 953192830Sed ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 954192830Sed spa->spa_proc_state = SPA_PROC_GONE; 955192830Sed spa->spa_proc = &p0; 956192830Sed cv_broadcast(&spa->spa_proc_cv); 957192830Sed CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 958192830Sed 959192830Sed mutex_enter(&curproc->p_lock); 960192830Sed lwp_exit(); 961192830Sed} 962192830Sed#endif /* SPA_PROCESS */ 963192830Sed#endif 964192830Sed 965192830Sed/* 966192830Sed * Activate an uninitialized pool. 967196750Sache */ 968192830Sedstatic void 969192830Sedspa_activate(spa_t *spa, int mode) 970192830Sed{ 971192830Sed ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 972192830Sed 973196750Sache spa->spa_state = POOL_STATE_ACTIVE; 974192830Sed spa->spa_mode = mode; 975192830Sed 976192914Sed spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 977196750Sache spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 978192830Sed 979192830Sed /* Try to create a covering process */ 980192830Sed mutex_enter(&spa->spa_proc_lock); 981192830Sed ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 982192830Sed ASSERT(spa->spa_proc == &p0); 983192954Smarcel spa->spa_did = 0; 984192830Sed 985192830Sed#ifdef SPA_PROCESS 986192830Sed /* Only create a process if we're going to be around a while. */ 987192830Sed if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 988192830Sed if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 989192830Sed NULL, 0) == 0) { 990192830Sed spa->spa_proc_state = SPA_PROC_CREATED; 991192830Sed while (spa->spa_proc_state == SPA_PROC_CREATED) { 992192830Sed cv_wait(&spa->spa_proc_cv, 993192830Sed &spa->spa_proc_lock); 994192830Sed } 995192830Sed ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 996192830Sed ASSERT(spa->spa_proc != &p0); 997192830Sed ASSERT(spa->spa_did != 0); 998192830Sed } else { 999192830Sed#ifdef _KERNEL 1000192830Sed cmn_err(CE_WARN, 1001192830Sed "Couldn't create process for zfs pool \"%s\"\n", 1002192830Sed spa->spa_name); 1003192830Sed#endif 1004192830Sed } 1005192830Sed } 1006192830Sed#endif /* SPA_PROCESS */ 1007192830Sed mutex_exit(&spa->spa_proc_lock); 1008192830Sed 1009192830Sed /* If we didn't create a process, we need to create our taskqs. */ 1010192830Sed ASSERT(spa->spa_proc == &p0); 1011192830Sed if (spa->spa_proc == &p0) { 1012192830Sed spa_create_zio_taskqs(spa); 1013192830Sed } 1014192830Sed 1015192830Sed /* 1016192830Sed * Start TRIM thread. 1017192830Sed */ 1018192830Sed trim_thread_create(spa); 1019192830Sed 1020192830Sed list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1021192830Sed offsetof(vdev_t, vdev_config_dirty_node)); 1022192830Sed list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1023192830Sed offsetof(vdev_t, vdev_state_dirty_node)); 1024192830Sed 1025192830Sed txg_list_create(&spa->spa_vdev_txg_list, 1026192830Sed offsetof(struct vdev, vdev_txg_node)); 1027192830Sed 1028192830Sed avl_create(&spa->spa_errlist_scrub, 1029192830Sed spa_error_entry_compare, sizeof (spa_error_entry_t), 1030192830Sed offsetof(spa_error_entry_t, se_avl)); 1031192830Sed avl_create(&spa->spa_errlist_last, 1032192830Sed spa_error_entry_compare, sizeof (spa_error_entry_t), 1033192830Sed offsetof(spa_error_entry_t, se_avl)); 1034192830Sed} 1035192830Sed 1036192830Sed/* 1037192830Sed * Opposite of spa_activate(). 1038192830Sed */ 1039192830Sedstatic void 1040192830Sedspa_deactivate(spa_t *spa) 1041192830Sed{ 1042196750Sache ASSERT(spa->spa_sync_on == B_FALSE); 1043192830Sed ASSERT(spa->spa_dsl_pool == NULL); 1044192830Sed ASSERT(spa->spa_root_vdev == NULL); 1045192830Sed ASSERT(spa->spa_async_zio_root == NULL); 1046192830Sed ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1047192830Sed 1048192830Sed /* 1049192830Sed * Stop TRIM thread in case spa_unload() wasn't called directly 1050192830Sed * before spa_deactivate(). 1051192830Sed */ 1052192830Sed trim_thread_destroy(spa); 1053192830Sed 1054192830Sed txg_list_destroy(&spa->spa_vdev_txg_list); 1055192830Sed 1056192830Sed list_destroy(&spa->spa_config_dirty_list); 1057192830Sed list_destroy(&spa->spa_state_dirty_list); 1058192830Sed 1059192830Sed for (int t = 0; t < ZIO_TYPES; t++) { 1060192830Sed for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1061192830Sed if (spa->spa_zio_taskq[t][q] != NULL) 1062192830Sed taskq_destroy(spa->spa_zio_taskq[t][q]); 1063192830Sed spa->spa_zio_taskq[t][q] = NULL; 1064192830Sed } 1065192830Sed } 1066192830Sed 1067192830Sed metaslab_class_destroy(spa->spa_normal_class); 1068192830Sed spa->spa_normal_class = NULL; 1069192830Sed 1070192830Sed metaslab_class_destroy(spa->spa_log_class); 1071192830Sed spa->spa_log_class = NULL; 1072192830Sed 1073192830Sed /* 1074192830Sed * If this was part of an import or the open otherwise failed, we may 1075192830Sed * still have errors left in the queues. Empty them just in case. 1076192830Sed */ 1077192830Sed spa_errlog_drain(spa); 1078192914Sed 1079192830Sed avl_destroy(&spa->spa_errlist_scrub); 1080192830Sed avl_destroy(&spa->spa_errlist_last); 1081192830Sed 1082192830Sed spa->spa_state = POOL_STATE_UNINITIALIZED; 1083192830Sed 1084192830Sed mutex_enter(&spa->spa_proc_lock); 1085192830Sed if (spa->spa_proc_state != SPA_PROC_NONE) { 1086192830Sed ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1087192830Sed spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1088192830Sed cv_broadcast(&spa->spa_proc_cv); 1089192830Sed while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1090192830Sed ASSERT(spa->spa_proc != &p0); 1091192830Sed cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1092192830Sed } 1093192830Sed ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1094192830Sed spa->spa_proc_state = SPA_PROC_NONE; 1095192830Sed } 1096192830Sed ASSERT(spa->spa_proc == &p0); 1097192830Sed mutex_exit(&spa->spa_proc_lock); 1098192830Sed 1099192830Sed#ifdef SPA_PROCESS 1100192830Sed /* 1101192914Sed * We want to make sure spa_thread() has actually exited the ZFS 1102192830Sed * module, so that the module can't be unloaded out from underneath 1103192830Sed * it. 1104192830Sed */ 1105192830Sed if (spa->spa_did != 0) { 1106192914Sed thread_join(spa->spa_did); 1107192830Sed spa->spa_did = 0; 1108192914Sed } 1109192830Sed#endif /* SPA_PROCESS */ 1110192830Sed} 1111192830Sed 1112192830Sed/* 1113192830Sed * Verify a pool configuration, and construct the vdev tree appropriately. This 1114192830Sed * will create all the necessary vdevs in the appropriate layout, with each vdev 1115192830Sed * in the CLOSED state. This will prep the pool before open/creation/import. 1116192830Sed * All vdev validation is done by the vdev_alloc() routine. 1117192830Sed */ 1118192830Sedstatic int 1119192830Sedspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1120192830Sed uint_t id, int atype) 1121192830Sed{ 1122192830Sed nvlist_t **child; 1123192830Sed uint_t children; 1124192830Sed int error; 1125192830Sed 1126192830Sed if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1127192830Sed return (error); 1128192830Sed 1129192830Sed if ((*vdp)->vdev_ops->vdev_op_leaf) 1130192830Sed return (0); 1131192830Sed 1132192830Sed error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1133192830Sed &child, &children); 1134192830Sed 1135192830Sed if (error == ENOENT) 1136192830Sed return (0); 1137192830Sed 1138192830Sed if (error) { 1139192830Sed vdev_free(*vdp); 1140192830Sed *vdp = NULL; 1141192830Sed return (SET_ERROR(EINVAL)); 1142192830Sed } 1143192830Sed 1144192830Sed for (int c = 0; c < children; c++) { 1145192830Sed vdev_t *vd; 1146192830Sed if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1147192830Sed atype)) != 0) { 1148192830Sed vdev_free(*vdp); 1149192830Sed *vdp = NULL; 1150192830Sed return (error); 1151192914Sed } 1152192830Sed } 1153192914Sed 1154192830Sed ASSERT(*vdp != NULL); 1155192830Sed 1156192830Sed return (0); 1157192830Sed} 1158192830Sed 1159192830Sed/* 1160192830Sed * Opposite of spa_load(). 1161192830Sed */ 1162192830Sedstatic void 1163192830Sedspa_unload(spa_t *spa) 1164192830Sed{ 1165192830Sed int i; 1166192830Sed 1167192830Sed ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1168192830Sed 1169192830Sed /* 1170192830Sed * Stop TRIM thread. 1171192830Sed */ 1172192830Sed trim_thread_destroy(spa); 1173192830Sed 1174192830Sed /* 1175192830Sed * Stop async tasks. 1176192830Sed */ 1177192830Sed spa_async_suspend(spa); 1178192830Sed 1179192830Sed /* 1180192830Sed * Stop syncing. 1181192830Sed */ 1182192830Sed if (spa->spa_sync_on) { 1183192830Sed txg_sync_stop(spa->spa_dsl_pool); 1184192830Sed spa->spa_sync_on = B_FALSE; 1185192830Sed } 1186192830Sed 1187192914Sed /* 1188192830Sed * Wait for any outstanding async I/O to complete. 1189192830Sed */ 1190192830Sed if (spa->spa_async_zio_root != NULL) { 1191192830Sed (void) zio_wait(spa->spa_async_zio_root); 1192192830Sed spa->spa_async_zio_root = NULL; 1193192830Sed } 1194192830Sed 1195192830Sed bpobj_close(&spa->spa_deferred_bpobj); 1196192830Sed 1197192830Sed /* 1198192830Sed * Close the dsl pool. 1199192830Sed */ 1200192830Sed if (spa->spa_dsl_pool) { 1201192830Sed dsl_pool_close(spa->spa_dsl_pool); 1202192830Sed spa->spa_dsl_pool = NULL; 1203192830Sed spa->spa_meta_objset = NULL; 1204192830Sed } 1205192830Sed 1206192830Sed ddt_unload(spa); 1207192830Sed 1208192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1209192830Sed 1210192830Sed /* 1211192830Sed * Drop and purge level 2 cache 1212192830Sed */ 1213192830Sed spa_l2cache_drop(spa); 1214192830Sed 1215192830Sed /* 1216192830Sed * Close all vdevs. 1217192830Sed */ 1218192830Sed if (spa->spa_root_vdev) 1219192830Sed vdev_free(spa->spa_root_vdev); 1220192830Sed ASSERT(spa->spa_root_vdev == NULL); 1221192830Sed 1222213567Sed for (i = 0; i < spa->spa_spares.sav_count; i++) 1223192830Sed vdev_free(spa->spa_spares.sav_vdevs[i]); 1224192830Sed if (spa->spa_spares.sav_vdevs) { 1225192830Sed kmem_free(spa->spa_spares.sav_vdevs, 1226213567Sed spa->spa_spares.sav_count * sizeof (void *)); 1227192830Sed spa->spa_spares.sav_vdevs = NULL; 1228192830Sed } 1229192830Sed if (spa->spa_spares.sav_config) { 1230192830Sed nvlist_free(spa->spa_spares.sav_config); 1231192830Sed spa->spa_spares.sav_config = NULL; 1232192830Sed } 1233192830Sed spa->spa_spares.sav_count = 0; 1234192830Sed 1235192830Sed for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1236192830Sed vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1237192830Sed vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1238192830Sed } 1239192830Sed if (spa->spa_l2cache.sav_vdevs) { 1240192830Sed kmem_free(spa->spa_l2cache.sav_vdevs, 1241192830Sed spa->spa_l2cache.sav_count * sizeof (void *)); 1242192830Sed spa->spa_l2cache.sav_vdevs = NULL; 1243192830Sed } 1244192830Sed if (spa->spa_l2cache.sav_config) { 1245192830Sed nvlist_free(spa->spa_l2cache.sav_config); 1246192830Sed spa->spa_l2cache.sav_config = NULL; 1247192830Sed } 1248192830Sed spa->spa_l2cache.sav_count = 0; 1249192830Sed 1250192830Sed spa->spa_async_suspended = 0; 1251192830Sed 1252192830Sed if (spa->spa_comment != NULL) { 1253192830Sed spa_strfree(spa->spa_comment); 1254192830Sed spa->spa_comment = NULL; 1255192830Sed } 1256192830Sed 1257192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 1258192830Sed} 1259192830Sed 1260192830Sed/* 1261192830Sed * Load (or re-load) the current list of vdevs describing the active spares for 1262192830Sed * this pool. When this is called, we have some form of basic information in 1263192830Sed * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1264192830Sed * then re-generate a more complete list including status information. 1265192830Sed */ 1266192830Sedstatic void 1267192830Sedspa_load_spares(spa_t *spa) 1268192830Sed{ 1269192830Sed nvlist_t **spares; 1270192830Sed uint_t nspares; 1271192830Sed int i; 1272192830Sed vdev_t *vd, *tvd; 1273192830Sed 1274192830Sed ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1275192830Sed 1276192830Sed /* 1277213567Sed * First, close and free any existing spare vdevs. 1278192830Sed */ 1279192830Sed for (i = 0; i < spa->spa_spares.sav_count; i++) { 1280192830Sed vd = spa->spa_spares.sav_vdevs[i]; 1281192830Sed 1282192830Sed /* Undo the call to spa_activate() below */ 1283192830Sed if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1284192830Sed B_FALSE)) != NULL && tvd->vdev_isspare) 1285192830Sed spa_spare_remove(tvd); 1286192830Sed vdev_close(vd); 1287192830Sed vdev_free(vd); 1288192830Sed } 1289192830Sed 1290192830Sed if (spa->spa_spares.sav_vdevs) 1291192830Sed kmem_free(spa->spa_spares.sav_vdevs, 1292192830Sed spa->spa_spares.sav_count * sizeof (void *)); 1293192830Sed 1294192830Sed if (spa->spa_spares.sav_config == NULL) 1295192914Sed nspares = 0; 1296192830Sed else 1297192830Sed VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1298192830Sed ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1299192830Sed 1300192830Sed spa->spa_spares.sav_count = (int)nspares; 1301192830Sed spa->spa_spares.sav_vdevs = NULL; 1302192830Sed 1303192830Sed if (nspares == 0) 1304192830Sed return; 1305192830Sed 1306192830Sed /* 1307192830Sed * Construct the array of vdevs, opening them to get status in the 1308192830Sed * process. For each spare, there is potentially two different vdev_t 1309192830Sed * structures associated with it: one in the list of spares (used only 1310192830Sed * for basic validation purposes) and one in the active vdev 1311192830Sed * configuration (if it's spared in). During this phase we open and 1312192830Sed * validate each vdev on the spare list. If the vdev also exists in the 1313192830Sed * active configuration, then we also mark this vdev as an active spare. 1314192830Sed */ 1315192830Sed spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1316213567Sed KM_SLEEP); 1317192830Sed for (i = 0; i < spa->spa_spares.sav_count; i++) { 1318192830Sed VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1319192830Sed VDEV_ALLOC_SPARE) == 0); 1320192830Sed ASSERT(vd != NULL); 1321192830Sed 1322192830Sed spa->spa_spares.sav_vdevs[i] = vd; 1323192830Sed 1324192830Sed if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1325192830Sed B_FALSE)) != NULL) { 1326192830Sed if (!tvd->vdev_isspare) 1327192830Sed spa_spare_add(tvd); 1328192830Sed 1329192830Sed /* 1330192830Sed * We only mark the spare active if we were successfully 1331192830Sed * able to load the vdev. Otherwise, importing a pool 1332192830Sed * with a bad active spare would result in strange 1333192830Sed * behavior, because multiple pool would think the spare 1334192830Sed * is actively in use. 1335192914Sed * 1336192830Sed * There is a vulnerability here to an equally bizarre 1337192914Sed * circumstance, where a dead active spare is later 1338192914Sed * brought back to life (onlined or otherwise). Given 1339192830Sed * the rarity of this scenario, and the extra complexity 1340192830Sed * it adds, we ignore the possibility. 1341192830Sed */ 1342192830Sed if (!vdev_is_dead(tvd)) 1343192830Sed spa_spare_activate(tvd); 1344192830Sed } 1345192830Sed 1346192830Sed vd->vdev_top = vd; 1347192830Sed vd->vdev_aux = &spa->spa_spares; 1348192830Sed 1349192830Sed if (vdev_open(vd) != 0) 1350192830Sed continue; 1351192914Sed 1352192830Sed if (vdev_validate_aux(vd) == 0) 1353192914Sed spa_spare_add(vd); 1354192914Sed } 1355192830Sed 1356192830Sed /* 1357192830Sed * Recompute the stashed list of spares, with status information 1358192830Sed * this time. 1359192830Sed */ 1360192830Sed VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1361192830Sed DATA_TYPE_NVLIST_ARRAY) == 0); 1362192830Sed 1363192830Sed spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1364192830Sed KM_SLEEP); 1365192830Sed for (i = 0; i < spa->spa_spares.sav_count; i++) 1366192830Sed spares[i] = vdev_config_generate(spa, 1367192914Sed spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1368192830Sed VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1369192830Sed ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1370192830Sed for (i = 0; i < spa->spa_spares.sav_count; i++) 1371192830Sed nvlist_free(spares[i]); 1372192830Sed kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1373192830Sed} 1374192830Sed 1375192830Sed/* 1376192830Sed * Load (or re-load) the current list of vdevs describing the active l2cache for 1377192830Sed * this pool. When this is called, we have some form of basic information in 1378192830Sed * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1379192830Sed * then re-generate a more complete list including status information. 1380192830Sed * Devices which are already active have their details maintained, and are 1381192830Sed * not re-opened. 1382192830Sed */ 1383192830Sedstatic void 1384192830Sedspa_load_l2cache(spa_t *spa) 1385192830Sed{ 1386192914Sed nvlist_t **l2cache; 1387192830Sed uint_t nl2cache; 1388192830Sed int i, j, oldnvdevs; 1389192830Sed uint64_t guid; 1390192830Sed vdev_t *vd, **oldvdevs, **newvdevs; 1391192830Sed spa_aux_vdev_t *sav = &spa->spa_l2cache; 1392192830Sed 1393192830Sed ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1394192830Sed 1395192830Sed if (sav->sav_config != NULL) { 1396192830Sed VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1397192830Sed ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1398192830Sed newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1399192830Sed } else { 1400192830Sed nl2cache = 0; 1401192830Sed newvdevs = NULL; 1402192830Sed } 1403192830Sed 1404192830Sed oldvdevs = sav->sav_vdevs; 1405192830Sed oldnvdevs = sav->sav_count; 1406192830Sed sav->sav_vdevs = NULL; 1407192830Sed sav->sav_count = 0; 1408192830Sed 1409192830Sed /* 1410192830Sed * Process new nvlist of vdevs. 1411192830Sed */ 1412192830Sed for (i = 0; i < nl2cache; i++) { 1413192830Sed VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1414192830Sed &guid) == 0); 1415192830Sed 1416192830Sed newvdevs[i] = NULL; 1417192830Sed for (j = 0; j < oldnvdevs; j++) { 1418192830Sed vd = oldvdevs[j]; 1419192830Sed if (vd != NULL && guid == vd->vdev_guid) { 1420192830Sed /* 1421192830Sed * Retain previous vdev for add/remove ops. 1422192830Sed */ 1423192830Sed newvdevs[i] = vd; 1424192914Sed oldvdevs[j] = NULL; 1425192830Sed break; 1426192830Sed } 1427192830Sed } 1428192830Sed 1429192830Sed if (newvdevs[i] == NULL) { 1430192830Sed /* 1431192830Sed * Create new vdev 1432192830Sed */ 1433192830Sed VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1434192830Sed VDEV_ALLOC_L2CACHE) == 0); 1435192830Sed ASSERT(vd != NULL); 1436192830Sed newvdevs[i] = vd; 1437192830Sed 1438192830Sed /* 1439192830Sed * Commit this vdev as an l2cache device, 1440192830Sed * even if it fails to open. 1441192830Sed */ 1442192830Sed spa_l2cache_add(vd); 1443192830Sed 1444192830Sed vd->vdev_top = vd; 1445192830Sed vd->vdev_aux = sav; 1446192830Sed 1447192830Sed spa_l2cache_activate(vd); 1448192830Sed 1449192830Sed if (vdev_open(vd) != 0) 1450192830Sed continue; 1451192830Sed 1452192830Sed (void) vdev_validate_aux(vd); 1453192830Sed 1454192830Sed if (!vdev_is_dead(vd)) 1455192830Sed l2arc_add_vdev(spa, vd); 1456192830Sed } 1457192830Sed } 1458192830Sed 1459192830Sed /* 1460192914Sed * Purge vdevs that were dropped 1461192830Sed */ 1462192830Sed for (i = 0; i < oldnvdevs; i++) { 1463192830Sed uint64_t pool; 1464192830Sed 1465192830Sed vd = oldvdevs[i]; 1466192830Sed if (vd != NULL) { 1467192830Sed ASSERT(vd->vdev_isl2cache); 1468192830Sed 1469192830Sed if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1470192830Sed pool != 0ULL && l2arc_vdev_present(vd)) 1471192830Sed l2arc_remove_vdev(vd); 1472192830Sed vdev_clear_stats(vd); 1473192830Sed vdev_free(vd); 1474192830Sed } 1475192830Sed } 1476192830Sed 1477192830Sed if (oldvdevs) 1478192830Sed kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1479192830Sed 1480192830Sed if (sav->sav_config == NULL) 1481192830Sed goto out; 1482192830Sed 1483192830Sed sav->sav_vdevs = newvdevs; 1484192830Sed sav->sav_count = (int)nl2cache; 1485192830Sed 1486192830Sed /* 1487192830Sed * Recompute the stashed list of l2cache devices, with status 1488192830Sed * information this time. 1489192830Sed */ 1490192830Sed VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1491192830Sed DATA_TYPE_NVLIST_ARRAY) == 0); 1492192830Sed 1493192830Sed l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1494192830Sed for (i = 0; i < sav->sav_count; i++) 1495192830Sed l2cache[i] = vdev_config_generate(spa, 1496192830Sed sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1497192830Sed VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1498192830Sed ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1499192830Sedout: 1500192830Sed for (i = 0; i < sav->sav_count; i++) 1501192830Sed nvlist_free(l2cache[i]); 1502192830Sed if (sav->sav_count) 1503192830Sed kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1504192830Sed} 1505192830Sed 1506192830Sedstatic int 1507192830Sedload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1508192830Sed{ 1509192830Sed dmu_buf_t *db; 1510192830Sed char *packed = NULL; 1511192830Sed size_t nvsize = 0; 1512192830Sed int error; 1513192830Sed *value = NULL; 1514192830Sed 1515192830Sed VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1516192830Sed nvsize = *(uint64_t *)db->db_data; 1517192830Sed dmu_buf_rele(db, FTAG); 1518192830Sed 1519192830Sed packed = kmem_alloc(nvsize, KM_SLEEP); 1520192830Sed error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1521192830Sed DMU_READ_PREFETCH); 1522192830Sed if (error == 0) 1523192830Sed error = nvlist_unpack(packed, nvsize, value, 0); 1524192830Sed kmem_free(packed, nvsize); 1525192830Sed 1526192830Sed return (error); 1527192830Sed} 1528192830Sed 1529192830Sed/* 1530192830Sed * Checks to see if the given vdev could not be opened, in which case we post a 1531192830Sed * sysevent to notify the autoreplace code that the device has been removed. 1532192830Sed */ 1533192830Sedstatic void 1534192830Sedspa_check_removed(vdev_t *vd) 1535192830Sed{ 1536192830Sed for (int c = 0; c < vd->vdev_children; c++) 1537192830Sed spa_check_removed(vd->vdev_child[c]); 1538192830Sed 1539192830Sed if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1540192830Sed !vd->vdev_ishole) { 1541192830Sed zfs_post_autoreplace(vd->vdev_spa, vd); 1542192856Sed spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1543192856Sed } 1544192856Sed} 1545192856Sed 1546192914Sed/* 1547192830Sed * Validate the current config against the MOS config 1548192830Sed */ 1549192830Sedstatic boolean_t 1550192830Sedspa_config_valid(spa_t *spa, nvlist_t *config) 1551213567Sed{ 1552192830Sed vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1553213567Sed nvlist_t *nv; 1554192830Sed 1555192830Sed VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1556192830Sed 1557192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1558192830Sed VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1559192830Sed 1560192830Sed ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1561192830Sed 1562192830Sed /* 1563192830Sed * If we're doing a normal import, then build up any additional 1564192830Sed * diagnostic information about missing devices in this config. 1565192830Sed * We'll pass this up to the user for further processing. 1566192830Sed */ 1567192830Sed if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1568192830Sed nvlist_t **child, *nv; 1569192830Sed uint64_t idx = 0; 1570192830Sed 1571192830Sed child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1572192830Sed KM_SLEEP); 1573192830Sed VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1574192830Sed 1575192830Sed for (int c = 0; c < rvd->vdev_children; c++) { 1576192830Sed vdev_t *tvd = rvd->vdev_child[c]; 1577192830Sed vdev_t *mtvd = mrvd->vdev_child[c]; 1578192830Sed 1579192830Sed if (tvd->vdev_ops == &vdev_missing_ops && 1580192830Sed mtvd->vdev_ops != &vdev_missing_ops && 1581192830Sed mtvd->vdev_islog) 1582192830Sed child[idx++] = vdev_config_generate(spa, mtvd, 1583192830Sed B_FALSE, 0); 1584192830Sed } 1585192830Sed 1586192830Sed if (idx) { 1587192830Sed VERIFY(nvlist_add_nvlist_array(nv, 1588192830Sed ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1589192830Sed VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1590192830Sed ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1591192830Sed 1592192830Sed for (int i = 0; i < idx; i++) 1593192830Sed nvlist_free(child[i]); 1594192830Sed } 1595192830Sed nvlist_free(nv); 1596192830Sed kmem_free(child, rvd->vdev_children * sizeof (char **)); 1597192830Sed } 1598192830Sed 1599192830Sed /* 1600192830Sed * Compare the root vdev tree with the information we have 1601192830Sed * from the MOS config (mrvd). Check each top-level vdev 1602192830Sed * with the corresponding MOS config top-level (mtvd). 1603192830Sed */ 1604192830Sed for (int c = 0; c < rvd->vdev_children; c++) { 1605192830Sed vdev_t *tvd = rvd->vdev_child[c]; 1606192830Sed vdev_t *mtvd = mrvd->vdev_child[c]; 1607192830Sed 1608192830Sed /* 1609192830Sed * Resolve any "missing" vdevs in the current configuration. 1610192830Sed * If we find that the MOS config has more accurate information 1611192830Sed * about the top-level vdev then use that vdev instead. 1612192830Sed */ 1613192830Sed if (tvd->vdev_ops == &vdev_missing_ops && 1614192830Sed mtvd->vdev_ops != &vdev_missing_ops) { 1615192830Sed 1616192830Sed if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1617192830Sed continue; 1618192830Sed 1619192830Sed /* 1620192830Sed * Device specific actions. 1621192830Sed */ 1622192830Sed if (mtvd->vdev_islog) { 1623192830Sed spa_set_log_state(spa, SPA_LOG_CLEAR); 1624192830Sed } else { 1625192830Sed /* 1626192830Sed * XXX - once we have 'readonly' pool 1627192830Sed * support we should be able to handle 1628192830Sed * missing data devices by transitioning 1629192830Sed * the pool to readonly. 1630192830Sed */ 1631192830Sed continue; 1632192830Sed } 1633192830Sed 1634192830Sed /* 1635192830Sed * Swap the missing vdev with the data we were 1636192830Sed * able to obtain from the MOS config. 1637192830Sed */ 1638192830Sed vdev_remove_child(rvd, tvd); 1639192830Sed vdev_remove_child(mrvd, mtvd); 1640192830Sed 1641192830Sed vdev_add_child(rvd, mtvd); 1642192830Sed vdev_add_child(mrvd, tvd); 1643192830Sed 1644192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 1645192830Sed vdev_load(mtvd); 1646192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1647192830Sed 1648192830Sed vdev_reopen(rvd); 1649192830Sed } else if (mtvd->vdev_islog) { 1650192830Sed /* 1651192830Sed * Load the slog device's state from the MOS config 1652192830Sed * since it's possible that the label does not 1653192830Sed * contain the most up-to-date information. 1654192830Sed */ 1655192830Sed vdev_load_log_state(tvd, mtvd); 1656192830Sed vdev_reopen(tvd); 1657192830Sed } 1658192830Sed } 1659192830Sed vdev_free(mrvd); 1660192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 1661192830Sed 1662192830Sed /* 1663192830Sed * Ensure we were able to validate the config. 1664192830Sed */ 1665192830Sed return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1666192830Sed} 1667192830Sed 1668192830Sed/* 1669192830Sed * Check for missing log devices 1670192830Sed */ 1671192830Sedstatic boolean_t 1672192830Sedspa_check_logs(spa_t *spa) 1673192830Sed{ 1674192830Sed boolean_t rv = B_FALSE; 1675192830Sed 1676192830Sed switch (spa->spa_log_state) { 1677192830Sed case SPA_LOG_MISSING: 1678192830Sed /* need to recheck in case slog has been restored */ 1679192830Sed case SPA_LOG_UNKNOWN: 1680192830Sed rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, 1681192830Sed NULL, DS_FIND_CHILDREN) != 0); 1682192830Sed if (rv) 1683192830Sed spa_set_log_state(spa, SPA_LOG_MISSING); 1684192830Sed break; 1685192830Sed } 1686192830Sed return (rv); 1687192830Sed} 1688192830Sed 1689192830Sedstatic boolean_t 1690192830Sedspa_passivate_log(spa_t *spa) 1691192830Sed{ 1692192830Sed vdev_t *rvd = spa->spa_root_vdev; 1693192830Sed boolean_t slog_found = B_FALSE; 1694192830Sed 1695192830Sed ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1696192830Sed 1697192830Sed if (!spa_has_slogs(spa)) 1698192830Sed return (B_FALSE); 1699192830Sed 1700192830Sed for (int c = 0; c < rvd->vdev_children; c++) { 1701192830Sed vdev_t *tvd = rvd->vdev_child[c]; 1702192830Sed metaslab_group_t *mg = tvd->vdev_mg; 1703192830Sed 1704192830Sed if (tvd->vdev_islog) { 1705192830Sed metaslab_group_passivate(mg); 1706192914Sed slog_found = B_TRUE; 1707192830Sed } 1708192830Sed } 1709192830Sed 1710192830Sed return (slog_found); 1711192856Sed} 1712192830Sed 1713192830Sedstatic void 1714192830Sedspa_activate_log(spa_t *spa) 1715192830Sed{ 1716192830Sed vdev_t *rvd = spa->spa_root_vdev; 1717192830Sed 1718192830Sed ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1719192830Sed 1720192830Sed for (int c = 0; c < rvd->vdev_children; c++) { 1721192830Sed vdev_t *tvd = rvd->vdev_child[c]; 1722192914Sed metaslab_group_t *mg = tvd->vdev_mg; 1723192830Sed 1724192830Sed if (tvd->vdev_islog) 1725192830Sed metaslab_group_activate(mg); 1726192830Sed } 1727192830Sed} 1728192830Sed 1729192830Sedint 1730192830Sedspa_offline_log(spa_t *spa) 1731192830Sed{ 1732192830Sed int error; 1733192830Sed 1734192830Sed error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1735192830Sed NULL, DS_FIND_CHILDREN); 1736192830Sed if (error == 0) { 1737192830Sed /* 1738192830Sed * We successfully offlined the log device, sync out the 1739192830Sed * current txg so that the "stubby" block can be removed 1740192830Sed * by zil_sync(). 1741192830Sed */ 1742192830Sed txg_wait_synced(spa->spa_dsl_pool, 0); 1743192830Sed } 1744192830Sed return (error); 1745192830Sed} 1746192830Sed 1747192830Sedstatic void 1748192830Sedspa_aux_check_removed(spa_aux_vdev_t *sav) 1749192830Sed{ 1750192830Sed int i; 1751192830Sed 1752192830Sed for (i = 0; i < sav->sav_count; i++) 1753192830Sed spa_check_removed(sav->sav_vdevs[i]); 1754192830Sed} 1755192830Sed 1756192830Sedvoid 1757192830Sedspa_claim_notify(zio_t *zio) 1758192830Sed{ 1759192830Sed spa_t *spa = zio->io_spa; 1760192830Sed 1761192830Sed if (zio->io_error) 1762192830Sed return; 1763192830Sed 1764192830Sed mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1765192830Sed if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1766192830Sed spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1767192830Sed mutex_exit(&spa->spa_props_lock); 1768192830Sed} 1769192830Sed 1770192830Sedtypedef struct spa_load_error { 1771192830Sed uint64_t sle_meta_count; 1772192830Sed uint64_t sle_data_count; 1773192830Sed} spa_load_error_t; 1774192830Sed 1775192830Sedstatic void 1776192830Sedspa_load_verify_done(zio_t *zio) 1777192830Sed{ 1778192830Sed blkptr_t *bp = zio->io_bp; 1779192830Sed spa_load_error_t *sle = zio->io_private; 1780192830Sed dmu_object_type_t type = BP_GET_TYPE(bp); 1781192830Sed int error = zio->io_error; 1782192830Sed 1783192830Sed if (error) { 1784192830Sed if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1785192830Sed type != DMU_OT_INTENT_LOG) 1786192830Sed atomic_add_64(&sle->sle_meta_count, 1); 1787192830Sed else 1788192830Sed atomic_add_64(&sle->sle_data_count, 1); 1789192830Sed } 1790192830Sed zio_data_buf_free(zio->io_data, zio->io_size); 1791192830Sed} 1792192830Sed 1793192830Sed/*ARGSUSED*/ 1794192830Sedstatic int 1795192830Sedspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1796192830Sed const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1797192830Sed{ 1798192830Sed if (bp != NULL) { 1799192830Sed zio_t *rio = arg; 1800192830Sed size_t size = BP_GET_PSIZE(bp); 1801192830Sed void *data = zio_data_buf_alloc(size); 1802192830Sed 1803192830Sed zio_nowait(zio_read(rio, spa, bp, data, size, 1804192830Sed spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1805192830Sed ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1806192830Sed ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1807192830Sed } 1808192830Sed return (0); 1809192830Sed} 1810192830Sed 1811192830Sedstatic int 1812192830Sedspa_load_verify(spa_t *spa) 1813192830Sed{ 1814192830Sed zio_t *rio; 1815192830Sed spa_load_error_t sle = { 0 }; 1816192830Sed zpool_rewind_policy_t policy; 1817192830Sed boolean_t verify_ok = B_FALSE; 1818192830Sed int error; 1819192830Sed 1820192830Sed zpool_get_rewind_policy(spa->spa_config, &policy); 1821192830Sed 1822192830Sed if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1823192830Sed return (0); 1824192830Sed 1825192830Sed rio = zio_root(spa, NULL, &sle, 1826192830Sed ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1827192830Sed 1828192830Sed error = traverse_pool(spa, spa->spa_verify_min_txg, 1829192830Sed TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1830192830Sed 1831192830Sed (void) zio_wait(rio); 1832192830Sed 1833192830Sed spa->spa_load_meta_errors = sle.sle_meta_count; 1834192830Sed spa->spa_load_data_errors = sle.sle_data_count; 1835192830Sed 1836192830Sed if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1837192830Sed sle.sle_data_count <= policy.zrp_maxdata) { 1838192830Sed int64_t loss = 0; 1839192830Sed 1840192830Sed verify_ok = B_TRUE; 1841192830Sed spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1842192830Sed spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1843192830Sed 1844192830Sed loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1845192830Sed VERIFY(nvlist_add_uint64(spa->spa_load_info, 1846192830Sed ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1847192830Sed VERIFY(nvlist_add_int64(spa->spa_load_info, 1848192830Sed ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1849192830Sed VERIFY(nvlist_add_uint64(spa->spa_load_info, 1850192830Sed ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1851192830Sed } else { 1852192830Sed spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1853192830Sed } 1854192830Sed 1855192830Sed if (error) { 1856192830Sed if (error != ENXIO && error != EIO) 1857192830Sed error = SET_ERROR(EIO); 1858192830Sed return (error); 1859192830Sed } 1860192830Sed 1861192830Sed return (verify_ok ? 0 : EIO); 1862192830Sed} 1863192830Sed 1864192830Sed/* 1865192830Sed * Find a value in the pool props object. 1866192830Sed */ 1867192830Sedstatic void 1868192830Sedspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1869192830Sed{ 1870192830Sed (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1871192830Sed zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1872192830Sed} 1873192830Sed 1874192830Sed/* 1875192830Sed * Find a value in the pool directory object. 1876192830Sed */ 1877192830Sedstatic int 1878192830Sedspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1879192830Sed{ 1880192830Sed return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1881192830Sed name, sizeof (uint64_t), 1, val)); 1882192830Sed} 1883192830Sed 1884192830Sedstatic int 1885192830Sedspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1886192830Sed{ 1887192830Sed vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1888192830Sed return (err); 1889192830Sed} 1890192830Sed 1891192830Sed/* 1892192914Sed * Fix up config after a partly-completed split. This is done with the 1893192830Sed * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1894192830Sed * pool have that entry in their config, but only the splitting one contains 1895192830Sed * a list of all the guids of the vdevs that are being split off. 1896192830Sed * 1897192830Sed * This function determines what to do with that list: either rejoin 1898192830Sed * all the disks to the pool, or complete the splitting process. To attempt 1899192830Sed * the rejoin, each disk that is offlined is marked online again, and 1900192830Sed * we do a reopen() call. If the vdev label for every disk that was 1901192830Sed * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1902192830Sed * then we call vdev_split() on each disk, and complete the split. 1903192830Sed * 1904192830Sed * Otherwise we leave the config alone, with all the vdevs in place in 1905192830Sed * the original pool. 1906192830Sed */ 1907192830Sedstatic void 1908192830Sedspa_try_repair(spa_t *spa, nvlist_t *config) 1909192830Sed{ 1910192830Sed uint_t extracted; 1911192830Sed uint64_t *glist; 1912192830Sed uint_t i, gcount; 1913192830Sed nvlist_t *nvl; 1914192830Sed vdev_t **vd; 1915192830Sed boolean_t attempt_reopen; 1916192830Sed 1917192914Sed if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1918192830Sed return; 1919192830Sed 1920192830Sed /* check that the config is complete */ 1921196750Sache if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1922192830Sed &glist, &gcount) != 0) 1923192830Sed return; 1924192830Sed 1925192830Sed vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1926192830Sed 1927196750Sache /* attempt to online all the vdevs & validate */ 1928192830Sed attempt_reopen = B_TRUE; 1929192830Sed for (i = 0; i < gcount; i++) { 1930192830Sed if (glist[i] == 0) /* vdev is hole */ 1931192830Sed continue; 1932192830Sed 1933192914Sed vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1934192830Sed if (vd[i] == NULL) { 1935192914Sed /* 1936192830Sed * Don't bother attempting to reopen the disks; 1937192830Sed * just do the split. 1938192830Sed */ 1939192830Sed attempt_reopen = B_FALSE; 1940192830Sed } else { 1941192830Sed /* attempt to re-online it */ 1942192830Sed vd[i]->vdev_offline = B_FALSE; 1943192830Sed } 1944192830Sed } 1945192830Sed 1946192830Sed if (attempt_reopen) { 1947192830Sed vdev_reopen(spa->spa_root_vdev); 1948192830Sed 1949192830Sed /* check each device to see what state it's in */ 1950192830Sed for (extracted = 0, i = 0; i < gcount; i++) { 1951192830Sed if (vd[i] != NULL && 1952192830Sed vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1953192830Sed break; 1954192830Sed ++extracted; 1955192830Sed } 1956192830Sed } 1957192830Sed 1958192830Sed /* 1959192830Sed * If every disk has been moved to the new pool, or if we never 1960192914Sed * even attempted to look at them, then we split them off for 1961192830Sed * good. 1962192830Sed */ 1963192830Sed if (!attempt_reopen || gcount == extracted) { 1964192830Sed for (i = 0; i < gcount; i++) 1965192830Sed if (vd[i] != NULL) 1966192830Sed vdev_split(vd[i]); 1967192830Sed vdev_reopen(spa->spa_root_vdev); 1968192830Sed } 1969192830Sed 1970192830Sed kmem_free(vd, gcount * sizeof (vdev_t *)); 1971192830Sed} 1972196818Sache 1973192830Sedstatic int 1974192830Sedspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1975192830Sed boolean_t mosconfig) 1976192830Sed{ 1977192914Sed nvlist_t *config = spa->spa_config; 1978192830Sed char *ereport = FM_EREPORT_ZFS_POOL; 1979192830Sed char *comment; 1980192830Sed int error; 1981192830Sed uint64_t pool_guid; 1982192830Sed nvlist_t *nvl; 1983192830Sed 1984192830Sed if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1985192830Sed return (SET_ERROR(EINVAL)); 1986192830Sed 1987192830Sed ASSERT(spa->spa_comment == NULL); 1988192830Sed if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 1989192830Sed spa->spa_comment = spa_strdup(comment); 1990192830Sed 1991213567Sed /* 1992192830Sed * Versioning wasn't explicitly added to the label until later, so if 1993192830Sed * it's not present treat it as the initial version. 1994192830Sed */ 1995192830Sed if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1996192830Sed &spa->spa_ubsync.ub_version) != 0) 1997192830Sed spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1998192830Sed 1999192830Sed (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2000192830Sed &spa->spa_config_txg); 2001192830Sed 2002192830Sed if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2003192830Sed spa_guid_exists(pool_guid, 0)) { 2004192830Sed error = SET_ERROR(EEXIST); 2005192830Sed } else { 2006192830Sed spa->spa_config_guid = pool_guid; 2007192830Sed 2008213567Sed if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2009192830Sed &nvl) == 0) { 2010192830Sed VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2011192830Sed KM_SLEEP) == 0); 2012192830Sed } 2013213567Sed 2014192830Sed nvlist_free(spa->spa_load_info); 2015192830Sed spa->spa_load_info = fnvlist_alloc(); 2016192830Sed 2017192830Sed gethrestime(&spa->spa_loaded_ts); 2018192830Sed error = spa_load_impl(spa, pool_guid, config, state, type, 2019192830Sed mosconfig, &ereport); 2020192830Sed } 2021192830Sed 2022213567Sed spa->spa_minref = refcount_count(&spa->spa_refcount); 2023192914Sed if (error) { 2024192914Sed if (error != EEXIST) { 2025192914Sed spa->spa_loaded_ts.tv_sec = 0; 2026192914Sed spa->spa_loaded_ts.tv_nsec = 0; 2027192914Sed } 2028192914Sed if (error != EBADF) { 2029192914Sed zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2030192830Sed } 2031192830Sed } 2032192830Sed spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2033192830Sed spa->spa_ena = 0; 2034192830Sed 2035192830Sed return (error); 2036192830Sed} 2037192830Sed 2038192830Sed/* 2039192830Sed * Load an existing storage pool, using the pool's builtin spa_config as a 2040192830Sed * source of configuration information. 2041192830Sed */ 2042192830Sedstatic int 2043192830Sedspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2044192830Sed spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2045192830Sed char **ereport) 2046192830Sed{ 2047192830Sed int error = 0; 2048192830Sed nvlist_t *nvroot = NULL; 2049192830Sed nvlist_t *label; 2050192830Sed vdev_t *rvd; 2051192830Sed uberblock_t *ub = &spa->spa_uberblock; 2052192830Sed uint64_t children, config_cache_txg = spa->spa_config_txg; 2053192830Sed int orig_mode = spa->spa_mode; 2054192830Sed int parse; 2055192830Sed uint64_t obj; 2056192830Sed boolean_t missing_feat_write = B_FALSE; 2057192830Sed 2058192830Sed /* 2059192830Sed * If this is an untrusted config, access the pool in read-only mode. 2060192830Sed * This prevents things like resilvering recently removed devices. 2061192830Sed */ 2062192830Sed if (!mosconfig) 2063192830Sed spa->spa_mode = FREAD; 2064192830Sed 2065192830Sed ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2066192830Sed 2067192830Sed spa->spa_load_state = state; 2068192830Sed 2069192830Sed if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2070192856Sed return (SET_ERROR(EINVAL)); 2071192830Sed 2072192830Sed parse = (type == SPA_IMPORT_EXISTING ? 2073192856Sed VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2074192830Sed 2075192830Sed /* 2076192830Sed * Create "The Godfather" zio to hold all async IOs 2077192830Sed */ 2078192830Sed spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2079192830Sed ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2080192830Sed 2081192830Sed /* 2082192830Sed * Parse the configuration into a vdev tree. We explicitly set the 2083192830Sed * value that will be returned by spa_version() since parsing the 2084192830Sed * configuration requires knowing the version number. 2085192830Sed */ 2086192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2087192830Sed error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2088192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 2089192830Sed 2090192914Sed if (error != 0) 2091192830Sed return (error); 2092192830Sed 2093192830Sed ASSERT(spa->spa_root_vdev == rvd); 2094192830Sed 2095192830Sed if (type != SPA_IMPORT_ASSEMBLE) { 2096192830Sed ASSERT(spa_guid(spa) == pool_guid); 2097192830Sed } 2098192830Sed 2099192830Sed /* 2100192830Sed * Try to open all vdevs, loading each label in the process. 2101192830Sed */ 2102192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2103192830Sed error = vdev_open(rvd); 2104192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 2105192830Sed if (error != 0) 2106192830Sed return (error); 2107192830Sed 2108228627Sdim /* 2109228627Sdim * We need to validate the vdev labels against the configuration that 2110228627Sdim * we have in hand, which is dependent on the setting of mosconfig. If 2111228627Sdim * mosconfig is true then we're validating the vdev labels based on 2112192830Sed * that config. Otherwise, we're validating against the cached config 2113192830Sed * (zpool.cache) that was read when we loaded the zfs module, and then 2114192914Sed * later we will recursively call spa_load() and validate against 2115192830Sed * the vdev config. 2116192830Sed * 2117192830Sed * If we're assembling a new pool that's been split off from an 2118192830Sed * existing pool, the labels haven't yet been updated so we skip 2119192856Sed * validation for now. 2120192856Sed */ 2121192830Sed if (type != SPA_IMPORT_ASSEMBLE) { 2122192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2123192856Sed error = vdev_validate(rvd, mosconfig); 2124192856Sed spa_config_exit(spa, SCL_ALL, FTAG); 2125192830Sed 2126192830Sed if (error != 0) 2127192830Sed return (error); 2128192856Sed 2129192856Sed if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2130192856Sed return (SET_ERROR(ENXIO)); 2131192856Sed } 2132192856Sed 2133192856Sed /* 2134192856Sed * Find the best uberblock. 2135192856Sed */ 2136192856Sed vdev_uberblock_load(rvd, ub, &label); 2137192856Sed 2138192856Sed /* 2139192856Sed * If we weren't able to find a single valid uberblock, return failure. 2140192856Sed */ 2141192914Sed if (ub->ub_txg == 0) { 2142192856Sed nvlist_free(label); 2143192856Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2144192856Sed } 2145192856Sed 2146192856Sed /* 2147192914Sed * If the pool has an unsupported version we can't open it. 2148192856Sed */ 2149192856Sed if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2150192856Sed nvlist_free(label); 2151192856Sed return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2152192856Sed } 2153192830Sed 2154192830Sed if (ub->ub_version >= SPA_VERSION_FEATURES) { 2155192830Sed nvlist_t *features; 2156192830Sed 2157192830Sed /* 2158192830Sed * If we weren't able to find what's necessary for reading the 2159192830Sed * MOS in the label, return failure. 2160192830Sed */ 2161192830Sed if (label == NULL || nvlist_lookup_nvlist(label, 2162192830Sed ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2163192830Sed nvlist_free(label); 2164192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2165192830Sed ENXIO)); 2166192830Sed } 2167192830Sed 2168192830Sed /* 2169192830Sed * Update our in-core representation with the definitive values 2170192830Sed * from the label. 2171192830Sed */ 2172192830Sed nvlist_free(spa->spa_label_features); 2173192830Sed VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2174192830Sed } 2175192830Sed 2176192830Sed nvlist_free(label); 2177192830Sed 2178192830Sed /* 2179192830Sed * Look through entries in the label nvlist's features_for_read. If 2180192830Sed * there is a feature listed there which we don't understand then we 2181192830Sed * cannot open a pool. 2182192830Sed */ 2183192830Sed if (ub->ub_version >= SPA_VERSION_FEATURES) { 2184192830Sed nvlist_t *unsup_feat; 2185192830Sed 2186192830Sed VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2187192830Sed 0); 2188192830Sed 2189192830Sed for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2190192830Sed NULL); nvp != NULL; 2191192830Sed nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2192192830Sed if (!zfeature_is_supported(nvpair_name(nvp))) { 2193192830Sed VERIFY(nvlist_add_string(unsup_feat, 2194192830Sed nvpair_name(nvp), "") == 0); 2195192830Sed } 2196192830Sed } 2197192830Sed 2198192830Sed if (!nvlist_empty(unsup_feat)) { 2199192830Sed VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2200192830Sed ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2201192830Sed nvlist_free(unsup_feat); 2202192830Sed return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2203192830Sed ENOTSUP)); 2204192830Sed } 2205192830Sed 2206192830Sed nvlist_free(unsup_feat); 2207192830Sed } 2208192830Sed 2209192830Sed /* 2210192830Sed * If the vdev guid sum doesn't match the uberblock, we have an 2211192830Sed * incomplete configuration. We first check to see if the pool 2212192830Sed * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2213192830Sed * If it is, defer the vdev_guid_sum check till later so we 2214192830Sed * can handle missing vdevs. 2215192830Sed */ 2216192830Sed if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2217192830Sed &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2218192830Sed rvd->vdev_guid_sum != ub->ub_guid_sum) 2219213567Sed return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2220192830Sed 2221192830Sed if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2222192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2223192830Sed spa_try_repair(spa, config); 2224192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 2225192830Sed nvlist_free(spa->spa_config_splitting); 2226192830Sed spa->spa_config_splitting = NULL; 2227192830Sed } 2228192830Sed 2229192914Sed /* 2230192830Sed * Initialize internal SPA structures. 2231192830Sed */ 2232192830Sed spa->spa_state = POOL_STATE_ACTIVE; 2233192830Sed spa->spa_ubsync = spa->spa_uberblock; 2234192830Sed spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2235192830Sed TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2236192830Sed spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2237192830Sed spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2238192830Sed spa->spa_claim_max_txg = spa->spa_first_txg; 2239192830Sed spa->spa_prev_software_version = ub->ub_software_version; 2240192830Sed 2241192830Sed error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2242192830Sed if (error) 2243192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2244192830Sed spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2245192830Sed 2246192830Sed if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2247192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2248192830Sed 2249192830Sed if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2250192830Sed boolean_t missing_feat_read = B_FALSE; 2251192830Sed nvlist_t *unsup_feat, *enabled_feat; 2252192830Sed 2253192830Sed if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2254192830Sed &spa->spa_feat_for_read_obj) != 0) { 2255192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2256192830Sed } 2257192830Sed 2258192830Sed if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2259192830Sed &spa->spa_feat_for_write_obj) != 0) { 2260192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2261192830Sed } 2262192830Sed 2263192830Sed if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2264192830Sed &spa->spa_feat_desc_obj) != 0) { 2265192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2266192830Sed } 2267192830Sed 2268192830Sed enabled_feat = fnvlist_alloc(); 2269192830Sed unsup_feat = fnvlist_alloc(); 2270192830Sed 2271192830Sed if (!feature_is_supported(spa->spa_meta_objset, 2272192830Sed spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, 2273192830Sed unsup_feat, enabled_feat)) 2274192830Sed missing_feat_read = B_TRUE; 2275192830Sed 2276192830Sed if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2277192830Sed if (!feature_is_supported(spa->spa_meta_objset, 2278192830Sed spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, 2279192830Sed unsup_feat, enabled_feat)) { 2280192830Sed missing_feat_write = B_TRUE; 2281192830Sed } 2282192830Sed } 2283192830Sed 2284192830Sed fnvlist_add_nvlist(spa->spa_load_info, 2285192830Sed ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2286192830Sed 2287192830Sed if (!nvlist_empty(unsup_feat)) { 2288192830Sed fnvlist_add_nvlist(spa->spa_load_info, 2289192830Sed ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2290192830Sed } 2291192830Sed 2292192830Sed fnvlist_free(enabled_feat); 2293192830Sed fnvlist_free(unsup_feat); 2294192830Sed 2295192830Sed if (!missing_feat_read) { 2296192830Sed fnvlist_add_boolean(spa->spa_load_info, 2297192830Sed ZPOOL_CONFIG_CAN_RDONLY); 2298192830Sed } 2299192830Sed 2300192830Sed /* 2301192830Sed * If the state is SPA_LOAD_TRYIMPORT, our objective is 2302192830Sed * twofold: to determine whether the pool is available for 2303192830Sed * import in read-write mode and (if it is not) whether the 2304192830Sed * pool is available for import in read-only mode. If the pool 2305192830Sed * is available for import in read-write mode, it is displayed 2306192830Sed * as available in userland; if it is not available for import 2307192830Sed * in read-only mode, it is displayed as unavailable in 2308192830Sed * userland. If the pool is available for import in read-only 2309192830Sed * mode but not read-write mode, it is displayed as unavailable 2310192830Sed * in userland with a special note that the pool is actually 2311192830Sed * available for open in read-only mode. 2312192830Sed * 2313192830Sed * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2314192830Sed * missing a feature for write, we must first determine whether 2315192830Sed * the pool can be opened read-only before returning to 2316192830Sed * userland in order to know whether to display the 2317192830Sed * abovementioned note. 2318192830Sed */ 2319192830Sed if (missing_feat_read || (missing_feat_write && 2320192830Sed spa_writeable(spa))) { 2321192830Sed return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2322192830Sed ENOTSUP)); 2323192830Sed } 2324192830Sed } 2325192830Sed 2326192830Sed spa->spa_is_initializing = B_TRUE; 2327192830Sed error = dsl_pool_open(spa->spa_dsl_pool); 2328192830Sed spa->spa_is_initializing = B_FALSE; 2329192830Sed if (error != 0) 2330192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2331192830Sed 2332192830Sed if (!mosconfig) { 2333192830Sed uint64_t hostid; 2334192830Sed nvlist_t *policy = NULL, *nvconfig; 2335192830Sed 2336192830Sed if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2337192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2338192830Sed 2339192830Sed if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2340192830Sed ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2341192830Sed char *hostname; 2342192830Sed unsigned long myhostid = 0; 2343192830Sed 2344192830Sed VERIFY(nvlist_lookup_string(nvconfig, 2345192830Sed ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2346192914Sed 2347192830Sed#ifdef _KERNEL 2348192830Sed myhostid = zone_get_hostid(NULL); 2349192830Sed#else /* _KERNEL */ 2350192830Sed /* 2351192830Sed * We're emulating the system's hostid in userland, so 2352192830Sed * we can't use zone_get_hostid(). 2353192830Sed */ 2354192830Sed (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2355192830Sed#endif /* _KERNEL */ 2356192830Sed if (check_hostid && hostid != 0 && myhostid != 0 && 2357192830Sed hostid != myhostid) { 2358192830Sed nvlist_free(nvconfig); 2359192830Sed cmn_err(CE_WARN, "pool '%s' could not be " 2360192830Sed "loaded as it was last accessed by " 2361192830Sed "another system (host: %s hostid: 0x%lx). " 2362192830Sed "See: http://illumos.org/msg/ZFS-8000-EY", 2363192830Sed spa_name(spa), hostname, 2364192830Sed (unsigned long)hostid); 2365192830Sed return (SET_ERROR(EBADF)); 2366192830Sed } 2367192830Sed } 2368192914Sed if (nvlist_lookup_nvlist(spa->spa_config, 2369192830Sed ZPOOL_REWIND_POLICY, &policy) == 0) 2370192830Sed VERIFY(nvlist_add_nvlist(nvconfig, 2371192830Sed ZPOOL_REWIND_POLICY, policy) == 0); 2372192830Sed 2373192830Sed spa_config_set(spa, nvconfig); 2374192830Sed spa_unload(spa); 2375192830Sed spa_deactivate(spa); 2376192830Sed spa_activate(spa, orig_mode); 2377192830Sed 2378192830Sed return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2379192830Sed } 2380192830Sed 2381192830Sed if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2382192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2383192830Sed error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2384192830Sed if (error != 0) 2385192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2386192830Sed 2387192830Sed /* 2388192830Sed * Load the bit that tells us to use the new accounting function 2389192830Sed * (raid-z deflation). If we have an older pool, this will not 2390192830Sed * be present. 2391192830Sed */ 2392192830Sed error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2393192830Sed if (error != 0 && error != ENOENT) 2394192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2395192830Sed 2396192830Sed error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2397192830Sed &spa->spa_creation_version); 2398192830Sed if (error != 0 && error != ENOENT) 2399192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2400192830Sed 2401192830Sed /* 2402192830Sed * Load the persistent error log. If we have an older pool, this will 2403192830Sed * not be present. 2404192830Sed */ 2405192830Sed error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2406192914Sed if (error != 0 && error != ENOENT) 2407192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2408192830Sed 2409192914Sed error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2410192830Sed &spa->spa_errlog_scrub); 2411192830Sed if (error != 0 && error != ENOENT) 2412192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2413192830Sed 2414192830Sed /* 2415192830Sed * Load the history object. If we have an older pool, this 2416192830Sed * will not be present. 2417192830Sed */ 2418192830Sed error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2419192830Sed if (error != 0 && error != ENOENT) 2420192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2421192830Sed 2422192830Sed /* 2423192830Sed * If we're assembling the pool from the split-off vdevs of 2424192830Sed * an existing pool, we don't want to attach the spares & cache 2425192830Sed * devices. 2426192856Sed */ 2427192830Sed 2428192830Sed /* 2429192830Sed * Load any hot spares for this pool. 2430192830Sed */ 2431192830Sed error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2432192830Sed if (error != 0 && error != ENOENT) 2433192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2434192830Sed if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2435192830Sed ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2436192830Sed if (load_nvlist(spa, spa->spa_spares.sav_object, 2437192830Sed &spa->spa_spares.sav_config) != 0) 2438192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2439192830Sed 2440192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2441192830Sed spa_load_spares(spa); 2442192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 2443192830Sed } else if (error == 0) { 2444196818Sache spa->spa_spares.sav_sync = B_TRUE; 2445192830Sed } 2446192830Sed 2447192830Sed /* 2448192830Sed * Load any level 2 ARC devices for this pool. 2449192830Sed */ 2450192830Sed error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2451192830Sed &spa->spa_l2cache.sav_object); 2452192830Sed if (error != 0 && error != ENOENT) 2453192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2454192830Sed if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2455192830Sed ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2456192830Sed if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2457192830Sed &spa->spa_l2cache.sav_config) != 0) 2458192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2459192830Sed 2460192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2461192830Sed spa_load_l2cache(spa); 2462192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 2463192830Sed } else if (error == 0) { 2464192830Sed spa->spa_l2cache.sav_sync = B_TRUE; 2465192830Sed } 2466192830Sed 2467192830Sed spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2468192830Sed 2469192830Sed error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2470192830Sed if (error && error != ENOENT) 2471192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2472192830Sed 2473192830Sed if (error == 0) { 2474192830Sed uint64_t autoreplace; 2475192830Sed 2476192830Sed spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2477192830Sed spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2478192830Sed spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2479192830Sed spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2480192830Sed spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2481192830Sed spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2482192830Sed &spa->spa_dedup_ditto); 2483192830Sed 2484192830Sed spa->spa_autoreplace = (autoreplace != 0); 2485192830Sed } 2486192830Sed 2487192830Sed /* 2488192830Sed * If the 'autoreplace' property is set, then post a resource notifying 2489192830Sed * the ZFS DE that it should not issue any faults for unopenable 2490192914Sed * devices. We also iterate over the vdevs, and post a sysevent for any 2491192830Sed * unopenable vdevs so that the normal autoreplace handler can take 2492192830Sed * over. 2493192830Sed */ 2494192914Sed if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2495192830Sed spa_check_removed(spa->spa_root_vdev); 2496192830Sed /* 2497192830Sed * For the import case, this is done in spa_import(), because 2498192830Sed * at this point we're using the spare definitions from 2499192830Sed * the MOS config, not necessarily from the userland config. 2500192830Sed */ 2501192830Sed if (state != SPA_LOAD_IMPORT) { 2502192830Sed spa_aux_check_removed(&spa->spa_spares); 2503192856Sed spa_aux_check_removed(&spa->spa_l2cache); 2504192830Sed } 2505192856Sed } 2506192830Sed 2507192830Sed /* 2508192830Sed * Load the vdev state for all toplevel vdevs. 2509192830Sed */ 2510192830Sed vdev_load(rvd); 2511192830Sed 2512192830Sed /* 2513192830Sed * Propagate the leaf DTLs we just loaded all the way up the tree. 2514192830Sed */ 2515192856Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2516192856Sed vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2517192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 2518192830Sed 2519192830Sed /* 2520192830Sed * Load the DDTs (dedup tables). 2521196818Sache */ 2522192830Sed error = ddt_load(spa); 2523192830Sed if (error != 0) 2524192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2525192830Sed 2526192830Sed spa_update_dspace(spa); 2527192830Sed 2528192830Sed /* 2529192830Sed * Validate the config, using the MOS config to fill in any 2530192830Sed * information which might be missing. If we fail to validate 2531192830Sed * the config then declare the pool unfit for use. If we're 2532192830Sed * assembling a pool from a split, the log is not transferred 2533192830Sed * over. 2534192830Sed */ 2535192830Sed if (type != SPA_IMPORT_ASSEMBLE) { 2536192830Sed nvlist_t *nvconfig; 2537192830Sed 2538192830Sed if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2539192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2540192830Sed 2541192830Sed if (!spa_config_valid(spa, nvconfig)) { 2542192830Sed nvlist_free(nvconfig); 2543192830Sed return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2544192830Sed ENXIO)); 2545192830Sed } 2546192830Sed nvlist_free(nvconfig); 2547192830Sed 2548192830Sed /* 2549192830Sed * Now that we've validated the config, check the state of the 2550192830Sed * root vdev. If it can't be opened, it indicates one or 2551192830Sed * more toplevel vdevs are faulted. 2552192830Sed */ 2553192830Sed if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2554192830Sed return (SET_ERROR(ENXIO)); 2555192830Sed 2556192830Sed if (spa_check_logs(spa)) { 2557192830Sed *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2558192830Sed return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2559192830Sed } 2560192830Sed } 2561192830Sed 2562192830Sed if (missing_feat_write) { 2563192830Sed ASSERT(state == SPA_LOAD_TRYIMPORT); 2564192830Sed 2565192830Sed /* 2566192830Sed * At this point, we know that we can open the pool in 2567192830Sed * read-only mode but not read-write mode. We now have enough 2568192830Sed * information and can return to userland. 2569192830Sed */ 2570192830Sed return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2571192830Sed } 2572192830Sed 2573192830Sed /* 2574192830Sed * We've successfully opened the pool, verify that we're ready 2575192830Sed * to start pushing transactions. 2576192830Sed */ 2577192830Sed if (state != SPA_LOAD_TRYIMPORT) { 2578192830Sed if (error = spa_load_verify(spa)) 2579192830Sed return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2580192830Sed error)); 2581192830Sed } 2582192830Sed 2583192830Sed if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2584192830Sed spa->spa_load_max_txg == UINT64_MAX)) { 2585192830Sed dmu_tx_t *tx; 2586192914Sed int need_update = B_FALSE; 2587192830Sed 2588192830Sed ASSERT(state != SPA_LOAD_TRYIMPORT); 2589192830Sed 2590192830Sed /* 2591192830Sed * Claim log blocks that haven't been committed yet. 2592192830Sed * This must all happen in a single txg. 2593192830Sed * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2594192830Sed * invoked from zil_claim_log_block()'s i/o done callback. 2595192830Sed * Price of rollback is that we abandon the log. 2596192830Sed */ 2597192830Sed spa->spa_claiming = B_TRUE; 2598192830Sed 2599192830Sed tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2600192830Sed spa_first_txg(spa)); 2601192830Sed (void) dmu_objset_find(spa_name(spa), 2602192830Sed zil_claim, tx, DS_FIND_CHILDREN); 2603192830Sed dmu_tx_commit(tx); 2604192830Sed 2605192830Sed spa->spa_claiming = B_FALSE; 2606192830Sed 2607192830Sed spa_set_log_state(spa, SPA_LOG_GOOD); 2608192830Sed spa->spa_sync_on = B_TRUE; 2609192830Sed txg_sync_start(spa->spa_dsl_pool); 2610192830Sed 2611192914Sed /* 2612192830Sed * Wait for all claims to sync. We sync up to the highest 2613192830Sed * claimed log block birth time so that claimed log blocks 2614192830Sed * don't appear to be from the future. spa_claim_max_txg 2615192830Sed * will have been set for us by either zil_check_log_chain() 2616192830Sed * (invoked from spa_check_logs()) or zil_claim() above. 2617192830Sed */ 2618192830Sed txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2619192830Sed 2620192830Sed /* 2621192914Sed * If the config cache is stale, or we have uninitialized 2622192830Sed * metaslabs (see spa_vdev_add()), then update the config. 2623192830Sed * 2624192830Sed * If this is a verbatim import, trust the current 2625192830Sed * in-core spa_config and update the disk labels. 2626192830Sed */ 2627192830Sed if (config_cache_txg != spa->spa_config_txg || 2628192914Sed state == SPA_LOAD_IMPORT || 2629192830Sed state == SPA_LOAD_RECOVER || 2630192830Sed (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2631192830Sed need_update = B_TRUE; 2632192830Sed 2633192830Sed for (int c = 0; c < rvd->vdev_children; c++) 2634192830Sed if (rvd->vdev_child[c]->vdev_ms_array == 0) 2635192830Sed need_update = B_TRUE; 2636192830Sed 2637192830Sed /* 2638192830Sed * Update the config cache asychronously in case we're the 2639192830Sed * root pool, in which case the config cache isn't writable yet. 2640192830Sed */ 2641192830Sed if (need_update) 2642192830Sed spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2643192830Sed 2644192830Sed /* 2645192830Sed * Check all DTLs to see if anything needs resilvering. 2646192830Sed */ 2647192830Sed if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2648192830Sed vdev_resilver_needed(rvd, NULL, NULL)) 2649192830Sed spa_async_request(spa, SPA_ASYNC_RESILVER); 2650192830Sed 2651192830Sed /* 2652192830Sed * Log the fact that we booted up (so that we can detect if 2653192830Sed * we rebooted in the middle of an operation). 2654192830Sed */ 2655192830Sed spa_history_log_version(spa, "open"); 2656192830Sed 2657192830Sed /* 2658192830Sed * Delete any inconsistent datasets. 2659192830Sed */ 2660192830Sed (void) dmu_objset_find(spa_name(spa), 2661192830Sed dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2662213567Sed 2663192830Sed /* 2664192830Sed * Clean up any stale temporary dataset userrefs. 2665192830Sed */ 2666192830Sed dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2667192830Sed } 2668192914Sed 2669192830Sed return (0); 2670192830Sed} 2671192830Sed 2672192830Sedstatic int 2673192830Sedspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2674192830Sed{ 2675192830Sed int mode = spa->spa_mode; 2676192830Sed 2677192830Sed spa_unload(spa); 2678192830Sed spa_deactivate(spa); 2679192830Sed 2680192830Sed spa->spa_load_max_txg--; 2681192830Sed 2682192830Sed spa_activate(spa, mode); 2683192830Sed spa_async_suspend(spa); 2684192830Sed 2685192830Sed return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2686192830Sed} 2687192830Sed 2688192830Sed/* 2689192830Sed * If spa_load() fails this function will try loading prior txg's. If 2690192830Sed * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2691192830Sed * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2692192830Sed * function will not rewind the pool and will return the same error as 2693192830Sed * spa_load(). 2694192830Sed */ 2695192830Sedstatic int 2696192830Sedspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2697192914Sed uint64_t max_request, int rewind_flags) 2698192830Sed{ 2699192830Sed nvlist_t *loadinfo = NULL; 2700192830Sed nvlist_t *config = NULL; 2701192830Sed int load_error, rewind_error; 2702192830Sed uint64_t safe_rewind_txg; 2703192914Sed uint64_t min_txg; 2704192830Sed 2705192830Sed if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2706192830Sed spa->spa_load_max_txg = spa->spa_load_txg; 2707192830Sed spa_set_log_state(spa, SPA_LOG_CLEAR); 2708192830Sed } else { 2709192914Sed spa->spa_load_max_txg = max_request; 2710192830Sed } 2711192830Sed 2712192830Sed load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2713192830Sed mosconfig); 2714192830Sed if (load_error == 0) 2715192830Sed return (0); 2716192830Sed 2717192830Sed if (spa->spa_root_vdev != NULL) 2718192830Sed config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2719192830Sed 2720192830Sed spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2721192830Sed spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2722192830Sed 2723192830Sed if (rewind_flags & ZPOOL_NEVER_REWIND) { 2724192830Sed nvlist_free(config); 2725192830Sed return (load_error); 2726192830Sed } 2727192830Sed 2728192830Sed if (state == SPA_LOAD_RECOVER) { 2729192830Sed /* Price of rolling back is discarding txgs, including log */ 2730192830Sed spa_set_log_state(spa, SPA_LOG_CLEAR); 2731192830Sed } else { 2732213567Sed /* 2733213567Sed * If we aren't rolling back save the load info from our first 2734192830Sed * import attempt so that we can restore it after attempting 2735192830Sed * to rewind. 2736192830Sed */ 2737192830Sed loadinfo = spa->spa_load_info; 2738192830Sed spa->spa_load_info = fnvlist_alloc(); 2739192830Sed } 2740192830Sed 2741192830Sed spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2742192830Sed safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2743192830Sed min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2744192830Sed TXG_INITIAL : safe_rewind_txg; 2745192830Sed 2746192914Sed /* 2747192830Sed * Continue as long as we're finding errors, we're still within 2748192830Sed * the acceptable rewind range, and we're still finding uberblocks 2749192830Sed */ 2750192830Sed while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2751192830Sed spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2752192830Sed if (spa->spa_load_max_txg < safe_rewind_txg) 2753192830Sed spa->spa_extreme_rewind = B_TRUE; 2754192830Sed rewind_error = spa_load_retry(spa, state, mosconfig); 2755192830Sed } 2756192830Sed 2757192830Sed spa->spa_extreme_rewind = B_FALSE; 2758192830Sed spa->spa_load_max_txg = UINT64_MAX; 2759192830Sed 2760192830Sed if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2761192830Sed spa_config_set(spa, config); 2762192830Sed 2763192830Sed if (state == SPA_LOAD_RECOVER) { 2764192830Sed ASSERT3P(loadinfo, ==, NULL); 2765192830Sed return (rewind_error); 2766192830Sed } else { 2767192830Sed /* Store the rewind info as part of the initial load info */ 2768192830Sed fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2769192830Sed spa->spa_load_info); 2770192830Sed 2771192830Sed /* Restore the initial load info */ 2772192830Sed fnvlist_free(spa->spa_load_info); 2773192830Sed spa->spa_load_info = loadinfo; 2774192830Sed 2775192830Sed return (load_error); 2776192830Sed } 2777192830Sed} 2778192830Sed 2779192830Sed/* 2780192830Sed * Pool Open/Import 2781192830Sed * 2782192830Sed * The import case is identical to an open except that the configuration is sent 2783192830Sed * down from userland, instead of grabbed from the configuration cache. For the 2784192830Sed * case of an open, the pool configuration will exist in the 2785192830Sed * POOL_STATE_UNINITIALIZED state. 2786192830Sed * 2787192830Sed * The stats information (gen/count/ustats) is used to gather vdev statistics at 2788192914Sed * the same time open the pool, without having to keep around the spa_t in some 2789192830Sed * ambiguous state. 2790192830Sed */ 2791192830Sedstatic int 2792192830Sedspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2793192830Sed nvlist_t **config) 2794192830Sed{ 2795192830Sed spa_t *spa; 2796192830Sed spa_load_state_t state = SPA_LOAD_OPEN; 2797192830Sed int error; 2798192830Sed int locked = B_FALSE; 2799192914Sed int firstopen = B_FALSE; 2800192830Sed 2801192830Sed *spapp = NULL; 2802192830Sed 2803192830Sed /* 2804192830Sed * As disgusting as this is, we need to support recursive calls to this 2805192830Sed * function because dsl_dir_open() is called during spa_load(), and ends 2806192830Sed * up calling spa_open() again. The real fix is to figure out how to 2807192830Sed * avoid dsl_dir_open() calling this in the first place. 2808192830Sed */ 2809192830Sed if (mutex_owner(&spa_namespace_lock) != curthread) { 2810192830Sed mutex_enter(&spa_namespace_lock); 2811192830Sed locked = B_TRUE; 2812192830Sed } 2813192830Sed 2814192830Sed if ((spa = spa_lookup(pool)) == NULL) { 2815192830Sed if (locked) 2816192830Sed mutex_exit(&spa_namespace_lock); 2817192830Sed return (SET_ERROR(ENOENT)); 2818192830Sed } 2819192830Sed 2820192830Sed if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2821192830Sed zpool_rewind_policy_t policy; 2822192830Sed 2823192830Sed firstopen = B_TRUE; 2824192830Sed 2825192830Sed zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2826192830Sed &policy); 2827192830Sed if (policy.zrp_request & ZPOOL_DO_REWIND) 2828192830Sed state = SPA_LOAD_RECOVER; 2829192830Sed 2830192830Sed spa_activate(spa, spa_mode_global); 2831192830Sed 2832192830Sed if (state != SPA_LOAD_RECOVER) 2833192830Sed spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2834192830Sed 2835192830Sed error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2836192830Sed policy.zrp_request); 2837192830Sed 2838192830Sed if (error == EBADF) { 2839192830Sed /* 2840192830Sed * If vdev_validate() returns failure (indicated by 2841192830Sed * EBADF), it indicates that one of the vdevs indicates 2842192830Sed * that the pool has been exported or destroyed. If 2843192830Sed * this is the case, the config cache is out of sync and 2844192830Sed * we should remove the pool from the namespace. 2845192830Sed */ 2846192830Sed spa_unload(spa); 2847192830Sed spa_deactivate(spa); 2848192830Sed spa_config_sync(spa, B_TRUE, B_TRUE); 2849192830Sed spa_remove(spa); 2850192830Sed if (locked) 2851192830Sed mutex_exit(&spa_namespace_lock); 2852192914Sed return (SET_ERROR(ENOENT)); 2853192830Sed } 2854192830Sed 2855192830Sed if (error) { 2856192830Sed /* 2857192830Sed * We can't open the pool, but we still have useful 2858192830Sed * information: the state of each vdev after the 2859192830Sed * attempted vdev_open(). Return this to the user. 2860192830Sed */ 2861192830Sed if (config != NULL && spa->spa_config) { 2862192830Sed VERIFY(nvlist_dup(spa->spa_config, config, 2863192830Sed KM_SLEEP) == 0); 2864192830Sed VERIFY(nvlist_add_nvlist(*config, 2865192914Sed ZPOOL_CONFIG_LOAD_INFO, 2866192830Sed spa->spa_load_info) == 0); 2867192830Sed } 2868192830Sed spa_unload(spa); 2869192830Sed spa_deactivate(spa); 2870192830Sed spa->spa_last_open_failed = error; 2871192830Sed if (locked) 2872192830Sed mutex_exit(&spa_namespace_lock); 2873192830Sed *spapp = NULL; 2874192830Sed return (error); 2875192830Sed } 2876192830Sed } 2877192830Sed 2878192830Sed spa_open_ref(spa, tag); 2879192830Sed 2880192830Sed if (config != NULL) 2881192830Sed *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2882192830Sed 2883192830Sed /* 2884192830Sed * If we've recovered the pool, pass back any information we 2885192830Sed * gathered while doing the load. 2886192830Sed */ 2887192830Sed if (state == SPA_LOAD_RECOVER) { 2888192830Sed VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2889192830Sed spa->spa_load_info) == 0); 2890192830Sed } 2891192914Sed 2892192914Sed if (locked) { 2893192830Sed spa->spa_last_open_failed = 0; 2894192830Sed spa->spa_last_ubsync_txg = 0; 2895192830Sed spa->spa_load_txg = 0; 2896192830Sed mutex_exit(&spa_namespace_lock); 2897192830Sed#ifdef __FreeBSD__ 2898192830Sed#ifdef _KERNEL 2899192830Sed if (firstopen) 2900192830Sed zvol_create_minors(spa->spa_name); 2901192830Sed#endif 2902192830Sed#endif 2903192830Sed } 2904192830Sed 2905192830Sed *spapp = spa; 2906192830Sed 2907192830Sed return (0); 2908192830Sed} 2909192830Sed 2910192830Sedint 2911192830Sedspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2912192830Sed nvlist_t **config) 2913192830Sed{ 2914192830Sed return (spa_open_common(name, spapp, tag, policy, config)); 2915192830Sed} 2916192830Sed 2917192830Sedint 2918192830Sedspa_open(const char *name, spa_t **spapp, void *tag) 2919192830Sed{ 2920192830Sed return (spa_open_common(name, spapp, tag, NULL, NULL)); 2921192830Sed} 2922192830Sed 2923192830Sed/* 2924192830Sed * Lookup the given spa_t, incrementing the inject count in the process, 2925192830Sed * preventing it from being exported or destroyed. 2926192830Sed */ 2927192914Sedspa_t * 2928192830Sedspa_inject_addref(char *name) 2929192830Sed{ 2930192830Sed spa_t *spa; 2931192830Sed 2932192830Sed mutex_enter(&spa_namespace_lock); 2933192830Sed if ((spa = spa_lookup(name)) == NULL) { 2934192830Sed mutex_exit(&spa_namespace_lock); 2935192830Sed return (NULL); 2936192830Sed } 2937192830Sed spa->spa_inject_ref++; 2938192830Sed mutex_exit(&spa_namespace_lock); 2939192830Sed 2940192830Sed return (spa); 2941192830Sed} 2942228627Sdim 2943192830Sedvoid 2944192830Sedspa_inject_delref(spa_t *spa) 2945192830Sed{ 2946192830Sed mutex_enter(&spa_namespace_lock); 2947192830Sed spa->spa_inject_ref--; 2948213567Sed mutex_exit(&spa_namespace_lock); 2949192830Sed} 2950192830Sed 2951192830Sed/* 2952192830Sed * Add spares device information to the nvlist. 2953192830Sed */ 2954192830Sedstatic void 2955192830Sedspa_add_spares(spa_t *spa, nvlist_t *config) 2956192830Sed{ 2957192830Sed nvlist_t **spares; 2958192830Sed uint_t i, nspares; 2959192830Sed nvlist_t *nvroot; 2960192830Sed uint64_t guid; 2961192830Sed vdev_stat_t *vs; 2962192830Sed uint_t vsc; 2963192830Sed uint64_t pool; 2964192830Sed 2965192830Sed ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2966192830Sed 2967192914Sed if (spa->spa_spares.sav_count == 0) 2968192830Sed return; 2969192830Sed 2970192830Sed VERIFY(nvlist_lookup_nvlist(config, 2971192830Sed ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2972192830Sed VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2973192830Sed ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2974192830Sed if (nspares != 0) { 2975192830Sed VERIFY(nvlist_add_nvlist_array(nvroot, 2976192830Sed ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2977192830Sed VERIFY(nvlist_lookup_nvlist_array(nvroot, 2978192830Sed ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2979192830Sed 2980192830Sed /* 2981192830Sed * Go through and find any spares which have since been 2982192830Sed * repurposed as an active spare. If this is the case, update 2983192830Sed * their status appropriately. 2984192830Sed */ 2985192830Sed for (i = 0; i < nspares; i++) { 2986192830Sed VERIFY(nvlist_lookup_uint64(spares[i], 2987192830Sed ZPOOL_CONFIG_GUID, &guid) == 0); 2988192830Sed if (spa_spare_exists(guid, &pool, NULL) && 2989192830Sed pool != 0ULL) { 2990192830Sed VERIFY(nvlist_lookup_uint64_array( 2991192830Sed spares[i], ZPOOL_CONFIG_VDEV_STATS, 2992192830Sed (uint64_t **)&vs, &vsc) == 0); 2993192830Sed vs->vs_state = VDEV_STATE_CANT_OPEN; 2994192830Sed vs->vs_aux = VDEV_AUX_SPARED; 2995192830Sed } 2996192830Sed } 2997192830Sed } 2998192914Sed} 2999192830Sed 3000192830Sed/* 3001192830Sed * Add l2cache device information to the nvlist, including vdev stats. 3002192830Sed */ 3003192830Sedstatic void 3004192830Sedspa_add_l2cache(spa_t *spa, nvlist_t *config) 3005192830Sed{ 3006192830Sed nvlist_t **l2cache; 3007192830Sed uint_t i, j, nl2cache; 3008192830Sed nvlist_t *nvroot; 3009192830Sed uint64_t guid; 3010192830Sed vdev_t *vd; 3011192830Sed vdev_stat_t *vs; 3012192830Sed uint_t vsc; 3013192830Sed 3014192830Sed ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3015192830Sed 3016192830Sed if (spa->spa_l2cache.sav_count == 0) 3017192830Sed return; 3018192830Sed 3019192830Sed VERIFY(nvlist_lookup_nvlist(config, 3020192830Sed ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3021192830Sed VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3022192830Sed ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3023192830Sed if (nl2cache != 0) { 3024192830Sed VERIFY(nvlist_add_nvlist_array(nvroot, 3025192830Sed ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3026192830Sed VERIFY(nvlist_lookup_nvlist_array(nvroot, 3027192830Sed ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3028192830Sed 3029192830Sed /* 3030192830Sed * Update level 2 cache device stats. 3031192830Sed */ 3032192830Sed 3033192830Sed for (i = 0; i < nl2cache; i++) { 3034192830Sed VERIFY(nvlist_lookup_uint64(l2cache[i], 3035192830Sed ZPOOL_CONFIG_GUID, &guid) == 0); 3036192830Sed 3037192830Sed vd = NULL; 3038192830Sed for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3039192830Sed if (guid == 3040192830Sed spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3041192830Sed vd = spa->spa_l2cache.sav_vdevs[j]; 3042192830Sed break; 3043192830Sed } 3044192830Sed } 3045192830Sed ASSERT(vd != NULL); 3046192830Sed 3047192830Sed VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3048192830Sed ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3049192830Sed == 0); 3050192830Sed vdev_get_stats(vd, vs); 3051192830Sed } 3052192830Sed } 3053192830Sed} 3054192830Sed 3055192830Sedstatic void 3056192914Sedspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3057192914Sed{ 3058192914Sed nvlist_t *features; 3059192914Sed zap_cursor_t zc; 3060192914Sed zap_attribute_t za; 3061192914Sed 3062192914Sed ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3063192914Sed VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3064192914Sed 3065192914Sed /* We may be unable to read features if pool is suspended. */ 3066192914Sed if (spa_suspended(spa)) 3067192914Sed goto out; 3068192914Sed 3069192914Sed if (spa->spa_feat_for_read_obj != 0) { 3070192830Sed for (zap_cursor_init(&zc, spa->spa_meta_objset, 3071192830Sed spa->spa_feat_for_read_obj); 3072192830Sed zap_cursor_retrieve(&zc, &za) == 0; 3073192830Sed zap_cursor_advance(&zc)) { 3074192830Sed ASSERT(za.za_integer_length == sizeof (uint64_t) && 3075192830Sed za.za_num_integers == 1); 3076192830Sed VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3077192830Sed za.za_first_integer)); 3078192830Sed } 3079192830Sed zap_cursor_fini(&zc); 3080192830Sed } 3081192830Sed 3082192830Sed if (spa->spa_feat_for_write_obj != 0) { 3083192830Sed for (zap_cursor_init(&zc, spa->spa_meta_objset, 3084192830Sed spa->spa_feat_for_write_obj); 3085192830Sed zap_cursor_retrieve(&zc, &za) == 0; 3086192830Sed zap_cursor_advance(&zc)) { 3087192830Sed ASSERT(za.za_integer_length == sizeof (uint64_t) && 3088192830Sed za.za_num_integers == 1); 3089192914Sed VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3090192830Sed za.za_first_integer)); 3091192830Sed } 3092192830Sed zap_cursor_fini(&zc); 3093192830Sed } 3094192830Sed 3095192830Sedout: 3096192830Sed VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3097192830Sed features) == 0); 3098192830Sed nvlist_free(features); 3099192830Sed} 3100192830Sed 3101192830Sedint 3102192830Sedspa_get_stats(const char *name, nvlist_t **config, 3103192830Sed char *altroot, size_t buflen) 3104192830Sed{ 3105192830Sed int error; 3106192830Sed spa_t *spa; 3107192830Sed 3108192830Sed *config = NULL; 3109192830Sed error = spa_open_common(name, &spa, FTAG, NULL, config); 3110192830Sed 3111192830Sed if (spa != NULL) { 3112192830Sed /* 3113192830Sed * This still leaves a window of inconsistency where the spares 3114192830Sed * or l2cache devices could change and the config would be 3115192830Sed * self-inconsistent. 3116192830Sed */ 3117192830Sed spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3118192830Sed 3119192830Sed if (*config != NULL) { 3120192830Sed uint64_t loadtimes[2]; 3121192830Sed 3122192830Sed loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3123192830Sed loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3124192830Sed VERIFY(nvlist_add_uint64_array(*config, 3125192830Sed ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3126192830Sed 3127192830Sed VERIFY(nvlist_add_uint64(*config, 3128192830Sed ZPOOL_CONFIG_ERRCOUNT, 3129192830Sed spa_get_errlog_size(spa)) == 0); 3130192830Sed 3131192830Sed if (spa_suspended(spa)) 3132192830Sed VERIFY(nvlist_add_uint64(*config, 3133192830Sed ZPOOL_CONFIG_SUSPENDED, 3134192830Sed spa->spa_failmode) == 0); 3135192830Sed 3136192830Sed spa_add_spares(spa, *config); 3137192830Sed spa_add_l2cache(spa, *config); 3138192830Sed spa_add_feature_stats(spa, *config); 3139192830Sed } 3140192830Sed } 3141192830Sed 3142192830Sed /* 3143192830Sed * We want to get the alternate root even for faulted pools, so we cheat 3144192830Sed * and call spa_lookup() directly. 3145192830Sed */ 3146192830Sed if (altroot) { 3147192830Sed if (spa == NULL) { 3148192830Sed mutex_enter(&spa_namespace_lock); 3149192830Sed spa = spa_lookup(name); 3150192830Sed if (spa) 3151192830Sed spa_altroot(spa, altroot, buflen); 3152192830Sed else 3153192830Sed altroot[0] = '\0'; 3154192830Sed spa = NULL; 3155192830Sed mutex_exit(&spa_namespace_lock); 3156192830Sed } else { 3157192830Sed spa_altroot(spa, altroot, buflen); 3158192830Sed } 3159192830Sed } 3160192830Sed 3161192914Sed if (spa != NULL) { 3162192830Sed spa_config_exit(spa, SCL_CONFIG, FTAG); 3163192830Sed spa_close(spa, FTAG); 3164192830Sed } 3165192830Sed 3166192830Sed return (error); 3167192830Sed} 3168192830Sed 3169192830Sed/* 3170192830Sed * Validate that the auxiliary device array is well formed. We must have an 3171192830Sed * array of nvlists, each which describes a valid leaf vdev. If this is an 3172192830Sed * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3173192830Sed * specified, as long as they are well-formed. 3174192830Sed */ 3175192830Sedstatic int 3176192830Sedspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3177192830Sed spa_aux_vdev_t *sav, const char *config, uint64_t version, 3178192830Sed vdev_labeltype_t label) 3179192830Sed{ 3180192830Sed nvlist_t **dev; 3181192830Sed uint_t i, ndev; 3182192830Sed vdev_t *vd; 3183192830Sed int error; 3184192830Sed 3185192830Sed ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3186192830Sed 3187192830Sed /* 3188192830Sed * It's acceptable to have no devs specified. 3189192830Sed */ 3190192830Sed if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3191192830Sed return (0); 3192192830Sed 3193192830Sed if (ndev == 0) 3194192830Sed return (SET_ERROR(EINVAL)); 3195192830Sed 3196192830Sed /* 3197192830Sed * Make sure the pool is formatted with a version that supports this 3198192830Sed * device type. 3199192830Sed */ 3200192914Sed if (spa_version(spa) < version) 3201192914Sed return (SET_ERROR(ENOTSUP)); 3202192914Sed 3203192830Sed /* 3204192830Sed * Set the pending device list so we correctly handle device in-use 3205192830Sed * checking. 3206192830Sed */ 3207192830Sed sav->sav_pending = dev; 3208192830Sed sav->sav_npending = ndev; 3209192830Sed 3210192830Sed for (i = 0; i < ndev; i++) { 3211192830Sed if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3212192830Sed mode)) != 0) 3213192830Sed goto out; 3214192830Sed 3215192830Sed if (!vd->vdev_ops->vdev_op_leaf) { 3216192830Sed vdev_free(vd); 3217192830Sed error = SET_ERROR(EINVAL); 3218192830Sed goto out; 3219192830Sed } 3220192830Sed 3221192830Sed /* 3222192830Sed * The L2ARC currently only supports disk devices in 3223192830Sed * kernel context. For user-level testing, we allow it. 3224192830Sed */ 3225192830Sed#ifdef _KERNEL 3226192830Sed if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3227192830Sed strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3228192830Sed error = SET_ERROR(ENOTBLK); 3229192830Sed vdev_free(vd); 3230192830Sed goto out; 3231192830Sed } 3232192830Sed#endif 3233192830Sed vd->vdev_top = vd; 3234192830Sed 3235192830Sed if ((error = vdev_open(vd)) == 0 && 3236192830Sed (error = vdev_label_init(vd, crtxg, label)) == 0) { 3237192830Sed VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3238192830Sed vd->vdev_guid) == 0); 3239192830Sed } 3240192830Sed 3241192830Sed vdev_free(vd); 3242192830Sed 3243192830Sed if (error && 3244192830Sed (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3245228627Sdim goto out; 3246192830Sed else 3247192830Sed error = 0; 3248192830Sed } 3249192830Sed 3250192830Sedout: 3251192830Sed sav->sav_pending = NULL; 3252192830Sed sav->sav_npending = 0; 3253192830Sed return (error); 3254192830Sed} 3255192830Sed 3256192830Sedstatic int 3257192830Sedspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3258192830Sed{ 3259192830Sed int error; 3260192830Sed 3261192830Sed ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3262192830Sed 3263192830Sed if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3264192830Sed &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3265192830Sed VDEV_LABEL_SPARE)) != 0) { 3266192830Sed return (error); 3267192830Sed } 3268192830Sed 3269192830Sed return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3270192830Sed &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3271192830Sed VDEV_LABEL_L2CACHE)); 3272192830Sed} 3273192830Sed 3274192830Sedstatic void 3275192830Sedspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3276192830Sed const char *config) 3277192830Sed{ 3278192830Sed int i; 3279192830Sed 3280192830Sed if (sav->sav_config != NULL) { 3281192830Sed nvlist_t **olddevs; 3282192830Sed uint_t oldndevs; 3283192830Sed nvlist_t **newdevs; 3284192830Sed 3285192830Sed /* 3286192830Sed * Generate new dev list by concatentating with the 3287192830Sed * current dev list. 3288192830Sed */ 3289192830Sed VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3290192830Sed &olddevs, &oldndevs) == 0); 3291192830Sed 3292192830Sed newdevs = kmem_alloc(sizeof (void *) * 3293192830Sed (ndevs + oldndevs), KM_SLEEP); 3294192830Sed for (i = 0; i < oldndevs; i++) 3295192830Sed VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3296192830Sed KM_SLEEP) == 0); 3297192830Sed for (i = 0; i < ndevs; i++) 3298192830Sed VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3299192830Sed KM_SLEEP) == 0); 3300192830Sed 3301192830Sed VERIFY(nvlist_remove(sav->sav_config, config, 3302192830Sed DATA_TYPE_NVLIST_ARRAY) == 0); 3303192830Sed 3304192830Sed VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3305192914Sed config, newdevs, ndevs + oldndevs) == 0); 3306192830Sed for (i = 0; i < oldndevs + ndevs; i++) 3307192830Sed nvlist_free(newdevs[i]); 3308192830Sed kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3309192830Sed } else { 3310192830Sed /* 3311192830Sed * Generate a new dev list. 3312192830Sed */ 3313192830Sed VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3314192830Sed KM_SLEEP) == 0); 3315192830Sed VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3316192830Sed devs, ndevs) == 0); 3317192830Sed } 3318192830Sed} 3319192830Sed 3320192830Sed/* 3321192830Sed * Stop and drop level 2 ARC devices 3322192830Sed */ 3323192830Sedvoid 3324192830Sedspa_l2cache_drop(spa_t *spa) 3325192830Sed{ 3326192830Sed vdev_t *vd; 3327192830Sed int i; 3328192830Sed spa_aux_vdev_t *sav = &spa->spa_l2cache; 3329192830Sed 3330192830Sed for (i = 0; i < sav->sav_count; i++) { 3331192830Sed uint64_t pool; 3332192830Sed 3333192830Sed vd = sav->sav_vdevs[i]; 3334192830Sed ASSERT(vd != NULL); 3335192830Sed 3336192830Sed if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3337192830Sed pool != 0ULL && l2arc_vdev_present(vd)) 3338192830Sed l2arc_remove_vdev(vd); 3339192830Sed } 3340192830Sed} 3341192830Sed 3342192830Sed/* 3343192830Sed * Pool Creation 3344192830Sed */ 3345192830Sedint 3346192830Sedspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3347192830Sed nvlist_t *zplprops) 3348192830Sed{ 3349192914Sed spa_t *spa; 3350192830Sed char *altroot = NULL; 3351192830Sed vdev_t *rvd; 3352192830Sed dsl_pool_t *dp; 3353192830Sed dmu_tx_t *tx; 3354192830Sed int error = 0; 3355192830Sed uint64_t txg = TXG_INITIAL; 3356192830Sed nvlist_t **spares, **l2cache; 3357192830Sed uint_t nspares, nl2cache; 3358192830Sed uint64_t version, obj; 3359192830Sed boolean_t has_features; 3360192830Sed 3361192830Sed /* 3362192830Sed * If this pool already exists, return failure. 3363192830Sed */ 3364192830Sed mutex_enter(&spa_namespace_lock); 3365192830Sed if (spa_lookup(pool) != NULL) { 3366192830Sed mutex_exit(&spa_namespace_lock); 3367192830Sed return (SET_ERROR(EEXIST)); 3368192830Sed } 3369192830Sed 3370192830Sed /* 3371192830Sed * Allocate a new spa_t structure. 3372192830Sed */ 3373192830Sed (void) nvlist_lookup_string(props, 3374192830Sed zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3375192830Sed spa = spa_add(pool, NULL, altroot); 3376192830Sed spa_activate(spa, spa_mode_global); 3377192830Sed 3378192830Sed if (props && (error = spa_prop_validate(spa, props))) { 3379192830Sed spa_deactivate(spa); 3380192830Sed spa_remove(spa); 3381192830Sed mutex_exit(&spa_namespace_lock); 3382192830Sed return (error); 3383192830Sed } 3384192830Sed 3385192830Sed has_features = B_FALSE; 3386192830Sed for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3387192830Sed elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3388192830Sed if (zpool_prop_feature(nvpair_name(elem))) 3389192830Sed has_features = B_TRUE; 3390192830Sed } 3391192830Sed 3392192830Sed if (has_features || nvlist_lookup_uint64(props, 3393192830Sed zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3394192830Sed version = SPA_VERSION; 3395192830Sed } 3396192830Sed ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3397192830Sed 3398192830Sed spa->spa_first_txg = txg; 3399192830Sed spa->spa_uberblock.ub_txg = txg - 1; 3400192830Sed spa->spa_uberblock.ub_version = version; 3401192830Sed spa->spa_ubsync = spa->spa_uberblock; 3402192830Sed 3403192830Sed /* 3404192830Sed * Create "The Godfather" zio to hold all async IOs 3405192830Sed */ 3406192830Sed spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 3407192830Sed ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 3408192830Sed 3409192830Sed /* 3410192830Sed * Create the root vdev. 3411192830Sed */ 3412192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3413192830Sed 3414192830Sed error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3415192830Sed 3416192830Sed ASSERT(error != 0 || rvd != NULL); 3417192830Sed ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3418192830Sed 3419192830Sed if (error == 0 && !zfs_allocatable_devs(nvroot)) 3420192830Sed error = SET_ERROR(EINVAL); 3421192830Sed 3422192830Sed if (error == 0 && 3423192830Sed (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3424192830Sed (error = spa_validate_aux(spa, nvroot, txg, 3425192830Sed VDEV_ALLOC_ADD)) == 0) { 3426192830Sed for (int c = 0; c < rvd->vdev_children; c++) { 3427192830Sed vdev_ashift_optimize(rvd->vdev_child[c]); 3428192830Sed vdev_metaslab_set_size(rvd->vdev_child[c]); 3429192830Sed vdev_expand(rvd->vdev_child[c], txg); 3430192830Sed } 3431192914Sed } 3432192914Sed 3433192914Sed spa_config_exit(spa, SCL_ALL, FTAG); 3434192914Sed 3435192830Sed if (error != 0) { 3436196818Sache spa_unload(spa); 3437192830Sed spa_deactivate(spa); 3438196818Sache spa_remove(spa); 3439192830Sed mutex_exit(&spa_namespace_lock); 3440192830Sed return (error); 3441192830Sed } 3442196818Sache 3443192830Sed /* 3444192830Sed * Get the list of spares, if specified. 3445192830Sed */ 3446192830Sed if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3447192830Sed &spares, &nspares) == 0) { 3448192830Sed VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3449192830Sed KM_SLEEP) == 0); 3450192830Sed VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3451192830Sed ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3452192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3453192830Sed spa_load_spares(spa); 3454192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 3455192830Sed spa->spa_spares.sav_sync = B_TRUE; 3456192830Sed } 3457192830Sed 3458192830Sed /* 3459192830Sed * Get the list of level 2 cache devices, if specified. 3460192830Sed */ 3461192830Sed if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3462192830Sed &l2cache, &nl2cache) == 0) { 3463192830Sed VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3464192830Sed NV_UNIQUE_NAME, KM_SLEEP) == 0); 3465192830Sed VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3466192830Sed ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3467192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3468192830Sed spa_load_l2cache(spa); 3469192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 3470192830Sed spa->spa_l2cache.sav_sync = B_TRUE; 3471192830Sed } 3472192830Sed 3473192830Sed spa->spa_is_initializing = B_TRUE; 3474192830Sed spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3475192830Sed spa->spa_meta_objset = dp->dp_meta_objset; 3476192830Sed spa->spa_is_initializing = B_FALSE; 3477192830Sed 3478192830Sed /* 3479192830Sed * Create DDTs (dedup tables). 3480192830Sed */ 3481192830Sed ddt_create(spa); 3482192830Sed 3483192830Sed spa_update_dspace(spa); 3484192830Sed 3485192830Sed tx = dmu_tx_create_assigned(dp, txg); 3486192830Sed 3487192830Sed /* 3488192830Sed * Create the pool config object. 3489192830Sed */ 3490192830Sed spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3491192830Sed DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3492192830Sed DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3493192830Sed 3494192830Sed if (zap_add(spa->spa_meta_objset, 3495192830Sed DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3496192830Sed sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3497192830Sed cmn_err(CE_PANIC, "failed to add pool config"); 3498192830Sed } 3499192830Sed 3500192830Sed if (spa_version(spa) >= SPA_VERSION_FEATURES) 3501192830Sed spa_feature_create_zap_objects(spa, tx); 3502192830Sed 3503192830Sed if (zap_add(spa->spa_meta_objset, 3504192830Sed DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3505192830Sed sizeof (uint64_t), 1, &version, tx) != 0) { 3506192830Sed cmn_err(CE_PANIC, "failed to add pool version"); 3507192830Sed } 3508192830Sed 3509192830Sed /* Newly created pools with the right version are always deflated. */ 3510192830Sed if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3511192830Sed spa->spa_deflate = TRUE; 3512192830Sed if (zap_add(spa->spa_meta_objset, 3513192830Sed DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3514192830Sed sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3515192830Sed cmn_err(CE_PANIC, "failed to add deflate"); 3516192830Sed } 3517192830Sed } 3518192830Sed 3519192830Sed /* 3520192830Sed * Create the deferred-free bpobj. Turn off compression 3521192830Sed * because sync-to-convergence takes longer if the blocksize 3522192830Sed * keeps changing. 3523192830Sed */ 3524192830Sed obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3525192830Sed dmu_object_set_compress(spa->spa_meta_objset, obj, 3526192830Sed ZIO_COMPRESS_OFF, tx); 3527192830Sed if (zap_add(spa->spa_meta_objset, 3528192830Sed DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3529192830Sed sizeof (uint64_t), 1, &obj, tx) != 0) { 3530192830Sed cmn_err(CE_PANIC, "failed to add bpobj"); 3531192830Sed } 3532192830Sed VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3533192830Sed spa->spa_meta_objset, obj)); 3534192830Sed 3535192830Sed /* 3536192830Sed * Create the pool's history object. 3537192830Sed */ 3538192830Sed if (version >= SPA_VERSION_ZPOOL_HISTORY) 3539192830Sed spa_history_create_obj(spa, tx); 3540192830Sed 3541192830Sed /* 3542192830Sed * Set pool properties. 3543192830Sed */ 3544192830Sed spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3545192830Sed spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3546192830Sed spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3547192830Sed spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3548192830Sed 3549192830Sed if (props != NULL) { 3550192830Sed spa_configfile_set(spa, props, B_FALSE); 3551192830Sed spa_sync_props(props, tx); 3552192830Sed } 3553192830Sed 3554192830Sed dmu_tx_commit(tx); 3555192830Sed 3556192830Sed spa->spa_sync_on = B_TRUE; 3557192830Sed txg_sync_start(spa->spa_dsl_pool); 3558192830Sed 3559192830Sed /* 3560192830Sed * We explicitly wait for the first transaction to complete so that our 3561192830Sed * bean counters are appropriately updated. 3562192830Sed */ 3563192830Sed txg_wait_synced(spa->spa_dsl_pool, txg); 3564192830Sed 3565192830Sed spa_config_sync(spa, B_FALSE, B_TRUE); 3566192830Sed 3567192830Sed spa_history_log_version(spa, "create"); 3568192830Sed 3569192830Sed spa->spa_minref = refcount_count(&spa->spa_refcount); 3570192830Sed 3571192830Sed mutex_exit(&spa_namespace_lock); 3572192830Sed 3573192830Sed return (0); 3574192830Sed} 3575192830Sed 3576192830Sed#ifdef _KERNEL 3577192830Sed#if defined(sun) 3578192830Sed/* 3579192830Sed * Get the root pool information from the root disk, then import the root pool 3580192830Sed * during the system boot up time. 3581192830Sed */ 3582192830Sedextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3583192830Sed 3584192830Sedstatic nvlist_t * 3585192830Sedspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3586192830Sed{ 3587192830Sed nvlist_t *config; 3588192830Sed nvlist_t *nvtop, *nvroot; 3589192830Sed uint64_t pgid; 3590192830Sed 3591192830Sed if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3592192830Sed return (NULL); 3593192830Sed 3594192830Sed /* 3595192830Sed * Add this top-level vdev to the child array. 3596192830Sed */ 3597192830Sed VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3598192830Sed &nvtop) == 0); 3599192830Sed VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3600192830Sed &pgid) == 0); 3601192830Sed VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3602192830Sed 3603192830Sed /* 3604192830Sed * Put this pool's top-level vdevs into a root vdev. 3605192830Sed */ 3606192830Sed VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3607192830Sed VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3608192830Sed VDEV_TYPE_ROOT) == 0); 3609192830Sed VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3610192830Sed VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3611192830Sed VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3612192830Sed &nvtop, 1) == 0); 3613192830Sed 3614192830Sed /* 3615192830Sed * Replace the existing vdev_tree with the new root vdev in 3616192830Sed * this pool's configuration (remove the old, add the new). 3617192830Sed */ 3618192830Sed VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3619192830Sed nvlist_free(nvroot); 3620192830Sed return (config); 3621192830Sed} 3622192830Sed 3623192830Sed/* 3624192830Sed * Walk the vdev tree and see if we can find a device with "better" 3625192830Sed * configuration. A configuration is "better" if the label on that 3626192830Sed * device has a more recent txg. 3627192830Sed */ 3628192830Sedstatic void 3629192830Sedspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3630192830Sed{ 3631192830Sed for (int c = 0; c < vd->vdev_children; c++) 3632192830Sed spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3633192830Sed 3634192830Sed if (vd->vdev_ops->vdev_op_leaf) { 3635192830Sed nvlist_t *label; 3636192830Sed uint64_t label_txg; 3637192830Sed 3638192830Sed if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3639192830Sed &label) != 0) 3640192830Sed return; 3641192830Sed 3642192830Sed VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3643192830Sed &label_txg) == 0); 3644192830Sed 3645192830Sed /* 3646192830Sed * Do we have a better boot device? 3647192830Sed */ 3648192830Sed if (label_txg > *txg) { 3649192830Sed *txg = label_txg; 3650192830Sed *avd = vd; 3651192830Sed } 3652192830Sed nvlist_free(label); 3653192830Sed } 3654192830Sed} 3655192830Sed 3656192830Sed/* 3657192830Sed * Import a root pool. 3658192830Sed * 3659192830Sed * For x86. devpath_list will consist of devid and/or physpath name of 3660192830Sed * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3661192830Sed * The GRUB "findroot" command will return the vdev we should boot. 3662192830Sed * 3663192830Sed * For Sparc, devpath_list consists the physpath name of the booting device 3664192830Sed * no matter the rootpool is a single device pool or a mirrored pool. 3665192830Sed * e.g. 3666192830Sed * "/pci@1f,0/ide@d/disk@0,0:a" 3667192830Sed */ 3668192830Sedint 3669192830Sedspa_import_rootpool(char *devpath, char *devid) 3670192830Sed{ 3671192830Sed spa_t *spa; 3672192830Sed vdev_t *rvd, *bvd, *avd = NULL; 3673192830Sed nvlist_t *config, *nvtop; 3674192830Sed uint64_t guid, txg; 3675192830Sed char *pname; 3676192914Sed int error; 3677192914Sed 3678192830Sed /* 3679192830Sed * Read the label from the boot device and generate a configuration. 3680192830Sed */ 3681192830Sed config = spa_generate_rootconf(devpath, devid, &guid); 3682192830Sed#if defined(_OBP) && defined(_KERNEL) 3683192830Sed if (config == NULL) { 3684192830Sed if (strstr(devpath, "/iscsi/ssd") != NULL) { 3685192830Sed /* iscsi boot */ 3686192830Sed get_iscsi_bootpath_phy(devpath); 3687192830Sed config = spa_generate_rootconf(devpath, devid, &guid); 3688192830Sed } 3689192830Sed } 3690192830Sed#endif 3691192830Sed if (config == NULL) { 3692192830Sed cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3693192830Sed devpath); 3694192830Sed return (SET_ERROR(EIO)); 3695192830Sed } 3696192830Sed 3697192830Sed VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3698192830Sed &pname) == 0); 3699192830Sed VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3700192830Sed 3701192830Sed mutex_enter(&spa_namespace_lock); 3702192830Sed if ((spa = spa_lookup(pname)) != NULL) { 3703192830Sed /* 3704192830Sed * Remove the existing root pool from the namespace so that we 3705192914Sed * can replace it with the correct config we just read in. 3706192914Sed */ 3707192914Sed spa_remove(spa); 3708192914Sed } 3709192914Sed 3710192830Sed spa = spa_add(pname, config, NULL); 3711192830Sed spa->spa_is_root = B_TRUE; 3712192830Sed spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3713192830Sed 3714192830Sed /* 3715192830Sed * Build up a vdev tree based on the boot device's label config. 3716192830Sed */ 3717192830Sed VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3718192830Sed &nvtop) == 0); 3719192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3720192830Sed error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3721192830Sed VDEV_ALLOC_ROOTPOOL); 3722192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 3723192830Sed if (error) { 3724192830Sed mutex_exit(&spa_namespace_lock); 3725192830Sed nvlist_free(config); 3726192830Sed cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3727192830Sed pname); 3728192830Sed return (error); 3729192830Sed } 3730192830Sed 3731192830Sed /* 3732192830Sed * Get the boot vdev. 3733192830Sed */ 3734192830Sed if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3735192830Sed cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3736192830Sed (u_longlong_t)guid); 3737192830Sed error = SET_ERROR(ENOENT); 3738192830Sed goto out; 3739192830Sed } 3740192830Sed 3741192830Sed /* 3742192830Sed * Determine if there is a better boot device. 3743192914Sed */ 3744192830Sed avd = bvd; 3745192830Sed spa_alt_rootvdev(rvd, &avd, &txg); 3746192830Sed if (avd != bvd) { 3747192830Sed cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3748192830Sed "try booting from '%s'", avd->vdev_path); 3749192830Sed error = SET_ERROR(EINVAL); 3750192830Sed goto out; 3751192830Sed } 3752192830Sed 3753192830Sed /* 3754192830Sed * If the boot device is part of a spare vdev then ensure that 3755192830Sed * we're booting off the active spare. 3756192830Sed */ 3757192830Sed if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3758192830Sed !bvd->vdev_isspare) { 3759192830Sed cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3760192830Sed "try booting from '%s'", 3761192830Sed bvd->vdev_parent-> 3762192830Sed vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3763192830Sed error = SET_ERROR(EINVAL); 3764192830Sed goto out; 3765192830Sed } 3766192830Sed 3767192830Sed error = 0; 3768192830Sedout: 3769192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3770192830Sed vdev_free(rvd); 3771192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 3772192830Sed mutex_exit(&spa_namespace_lock); 3773192830Sed 3774192830Sed nvlist_free(config); 3775192830Sed return (error); 3776192830Sed} 3777192830Sed 3778192830Sed#else 3779192856Sed 3780192830Sedextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3781192830Sed uint64_t *count); 3782192830Sed 3783192830Sedstatic nvlist_t * 3784192830Sedspa_generate_rootconf(const char *name) 3785192830Sed{ 3786192830Sed nvlist_t **configs, **tops; 3787192830Sed nvlist_t *config; 3788192830Sed nvlist_t *best_cfg, *nvtop, *nvroot; 3789192830Sed uint64_t *holes; 3790192830Sed uint64_t best_txg; 3791192830Sed uint64_t nchildren; 3792192830Sed uint64_t pgid; 3793192830Sed uint64_t count; 3794192830Sed uint64_t i; 3795192830Sed uint_t nholes; 3796192914Sed 3797192830Sed if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 3798192914Sed return (NULL); 3799192830Sed 3800192830Sed ASSERT3U(count, !=, 0); 3801192830Sed best_txg = 0; 3802192830Sed for (i = 0; i < count; i++) { 3803192830Sed uint64_t txg; 3804192830Sed 3805192830Sed VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 3806192830Sed &txg) == 0); 3807192830Sed if (txg > best_txg) { 3808192830Sed best_txg = txg; 3809192830Sed best_cfg = configs[i]; 3810192830Sed } 3811192830Sed } 3812192830Sed 3813192830Sed /* 3814192830Sed * Multi-vdev root pool configuration discovery is not supported yet. 3815192830Sed */ 3816192856Sed nchildren = 1; 3817192830Sed nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 3818192830Sed holes = NULL; 3819192830Sed nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 3820192830Sed &holes, &nholes); 3821192830Sed 3822192830Sed tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 3823192830Sed for (i = 0; i < nchildren; i++) { 3824192830Sed if (i >= count) 3825192830Sed break; 3826192830Sed if (configs[i] == NULL) 3827192830Sed continue; 3828192830Sed VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 3829192830Sed &nvtop) == 0); 3830192830Sed nvlist_dup(nvtop, &tops[i], KM_SLEEP); 3831192830Sed } 3832192830Sed for (i = 0; holes != NULL && i < nholes; i++) { 3833192914Sed if (i >= nchildren) 3834192830Sed continue; 3835192830Sed if (tops[holes[i]] != NULL) 3836192830Sed continue; 3837192830Sed nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 3838192830Sed VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 3839192830Sed VDEV_TYPE_HOLE) == 0); 3840192830Sed VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 3841192830Sed holes[i]) == 0); 3842192830Sed VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 3843192830Sed 0) == 0); 3844192830Sed } 3845192830Sed for (i = 0; i < nchildren; i++) { 3846192830Sed if (tops[i] != NULL) 3847192830Sed continue; 3848192830Sed nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 3849192830Sed VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 3850192830Sed VDEV_TYPE_MISSING) == 0); 3851192830Sed VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 3852192830Sed i) == 0); 3853192830Sed VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 3854192830Sed 0) == 0); 3855192830Sed } 3856192830Sed 3857192830Sed /* 3858192830Sed * Create pool config based on the best vdev config. 3859192830Sed */ 3860192830Sed nvlist_dup(best_cfg, &config, KM_SLEEP); 3861192830Sed 3862192830Sed /* 3863192830Sed * Put this pool's top-level vdevs into a root vdev. 3864192830Sed */ 3865192830Sed VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3866192830Sed &pgid) == 0); 3867192830Sed VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3868192830Sed VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3869192830Sed VDEV_TYPE_ROOT) == 0); 3870192830Sed VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3871192830Sed VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3872192830Sed VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3873192830Sed tops, nchildren) == 0); 3874192830Sed 3875192830Sed /* 3876192830Sed * Replace the existing vdev_tree with the new root vdev in 3877192830Sed * this pool's configuration (remove the old, add the new). 3878192830Sed */ 3879192830Sed VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3880192830Sed 3881192830Sed /* 3882192830Sed * Drop vdev config elements that should not be present at pool level. 3883192830Sed */ 3884192830Sed nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 3885192830Sed nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 3886192830Sed 3887192830Sed for (i = 0; i < count; i++) 3888192830Sed nvlist_free(configs[i]); 3889192830Sed kmem_free(configs, count * sizeof(void *)); 3890192830Sed for (i = 0; i < nchildren; i++) 3891192830Sed nvlist_free(tops[i]); 3892192830Sed kmem_free(tops, nchildren * sizeof(void *)); 3893192830Sed nvlist_free(nvroot); 3894192830Sed return (config); 3895192830Sed} 3896192830Sed 3897192830Sedint 3898192830Sedspa_import_rootpool(const char *name) 3899192830Sed{ 3900192830Sed spa_t *spa; 3901192830Sed vdev_t *rvd, *bvd, *avd = NULL; 3902192830Sed nvlist_t *config, *nvtop; 3903192830Sed uint64_t txg; 3904192830Sed char *pname; 3905192830Sed int error; 3906192830Sed 3907192830Sed /* 3908192830Sed * Read the label from the boot device and generate a configuration. 3909192830Sed */ 3910192830Sed config = spa_generate_rootconf(name); 3911192830Sed 3912192830Sed mutex_enter(&spa_namespace_lock); 3913192830Sed if (config != NULL) { 3914192830Sed VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3915192830Sed &pname) == 0 && strcmp(name, pname) == 0); 3916192830Sed VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 3917192830Sed == 0); 3918192830Sed 3919192830Sed if ((spa = spa_lookup(pname)) != NULL) { 3920192830Sed /* 3921192830Sed * Remove the existing root pool from the namespace so 3922192830Sed * that we can replace it with the correct config 3923192830Sed * we just read in. 3924192830Sed */ 3925192830Sed spa_remove(spa); 3926192830Sed } 3927192830Sed spa = spa_add(pname, config, NULL); 3928192830Sed 3929192830Sed /* 3930192830Sed * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 3931192830Sed * via spa_version(). 3932192830Sed */ 3933192830Sed if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3934192830Sed &spa->spa_ubsync.ub_version) != 0) 3935192830Sed spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3936192830Sed } else if ((spa = spa_lookup(name)) == NULL) { 3937192830Sed cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 3938192830Sed name); 3939192830Sed return (EIO); 3940192830Sed } else { 3941192830Sed VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 3942192830Sed } 3943192830Sed spa->spa_is_root = B_TRUE; 3944192830Sed spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3945192830Sed 3946192830Sed /* 3947192830Sed * Build up a vdev tree based on the boot device's label config. 3948192830Sed */ 3949192830Sed VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3950192830Sed &nvtop) == 0); 3951192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3952192830Sed error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3953192830Sed VDEV_ALLOC_ROOTPOOL); 3954192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 3955192830Sed if (error) { 3956192830Sed mutex_exit(&spa_namespace_lock); 3957192830Sed nvlist_free(config); 3958192830Sed cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3959192914Sed pname); 3960192830Sed return (error); 3961192830Sed } 3962192830Sed 3963192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3964192830Sed vdev_free(rvd); 3965192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 3966192914Sed mutex_exit(&spa_namespace_lock); 3967192830Sed 3968192830Sed nvlist_free(config); 3969192830Sed return (0); 3970192830Sed} 3971192830Sed 3972192830Sed#endif /* sun */ 3973192830Sed#endif 3974192830Sed 3975192830Sed/* 3976192830Sed * Import a non-root pool into the system. 3977192830Sed */ 3978192830Sedint 3979192830Sedspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3980192830Sed{ 3981192830Sed spa_t *spa; 3982192830Sed char *altroot = NULL; 3983192830Sed spa_load_state_t state = SPA_LOAD_IMPORT; 3984192830Sed zpool_rewind_policy_t policy; 3985192830Sed uint64_t mode = spa_mode_global; 3986192830Sed uint64_t readonly = B_FALSE; 3987192830Sed int error; 3988192830Sed nvlist_t *nvroot; 3989192830Sed nvlist_t **spares, **l2cache; 3990192830Sed uint_t nspares, nl2cache; 3991192830Sed 3992192830Sed /* 3993192830Sed * If a pool with this name exists, return failure. 3994192830Sed */ 3995192830Sed mutex_enter(&spa_namespace_lock); 3996192830Sed if (spa_lookup(pool) != NULL) { 3997192830Sed mutex_exit(&spa_namespace_lock); 3998192830Sed return (SET_ERROR(EEXIST)); 3999192830Sed } 4000192830Sed 4001192830Sed /* 4002192830Sed * Create and initialize the spa structure. 4003192830Sed */ 4004192830Sed (void) nvlist_lookup_string(props, 4005192830Sed zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4006192830Sed (void) nvlist_lookup_uint64(props, 4007192830Sed zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4008192830Sed if (readonly) 4009192830Sed mode = FREAD; 4010192830Sed spa = spa_add(pool, config, altroot); 4011192830Sed spa->spa_import_flags = flags; 4012192830Sed 4013192830Sed /* 4014192830Sed * Verbatim import - Take a pool and insert it into the namespace 4015192830Sed * as if it had been loaded at boot. 4016192830Sed */ 4017192830Sed if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4018192830Sed if (props != NULL) 4019192830Sed spa_configfile_set(spa, props, B_FALSE); 4020192830Sed 4021192830Sed spa_config_sync(spa, B_FALSE, B_TRUE); 4022192830Sed 4023192830Sed mutex_exit(&spa_namespace_lock); 4024192830Sed spa_history_log_version(spa, "import"); 4025192830Sed 4026192830Sed return (0); 4027192830Sed } 4028192830Sed 4029192830Sed spa_activate(spa, mode); 4030192830Sed 4031192830Sed /* 4032192830Sed * Don't start async tasks until we know everything is healthy. 4033192830Sed */ 4034192830Sed spa_async_suspend(spa); 4035192830Sed 4036192830Sed zpool_get_rewind_policy(config, &policy); 4037192830Sed if (policy.zrp_request & ZPOOL_DO_REWIND) 4038192830Sed state = SPA_LOAD_RECOVER; 4039192830Sed 4040192830Sed /* 4041192830Sed * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4042192830Sed * because the user-supplied config is actually the one to trust when 4043192830Sed * doing an import. 4044192830Sed */ 4045192830Sed if (state != SPA_LOAD_RECOVER) 4046192830Sed spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4047192830Sed 4048192830Sed error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4049192830Sed policy.zrp_request); 4050192830Sed 4051192830Sed /* 4052192830Sed * Propagate anything learned while loading the pool and pass it 4053192830Sed * back to caller (i.e. rewind info, missing devices, etc). 4054192830Sed */ 4055192830Sed VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4056192830Sed spa->spa_load_info) == 0); 4057192830Sed 4058192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4059192830Sed /* 4060192830Sed * Toss any existing sparelist, as it doesn't have any validity 4061192830Sed * anymore, and conflicts with spa_has_spare(). 4062192830Sed */ 4063192830Sed if (spa->spa_spares.sav_config) { 4064192830Sed nvlist_free(spa->spa_spares.sav_config); 4065192830Sed spa->spa_spares.sav_config = NULL; 4066192830Sed spa_load_spares(spa); 4067192830Sed } 4068192830Sed if (spa->spa_l2cache.sav_config) { 4069192830Sed nvlist_free(spa->spa_l2cache.sav_config); 4070192830Sed spa->spa_l2cache.sav_config = NULL; 4071192830Sed spa_load_l2cache(spa); 4072192830Sed } 4073192830Sed 4074192830Sed VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4075192830Sed &nvroot) == 0); 4076192830Sed if (error == 0) 4077192830Sed error = spa_validate_aux(spa, nvroot, -1ULL, 4078192830Sed VDEV_ALLOC_SPARE); 4079192830Sed if (error == 0) 4080192830Sed error = spa_validate_aux(spa, nvroot, -1ULL, 4081192830Sed VDEV_ALLOC_L2CACHE); 4082192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 4083192830Sed 4084192830Sed if (props != NULL) 4085192830Sed spa_configfile_set(spa, props, B_FALSE); 4086192830Sed 4087192830Sed if (error != 0 || (props && spa_writeable(spa) && 4088192830Sed (error = spa_prop_set(spa, props)))) { 4089192830Sed spa_unload(spa); 4090192830Sed spa_deactivate(spa); 4091192830Sed spa_remove(spa); 4092192830Sed mutex_exit(&spa_namespace_lock); 4093192830Sed return (error); 4094192830Sed } 4095192830Sed 4096192830Sed spa_async_resume(spa); 4097192830Sed 4098192830Sed /* 4099192830Sed * Override any spares and level 2 cache devices as specified by 4100192830Sed * the user, as these may have correct device names/devids, etc. 4101192830Sed */ 4102192830Sed if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4103192830Sed &spares, &nspares) == 0) { 4104192830Sed if (spa->spa_spares.sav_config) 4105192830Sed VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4106192830Sed ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4107192830Sed else 4108192830Sed VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4109192830Sed NV_UNIQUE_NAME, KM_SLEEP) == 0); 4110192830Sed VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4111192830Sed ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4112192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4113192830Sed spa_load_spares(spa); 4114192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 4115192830Sed spa->spa_spares.sav_sync = B_TRUE; 4116192830Sed } 4117192830Sed if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4118192830Sed &l2cache, &nl2cache) == 0) { 4119192830Sed if (spa->spa_l2cache.sav_config) 4120192830Sed VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4121192830Sed ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4122192830Sed else 4123192830Sed VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4124192830Sed NV_UNIQUE_NAME, KM_SLEEP) == 0); 4125192856Sed VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4126192830Sed ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4127192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4128192830Sed spa_load_l2cache(spa); 4129192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 4130192830Sed spa->spa_l2cache.sav_sync = B_TRUE; 4131192830Sed } 4132192830Sed 4133192830Sed /* 4134192830Sed * Check for any removed devices. 4135192830Sed */ 4136192830Sed if (spa->spa_autoreplace) { 4137192830Sed spa_aux_check_removed(&spa->spa_spares); 4138192830Sed spa_aux_check_removed(&spa->spa_l2cache); 4139192830Sed } 4140192830Sed 4141192830Sed if (spa_writeable(spa)) { 4142192914Sed /* 4143192914Sed * Update the config cache to include the newly-imported pool. 4144192830Sed */ 4145192830Sed spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4146192830Sed } 4147192830Sed 4148192830Sed /* 4149192830Sed * It's possible that the pool was expanded while it was exported. 4150192830Sed * We kick off an async task to handle this for us. 4151192830Sed */ 4152192830Sed spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4153192830Sed 4154192830Sed mutex_exit(&spa_namespace_lock); 4155192830Sed spa_history_log_version(spa, "import"); 4156192830Sed 4157192830Sed#ifdef __FreeBSD__ 4158192830Sed#ifdef _KERNEL 4159192830Sed zvol_create_minors(pool); 4160192914Sed#endif 4161192830Sed#endif 4162192830Sed return (0); 4163192830Sed} 4164192830Sed 4165192830Sednvlist_t * 4166192830Sedspa_tryimport(nvlist_t *tryconfig) 4167192830Sed{ 4168192830Sed nvlist_t *config = NULL; 4169192830Sed char *poolname; 4170192830Sed spa_t *spa; 4171192830Sed uint64_t state; 4172192830Sed int error; 4173192830Sed 4174192830Sed if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4175192830Sed return (NULL); 4176192830Sed 4177192830Sed if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4178192830Sed return (NULL); 4179192830Sed 4180192830Sed /* 4181192830Sed * Create and initialize the spa structure. 4182192830Sed */ 4183192830Sed mutex_enter(&spa_namespace_lock); 4184192830Sed spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4185192830Sed spa_activate(spa, FREAD); 4186192830Sed 4187192830Sed /* 4188192830Sed * Pass off the heavy lifting to spa_load(). 4189192830Sed * Pass TRUE for mosconfig because the user-supplied config 4190192830Sed * is actually the one to trust when doing an import. 4191192914Sed */ 4192192830Sed error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4193192830Sed 4194192830Sed /* 4195192830Sed * If 'tryconfig' was at least parsable, return the current config. 4196192830Sed */ 4197192830Sed if (spa->spa_root_vdev != NULL) { 4198192830Sed config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4199192830Sed VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4200192830Sed poolname) == 0); 4201192830Sed VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4202192830Sed state) == 0); 4203192830Sed VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4204192830Sed spa->spa_uberblock.ub_timestamp) == 0); 4205192830Sed VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4206192830Sed spa->spa_load_info) == 0); 4207192830Sed 4208192830Sed /* 4209192830Sed * If the bootfs property exists on this pool then we 4210192830Sed * copy it out so that external consumers can tell which 4211192830Sed * pools are bootable. 4212192830Sed */ 4213192830Sed if ((!error || error == EEXIST) && spa->spa_bootfs) { 4214192830Sed char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4215192830Sed 4216192830Sed /* 4217192830Sed * We have to play games with the name since the 4218192830Sed * pool was opened as TRYIMPORT_NAME. 4219192830Sed */ 4220192830Sed if (dsl_dsobj_to_dsname(spa_name(spa), 4221192830Sed spa->spa_bootfs, tmpname) == 0) { 4222192830Sed char *cp; 4223192830Sed char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4224192830Sed 4225192830Sed cp = strchr(tmpname, '/'); 4226192830Sed if (cp == NULL) { 4227192830Sed (void) strlcpy(dsname, tmpname, 4228192830Sed MAXPATHLEN); 4229192830Sed } else { 4230192830Sed (void) snprintf(dsname, MAXPATHLEN, 4231192830Sed "%s/%s", poolname, ++cp); 4232192830Sed } 4233192830Sed VERIFY(nvlist_add_string(config, 4234192830Sed ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4235192830Sed kmem_free(dsname, MAXPATHLEN); 4236192830Sed } 4237192830Sed kmem_free(tmpname, MAXPATHLEN); 4238192830Sed } 4239192830Sed 4240192830Sed /* 4241192830Sed * Add the list of hot spares and level 2 cache devices. 4242192830Sed */ 4243192830Sed spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4244192830Sed spa_add_spares(spa, config); 4245192830Sed spa_add_l2cache(spa, config); 4246192830Sed spa_config_exit(spa, SCL_CONFIG, FTAG); 4247192830Sed } 4248192830Sed 4249192830Sed spa_unload(spa); 4250192830Sed spa_deactivate(spa); 4251192830Sed spa_remove(spa); 4252192830Sed mutex_exit(&spa_namespace_lock); 4253192830Sed 4254192830Sed return (config); 4255192830Sed} 4256192830Sed 4257192830Sed/* 4258192830Sed * Pool export/destroy 4259192830Sed * 4260192830Sed * The act of destroying or exporting a pool is very simple. We make sure there 4261192830Sed * is no more pending I/O and any references to the pool are gone. Then, we 4262192830Sed * update the pool state and sync all the labels to disk, removing the 4263192830Sed * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4264192830Sed * we don't sync the labels or remove the configuration cache. 4265192830Sed */ 4266192830Sedstatic int 4267192830Sedspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4268192830Sed boolean_t force, boolean_t hardforce) 4269192830Sed{ 4270192830Sed spa_t *spa; 4271192830Sed 4272192830Sed if (oldconfig) 4273192830Sed *oldconfig = NULL; 4274192830Sed 4275192830Sed if (!(spa_mode_global & FWRITE)) 4276192830Sed return (SET_ERROR(EROFS)); 4277192830Sed 4278192830Sed mutex_enter(&spa_namespace_lock); 4279192830Sed if ((spa = spa_lookup(pool)) == NULL) { 4280192830Sed mutex_exit(&spa_namespace_lock); 4281192830Sed return (SET_ERROR(ENOENT)); 4282192830Sed } 4283192830Sed 4284192830Sed /* 4285192830Sed * Put a hold on the pool, drop the namespace lock, stop async tasks, 4286192830Sed * reacquire the namespace lock, and see if we can export. 4287192830Sed */ 4288192830Sed spa_open_ref(spa, FTAG); 4289192830Sed mutex_exit(&spa_namespace_lock); 4290192830Sed spa_async_suspend(spa); 4291192830Sed mutex_enter(&spa_namespace_lock); 4292192830Sed spa_close(spa, FTAG); 4293192830Sed 4294192830Sed /* 4295192830Sed * The pool will be in core if it's openable, 4296192830Sed * in which case we can modify its state. 4297192830Sed */ 4298192830Sed if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4299192830Sed /* 4300192830Sed * Objsets may be open only because they're dirty, so we 4301192830Sed * have to force it to sync before checking spa_refcnt. 4302192830Sed */ 4303192830Sed txg_wait_synced(spa->spa_dsl_pool, 0); 4304192830Sed 4305192830Sed /* 4306192830Sed * A pool cannot be exported or destroyed if there are active 4307192830Sed * references. If we are resetting a pool, allow references by 4308192830Sed * fault injection handlers. 4309192830Sed */ 4310192830Sed if (!spa_refcount_zero(spa) || 4311192830Sed (spa->spa_inject_ref != 0 && 4312192830Sed new_state != POOL_STATE_UNINITIALIZED)) { 4313192830Sed spa_async_resume(spa); 4314192830Sed mutex_exit(&spa_namespace_lock); 4315192914Sed return (SET_ERROR(EBUSY)); 4316192830Sed } 4317192830Sed 4318192830Sed /* 4319192830Sed * A pool cannot be exported if it has an active shared spare. 4320192830Sed * This is to prevent other pools stealing the active spare 4321192830Sed * from an exported pool. At user's own will, such pool can 4322192830Sed * be forcedly exported. 4323192830Sed */ 4324192830Sed if (!force && new_state == POOL_STATE_EXPORTED && 4325192830Sed spa_has_active_shared_spare(spa)) { 4326192830Sed spa_async_resume(spa); 4327192830Sed mutex_exit(&spa_namespace_lock); 4328192830Sed return (SET_ERROR(EXDEV)); 4329192830Sed } 4330192830Sed 4331192830Sed /* 4332192830Sed * We want this to be reflected on every label, 4333192830Sed * so mark them all dirty. spa_unload() will do the 4334192830Sed * final sync that pushes these changes out. 4335192830Sed */ 4336192830Sed if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4337192830Sed spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4338192830Sed spa->spa_state = new_state; 4339192830Sed spa->spa_final_txg = spa_last_synced_txg(spa) + 4340192830Sed TXG_DEFER_SIZE + 1; 4341192830Sed vdev_config_dirty(spa->spa_root_vdev); 4342192830Sed spa_config_exit(spa, SCL_ALL, FTAG); 4343192830Sed } 4344192830Sed } 4345192830Sed 4346192830Sed spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4347192830Sed 4348192830Sed if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4349192830Sed spa_unload(spa); 4350192830Sed spa_deactivate(spa); 4351192830Sed } 4352192830Sed 4353192830Sed if (oldconfig && spa->spa_config) 4354192830Sed VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4355192830Sed 4356192830Sed if (new_state != POOL_STATE_UNINITIALIZED) { 4357192830Sed if (!hardforce) 4358192830Sed spa_config_sync(spa, B_TRUE, B_TRUE); 4359192830Sed spa_remove(spa); 4360192830Sed } 4361192830Sed mutex_exit(&spa_namespace_lock); 4362192914Sed 4363192830Sed return (0); 4364192830Sed} 4365192830Sed 4366192830Sed/* 4367192830Sed * Destroy a storage pool. 4368192830Sed */ 4369192830Sedint 4370192830Sedspa_destroy(char *pool) 4371192830Sed{ 4372192830Sed return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4373192830Sed B_FALSE, B_FALSE)); 4374192830Sed} 4375192830Sed 4376192830Sed/* 4377192830Sed * Export a storage pool. 4378192830Sed */ 4379192830Sedint 4380192830Sedspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4381192830Sed boolean_t hardforce) 4382192830Sed{ 4383192830Sed return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4384192830Sed force, hardforce)); 4385192830Sed} 4386192830Sed 4387192830Sed/* 4388192830Sed * Similar to spa_export(), this unloads the spa_t without actually removing it 4389192830Sed * from the namespace in any way. 4390192830Sed */ 4391192830Sedint 4392192830Sedspa_reset(char *pool) 4393192830Sed{ 4394192830Sed return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4395192830Sed B_FALSE, B_FALSE)); 4396192830Sed} 4397192830Sed 4398192830Sed/* 4399192830Sed * ========================================================================== 4400192830Sed * Device manipulation 4401192830Sed * ========================================================================== 4402192830Sed */ 4403192830Sed 4404192830Sed/* 4405192830Sed * Add a device to a storage pool. 4406192830Sed */ 4407192830Sedint 4408192830Sedspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4409192830Sed{ 4410192830Sed uint64_t txg, id; 4411192830Sed int error; 4412192830Sed vdev_t *rvd = spa->spa_root_vdev; 4413192830Sed vdev_t *vd, *tvd; 4414192830Sed nvlist_t **spares, **l2cache; 4415192830Sed uint_t nspares, nl2cache; 4416192830Sed 4417192830Sed ASSERT(spa_writeable(spa)); 4418192830Sed 4419192830Sed txg = spa_vdev_enter(spa); 4420192830Sed 4421192830Sed if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4422192830Sed VDEV_ALLOC_ADD)) != 0) 4423192830Sed return (spa_vdev_exit(spa, NULL, txg, error)); 4424192830Sed 4425192830Sed spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4426192830Sed 4427192830Sed if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4428192830Sed &nspares) != 0) 4429192856Sed nspares = 0; 4430192830Sed 4431192856Sed if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4432192830Sed &nl2cache) != 0) 4433192830Sed nl2cache = 0; 4434192830Sed 4435192830Sed if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4436192830Sed return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4437192856Sed 4438192914Sed if (vd->vdev_children != 0 && 4439192856Sed (error = vdev_create(vd, txg, B_FALSE)) != 0) 4440192856Sed return (spa_vdev_exit(spa, vd, txg, error)); 4441192856Sed 4442192856Sed /* 4443192856Sed * We must validate the spares and l2cache devices after checking the 4444192856Sed * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4445192856Sed */ 4446192856Sed if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4447192830Sed return (spa_vdev_exit(spa, vd, txg, error)); 4448192830Sed 4449192830Sed /* 4450192830Sed * Transfer each new top-level vdev from vd to rvd. 4451192830Sed */ 4452192830Sed for (int c = 0; c < vd->vdev_children; c++) { 4453192830Sed 4454192830Sed /* 4455192830Sed * Set the vdev id to the first hole, if one exists. 4456192830Sed */ 4457192830Sed for (id = 0; id < rvd->vdev_children; id++) { 4458192830Sed if (rvd->vdev_child[id]->vdev_ishole) { 4459192830Sed vdev_free(rvd->vdev_child[id]); 4460192830Sed break; 4461192830Sed } 4462192830Sed } 4463192830Sed tvd = vd->vdev_child[c]; 4464192830Sed vdev_remove_child(vd, tvd); 4465192830Sed tvd->vdev_id = id; 4466192830Sed vdev_add_child(rvd, tvd); 4467192830Sed vdev_config_dirty(tvd); 4468192830Sed } 4469192914Sed 4470192830Sed if (nspares != 0) { 4471192830Sed spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4472192830Sed ZPOOL_CONFIG_SPARES); 4473192830Sed spa_load_spares(spa); 4474192830Sed spa->spa_spares.sav_sync = B_TRUE; 4475192830Sed } 4476192830Sed 4477192830Sed if (nl2cache != 0) { 4478192914Sed spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4479192830Sed ZPOOL_CONFIG_L2CACHE); 4480192830Sed spa_load_l2cache(spa); 4481192830Sed spa->spa_l2cache.sav_sync = B_TRUE; 4482192914Sed } 4483192830Sed 4484192830Sed /* 4485192830Sed * We have to be careful when adding new vdevs to an existing pool. 4486192830Sed * If other threads start allocating from these vdevs before we 4487192914Sed * sync the config cache, and we lose power, then upon reboot we may 4488192830Sed * fail to open the pool because there are DVAs that the config cache 4489192830Sed * can't translate. Therefore, we first add the vdevs without 4490192830Sed * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4491192830Sed * and then let spa_config_update() initialize the new metaslabs. 4492192830Sed * 4493192830Sed * spa_load() checks for added-but-not-initialized vdevs, so that 4494192830Sed * if we lose power at any point in this sequence, the remaining 4495192830Sed * steps will be completed the next time we load the pool. 4496192830Sed */ 4497192830Sed (void) spa_vdev_exit(spa, vd, txg, 0); 4498192830Sed 4499192830Sed mutex_enter(&spa_namespace_lock); 4500192830Sed spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4501192830Sed mutex_exit(&spa_namespace_lock); 4502192830Sed 4503192830Sed return (0); 4504192830Sed} 4505192830Sed 4506192830Sed/* 4507192830Sed * Attach a device to a mirror. The arguments are the path to any device 4508192830Sed * in the mirror, and the nvroot for the new device. If the path specifies 4509192830Sed * a device that is not mirrored, we automatically insert the mirror vdev. 4510192830Sed * 4511192830Sed * If 'replacing' is specified, the new device is intended to replace the 4512192830Sed * existing device; in this case the two devices are made into their own 4513192830Sed * mirror using the 'replacing' vdev, which is functionally identical to 4514192830Sed * the mirror vdev (it actually reuses all the same ops) but has a few 4515192830Sed * extra rules: you can't attach to it after it's been created, and upon 4516192830Sed * completion of resilvering, the first disk (the one being replaced) 4517192830Sed * is automatically detached. 4518192830Sed */ 4519192830Sedint 4520192830Sedspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4521192830Sed{ 4522192830Sed uint64_t txg, dtl_max_txg; 4523192830Sed vdev_t *rvd = spa->spa_root_vdev; 4524192830Sed vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4525192830Sed vdev_ops_t *pvops; 4526192830Sed char *oldvdpath, *newvdpath; 4527192830Sed int newvd_isspare; 4528192830Sed int error; 4529192830Sed 4530192830Sed ASSERT(spa_writeable(spa)); 4531192830Sed 4532192830Sed txg = spa_vdev_enter(spa); 4533192830Sed 4534192830Sed oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4535192830Sed 4536192830Sed if (oldvd == NULL) 4537192830Sed return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4538192914Sed 4539192830Sed if (!oldvd->vdev_ops->vdev_op_leaf) 4540192830Sed return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4541192830Sed 4542192830Sed pvd = oldvd->vdev_parent; 4543192830Sed 4544192830Sed if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4545192830Sed VDEV_ALLOC_ATTACH)) != 0) 4546192830Sed return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4547192830Sed 4548192830Sed if (newrootvd->vdev_children != 1) 4549192830Sed return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4550192830Sed 4551192830Sed newvd = newrootvd->vdev_child[0]; 4552192830Sed 4553192914Sed if (!newvd->vdev_ops->vdev_op_leaf) 4554192830Sed return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4555192830Sed 4556192830Sed if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4557192830Sed return (spa_vdev_exit(spa, newrootvd, txg, error)); 4558192830Sed 4559192830Sed /* 4560192914Sed * Spares can't replace logs 4561192830Sed */ 4562192830Sed if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4563192830Sed return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4564192830Sed 4565192830Sed if (!replacing) { 4566192830Sed /* 4567192830Sed * For attach, the only allowable parent is a mirror or the root 4568192830Sed * vdev. 4569192830Sed */ 4570192830Sed if (pvd->vdev_ops != &vdev_mirror_ops && 4571192830Sed pvd->vdev_ops != &vdev_root_ops) 4572192830Sed return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4573192830Sed 4574192830Sed pvops = &vdev_mirror_ops; 4575192830Sed } else { 4576192830Sed /* 4577192830Sed * Active hot spares can only be replaced by inactive hot 4578192830Sed * spares. 4579192830Sed */ 4580192830Sed if (pvd->vdev_ops == &vdev_spare_ops && 4581192830Sed oldvd->vdev_isspare && 4582192830Sed !spa_has_spare(spa, newvd->vdev_guid)) 4583192830Sed return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4584192830Sed 4585192830Sed /* 4586192830Sed * If the source is a hot spare, and the parent isn't already a 4587192830Sed * spare, then we want to create a new hot spare. Otherwise, we 4588192830Sed * want to create a replacing vdev. The user is not allowed to 4589192830Sed * attach to a spared vdev child unless the 'isspare' state is 4590192830Sed * the same (spare replaces spare, non-spare replaces 4591192830Sed * non-spare). 4592192830Sed */ 4593192830Sed if (pvd->vdev_ops == &vdev_replacing_ops && 4594192830Sed spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4595192830Sed return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4596192830Sed } else if (pvd->vdev_ops == &vdev_spare_ops && 4597192830Sed newvd->vdev_isspare != oldvd->vdev_isspare) { 4598192830Sed return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4599192830Sed } 4600192830Sed 4601192830Sed if (newvd->vdev_isspare) 4602192830Sed pvops = &vdev_spare_ops; 4603192830Sed else 4604192830Sed pvops = &vdev_replacing_ops; 4605192830Sed } 4606192830Sed 4607192830Sed /* 4608192830Sed * Make sure the new device is big enough. 4609192830Sed */ 4610192830Sed if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4611192830Sed return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4612192830Sed 4613192830Sed /* 4614192830Sed * The new device cannot have a higher alignment requirement 4615192830Sed * than the top-level vdev. 4616192830Sed */ 4617192830Sed if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4618192830Sed return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4619192830Sed 4620192830Sed /* 4621192830Sed * If this is an in-place replacement, update oldvd's path and devid 4622192830Sed * to make it distinguishable from newvd, and unopenable from now on. 4623192830Sed */ 4624192830Sed if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4625192830Sed spa_strfree(oldvd->vdev_path); 4626192830Sed oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4627192830Sed KM_SLEEP); 4628192830Sed (void) sprintf(oldvd->vdev_path, "%s/%s", 4629192830Sed newvd->vdev_path, "old"); 4630192830Sed if (oldvd->vdev_devid != NULL) { 4631192830Sed spa_strfree(oldvd->vdev_devid); 4632192830Sed oldvd->vdev_devid = NULL; 4633192830Sed } 4634192830Sed } 4635192830Sed 4636192830Sed /* mark the device being resilvered */ 4637192830Sed newvd->vdev_resilver_txg = txg; 4638192830Sed 4639192830Sed /* 4640192830Sed * If the parent is not a mirror, or if we're replacing, insert the new 4641192830Sed * mirror/replacing/spare vdev above oldvd. 4642192830Sed */ 4643192830Sed if (pvd->vdev_ops != pvops) 4644192830Sed pvd = vdev_add_parent(oldvd, pvops); 4645192830Sed 4646192830Sed ASSERT(pvd->vdev_top->vdev_parent == rvd); 4647192830Sed ASSERT(pvd->vdev_ops == pvops); 4648192830Sed ASSERT(oldvd->vdev_parent == pvd); 4649192830Sed 4650192830Sed /* 4651192830Sed * Extract the new device from its root and add it to pvd. 4652192830Sed */ 4653192830Sed vdev_remove_child(newrootvd, newvd); 4654192830Sed newvd->vdev_id = pvd->vdev_children; 4655192830Sed newvd->vdev_crtxg = oldvd->vdev_crtxg; 4656192830Sed vdev_add_child(pvd, newvd); 4657192830Sed 4658192830Sed tvd = newvd->vdev_top; 4659192830Sed ASSERT(pvd->vdev_top == tvd); 4660192830Sed ASSERT(tvd->vdev_parent == rvd); 4661192830Sed 4662192830Sed vdev_config_dirty(tvd); 4663192830Sed 4664192830Sed /* 4665192830Sed * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4666192830Sed * for any dmu_sync-ed blocks. It will propagate upward when 4667192830Sed * spa_vdev_exit() calls vdev_dtl_reassess(). 4668192830Sed */ 4669192830Sed dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4670192830Sed 4671192830Sed vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4672192830Sed dtl_max_txg - TXG_INITIAL); 4673192830Sed 4674192830Sed if (newvd->vdev_isspare) { 4675192830Sed spa_spare_activate(newvd); 4676192830Sed spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4677192830Sed } 4678192830Sed 4679192830Sed oldvdpath = spa_strdup(oldvd->vdev_path); 4680192830Sed newvdpath = spa_strdup(newvd->vdev_path); 4681192830Sed newvd_isspare = newvd->vdev_isspare; 4682192830Sed 4683192830Sed /* 4684192830Sed * Mark newvd's DTL dirty in this txg. 4685192830Sed */ 4686192830Sed vdev_dirty(tvd, VDD_DTL, newvd, txg); 4687192830Sed 4688192830Sed /* 4689192830Sed * Restart the resilver 4690192830Sed */ 4691192830Sed dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4692192830Sed 4693192830Sed /* 4694192830Sed * Commit the config 4695192830Sed */ 4696192830Sed (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4697192830Sed 4698192830Sed spa_history_log_internal(spa, "vdev attach", NULL, 4699192830Sed "%s vdev=%s %s vdev=%s", 4700192830Sed replacing && newvd_isspare ? "spare in" : 4701192830Sed replacing ? "replace" : "attach", newvdpath, 4702192830Sed replacing ? "for" : "to", oldvdpath); 4703192830Sed 4704192830Sed spa_strfree(oldvdpath); 4705192830Sed spa_strfree(newvdpath); 4706192830Sed 4707192830Sed if (spa->spa_bootfs) 4708192830Sed spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4709192830Sed 4710192830Sed return (0); 4711192830Sed} 4712192830Sed 4713192830Sed/* 4714192830Sed * Detach a device from a mirror or replacing vdev. 4715192830Sed * 4716192830Sed * If 'replace_done' is specified, only detach if the parent 4717192830Sed * is a replacing vdev. 4718192830Sed */ 4719192830Sedint 4720192830Sedspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4721192830Sed{ 4722192830Sed uint64_t txg; 4723192830Sed int error; 4724192830Sed vdev_t *rvd = spa->spa_root_vdev; 4725192830Sed vdev_t *vd, *pvd, *cvd, *tvd; 4726192830Sed boolean_t unspare = B_FALSE; 4727192830Sed uint64_t unspare_guid = 0; 4728192830Sed char *vdpath; 4729192830Sed 4730192830Sed ASSERT(spa_writeable(spa)); 4731192830Sed 4732192830Sed txg = spa_vdev_enter(spa); 4733192830Sed 4734192830Sed vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4735192830Sed 4736192830Sed if (vd == NULL) 4737192830Sed return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4738192830Sed 4739192830Sed if (!vd->vdev_ops->vdev_op_leaf) 4740192830Sed return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4741192830Sed 4742192830Sed pvd = vd->vdev_parent; 4743192830Sed 4744192830Sed /* 4745192830Sed * If the parent/child relationship is not as expected, don't do it. 4746192830Sed * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4747192830Sed * vdev that's replacing B with C. The user's intent in replacing 4748192830Sed * is to go from M(A,B) to M(A,C). If the user decides to cancel 4749192830Sed * the replace by detaching C, the expected behavior is to end up 4750192830Sed * M(A,B). But suppose that right after deciding to detach C, 4751192830Sed * the replacement of B completes. We would have M(A,C), and then 4752192830Sed * ask to detach C, which would leave us with just A -- not what 4753192830Sed * the user wanted. To prevent this, we make sure that the 4754192830Sed * parent/child relationship hasn't changed -- in this example, 4755192830Sed * that C's parent is still the replacing vdev R. 4756192830Sed */ 4757192830Sed if (pvd->vdev_guid != pguid && pguid != 0) 4758192830Sed return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4759192830Sed 4760192830Sed /* 4761192830Sed * Only 'replacing' or 'spare' vdevs can be replaced. 4762192830Sed */ 4763192830Sed if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4764192830Sed pvd->vdev_ops != &vdev_spare_ops) 4765192830Sed return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4766192830Sed 4767192830Sed ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4768192830Sed spa_version(spa) >= SPA_VERSION_SPARES); 4769192830Sed 4770192830Sed /* 4771192830Sed * Only mirror, replacing, and spare vdevs support detach. 4772192830Sed */ 4773192830Sed if (pvd->vdev_ops != &vdev_replacing_ops && 4774192830Sed pvd->vdev_ops != &vdev_mirror_ops && 4775192830Sed pvd->vdev_ops != &vdev_spare_ops) 4776192830Sed return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4777192830Sed 4778192830Sed /* 4779192830Sed * If this device has the only valid copy of some data, 4780192830Sed * we cannot safely detach it. 4781192830Sed */ 4782192830Sed if (vdev_dtl_required(vd)) 4783192830Sed return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4784192830Sed 4785192830Sed ASSERT(pvd->vdev_children >= 2); 4786192830Sed 4787192830Sed /* 4788192830Sed * If we are detaching the second disk from a replacing vdev, then 4789192830Sed * check to see if we changed the original vdev's path to have "/old" 4790192830Sed * at the end in spa_vdev_attach(). If so, undo that change now. 4791192830Sed */ 4792192830Sed if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4793192830Sed vd->vdev_path != NULL) { 4794192830Sed size_t len = strlen(vd->vdev_path); 4795192830Sed 4796192830Sed for (int c = 0; c < pvd->vdev_children; c++) { 4797192830Sed cvd = pvd->vdev_child[c]; 4798192830Sed 4799192830Sed if (cvd == vd || cvd->vdev_path == NULL) 4800192830Sed continue; 4801192830Sed 4802192830Sed if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4803192830Sed strcmp(cvd->vdev_path + len, "/old") == 0) { 4804192830Sed spa_strfree(cvd->vdev_path); 4805192830Sed cvd->vdev_path = spa_strdup(vd->vdev_path); 4806192830Sed break; 4807192830Sed } 4808192830Sed } 4809192830Sed } 4810192830Sed 4811192830Sed /* 4812192830Sed * If we are detaching the original disk from a spare, then it implies 4813192830Sed * that the spare should become a real disk, and be removed from the 4814192830Sed * active spare list for the pool. 4815192830Sed */ 4816192830Sed if (pvd->vdev_ops == &vdev_spare_ops && 4817192830Sed vd->vdev_id == 0 && 4818192830Sed pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4819192830Sed unspare = B_TRUE; 4820192830Sed 4821192830Sed /* 4822192830Sed * Erase the disk labels so the disk can be used for other things. 4823192830Sed * This must be done after all other error cases are handled, 4824192830Sed * but before we disembowel vd (so we can still do I/O to it). 4825192830Sed * But if we can't do it, don't treat the error as fatal -- 4826192830Sed * it may be that the unwritability of the disk is the reason 4827192830Sed * it's being detached! 4828192830Sed */ 4829192830Sed error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4830192830Sed 4831192830Sed /* 4832192830Sed * Remove vd from its parent and compact the parent's children. 4833192830Sed */ 4834192830Sed vdev_remove_child(pvd, vd); 4835192830Sed vdev_compact_children(pvd); 4836192830Sed 4837192830Sed /* 4838192830Sed * Remember one of the remaining children so we can get tvd below. 4839192830Sed */ 4840192830Sed cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4841192830Sed 4842192830Sed /* 4843192830Sed * If we need to remove the remaining child from the list of hot spares, 4844192830Sed * do it now, marking the vdev as no longer a spare in the process. 4845192830Sed * We must do this before vdev_remove_parent(), because that can 4846192830Sed * change the GUID if it creates a new toplevel GUID. For a similar 4847192830Sed * reason, we must remove the spare now, in the same txg as the detach; 4848192830Sed * otherwise someone could attach a new sibling, change the GUID, and 4849192830Sed * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4850192830Sed */ 4851192830Sed if (unspare) { 4852192830Sed ASSERT(cvd->vdev_isspare); 4853192830Sed spa_spare_remove(cvd); 4854192830Sed unspare_guid = cvd->vdev_guid; 4855192830Sed (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4856192830Sed cvd->vdev_unspare = B_TRUE; 4857192830Sed } 4858192830Sed 4859192830Sed /* 4860192830Sed * If the parent mirror/replacing vdev only has one child, 4861192830Sed * the parent is no longer needed. Remove it from the tree. 4862192914Sed */ 4863192830Sed if (pvd->vdev_children == 1) { 4864192914Sed if (pvd->vdev_ops == &vdev_spare_ops) 4865192830Sed cvd->vdev_unspare = B_FALSE; 4866192830Sed vdev_remove_parent(cvd); 4867192830Sed } 4868192830Sed 4869192830Sed 4870192830Sed /* 4871192830Sed * We don't set tvd until now because the parent we just removed 4872192830Sed * may have been the previous top-level vdev. 4873192830Sed */ 4874192830Sed tvd = cvd->vdev_top; 4875192830Sed ASSERT(tvd->vdev_parent == rvd); 4876192830Sed 4877192830Sed /* 4878192830Sed * Reevaluate the parent vdev state. 4879192830Sed */ 4880192830Sed vdev_propagate_state(cvd); 4881192830Sed 4882192830Sed /* 4883192830Sed * If the 'autoexpand' property is set on the pool then automatically 4884192830Sed * try to expand the size of the pool. For example if the device we 4885192830Sed * just detached was smaller than the others, it may be possible to 4886192830Sed * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4887192830Sed * first so that we can obtain the updated sizes of the leaf vdevs. 4888192830Sed */ 4889192830Sed if (spa->spa_autoexpand) { 4890192830Sed vdev_reopen(tvd); 4891192830Sed vdev_expand(tvd, txg); 4892192830Sed } 4893192830Sed 4894192830Sed vdev_config_dirty(tvd); 4895192830Sed 4896192830Sed /* 4897192830Sed * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4898192830Sed * vd->vdev_detached is set and free vd's DTL object in syncing context. 4899192830Sed * But first make sure we're not on any *other* txg's DTL list, to 4900192830Sed * prevent vd from being accessed after it's freed. 4901192830Sed */ 4902192830Sed vdpath = spa_strdup(vd->vdev_path); 4903192830Sed for (int t = 0; t < TXG_SIZE; t++) 4904192830Sed (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4905192830Sed vd->vdev_detached = B_TRUE; 4906192830Sed vdev_dirty(tvd, VDD_DTL, vd, txg); 4907192914Sed 4908192830Sed spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4909192830Sed 4910192830Sed /* hang on to the spa before we release the lock */ 4911192830Sed spa_open_ref(spa, FTAG); 4912192830Sed 4913192830Sed error = spa_vdev_exit(spa, vd, txg, 0); 4914192830Sed 4915192830Sed spa_history_log_internal(spa, "detach", NULL, 4916192830Sed "vdev=%s", vdpath); 4917192830Sed spa_strfree(vdpath); 4918192830Sed 4919192830Sed /* 4920192830Sed * If this was the removal of the original device in a hot spare vdev, 4921192830Sed * then we want to go through and remove the device from the hot spare 4922192830Sed * list of every other pool. 4923192830Sed */ 4924192830Sed if (unspare) { 4925192830Sed spa_t *altspa = NULL; 4926192830Sed 4927192914Sed mutex_enter(&spa_namespace_lock); 4928192830Sed while ((altspa = spa_next(altspa)) != NULL) { 4929192830Sed if (altspa->spa_state != POOL_STATE_ACTIVE || 4930192914Sed altspa == spa) 4931192830Sed continue; 4932192830Sed 4933192830Sed spa_open_ref(altspa, FTAG); 4934192830Sed mutex_exit(&spa_namespace_lock); 4935192830Sed (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4936192830Sed mutex_enter(&spa_namespace_lock); 4937192830Sed spa_close(altspa, FTAG); 4938192830Sed } 4939192830Sed mutex_exit(&spa_namespace_lock); 4940192830Sed 4941192830Sed /* search the rest of the vdevs for spares to remove */ 4942192830Sed spa_vdev_resilver_done(spa); 4943192830Sed } 4944192830Sed 4945192830Sed /* all done with the spa; OK to release */ 4946192914Sed mutex_enter(&spa_namespace_lock); 4947192830Sed spa_close(spa, FTAG); 4948192830Sed mutex_exit(&spa_namespace_lock); 4949192830Sed 4950192830Sed return (error); 4951192830Sed} 4952192830Sed 4953192830Sed/* 4954192830Sed * Split a set of devices from their mirrors, and create a new pool from them. 4955192830Sed */ 4956192830Sedint 4957192830Sedspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4958192830Sed nvlist_t *props, boolean_t exp) 4959192914Sed{ 4960192830Sed int error = 0; 4961192830Sed uint64_t txg, *glist; 4962192830Sed spa_t *newspa; 4963192830Sed uint_t c, children, lastlog; 4964192830Sed nvlist_t **child, *nvl, *tmp; 4965192830Sed dmu_tx_t *tx; 4966192830Sed char *altroot = NULL; 4967192830Sed vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4968192830Sed boolean_t activate_slog; 4969192914Sed 4970192830Sed ASSERT(spa_writeable(spa)); 4971192830Sed 4972192830Sed txg = spa_vdev_enter(spa); 4973192830Sed 4974192830Sed /* clear the log and flush everything up to now */ 4975192830Sed activate_slog = spa_passivate_log(spa); 4976192830Sed (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4977192830Sed error = spa_offline_log(spa); 4978192830Sed txg = spa_vdev_config_enter(spa); 4979192830Sed 4980192830Sed if (activate_slog) 4981192830Sed spa_activate_log(spa); 4982192830Sed 4983192830Sed if (error != 0) 4984192830Sed return (spa_vdev_exit(spa, NULL, txg, error)); 4985192830Sed 4986192830Sed /* check new spa name before going any further */ 4987192830Sed if (spa_lookup(newname) != NULL) 4988192830Sed return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4989192830Sed 4990192830Sed /* 4991192830Sed * scan through all the children to ensure they're all mirrors 4992192914Sed */ 4993192830Sed if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4994192830Sed nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4995192830Sed &children) != 0) 4996192830Sed return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4997192830Sed 4998192830Sed /* first, check to ensure we've got the right child count */ 4999192830Sed rvd = spa->spa_root_vdev; 5000192830Sed lastlog = 0; 5001192830Sed for (c = 0; c < rvd->vdev_children; c++) { 5002192830Sed vdev_t *vd = rvd->vdev_child[c]; 5003192830Sed 5004192830Sed /* don't count the holes & logs as children */ 5005192830Sed if (vd->vdev_islog || vd->vdev_ishole) { 5006192830Sed if (lastlog == 0) 5007192830Sed lastlog = c; 5008192830Sed continue; 5009192830Sed } 5010192830Sed 5011192830Sed lastlog = 0; 5012192830Sed } 5013192830Sed if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5014192830Sed return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5015192830Sed 5016192830Sed /* next, ensure no spare or cache devices are part of the split */ 5017192830Sed if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5018192830Sed nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5019192830Sed return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5020192830Sed 5021192830Sed vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5022192830Sed glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5023192830Sed 5024192830Sed /* then, loop over each vdev and validate it */ 5025192830Sed for (c = 0; c < children; c++) { 5026192830Sed uint64_t is_hole = 0; 5027192830Sed 5028192830Sed (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5029192830Sed &is_hole); 5030192830Sed 5031192830Sed if (is_hole != 0) { 5032192830Sed if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5033192830Sed spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5034192830Sed continue; 5035192830Sed } else { 5036192830Sed error = SET_ERROR(EINVAL); 5037192830Sed break; 5038192830Sed } 5039192830Sed } 5040192830Sed 5041192830Sed /* which disk is going to be split? */ 5042192830Sed if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5043192830Sed &glist[c]) != 0) { 5044192830Sed error = SET_ERROR(EINVAL); 5045192830Sed break; 5046192830Sed } 5047192830Sed 5048192830Sed /* look it up in the spa */ 5049192830Sed vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5050192830Sed if (vml[c] == NULL) { 5051192830Sed error = SET_ERROR(ENODEV); 5052192830Sed break; 5053192830Sed } 5054192830Sed 5055192830Sed /* make sure there's nothing stopping the split */ 5056192830Sed if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5057192830Sed vml[c]->vdev_islog || 5058192830Sed vml[c]->vdev_ishole || 5059192830Sed vml[c]->vdev_isspare || 5060192830Sed vml[c]->vdev_isl2cache || 5061192830Sed !vdev_writeable(vml[c]) || 5062192830Sed vml[c]->vdev_children != 0 || 5063192830Sed vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5064192830Sed c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5065192830Sed error = SET_ERROR(EINVAL); 5066192830Sed break; 5067192830Sed } 5068192830Sed 5069192830Sed if (vdev_dtl_required(vml[c])) { 5070192830Sed error = SET_ERROR(EBUSY); 5071192830Sed break; 5072192830Sed } 5073192830Sed 5074192830Sed /* we need certain info from the top level */ 5075192830Sed VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5076192830Sed vml[c]->vdev_top->vdev_ms_array) == 0); 5077192830Sed VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5078192830Sed vml[c]->vdev_top->vdev_ms_shift) == 0); 5079192830Sed VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5080192830Sed vml[c]->vdev_top->vdev_asize) == 0); 5081192830Sed VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5082196750Sache vml[c]->vdev_top->vdev_ashift) == 0); 5083192830Sed } 5084192856Sed 5085192830Sed if (error != 0) { 5086192830Sed kmem_free(vml, children * sizeof (vdev_t *)); 5087192830Sed kmem_free(glist, children * sizeof (uint64_t)); 5088192830Sed return (spa_vdev_exit(spa, NULL, txg, error)); 5089192830Sed } 5090192830Sed 5091192830Sed /* stop writers from using the disks */ 5092192830Sed for (c = 0; c < children; c++) { 5093192830Sed if (vml[c] != NULL) 5094192830Sed vml[c]->vdev_offline = B_TRUE; 5095192830Sed } 5096192830Sed vdev_reopen(spa->spa_root_vdev); 5097192830Sed 5098192830Sed /* 5099192830Sed * Temporarily record the splitting vdevs in the spa config. This 5100192830Sed * will disappear once the config is regenerated. 5101192830Sed */ 5102192830Sed VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5103192830Sed VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5104192830Sed glist, children) == 0); 5105192830Sed kmem_free(glist, children * sizeof (uint64_t)); 5106192830Sed 5107192830Sed mutex_enter(&spa->spa_props_lock); 5108192830Sed VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5109192830Sed nvl) == 0); 5110192830Sed mutex_exit(&spa->spa_props_lock); 5111192830Sed spa->spa_config_splitting = nvl; 5112192830Sed vdev_config_dirty(spa->spa_root_vdev); 5113192830Sed 5114192830Sed /* configure and create the new pool */ 5115192830Sed VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5116192830Sed VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5117192830Sed exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5118192830Sed VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5119192830Sed spa_version(spa)) == 0); 5120192830Sed VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5121192830Sed spa->spa_config_txg) == 0); 5122192830Sed VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5123192830Sed spa_generate_guid(NULL)) == 0); 5124192830Sed (void) nvlist_lookup_string(props, 5125192830Sed zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5126192830Sed 5127192830Sed /* add the new pool to the namespace */ 5128192830Sed newspa = spa_add(newname, config, altroot); 5129192830Sed newspa->spa_config_txg = spa->spa_config_txg; 5130192856Sed spa_set_log_state(newspa, SPA_LOG_CLEAR); 5131192830Sed 5132192830Sed /* release the spa config lock, retaining the namespace lock */ 5133192830Sed spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5134192830Sed 5135192830Sed if (zio_injection_enabled) 5136192830Sed zio_handle_panic_injection(spa, FTAG, 1); 5137192830Sed 5138192830Sed spa_activate(newspa, spa_mode_global); 5139192830Sed spa_async_suspend(newspa); 5140192830Sed 5141192830Sed#ifndef sun 5142192830Sed /* mark that we are creating new spa by splitting */ 5143192830Sed newspa->spa_splitting_newspa = B_TRUE; 5144192830Sed#endif 5145192830Sed /* create the new pool from the disks of the original pool */ 5146192830Sed error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5147192856Sed#ifndef sun 5148192830Sed newspa->spa_splitting_newspa = B_FALSE; 5149192830Sed#endif 5150192830Sed if (error) 5151192830Sed goto out; 5152192830Sed 5153192830Sed /* if that worked, generate a real config for the new pool */ 5154192830Sed if (newspa->spa_root_vdev != NULL) { 5155192830Sed VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5156192830Sed NV_UNIQUE_NAME, KM_SLEEP) == 0); 5157192830Sed VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5158192830Sed ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5159192830Sed spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5160192830Sed B_TRUE)); 5161192830Sed } 5162192830Sed 5163192830Sed /* set the props */ 5164192830Sed if (props != NULL) { 5165192830Sed spa_configfile_set(newspa, props, B_FALSE); 5166192830Sed error = spa_prop_set(newspa, props); 5167192830Sed if (error) 5168192830Sed goto out; 5169192830Sed } 5170192830Sed 5171192830Sed /* flush everything */ 5172192830Sed txg = spa_vdev_config_enter(newspa); 5173192830Sed vdev_config_dirty(newspa->spa_root_vdev); 5174192830Sed (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5175192830Sed 5176192830Sed if (zio_injection_enabled) 5177192830Sed zio_handle_panic_injection(spa, FTAG, 2); 5178192830Sed 5179192830Sed spa_async_resume(newspa); 5180192830Sed 5181192830Sed /* finally, update the original pool's config */ 5182192830Sed txg = spa_vdev_config_enter(spa); 5183192830Sed tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5184192830Sed error = dmu_tx_assign(tx, TXG_WAIT); 5185192830Sed if (error != 0) 5186192830Sed dmu_tx_abort(tx); 5187192830Sed for (c = 0; c < children; c++) { 5188192830Sed if (vml[c] != NULL) { 5189192914Sed vdev_split(vml[c]); 5190192830Sed if (error == 0) 5191192830Sed spa_history_log_internal(spa, "detach", tx, 5192192830Sed "vdev=%s", vml[c]->vdev_path); 5193192830Sed vdev_free(vml[c]); 5194192830Sed } 5195192830Sed } 5196192830Sed vdev_config_dirty(spa->spa_root_vdev); 5197192830Sed spa->spa_config_splitting = NULL; 5198192830Sed nvlist_free(nvl); 5199192830Sed if (error == 0) 5200192830Sed dmu_tx_commit(tx); 5201192830Sed (void) spa_vdev_exit(spa, NULL, txg, 0); 5202192830Sed 5203192830Sed if (zio_injection_enabled) 5204192830Sed zio_handle_panic_injection(spa, FTAG, 3); 5205192830Sed 5206192830Sed /* split is complete; log a history record */ 5207192830Sed spa_history_log_internal(newspa, "split", NULL, 5208192830Sed "from pool %s", spa_name(spa)); 5209192830Sed 5210192830Sed kmem_free(vml, children * sizeof (vdev_t *)); 5211192830Sed 5212192830Sed /* if we're not going to mount the filesystems in userland, export */ 5213192830Sed if (exp) 5214192830Sed error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5215192830Sed B_FALSE, B_FALSE); 5216192830Sed 5217192830Sed return (error); 5218192830Sed 5219192830Sedout: 5220192830Sed spa_unload(newspa); 5221192830Sed spa_deactivate(newspa); 5222192830Sed spa_remove(newspa); 5223192830Sed 5224192830Sed txg = spa_vdev_config_enter(spa); 5225192830Sed 5226192830Sed /* re-online all offlined disks */ 5227192830Sed for (c = 0; c < children; c++) { 5228192830Sed if (vml[c] != NULL) 5229192830Sed vml[c]->vdev_offline = B_FALSE; 5230192830Sed } 5231192830Sed vdev_reopen(spa->spa_root_vdev); 5232192830Sed 5233192830Sed nvlist_free(spa->spa_config_splitting); 5234192830Sed spa->spa_config_splitting = NULL; 5235192830Sed (void) spa_vdev_exit(spa, NULL, txg, error); 5236192830Sed 5237192914Sed kmem_free(vml, children * sizeof (vdev_t *)); 5238192914Sed return (error); 5239192830Sed} 5240192830Sed 5241192830Sedstatic nvlist_t * 5242192830Sedspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5243192830Sed{ 5244192830Sed for (int i = 0; i < count; i++) { 5245192830Sed uint64_t guid; 5246192830Sed 5247192830Sed VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5248192830Sed &guid) == 0); 5249192830Sed 5250192830Sed if (guid == target_guid) 5251192830Sed return (nvpp[i]); 5252192830Sed } 5253192830Sed 5254192830Sed return (NULL); 5255192830Sed} 5256192830Sed 5257192914Sedstatic void 5258192914Sedspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5259192914Sed nvlist_t *dev_to_remove) 5260192914Sed{ 5261192856Sed nvlist_t **newdev = NULL; 5262192830Sed 5263192830Sed if (count > 1) 5264192830Sed newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5265192830Sed 5266192830Sed for (int i = 0, j = 0; i < count; i++) { 5267192830Sed if (dev[i] == dev_to_remove) 5268192830Sed continue; 5269192830Sed VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5270192830Sed } 5271192830Sed 5272192830Sed VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5273192830Sed VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5274192830Sed 5275192830Sed for (int i = 0; i < count - 1; i++) 5276192830Sed nvlist_free(newdev[i]); 5277192830Sed 5278192830Sed if (count > 1) 5279192830Sed kmem_free(newdev, (count - 1) * sizeof (void *)); 5280192830Sed} 5281192830Sed 5282192830Sed/* 5283192830Sed * Evacuate the device. 5284192830Sed */ 5285192830Sedstatic int 5286192830Sedspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5287192830Sed{ 5288192830Sed uint64_t txg; 5289192830Sed int error = 0; 5290192830Sed 5291192830Sed ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5292192830Sed ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5293192830Sed ASSERT(vd == vd->vdev_top); 5294192830Sed 5295192830Sed /* 5296192830Sed * Evacuate the device. We don't hold the config lock as writer 5297192830Sed * since we need to do I/O but we do keep the 5298192830Sed * spa_namespace_lock held. Once this completes the device 5299192830Sed * should no longer have any blocks allocated on it. 5300192830Sed */ 5301192830Sed if (vd->vdev_islog) { 5302192830Sed if (vd->vdev_stat.vs_alloc != 0) 5303192830Sed error = spa_offline_log(spa); 5304192830Sed } else { 5305192830Sed error = SET_ERROR(ENOTSUP); 5306192830Sed } 5307192830Sed 5308192830Sed if (error) 5309192830Sed return (error); 5310192830Sed 5311192830Sed /* 5312192830Sed * The evacuation succeeded. Remove any remaining MOS metadata 5313192830Sed * associated with this vdev, and wait for these changes to sync. 5314192830Sed */ 5315192830Sed ASSERT0(vd->vdev_stat.vs_alloc); 5316192830Sed txg = spa_vdev_config_enter(spa); 5317192830Sed vd->vdev_removing = B_TRUE; 5318192830Sed vdev_dirty(vd, 0, NULL, txg); 5319192830Sed vdev_config_dirty(vd); 5320192830Sed spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5321192830Sed 5322192830Sed return (0); 5323192830Sed} 5324192830Sed 5325192830Sed/* 5326192830Sed * Complete the removal by cleaning up the namespace. 5327192830Sed */ 5328192830Sedstatic void 5329192830Sedspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5330192830Sed{ 5331192830Sed vdev_t *rvd = spa->spa_root_vdev; 5332192830Sed uint64_t id = vd->vdev_id; 5333192830Sed boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5334192830Sed 5335192830Sed ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5336192830Sed ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5337192830Sed ASSERT(vd == vd->vdev_top); 5338192830Sed 5339192830Sed /* 5340192830Sed * Only remove any devices which are empty. 5341192830Sed */ 5342192830Sed if (vd->vdev_stat.vs_alloc != 0) 5343192830Sed return; 5344192830Sed 5345192830Sed (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5346192830Sed 5347192830Sed if (list_link_active(&vd->vdev_state_dirty_node)) 5348192830Sed vdev_state_clean(vd); 5349 if (list_link_active(&vd->vdev_config_dirty_node)) 5350 vdev_config_clean(vd); 5351 5352 vdev_free(vd); 5353 5354 if (last_vdev) { 5355 vdev_compact_children(rvd); 5356 } else { 5357 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5358 vdev_add_child(rvd, vd); 5359 } 5360 vdev_config_dirty(rvd); 5361 5362 /* 5363 * Reassess the health of our root vdev. 5364 */ 5365 vdev_reopen(rvd); 5366} 5367 5368/* 5369 * Remove a device from the pool - 5370 * 5371 * Removing a device from the vdev namespace requires several steps 5372 * and can take a significant amount of time. As a result we use 5373 * the spa_vdev_config_[enter/exit] functions which allow us to 5374 * grab and release the spa_config_lock while still holding the namespace 5375 * lock. During each step the configuration is synced out. 5376 * 5377 * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5378 * devices. 5379 */ 5380int 5381spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5382{ 5383 vdev_t *vd; 5384 metaslab_group_t *mg; 5385 nvlist_t **spares, **l2cache, *nv; 5386 uint64_t txg = 0; 5387 uint_t nspares, nl2cache; 5388 int error = 0; 5389 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5390 5391 ASSERT(spa_writeable(spa)); 5392 5393 if (!locked) 5394 txg = spa_vdev_enter(spa); 5395 5396 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5397 5398 if (spa->spa_spares.sav_vdevs != NULL && 5399 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5400 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5401 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5402 /* 5403 * Only remove the hot spare if it's not currently in use 5404 * in this pool. 5405 */ 5406 if (vd == NULL || unspare) { 5407 spa_vdev_remove_aux(spa->spa_spares.sav_config, 5408 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5409 spa_load_spares(spa); 5410 spa->spa_spares.sav_sync = B_TRUE; 5411 } else { 5412 error = SET_ERROR(EBUSY); 5413 } 5414 } else if (spa->spa_l2cache.sav_vdevs != NULL && 5415 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5416 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5417 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5418 /* 5419 * Cache devices can always be removed. 5420 */ 5421 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5422 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5423 spa_load_l2cache(spa); 5424 spa->spa_l2cache.sav_sync = B_TRUE; 5425 } else if (vd != NULL && vd->vdev_islog) { 5426 ASSERT(!locked); 5427 ASSERT(vd == vd->vdev_top); 5428 5429 /* 5430 * XXX - Once we have bp-rewrite this should 5431 * become the common case. 5432 */ 5433 5434 mg = vd->vdev_mg; 5435 5436 /* 5437 * Stop allocating from this vdev. 5438 */ 5439 metaslab_group_passivate(mg); 5440 5441 /* 5442 * Wait for the youngest allocations and frees to sync, 5443 * and then wait for the deferral of those frees to finish. 5444 */ 5445 spa_vdev_config_exit(spa, NULL, 5446 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5447 5448 /* 5449 * Attempt to evacuate the vdev. 5450 */ 5451 error = spa_vdev_remove_evacuate(spa, vd); 5452 5453 txg = spa_vdev_config_enter(spa); 5454 5455 /* 5456 * If we couldn't evacuate the vdev, unwind. 5457 */ 5458 if (error) { 5459 metaslab_group_activate(mg); 5460 return (spa_vdev_exit(spa, NULL, txg, error)); 5461 } 5462 5463 /* 5464 * Clean up the vdev namespace. 5465 */ 5466 spa_vdev_remove_from_namespace(spa, vd); 5467 5468 } else if (vd != NULL) { 5469 /* 5470 * Normal vdevs cannot be removed (yet). 5471 */ 5472 error = SET_ERROR(ENOTSUP); 5473 } else { 5474 /* 5475 * There is no vdev of any kind with the specified guid. 5476 */ 5477 error = SET_ERROR(ENOENT); 5478 } 5479 5480 if (!locked) 5481 return (spa_vdev_exit(spa, NULL, txg, error)); 5482 5483 return (error); 5484} 5485 5486/* 5487 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5488 * currently spared, so we can detach it. 5489 */ 5490static vdev_t * 5491spa_vdev_resilver_done_hunt(vdev_t *vd) 5492{ 5493 vdev_t *newvd, *oldvd; 5494 5495 for (int c = 0; c < vd->vdev_children; c++) { 5496 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5497 if (oldvd != NULL) 5498 return (oldvd); 5499 } 5500 5501 /* 5502 * Check for a completed replacement. We always consider the first 5503 * vdev in the list to be the oldest vdev, and the last one to be 5504 * the newest (see spa_vdev_attach() for how that works). In 5505 * the case where the newest vdev is faulted, we will not automatically 5506 * remove it after a resilver completes. This is OK as it will require 5507 * user intervention to determine which disk the admin wishes to keep. 5508 */ 5509 if (vd->vdev_ops == &vdev_replacing_ops) { 5510 ASSERT(vd->vdev_children > 1); 5511 5512 newvd = vd->vdev_child[vd->vdev_children - 1]; 5513 oldvd = vd->vdev_child[0]; 5514 5515 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5516 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5517 !vdev_dtl_required(oldvd)) 5518 return (oldvd); 5519 } 5520 5521 /* 5522 * Check for a completed resilver with the 'unspare' flag set. 5523 */ 5524 if (vd->vdev_ops == &vdev_spare_ops) { 5525 vdev_t *first = vd->vdev_child[0]; 5526 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5527 5528 if (last->vdev_unspare) { 5529 oldvd = first; 5530 newvd = last; 5531 } else if (first->vdev_unspare) { 5532 oldvd = last; 5533 newvd = first; 5534 } else { 5535 oldvd = NULL; 5536 } 5537 5538 if (oldvd != NULL && 5539 vdev_dtl_empty(newvd, DTL_MISSING) && 5540 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5541 !vdev_dtl_required(oldvd)) 5542 return (oldvd); 5543 5544 /* 5545 * If there are more than two spares attached to a disk, 5546 * and those spares are not required, then we want to 5547 * attempt to free them up now so that they can be used 5548 * by other pools. Once we're back down to a single 5549 * disk+spare, we stop removing them. 5550 */ 5551 if (vd->vdev_children > 2) { 5552 newvd = vd->vdev_child[1]; 5553 5554 if (newvd->vdev_isspare && last->vdev_isspare && 5555 vdev_dtl_empty(last, DTL_MISSING) && 5556 vdev_dtl_empty(last, DTL_OUTAGE) && 5557 !vdev_dtl_required(newvd)) 5558 return (newvd); 5559 } 5560 } 5561 5562 return (NULL); 5563} 5564 5565static void 5566spa_vdev_resilver_done(spa_t *spa) 5567{ 5568 vdev_t *vd, *pvd, *ppvd; 5569 uint64_t guid, sguid, pguid, ppguid; 5570 5571 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5572 5573 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5574 pvd = vd->vdev_parent; 5575 ppvd = pvd->vdev_parent; 5576 guid = vd->vdev_guid; 5577 pguid = pvd->vdev_guid; 5578 ppguid = ppvd->vdev_guid; 5579 sguid = 0; 5580 /* 5581 * If we have just finished replacing a hot spared device, then 5582 * we need to detach the parent's first child (the original hot 5583 * spare) as well. 5584 */ 5585 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5586 ppvd->vdev_children == 2) { 5587 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5588 sguid = ppvd->vdev_child[1]->vdev_guid; 5589 } 5590 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5591 5592 spa_config_exit(spa, SCL_ALL, FTAG); 5593 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5594 return; 5595 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5596 return; 5597 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5598 } 5599 5600 spa_config_exit(spa, SCL_ALL, FTAG); 5601} 5602 5603/* 5604 * Update the stored path or FRU for this vdev. 5605 */ 5606int 5607spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5608 boolean_t ispath) 5609{ 5610 vdev_t *vd; 5611 boolean_t sync = B_FALSE; 5612 5613 ASSERT(spa_writeable(spa)); 5614 5615 spa_vdev_state_enter(spa, SCL_ALL); 5616 5617 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5618 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5619 5620 if (!vd->vdev_ops->vdev_op_leaf) 5621 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5622 5623 if (ispath) { 5624 if (strcmp(value, vd->vdev_path) != 0) { 5625 spa_strfree(vd->vdev_path); 5626 vd->vdev_path = spa_strdup(value); 5627 sync = B_TRUE; 5628 } 5629 } else { 5630 if (vd->vdev_fru == NULL) { 5631 vd->vdev_fru = spa_strdup(value); 5632 sync = B_TRUE; 5633 } else if (strcmp(value, vd->vdev_fru) != 0) { 5634 spa_strfree(vd->vdev_fru); 5635 vd->vdev_fru = spa_strdup(value); 5636 sync = B_TRUE; 5637 } 5638 } 5639 5640 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5641} 5642 5643int 5644spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5645{ 5646 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5647} 5648 5649int 5650spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5651{ 5652 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5653} 5654 5655/* 5656 * ========================================================================== 5657 * SPA Scanning 5658 * ========================================================================== 5659 */ 5660 5661int 5662spa_scan_stop(spa_t *spa) 5663{ 5664 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5665 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5666 return (SET_ERROR(EBUSY)); 5667 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5668} 5669 5670int 5671spa_scan(spa_t *spa, pool_scan_func_t func) 5672{ 5673 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5674 5675 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5676 return (SET_ERROR(ENOTSUP)); 5677 5678 /* 5679 * If a resilver was requested, but there is no DTL on a 5680 * writeable leaf device, we have nothing to do. 5681 */ 5682 if (func == POOL_SCAN_RESILVER && 5683 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5684 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5685 return (0); 5686 } 5687 5688 return (dsl_scan(spa->spa_dsl_pool, func)); 5689} 5690 5691/* 5692 * ========================================================================== 5693 * SPA async task processing 5694 * ========================================================================== 5695 */ 5696 5697static void 5698spa_async_remove(spa_t *spa, vdev_t *vd) 5699{ 5700 if (vd->vdev_remove_wanted) { 5701 vd->vdev_remove_wanted = B_FALSE; 5702 vd->vdev_delayed_close = B_FALSE; 5703 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5704 5705 /* 5706 * We want to clear the stats, but we don't want to do a full 5707 * vdev_clear() as that will cause us to throw away 5708 * degraded/faulted state as well as attempt to reopen the 5709 * device, all of which is a waste. 5710 */ 5711 vd->vdev_stat.vs_read_errors = 0; 5712 vd->vdev_stat.vs_write_errors = 0; 5713 vd->vdev_stat.vs_checksum_errors = 0; 5714 5715 vdev_state_dirty(vd->vdev_top); 5716 } 5717 5718 for (int c = 0; c < vd->vdev_children; c++) 5719 spa_async_remove(spa, vd->vdev_child[c]); 5720} 5721 5722static void 5723spa_async_probe(spa_t *spa, vdev_t *vd) 5724{ 5725 if (vd->vdev_probe_wanted) { 5726 vd->vdev_probe_wanted = B_FALSE; 5727 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5728 } 5729 5730 for (int c = 0; c < vd->vdev_children; c++) 5731 spa_async_probe(spa, vd->vdev_child[c]); 5732} 5733 5734static void 5735spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5736{ 5737 sysevent_id_t eid; 5738 nvlist_t *attr; 5739 char *physpath; 5740 5741 if (!spa->spa_autoexpand) 5742 return; 5743 5744 for (int c = 0; c < vd->vdev_children; c++) { 5745 vdev_t *cvd = vd->vdev_child[c]; 5746 spa_async_autoexpand(spa, cvd); 5747 } 5748 5749 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5750 return; 5751 5752 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5753 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5754 5755 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5756 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5757 5758 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5759 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5760 5761 nvlist_free(attr); 5762 kmem_free(physpath, MAXPATHLEN); 5763} 5764 5765static void 5766spa_async_thread(void *arg) 5767{ 5768 spa_t *spa = arg; 5769 int tasks; 5770 5771 ASSERT(spa->spa_sync_on); 5772 5773 mutex_enter(&spa->spa_async_lock); 5774 tasks = spa->spa_async_tasks; 5775 spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 5776 mutex_exit(&spa->spa_async_lock); 5777 5778 /* 5779 * See if the config needs to be updated. 5780 */ 5781 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5782 uint64_t old_space, new_space; 5783 5784 mutex_enter(&spa_namespace_lock); 5785 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5786 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5787 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5788 mutex_exit(&spa_namespace_lock); 5789 5790 /* 5791 * If the pool grew as a result of the config update, 5792 * then log an internal history event. 5793 */ 5794 if (new_space != old_space) { 5795 spa_history_log_internal(spa, "vdev online", NULL, 5796 "pool '%s' size: %llu(+%llu)", 5797 spa_name(spa), new_space, new_space - old_space); 5798 } 5799 } 5800 5801 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5802 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5803 spa_async_autoexpand(spa, spa->spa_root_vdev); 5804 spa_config_exit(spa, SCL_CONFIG, FTAG); 5805 } 5806 5807 /* 5808 * See if any devices need to be probed. 5809 */ 5810 if (tasks & SPA_ASYNC_PROBE) { 5811 spa_vdev_state_enter(spa, SCL_NONE); 5812 spa_async_probe(spa, spa->spa_root_vdev); 5813 (void) spa_vdev_state_exit(spa, NULL, 0); 5814 } 5815 5816 /* 5817 * If any devices are done replacing, detach them. 5818 */ 5819 if (tasks & SPA_ASYNC_RESILVER_DONE) 5820 spa_vdev_resilver_done(spa); 5821 5822 /* 5823 * Kick off a resilver. 5824 */ 5825 if (tasks & SPA_ASYNC_RESILVER) 5826 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5827 5828 /* 5829 * Let the world know that we're done. 5830 */ 5831 mutex_enter(&spa->spa_async_lock); 5832 spa->spa_async_thread = NULL; 5833 cv_broadcast(&spa->spa_async_cv); 5834 mutex_exit(&spa->spa_async_lock); 5835 thread_exit(); 5836} 5837 5838static void 5839spa_async_thread_vd(void *arg) 5840{ 5841 spa_t *spa = arg; 5842 int tasks; 5843 5844 ASSERT(spa->spa_sync_on); 5845 5846 mutex_enter(&spa->spa_async_lock); 5847 tasks = spa->spa_async_tasks; 5848retry: 5849 spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 5850 mutex_exit(&spa->spa_async_lock); 5851 5852 /* 5853 * See if any devices need to be marked REMOVED. 5854 */ 5855 if (tasks & SPA_ASYNC_REMOVE) { 5856 spa_vdev_state_enter(spa, SCL_NONE); 5857 spa_async_remove(spa, spa->spa_root_vdev); 5858 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5859 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5860 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5861 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5862 (void) spa_vdev_state_exit(spa, NULL, 0); 5863 } 5864 5865 /* 5866 * Let the world know that we're done. 5867 */ 5868 mutex_enter(&spa->spa_async_lock); 5869 tasks = spa->spa_async_tasks; 5870 if ((tasks & SPA_ASYNC_REMOVE) != 0) 5871 goto retry; 5872 spa->spa_async_thread_vd = NULL; 5873 cv_broadcast(&spa->spa_async_cv); 5874 mutex_exit(&spa->spa_async_lock); 5875 thread_exit(); 5876} 5877 5878void 5879spa_async_suspend(spa_t *spa) 5880{ 5881 mutex_enter(&spa->spa_async_lock); 5882 spa->spa_async_suspended++; 5883 while (spa->spa_async_thread != NULL && 5884 spa->spa_async_thread_vd != NULL) 5885 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5886 mutex_exit(&spa->spa_async_lock); 5887} 5888 5889void 5890spa_async_resume(spa_t *spa) 5891{ 5892 mutex_enter(&spa->spa_async_lock); 5893 ASSERT(spa->spa_async_suspended != 0); 5894 spa->spa_async_suspended--; 5895 mutex_exit(&spa->spa_async_lock); 5896} 5897 5898static boolean_t 5899spa_async_tasks_pending(spa_t *spa) 5900{ 5901 uint_t non_config_tasks; 5902 uint_t config_task; 5903 boolean_t config_task_suspended; 5904 5905 non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 5906 SPA_ASYNC_REMOVE); 5907 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 5908 if (spa->spa_ccw_fail_time == 0) { 5909 config_task_suspended = B_FALSE; 5910 } else { 5911 config_task_suspended = 5912 (gethrtime() - spa->spa_ccw_fail_time) < 5913 (zfs_ccw_retry_interval * NANOSEC); 5914 } 5915 5916 return (non_config_tasks || (config_task && !config_task_suspended)); 5917} 5918 5919static void 5920spa_async_dispatch(spa_t *spa) 5921{ 5922 mutex_enter(&spa->spa_async_lock); 5923 if (spa_async_tasks_pending(spa) && 5924 !spa->spa_async_suspended && 5925 spa->spa_async_thread == NULL && 5926 rootdir != NULL) 5927 spa->spa_async_thread = thread_create(NULL, 0, 5928 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5929 mutex_exit(&spa->spa_async_lock); 5930} 5931 5932static void 5933spa_async_dispatch_vd(spa_t *spa) 5934{ 5935 mutex_enter(&spa->spa_async_lock); 5936 if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 5937 !spa->spa_async_suspended && 5938 spa->spa_async_thread_vd == NULL && 5939 rootdir != NULL) 5940 spa->spa_async_thread_vd = thread_create(NULL, 0, 5941 spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 5942 mutex_exit(&spa->spa_async_lock); 5943} 5944 5945void 5946spa_async_request(spa_t *spa, int task) 5947{ 5948 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5949 mutex_enter(&spa->spa_async_lock); 5950 spa->spa_async_tasks |= task; 5951 mutex_exit(&spa->spa_async_lock); 5952 spa_async_dispatch_vd(spa); 5953} 5954 5955/* 5956 * ========================================================================== 5957 * SPA syncing routines 5958 * ========================================================================== 5959 */ 5960 5961static int 5962bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5963{ 5964 bpobj_t *bpo = arg; 5965 bpobj_enqueue(bpo, bp, tx); 5966 return (0); 5967} 5968 5969static int 5970spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5971{ 5972 zio_t *zio = arg; 5973 5974 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5975 BP_GET_PSIZE(bp), zio->io_flags)); 5976 return (0); 5977} 5978 5979static void 5980spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5981{ 5982 char *packed = NULL; 5983 size_t bufsize; 5984 size_t nvsize = 0; 5985 dmu_buf_t *db; 5986 5987 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5988 5989 /* 5990 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5991 * information. This avoids the dbuf_will_dirty() path and 5992 * saves us a pre-read to get data we don't actually care about. 5993 */ 5994 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 5995 packed = kmem_alloc(bufsize, KM_SLEEP); 5996 5997 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5998 KM_SLEEP) == 0); 5999 bzero(packed + nvsize, bufsize - nvsize); 6000 6001 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6002 6003 kmem_free(packed, bufsize); 6004 6005 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6006 dmu_buf_will_dirty(db, tx); 6007 *(uint64_t *)db->db_data = nvsize; 6008 dmu_buf_rele(db, FTAG); 6009} 6010 6011static void 6012spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6013 const char *config, const char *entry) 6014{ 6015 nvlist_t *nvroot; 6016 nvlist_t **list; 6017 int i; 6018 6019 if (!sav->sav_sync) 6020 return; 6021 6022 /* 6023 * Update the MOS nvlist describing the list of available devices. 6024 * spa_validate_aux() will have already made sure this nvlist is 6025 * valid and the vdevs are labeled appropriately. 6026 */ 6027 if (sav->sav_object == 0) { 6028 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6029 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6030 sizeof (uint64_t), tx); 6031 VERIFY(zap_update(spa->spa_meta_objset, 6032 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6033 &sav->sav_object, tx) == 0); 6034 } 6035 6036 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6037 if (sav->sav_count == 0) { 6038 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6039 } else { 6040 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6041 for (i = 0; i < sav->sav_count; i++) 6042 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6043 B_FALSE, VDEV_CONFIG_L2CACHE); 6044 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6045 sav->sav_count) == 0); 6046 for (i = 0; i < sav->sav_count; i++) 6047 nvlist_free(list[i]); 6048 kmem_free(list, sav->sav_count * sizeof (void *)); 6049 } 6050 6051 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6052 nvlist_free(nvroot); 6053 6054 sav->sav_sync = B_FALSE; 6055} 6056 6057static void 6058spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6059{ 6060 nvlist_t *config; 6061 6062 if (list_is_empty(&spa->spa_config_dirty_list)) 6063 return; 6064 6065 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6066 6067 config = spa_config_generate(spa, spa->spa_root_vdev, 6068 dmu_tx_get_txg(tx), B_FALSE); 6069 6070 /* 6071 * If we're upgrading the spa version then make sure that 6072 * the config object gets updated with the correct version. 6073 */ 6074 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6075 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6076 spa->spa_uberblock.ub_version); 6077 6078 spa_config_exit(spa, SCL_STATE, FTAG); 6079 6080 if (spa->spa_config_syncing) 6081 nvlist_free(spa->spa_config_syncing); 6082 spa->spa_config_syncing = config; 6083 6084 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6085} 6086 6087static void 6088spa_sync_version(void *arg, dmu_tx_t *tx) 6089{ 6090 uint64_t *versionp = arg; 6091 uint64_t version = *versionp; 6092 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6093 6094 /* 6095 * Setting the version is special cased when first creating the pool. 6096 */ 6097 ASSERT(tx->tx_txg != TXG_INITIAL); 6098 6099 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6100 ASSERT(version >= spa_version(spa)); 6101 6102 spa->spa_uberblock.ub_version = version; 6103 vdev_config_dirty(spa->spa_root_vdev); 6104 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6105} 6106 6107/* 6108 * Set zpool properties. 6109 */ 6110static void 6111spa_sync_props(void *arg, dmu_tx_t *tx) 6112{ 6113 nvlist_t *nvp = arg; 6114 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6115 objset_t *mos = spa->spa_meta_objset; 6116 nvpair_t *elem = NULL; 6117 6118 mutex_enter(&spa->spa_props_lock); 6119 6120 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6121 uint64_t intval; 6122 char *strval, *fname; 6123 zpool_prop_t prop; 6124 const char *propname; 6125 zprop_type_t proptype; 6126 zfeature_info_t *feature; 6127 6128 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6129 case ZPROP_INVAL: 6130 /* 6131 * We checked this earlier in spa_prop_validate(). 6132 */ 6133 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6134 6135 fname = strchr(nvpair_name(elem), '@') + 1; 6136 VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); 6137 6138 spa_feature_enable(spa, feature, tx); 6139 spa_history_log_internal(spa, "set", tx, 6140 "%s=enabled", nvpair_name(elem)); 6141 break; 6142 6143 case ZPOOL_PROP_VERSION: 6144 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6145 /* 6146 * The version is synced seperatly before other 6147 * properties and should be correct by now. 6148 */ 6149 ASSERT3U(spa_version(spa), >=, intval); 6150 break; 6151 6152 case ZPOOL_PROP_ALTROOT: 6153 /* 6154 * 'altroot' is a non-persistent property. It should 6155 * have been set temporarily at creation or import time. 6156 */ 6157 ASSERT(spa->spa_root != NULL); 6158 break; 6159 6160 case ZPOOL_PROP_READONLY: 6161 case ZPOOL_PROP_CACHEFILE: 6162 /* 6163 * 'readonly' and 'cachefile' are also non-persisitent 6164 * properties. 6165 */ 6166 break; 6167 case ZPOOL_PROP_COMMENT: 6168 VERIFY(nvpair_value_string(elem, &strval) == 0); 6169 if (spa->spa_comment != NULL) 6170 spa_strfree(spa->spa_comment); 6171 spa->spa_comment = spa_strdup(strval); 6172 /* 6173 * We need to dirty the configuration on all the vdevs 6174 * so that their labels get updated. It's unnecessary 6175 * to do this for pool creation since the vdev's 6176 * configuratoin has already been dirtied. 6177 */ 6178 if (tx->tx_txg != TXG_INITIAL) 6179 vdev_config_dirty(spa->spa_root_vdev); 6180 spa_history_log_internal(spa, "set", tx, 6181 "%s=%s", nvpair_name(elem), strval); 6182 break; 6183 default: 6184 /* 6185 * Set pool property values in the poolprops mos object. 6186 */ 6187 if (spa->spa_pool_props_object == 0) { 6188 spa->spa_pool_props_object = 6189 zap_create_link(mos, DMU_OT_POOL_PROPS, 6190 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6191 tx); 6192 } 6193 6194 /* normalize the property name */ 6195 propname = zpool_prop_to_name(prop); 6196 proptype = zpool_prop_get_type(prop); 6197 6198 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6199 ASSERT(proptype == PROP_TYPE_STRING); 6200 VERIFY(nvpair_value_string(elem, &strval) == 0); 6201 VERIFY(zap_update(mos, 6202 spa->spa_pool_props_object, propname, 6203 1, strlen(strval) + 1, strval, tx) == 0); 6204 spa_history_log_internal(spa, "set", tx, 6205 "%s=%s", nvpair_name(elem), strval); 6206 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6207 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6208 6209 if (proptype == PROP_TYPE_INDEX) { 6210 const char *unused; 6211 VERIFY(zpool_prop_index_to_string( 6212 prop, intval, &unused) == 0); 6213 } 6214 VERIFY(zap_update(mos, 6215 spa->spa_pool_props_object, propname, 6216 8, 1, &intval, tx) == 0); 6217 spa_history_log_internal(spa, "set", tx, 6218 "%s=%lld", nvpair_name(elem), intval); 6219 } else { 6220 ASSERT(0); /* not allowed */ 6221 } 6222 6223 switch (prop) { 6224 case ZPOOL_PROP_DELEGATION: 6225 spa->spa_delegation = intval; 6226 break; 6227 case ZPOOL_PROP_BOOTFS: 6228 spa->spa_bootfs = intval; 6229 break; 6230 case ZPOOL_PROP_FAILUREMODE: 6231 spa->spa_failmode = intval; 6232 break; 6233 case ZPOOL_PROP_AUTOEXPAND: 6234 spa->spa_autoexpand = intval; 6235 if (tx->tx_txg != TXG_INITIAL) 6236 spa_async_request(spa, 6237 SPA_ASYNC_AUTOEXPAND); 6238 break; 6239 case ZPOOL_PROP_DEDUPDITTO: 6240 spa->spa_dedup_ditto = intval; 6241 break; 6242 default: 6243 break; 6244 } 6245 } 6246 6247 } 6248 6249 mutex_exit(&spa->spa_props_lock); 6250} 6251 6252/* 6253 * Perform one-time upgrade on-disk changes. spa_version() does not 6254 * reflect the new version this txg, so there must be no changes this 6255 * txg to anything that the upgrade code depends on after it executes. 6256 * Therefore this must be called after dsl_pool_sync() does the sync 6257 * tasks. 6258 */ 6259static void 6260spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6261{ 6262 dsl_pool_t *dp = spa->spa_dsl_pool; 6263 6264 ASSERT(spa->spa_sync_pass == 1); 6265 6266 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6267 6268 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6269 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6270 dsl_pool_create_origin(dp, tx); 6271 6272 /* Keeping the origin open increases spa_minref */ 6273 spa->spa_minref += 3; 6274 } 6275 6276 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6277 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6278 dsl_pool_upgrade_clones(dp, tx); 6279 } 6280 6281 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6282 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6283 dsl_pool_upgrade_dir_clones(dp, tx); 6284 6285 /* Keeping the freedir open increases spa_minref */ 6286 spa->spa_minref += 3; 6287 } 6288 6289 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6290 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6291 spa_feature_create_zap_objects(spa, tx); 6292 } 6293 rrw_exit(&dp->dp_config_rwlock, FTAG); 6294} 6295 6296/* 6297 * Sync the specified transaction group. New blocks may be dirtied as 6298 * part of the process, so we iterate until it converges. 6299 */ 6300void 6301spa_sync(spa_t *spa, uint64_t txg) 6302{ 6303 dsl_pool_t *dp = spa->spa_dsl_pool; 6304 objset_t *mos = spa->spa_meta_objset; 6305 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 6306 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6307 vdev_t *rvd = spa->spa_root_vdev; 6308 vdev_t *vd; 6309 dmu_tx_t *tx; 6310 int error; 6311 6312 VERIFY(spa_writeable(spa)); 6313 6314 /* 6315 * Lock out configuration changes. 6316 */ 6317 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6318 6319 spa->spa_syncing_txg = txg; 6320 spa->spa_sync_pass = 0; 6321 6322 /* 6323 * If there are any pending vdev state changes, convert them 6324 * into config changes that go out with this transaction group. 6325 */ 6326 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6327 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6328 /* 6329 * We need the write lock here because, for aux vdevs, 6330 * calling vdev_config_dirty() modifies sav_config. 6331 * This is ugly and will become unnecessary when we 6332 * eliminate the aux vdev wart by integrating all vdevs 6333 * into the root vdev tree. 6334 */ 6335 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6336 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6337 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6338 vdev_state_clean(vd); 6339 vdev_config_dirty(vd); 6340 } 6341 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6342 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6343 } 6344 spa_config_exit(spa, SCL_STATE, FTAG); 6345 6346 tx = dmu_tx_create_assigned(dp, txg); 6347 6348 spa->spa_sync_starttime = gethrtime(); 6349#ifdef illumos 6350 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6351 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6352#else /* FreeBSD */ 6353#ifdef _KERNEL 6354 callout_reset(&spa->spa_deadman_cycid, 6355 hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6356#endif 6357#endif 6358 6359 /* 6360 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6361 * set spa_deflate if we have no raid-z vdevs. 6362 */ 6363 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6364 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6365 int i; 6366 6367 for (i = 0; i < rvd->vdev_children; i++) { 6368 vd = rvd->vdev_child[i]; 6369 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6370 break; 6371 } 6372 if (i == rvd->vdev_children) { 6373 spa->spa_deflate = TRUE; 6374 VERIFY(0 == zap_add(spa->spa_meta_objset, 6375 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6376 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6377 } 6378 } 6379 6380 /* 6381 * If anything has changed in this txg, or if someone is waiting 6382 * for this txg to sync (eg, spa_vdev_remove()), push the 6383 * deferred frees from the previous txg. If not, leave them 6384 * alone so that we don't generate work on an otherwise idle 6385 * system. 6386 */ 6387 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6388 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6389 !txg_list_empty(&dp->dp_sync_tasks, txg) || 6390 ((dsl_scan_active(dp->dp_scan) || 6391 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6392 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6393 VERIFY3U(bpobj_iterate(defer_bpo, 6394 spa_free_sync_cb, zio, tx), ==, 0); 6395 VERIFY0(zio_wait(zio)); 6396 } 6397 6398 /* 6399 * Iterate to convergence. 6400 */ 6401 do { 6402 int pass = ++spa->spa_sync_pass; 6403 6404 spa_sync_config_object(spa, tx); 6405 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6406 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6407 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6408 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6409 spa_errlog_sync(spa, txg); 6410 dsl_pool_sync(dp, txg); 6411 6412 if (pass < zfs_sync_pass_deferred_free) { 6413 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6414 bplist_iterate(free_bpl, spa_free_sync_cb, 6415 zio, tx); 6416 VERIFY(zio_wait(zio) == 0); 6417 } else { 6418 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6419 defer_bpo, tx); 6420 } 6421 6422 ddt_sync(spa, txg); 6423 dsl_scan_sync(dp, tx); 6424 6425 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6426 vdev_sync(vd, txg); 6427 6428 if (pass == 1) 6429 spa_sync_upgrades(spa, tx); 6430 6431 } while (dmu_objset_is_dirty(mos, txg)); 6432 6433 /* 6434 * Rewrite the vdev configuration (which includes the uberblock) 6435 * to commit the transaction group. 6436 * 6437 * If there are no dirty vdevs, we sync the uberblock to a few 6438 * random top-level vdevs that are known to be visible in the 6439 * config cache (see spa_vdev_add() for a complete description). 6440 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6441 */ 6442 for (;;) { 6443 /* 6444 * We hold SCL_STATE to prevent vdev open/close/etc. 6445 * while we're attempting to write the vdev labels. 6446 */ 6447 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6448 6449 if (list_is_empty(&spa->spa_config_dirty_list)) { 6450 vdev_t *svd[SPA_DVAS_PER_BP]; 6451 int svdcount = 0; 6452 int children = rvd->vdev_children; 6453 int c0 = spa_get_random(children); 6454 6455 for (int c = 0; c < children; c++) { 6456 vd = rvd->vdev_child[(c0 + c) % children]; 6457 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6458 continue; 6459 svd[svdcount++] = vd; 6460 if (svdcount == SPA_DVAS_PER_BP) 6461 break; 6462 } 6463 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6464 if (error != 0) 6465 error = vdev_config_sync(svd, svdcount, txg, 6466 B_TRUE); 6467 } else { 6468 error = vdev_config_sync(rvd->vdev_child, 6469 rvd->vdev_children, txg, B_FALSE); 6470 if (error != 0) 6471 error = vdev_config_sync(rvd->vdev_child, 6472 rvd->vdev_children, txg, B_TRUE); 6473 } 6474 6475 if (error == 0) 6476 spa->spa_last_synced_guid = rvd->vdev_guid; 6477 6478 spa_config_exit(spa, SCL_STATE, FTAG); 6479 6480 if (error == 0) 6481 break; 6482 zio_suspend(spa, NULL); 6483 zio_resume_wait(spa); 6484 } 6485 dmu_tx_commit(tx); 6486 6487#ifdef illumos 6488 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6489#else /* FreeBSD */ 6490#ifdef _KERNEL 6491 callout_drain(&spa->spa_deadman_cycid); 6492#endif 6493#endif 6494 6495 /* 6496 * Clear the dirty config list. 6497 */ 6498 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6499 vdev_config_clean(vd); 6500 6501 /* 6502 * Now that the new config has synced transactionally, 6503 * let it become visible to the config cache. 6504 */ 6505 if (spa->spa_config_syncing != NULL) { 6506 spa_config_set(spa, spa->spa_config_syncing); 6507 spa->spa_config_txg = txg; 6508 spa->spa_config_syncing = NULL; 6509 } 6510 6511 spa->spa_ubsync = spa->spa_uberblock; 6512 6513 dsl_pool_sync_done(dp, txg); 6514 6515 /* 6516 * Update usable space statistics. 6517 */ 6518 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6519 vdev_sync_done(vd, txg); 6520 6521 spa_update_dspace(spa); 6522 6523 /* 6524 * It had better be the case that we didn't dirty anything 6525 * since vdev_config_sync(). 6526 */ 6527 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6528 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6529 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6530 6531 spa->spa_sync_pass = 0; 6532 6533 spa_config_exit(spa, SCL_CONFIG, FTAG); 6534 6535 spa_handle_ignored_writes(spa); 6536 6537 /* 6538 * If any async tasks have been requested, kick them off. 6539 */ 6540 spa_async_dispatch(spa); 6541 spa_async_dispatch_vd(spa); 6542} 6543 6544/* 6545 * Sync all pools. We don't want to hold the namespace lock across these 6546 * operations, so we take a reference on the spa_t and drop the lock during the 6547 * sync. 6548 */ 6549void 6550spa_sync_allpools(void) 6551{ 6552 spa_t *spa = NULL; 6553 mutex_enter(&spa_namespace_lock); 6554 while ((spa = spa_next(spa)) != NULL) { 6555 if (spa_state(spa) != POOL_STATE_ACTIVE || 6556 !spa_writeable(spa) || spa_suspended(spa)) 6557 continue; 6558 spa_open_ref(spa, FTAG); 6559 mutex_exit(&spa_namespace_lock); 6560 txg_wait_synced(spa_get_dsl(spa), 0); 6561 mutex_enter(&spa_namespace_lock); 6562 spa_close(spa, FTAG); 6563 } 6564 mutex_exit(&spa_namespace_lock); 6565} 6566 6567/* 6568 * ========================================================================== 6569 * Miscellaneous routines 6570 * ========================================================================== 6571 */ 6572 6573/* 6574 * Remove all pools in the system. 6575 */ 6576void 6577spa_evict_all(void) 6578{ 6579 spa_t *spa; 6580 6581 /* 6582 * Remove all cached state. All pools should be closed now, 6583 * so every spa in the AVL tree should be unreferenced. 6584 */ 6585 mutex_enter(&spa_namespace_lock); 6586 while ((spa = spa_next(NULL)) != NULL) { 6587 /* 6588 * Stop async tasks. The async thread may need to detach 6589 * a device that's been replaced, which requires grabbing 6590 * spa_namespace_lock, so we must drop it here. 6591 */ 6592 spa_open_ref(spa, FTAG); 6593 mutex_exit(&spa_namespace_lock); 6594 spa_async_suspend(spa); 6595 mutex_enter(&spa_namespace_lock); 6596 spa_close(spa, FTAG); 6597 6598 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6599 spa_unload(spa); 6600 spa_deactivate(spa); 6601 } 6602 spa_remove(spa); 6603 } 6604 mutex_exit(&spa_namespace_lock); 6605} 6606 6607vdev_t * 6608spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6609{ 6610 vdev_t *vd; 6611 int i; 6612 6613 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6614 return (vd); 6615 6616 if (aux) { 6617 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6618 vd = spa->spa_l2cache.sav_vdevs[i]; 6619 if (vd->vdev_guid == guid) 6620 return (vd); 6621 } 6622 6623 for (i = 0; i < spa->spa_spares.sav_count; i++) { 6624 vd = spa->spa_spares.sav_vdevs[i]; 6625 if (vd->vdev_guid == guid) 6626 return (vd); 6627 } 6628 } 6629 6630 return (NULL); 6631} 6632 6633void 6634spa_upgrade(spa_t *spa, uint64_t version) 6635{ 6636 ASSERT(spa_writeable(spa)); 6637 6638 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6639 6640 /* 6641 * This should only be called for a non-faulted pool, and since a 6642 * future version would result in an unopenable pool, this shouldn't be 6643 * possible. 6644 */ 6645 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6646 ASSERT(version >= spa->spa_uberblock.ub_version); 6647 6648 spa->spa_uberblock.ub_version = version; 6649 vdev_config_dirty(spa->spa_root_vdev); 6650 6651 spa_config_exit(spa, SCL_ALL, FTAG); 6652 6653 txg_wait_synced(spa_get_dsl(spa), 0); 6654} 6655 6656boolean_t 6657spa_has_spare(spa_t *spa, uint64_t guid) 6658{ 6659 int i; 6660 uint64_t spareguid; 6661 spa_aux_vdev_t *sav = &spa->spa_spares; 6662 6663 for (i = 0; i < sav->sav_count; i++) 6664 if (sav->sav_vdevs[i]->vdev_guid == guid) 6665 return (B_TRUE); 6666 6667 for (i = 0; i < sav->sav_npending; i++) { 6668 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6669 &spareguid) == 0 && spareguid == guid) 6670 return (B_TRUE); 6671 } 6672 6673 return (B_FALSE); 6674} 6675 6676/* 6677 * Check if a pool has an active shared spare device. 6678 * Note: reference count of an active spare is 2, as a spare and as a replace 6679 */ 6680static boolean_t 6681spa_has_active_shared_spare(spa_t *spa) 6682{ 6683 int i, refcnt; 6684 uint64_t pool; 6685 spa_aux_vdev_t *sav = &spa->spa_spares; 6686 6687 for (i = 0; i < sav->sav_count; i++) { 6688 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6689 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6690 refcnt > 2) 6691 return (B_TRUE); 6692 } 6693 6694 return (B_FALSE); 6695} 6696 6697/* 6698 * Post a sysevent corresponding to the given event. The 'name' must be one of 6699 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6700 * filled in from the spa and (optionally) the vdev. This doesn't do anything 6701 * in the userland libzpool, as we don't want consumers to misinterpret ztest 6702 * or zdb as real changes. 6703 */ 6704void 6705spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6706{ 6707#ifdef _KERNEL 6708 sysevent_t *ev; 6709 sysevent_attr_list_t *attr = NULL; 6710 sysevent_value_t value; 6711 sysevent_id_t eid; 6712 6713 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6714 SE_SLEEP); 6715 6716 value.value_type = SE_DATA_TYPE_STRING; 6717 value.value.sv_string = spa_name(spa); 6718 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6719 goto done; 6720 6721 value.value_type = SE_DATA_TYPE_UINT64; 6722 value.value.sv_uint64 = spa_guid(spa); 6723 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6724 goto done; 6725 6726 if (vd) { 6727 value.value_type = SE_DATA_TYPE_UINT64; 6728 value.value.sv_uint64 = vd->vdev_guid; 6729 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6730 SE_SLEEP) != 0) 6731 goto done; 6732 6733 if (vd->vdev_path) { 6734 value.value_type = SE_DATA_TYPE_STRING; 6735 value.value.sv_string = vd->vdev_path; 6736 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6737 &value, SE_SLEEP) != 0) 6738 goto done; 6739 } 6740 } 6741 6742 if (sysevent_attach_attributes(ev, attr) != 0) 6743 goto done; 6744 attr = NULL; 6745 6746 (void) log_sysevent(ev, SE_SLEEP, &eid); 6747 6748done: 6749 if (attr) 6750 sysevent_free_attr(attr); 6751 sysevent_free(ev); 6752#endif 6753} 6754