spa.c revision 168962
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23168404Spjd * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24168404Spjd * Use is subject to license terms. 25168404Spjd */ 26168404Spjd 27168404Spjd#pragma ident "%Z%%M% %I% %E% SMI" 28168404Spjd 29168404Spjd/* 30168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 31168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 32168404Spjd * pool. 33168404Spjd */ 34168404Spjd 35168404Spjd#include <sys/zfs_context.h> 36168404Spjd#include <sys/fm/fs/zfs.h> 37168404Spjd#include <sys/spa_impl.h> 38168404Spjd#include <sys/zio.h> 39168404Spjd#include <sys/zio_checksum.h> 40168404Spjd#include <sys/zio_compress.h> 41168404Spjd#include <sys/dmu.h> 42168404Spjd#include <sys/dmu_tx.h> 43168404Spjd#include <sys/zap.h> 44168404Spjd#include <sys/zil.h> 45168404Spjd#include <sys/vdev_impl.h> 46168404Spjd#include <sys/metaslab.h> 47168404Spjd#include <sys/uberblock_impl.h> 48168404Spjd#include <sys/txg.h> 49168404Spjd#include <sys/avl.h> 50168404Spjd#include <sys/dmu_traverse.h> 51168404Spjd#include <sys/dmu_objset.h> 52168404Spjd#include <sys/unique.h> 53168404Spjd#include <sys/dsl_pool.h> 54168404Spjd#include <sys/dsl_dataset.h> 55168404Spjd#include <sys/dsl_dir.h> 56168404Spjd#include <sys/dsl_prop.h> 57168404Spjd#include <sys/dsl_synctask.h> 58168404Spjd#include <sys/fs/zfs.h> 59168404Spjd#include <sys/callb.h> 60168962Spjd#include <sys/sunddi.h> 61168404Spjd 62168712Spjdint zio_taskq_threads = 0; 63168712SpjdSYSCTL_DECL(_vfs_zfs); 64168712SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 65168712SpjdTUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads); 66168712SpjdSYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW, 67168712Spjd &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type"); 68168404Spjd 69168712Spjd 70168404Spjd/* 71168404Spjd * ========================================================================== 72168404Spjd * SPA state manipulation (open/create/destroy/import/export) 73168404Spjd * ========================================================================== 74168404Spjd */ 75168404Spjd 76168404Spjdstatic int 77168404Spjdspa_error_entry_compare(const void *a, const void *b) 78168404Spjd{ 79168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 80168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 81168404Spjd int ret; 82168404Spjd 83168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 84168404Spjd sizeof (zbookmark_t)); 85168404Spjd 86168404Spjd if (ret < 0) 87168404Spjd return (-1); 88168404Spjd else if (ret > 0) 89168404Spjd return (1); 90168404Spjd else 91168404Spjd return (0); 92168404Spjd} 93168404Spjd 94168404Spjd/* 95168404Spjd * Utility function which retrieves copies of the current logs and 96168404Spjd * re-initializes them in the process. 97168404Spjd */ 98168404Spjdvoid 99168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 100168404Spjd{ 101168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 102168404Spjd 103168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 104168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 105168404Spjd 106168404Spjd avl_create(&spa->spa_errlist_scrub, 107168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 108168404Spjd offsetof(spa_error_entry_t, se_avl)); 109168404Spjd avl_create(&spa->spa_errlist_last, 110168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 111168404Spjd offsetof(spa_error_entry_t, se_avl)); 112168404Spjd} 113168404Spjd 114168404Spjd/* 115168404Spjd * Activate an uninitialized pool. 116168404Spjd */ 117168404Spjdstatic void 118168404Spjdspa_activate(spa_t *spa) 119168404Spjd{ 120168404Spjd int t; 121168712Spjd int nthreads = zio_taskq_threads; 122168712Spjd char name[32]; 123168404Spjd 124168404Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 125168404Spjd 126168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 127168404Spjd 128168404Spjd spa->spa_normal_class = metaslab_class_create(); 129168404Spjd 130168712Spjd if (nthreads == 0) 131168715Spjd nthreads = max_ncpus; 132168404Spjd for (t = 0; t < ZIO_TYPES; t++) { 133168712Spjd snprintf(name, sizeof(name), "spa_zio_issue %d", t); 134168712Spjd spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads, 135168712Spjd maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 136168712Spjd snprintf(name, sizeof(name), "spa_zio_intr %d", t); 137168712Spjd spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads, 138168712Spjd maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 139168404Spjd } 140168404Spjd 141168404Spjd rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 142168404Spjd 143168404Spjd mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL); 144168404Spjd mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 145168404Spjd mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 146168404Spjd mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 147168404Spjd cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL); 148168404Spjd mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 149168404Spjd mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 150168404Spjd mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 151168404Spjd 152168404Spjd list_create(&spa->spa_dirty_list, sizeof (vdev_t), 153168404Spjd offsetof(vdev_t, vdev_dirty_node)); 154168404Spjd 155168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 156168404Spjd offsetof(struct vdev, vdev_txg_node)); 157168404Spjd 158168404Spjd avl_create(&spa->spa_errlist_scrub, 159168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 160168404Spjd offsetof(spa_error_entry_t, se_avl)); 161168404Spjd avl_create(&spa->spa_errlist_last, 162168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 163168404Spjd offsetof(spa_error_entry_t, se_avl)); 164168404Spjd} 165168404Spjd 166168404Spjd/* 167168404Spjd * Opposite of spa_activate(). 168168404Spjd */ 169168404Spjdstatic void 170168404Spjdspa_deactivate(spa_t *spa) 171168404Spjd{ 172168404Spjd int t; 173168404Spjd 174168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 175168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 176168404Spjd ASSERT(spa->spa_root_vdev == NULL); 177168404Spjd 178168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 179168404Spjd 180168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 181168404Spjd 182168404Spjd list_destroy(&spa->spa_dirty_list); 183168404Spjd 184168404Spjd for (t = 0; t < ZIO_TYPES; t++) { 185168404Spjd taskq_destroy(spa->spa_zio_issue_taskq[t]); 186168404Spjd taskq_destroy(spa->spa_zio_intr_taskq[t]); 187168404Spjd spa->spa_zio_issue_taskq[t] = NULL; 188168404Spjd spa->spa_zio_intr_taskq[t] = NULL; 189168404Spjd } 190168404Spjd 191168404Spjd metaslab_class_destroy(spa->spa_normal_class); 192168404Spjd spa->spa_normal_class = NULL; 193168404Spjd 194168404Spjd /* 195168404Spjd * If this was part of an import or the open otherwise failed, we may 196168404Spjd * still have errors left in the queues. Empty them just in case. 197168404Spjd */ 198168404Spjd spa_errlog_drain(spa); 199168404Spjd 200168404Spjd avl_destroy(&spa->spa_errlist_scrub); 201168404Spjd avl_destroy(&spa->spa_errlist_last); 202168404Spjd 203168404Spjd rw_destroy(&spa->spa_traverse_lock); 204168404Spjd mutex_destroy(&spa->spa_uberblock_lock); 205168404Spjd mutex_destroy(&spa->spa_errlog_lock); 206168404Spjd mutex_destroy(&spa->spa_errlist_lock); 207168404Spjd mutex_destroy(&spa->spa_config_lock.scl_lock); 208168404Spjd cv_destroy(&spa->spa_config_lock.scl_cv); 209168404Spjd mutex_destroy(&spa->spa_sync_bplist.bpl_lock); 210168404Spjd mutex_destroy(&spa->spa_history_lock); 211168404Spjd mutex_destroy(&spa->spa_props_lock); 212168404Spjd 213168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 214168404Spjd} 215168404Spjd 216168404Spjd/* 217168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 218168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 219168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 220168404Spjd * All vdev validation is done by the vdev_alloc() routine. 221168404Spjd */ 222168404Spjdstatic int 223168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 224168404Spjd uint_t id, int atype) 225168404Spjd{ 226168404Spjd nvlist_t **child; 227168404Spjd uint_t c, children; 228168404Spjd int error; 229168404Spjd 230168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 231168404Spjd return (error); 232168404Spjd 233168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 234168404Spjd return (0); 235168404Spjd 236168404Spjd if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 237168404Spjd &child, &children) != 0) { 238168404Spjd vdev_free(*vdp); 239168404Spjd *vdp = NULL; 240168404Spjd return (EINVAL); 241168404Spjd } 242168404Spjd 243168404Spjd for (c = 0; c < children; c++) { 244168404Spjd vdev_t *vd; 245168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 246168404Spjd atype)) != 0) { 247168404Spjd vdev_free(*vdp); 248168404Spjd *vdp = NULL; 249168404Spjd return (error); 250168404Spjd } 251168404Spjd } 252168404Spjd 253168404Spjd ASSERT(*vdp != NULL); 254168404Spjd 255168404Spjd return (0); 256168404Spjd} 257168404Spjd 258168404Spjd/* 259168404Spjd * Opposite of spa_load(). 260168404Spjd */ 261168404Spjdstatic void 262168404Spjdspa_unload(spa_t *spa) 263168404Spjd{ 264168404Spjd int i; 265168404Spjd 266168404Spjd /* 267168404Spjd * Stop async tasks. 268168404Spjd */ 269168404Spjd spa_async_suspend(spa); 270168404Spjd 271168404Spjd /* 272168404Spjd * Stop syncing. 273168404Spjd */ 274168404Spjd if (spa->spa_sync_on) { 275168404Spjd txg_sync_stop(spa->spa_dsl_pool); 276168404Spjd spa->spa_sync_on = B_FALSE; 277168404Spjd } 278168404Spjd 279168404Spjd /* 280168404Spjd * Wait for any outstanding prefetch I/O to complete. 281168404Spjd */ 282168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 283168404Spjd spa_config_exit(spa, FTAG); 284168404Spjd 285168404Spjd /* 286168404Spjd * Close the dsl pool. 287168404Spjd */ 288168404Spjd if (spa->spa_dsl_pool) { 289168404Spjd dsl_pool_close(spa->spa_dsl_pool); 290168404Spjd spa->spa_dsl_pool = NULL; 291168404Spjd } 292168404Spjd 293168404Spjd /* 294168404Spjd * Close all vdevs. 295168404Spjd */ 296168404Spjd if (spa->spa_root_vdev) 297168404Spjd vdev_free(spa->spa_root_vdev); 298168404Spjd ASSERT(spa->spa_root_vdev == NULL); 299168404Spjd 300168404Spjd for (i = 0; i < spa->spa_nspares; i++) 301168404Spjd vdev_free(spa->spa_spares[i]); 302168404Spjd if (spa->spa_spares) { 303168404Spjd kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 304168404Spjd spa->spa_spares = NULL; 305168404Spjd } 306168404Spjd if (spa->spa_sparelist) { 307168404Spjd nvlist_free(spa->spa_sparelist); 308168404Spjd spa->spa_sparelist = NULL; 309168404Spjd } 310168404Spjd 311168404Spjd spa->spa_async_suspended = 0; 312168404Spjd} 313168404Spjd 314168404Spjd/* 315168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 316168404Spjd * this pool. When this is called, we have some form of basic information in 317168404Spjd * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 318168404Spjd * re-generate a more complete list including status information. 319168404Spjd */ 320168404Spjdstatic void 321168404Spjdspa_load_spares(spa_t *spa) 322168404Spjd{ 323168404Spjd nvlist_t **spares; 324168404Spjd uint_t nspares; 325168404Spjd int i; 326168404Spjd vdev_t *vd, *tvd; 327168404Spjd 328168404Spjd /* 329168404Spjd * First, close and free any existing spare vdevs. 330168404Spjd */ 331168404Spjd for (i = 0; i < spa->spa_nspares; i++) { 332168404Spjd vd = spa->spa_spares[i]; 333168404Spjd 334168404Spjd /* Undo the call to spa_activate() below */ 335168404Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 336168404Spjd tvd->vdev_isspare) 337168404Spjd spa_spare_remove(tvd); 338168404Spjd vdev_close(vd); 339168404Spjd vdev_free(vd); 340168404Spjd } 341168404Spjd 342168404Spjd if (spa->spa_spares) 343168404Spjd kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 344168404Spjd 345168404Spjd if (spa->spa_sparelist == NULL) 346168404Spjd nspares = 0; 347168404Spjd else 348168404Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 349168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 350168404Spjd 351168404Spjd spa->spa_nspares = (int)nspares; 352168404Spjd spa->spa_spares = NULL; 353168404Spjd 354168404Spjd if (nspares == 0) 355168404Spjd return; 356168404Spjd 357168404Spjd /* 358168404Spjd * Construct the array of vdevs, opening them to get status in the 359168404Spjd * process. For each spare, there is potentially two different vdev_t 360168404Spjd * structures associated with it: one in the list of spares (used only 361168404Spjd * for basic validation purposes) and one in the active vdev 362168404Spjd * configuration (if it's spared in). During this phase we open and 363168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 364168404Spjd * active configuration, then we also mark this vdev as an active spare. 365168404Spjd */ 366168404Spjd spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 367168404Spjd for (i = 0; i < spa->spa_nspares; i++) { 368168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 369168404Spjd VDEV_ALLOC_SPARE) == 0); 370168404Spjd ASSERT(vd != NULL); 371168404Spjd 372168404Spjd spa->spa_spares[i] = vd; 373168404Spjd 374168404Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 375168404Spjd if (!tvd->vdev_isspare) 376168404Spjd spa_spare_add(tvd); 377168404Spjd 378168404Spjd /* 379168404Spjd * We only mark the spare active if we were successfully 380168404Spjd * able to load the vdev. Otherwise, importing a pool 381168404Spjd * with a bad active spare would result in strange 382168404Spjd * behavior, because multiple pool would think the spare 383168404Spjd * is actively in use. 384168404Spjd * 385168404Spjd * There is a vulnerability here to an equally bizarre 386168404Spjd * circumstance, where a dead active spare is later 387168404Spjd * brought back to life (onlined or otherwise). Given 388168404Spjd * the rarity of this scenario, and the extra complexity 389168404Spjd * it adds, we ignore the possibility. 390168404Spjd */ 391168404Spjd if (!vdev_is_dead(tvd)) 392168404Spjd spa_spare_activate(tvd); 393168404Spjd } 394168404Spjd 395168404Spjd if (vdev_open(vd) != 0) 396168404Spjd continue; 397168404Spjd 398168404Spjd vd->vdev_top = vd; 399168404Spjd (void) vdev_validate_spare(vd); 400168404Spjd } 401168404Spjd 402168404Spjd /* 403168404Spjd * Recompute the stashed list of spares, with status information 404168404Spjd * this time. 405168404Spjd */ 406168404Spjd VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 407168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 408168404Spjd 409168404Spjd spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 410168404Spjd for (i = 0; i < spa->spa_nspares; i++) 411168404Spjd spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 412168404Spjd B_TRUE, B_TRUE); 413168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 414168404Spjd spares, spa->spa_nspares) == 0); 415168404Spjd for (i = 0; i < spa->spa_nspares; i++) 416168404Spjd nvlist_free(spares[i]); 417168404Spjd kmem_free(spares, spa->spa_nspares * sizeof (void *)); 418168404Spjd} 419168404Spjd 420168404Spjdstatic int 421168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 422168404Spjd{ 423168404Spjd dmu_buf_t *db; 424168404Spjd char *packed = NULL; 425168404Spjd size_t nvsize = 0; 426168404Spjd int error; 427168404Spjd *value = NULL; 428168404Spjd 429168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 430168404Spjd nvsize = *(uint64_t *)db->db_data; 431168404Spjd dmu_buf_rele(db, FTAG); 432168404Spjd 433168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 434168404Spjd error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 435168404Spjd if (error == 0) 436168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 437168404Spjd kmem_free(packed, nvsize); 438168404Spjd 439168404Spjd return (error); 440168404Spjd} 441168404Spjd 442168404Spjd/* 443168404Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 444168404Spjd * source of configuration information. 445168404Spjd */ 446168404Spjdstatic int 447168404Spjdspa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 448168404Spjd{ 449168404Spjd int error = 0; 450168404Spjd nvlist_t *nvroot = NULL; 451168404Spjd vdev_t *rvd; 452168404Spjd uberblock_t *ub = &spa->spa_uberblock; 453168404Spjd uint64_t config_cache_txg = spa->spa_config_txg; 454168404Spjd uint64_t pool_guid; 455168404Spjd uint64_t version; 456168404Spjd zio_t *zio; 457168404Spjd 458168404Spjd spa->spa_load_state = state; 459168404Spjd 460168404Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 461168404Spjd nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 462168404Spjd error = EINVAL; 463168404Spjd goto out; 464168404Spjd } 465168404Spjd 466168404Spjd /* 467168404Spjd * Versioning wasn't explicitly added to the label until later, so if 468168404Spjd * it's not present treat it as the initial version. 469168404Spjd */ 470168404Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 471168404Spjd version = ZFS_VERSION_INITIAL; 472168404Spjd 473168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 474168404Spjd &spa->spa_config_txg); 475168404Spjd 476168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 477168404Spjd spa_guid_exists(pool_guid, 0)) { 478168404Spjd error = EEXIST; 479168404Spjd goto out; 480168404Spjd } 481168404Spjd 482168404Spjd spa->spa_load_guid = pool_guid; 483168404Spjd 484168404Spjd /* 485168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 486168404Spjd * value that will be returned by spa_version() since parsing the 487168404Spjd * configuration requires knowing the version number. 488168404Spjd */ 489168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 490168404Spjd spa->spa_ubsync.ub_version = version; 491168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 492168404Spjd spa_config_exit(spa, FTAG); 493168404Spjd 494168404Spjd if (error != 0) 495168404Spjd goto out; 496168404Spjd 497168404Spjd ASSERT(spa->spa_root_vdev == rvd); 498168404Spjd ASSERT(spa_guid(spa) == pool_guid); 499168404Spjd 500168404Spjd /* 501168404Spjd * Try to open all vdevs, loading each label in the process. 502168404Spjd */ 503168926Spjd error = vdev_open(rvd); 504168926Spjd if (error != 0) 505168404Spjd goto out; 506168404Spjd 507168404Spjd /* 508168404Spjd * Validate the labels for all leaf vdevs. We need to grab the config 509168404Spjd * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 510168404Spjd * flag. 511168404Spjd */ 512168404Spjd spa_config_enter(spa, RW_READER, FTAG); 513168404Spjd error = vdev_validate(rvd); 514168404Spjd spa_config_exit(spa, FTAG); 515168404Spjd 516168926Spjd if (error != 0) 517168404Spjd goto out; 518168404Spjd 519168404Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 520168404Spjd error = ENXIO; 521168404Spjd goto out; 522168404Spjd } 523168404Spjd 524168404Spjd /* 525168404Spjd * Find the best uberblock. 526168404Spjd */ 527168404Spjd bzero(ub, sizeof (uberblock_t)); 528168404Spjd 529168404Spjd zio = zio_root(spa, NULL, NULL, 530168404Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 531168404Spjd vdev_uberblock_load(zio, rvd, ub); 532168404Spjd error = zio_wait(zio); 533168404Spjd 534168404Spjd /* 535168404Spjd * If we weren't able to find a single valid uberblock, return failure. 536168404Spjd */ 537168404Spjd if (ub->ub_txg == 0) { 538168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 539168404Spjd VDEV_AUX_CORRUPT_DATA); 540168404Spjd error = ENXIO; 541168404Spjd goto out; 542168404Spjd } 543168404Spjd 544168404Spjd /* 545168404Spjd * If the pool is newer than the code, we can't open it. 546168404Spjd */ 547168404Spjd if (ub->ub_version > ZFS_VERSION) { 548168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 549168404Spjd VDEV_AUX_VERSION_NEWER); 550168404Spjd error = ENOTSUP; 551168404Spjd goto out; 552168404Spjd } 553168404Spjd 554168404Spjd /* 555168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 556168404Spjd * incomplete configuration. 557168404Spjd */ 558168404Spjd if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 559168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 560168404Spjd VDEV_AUX_BAD_GUID_SUM); 561168404Spjd error = ENXIO; 562168404Spjd goto out; 563168404Spjd } 564168404Spjd 565168404Spjd /* 566168404Spjd * Initialize internal SPA structures. 567168404Spjd */ 568168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 569168404Spjd spa->spa_ubsync = spa->spa_uberblock; 570168404Spjd spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 571168404Spjd error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 572168404Spjd if (error) { 573168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 574168404Spjd VDEV_AUX_CORRUPT_DATA); 575168404Spjd goto out; 576168404Spjd } 577168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 578168404Spjd 579168404Spjd if (zap_lookup(spa->spa_meta_objset, 580168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 581168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 582168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 583168404Spjd VDEV_AUX_CORRUPT_DATA); 584168404Spjd error = EIO; 585168404Spjd goto out; 586168404Spjd } 587168404Spjd 588168404Spjd if (!mosconfig) { 589168404Spjd nvlist_t *newconfig; 590168498Spjd uint64_t hostid; 591168404Spjd 592168404Spjd if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 593168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 594168404Spjd VDEV_AUX_CORRUPT_DATA); 595168404Spjd error = EIO; 596168404Spjd goto out; 597168404Spjd } 598168404Spjd 599168821Spjd /* 600168821Spjd * hostid is set after the root file system is mounted, so 601168821Spjd * ignore the check until it's done. 602168821Spjd */ 603168498Spjd if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 604168821Spjd &hostid) == 0 && root_mounted()) { 605168498Spjd char *hostname; 606168498Spjd unsigned long myhostid = 0; 607168498Spjd 608168498Spjd VERIFY(nvlist_lookup_string(newconfig, 609168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 610168498Spjd 611168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 612168498Spjd if ((unsigned long)hostid != myhostid) { 613168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 614168498Spjd "loaded as it was last accessed by " 615168498Spjd "another system (host: %s hostid: 0x%lx). " 616168498Spjd "See: http://www.sun.com/msg/ZFS-8000-EY", 617168498Spjd spa->spa_name, hostname, 618168498Spjd (unsigned long)hostid); 619168498Spjd error = EBADF; 620168498Spjd goto out; 621168498Spjd } 622168498Spjd } 623168498Spjd 624168404Spjd spa_config_set(spa, newconfig); 625168404Spjd spa_unload(spa); 626168404Spjd spa_deactivate(spa); 627168404Spjd spa_activate(spa); 628168404Spjd 629168404Spjd return (spa_load(spa, newconfig, state, B_TRUE)); 630168404Spjd } 631168404Spjd 632168404Spjd if (zap_lookup(spa->spa_meta_objset, 633168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 634168404Spjd sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 635168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 636168404Spjd VDEV_AUX_CORRUPT_DATA); 637168404Spjd error = EIO; 638168404Spjd goto out; 639168404Spjd } 640168404Spjd 641168404Spjd /* 642168404Spjd * Load the bit that tells us to use the new accounting function 643168404Spjd * (raid-z deflation). If we have an older pool, this will not 644168404Spjd * be present. 645168404Spjd */ 646168404Spjd error = zap_lookup(spa->spa_meta_objset, 647168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 648168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate); 649168404Spjd if (error != 0 && error != ENOENT) { 650168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 651168404Spjd VDEV_AUX_CORRUPT_DATA); 652168404Spjd error = EIO; 653168404Spjd goto out; 654168404Spjd } 655168404Spjd 656168404Spjd /* 657168404Spjd * Load the persistent error log. If we have an older pool, this will 658168404Spjd * not be present. 659168404Spjd */ 660168404Spjd error = zap_lookup(spa->spa_meta_objset, 661168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 662168404Spjd sizeof (uint64_t), 1, &spa->spa_errlog_last); 663168404Spjd if (error != 0 && error != ENOENT) { 664168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 665168404Spjd VDEV_AUX_CORRUPT_DATA); 666168404Spjd error = EIO; 667168404Spjd goto out; 668168404Spjd } 669168404Spjd 670168404Spjd error = zap_lookup(spa->spa_meta_objset, 671168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 672168404Spjd sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 673168404Spjd if (error != 0 && error != ENOENT) { 674168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 675168404Spjd VDEV_AUX_CORRUPT_DATA); 676168404Spjd error = EIO; 677168404Spjd goto out; 678168404Spjd } 679168404Spjd 680168404Spjd /* 681168404Spjd * Load the history object. If we have an older pool, this 682168404Spjd * will not be present. 683168404Spjd */ 684168404Spjd error = zap_lookup(spa->spa_meta_objset, 685168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 686168404Spjd sizeof (uint64_t), 1, &spa->spa_history); 687168404Spjd if (error != 0 && error != ENOENT) { 688168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 689168404Spjd VDEV_AUX_CORRUPT_DATA); 690168404Spjd error = EIO; 691168404Spjd goto out; 692168404Spjd } 693168404Spjd 694168404Spjd /* 695168404Spjd * Load any hot spares for this pool. 696168404Spjd */ 697168404Spjd error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 698168404Spjd DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 699168404Spjd if (error != 0 && error != ENOENT) { 700168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 701168404Spjd VDEV_AUX_CORRUPT_DATA); 702168404Spjd error = EIO; 703168404Spjd goto out; 704168404Spjd } 705168404Spjd if (error == 0) { 706168404Spjd ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 707168404Spjd if (load_nvlist(spa, spa->spa_spares_object, 708168404Spjd &spa->spa_sparelist) != 0) { 709168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 710168404Spjd VDEV_AUX_CORRUPT_DATA); 711168404Spjd error = EIO; 712168404Spjd goto out; 713168404Spjd } 714168404Spjd 715168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 716168404Spjd spa_load_spares(spa); 717168404Spjd spa_config_exit(spa, FTAG); 718168404Spjd } 719168404Spjd 720168404Spjd error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 721168404Spjd DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 722168404Spjd 723168404Spjd if (error && error != ENOENT) { 724168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 725168404Spjd VDEV_AUX_CORRUPT_DATA); 726168404Spjd error = EIO; 727168404Spjd goto out; 728168404Spjd } 729168404Spjd 730168404Spjd if (error == 0) { 731168404Spjd (void) zap_lookup(spa->spa_meta_objset, 732168404Spjd spa->spa_pool_props_object, 733168404Spjd zpool_prop_to_name(ZFS_PROP_BOOTFS), 734168404Spjd sizeof (uint64_t), 1, &spa->spa_bootfs); 735168404Spjd } 736168404Spjd 737168404Spjd /* 738168404Spjd * Load the vdev state for all toplevel vdevs. 739168404Spjd */ 740168404Spjd vdev_load(rvd); 741168404Spjd 742168404Spjd /* 743168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 744168404Spjd */ 745168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 746168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 747168404Spjd spa_config_exit(spa, FTAG); 748168404Spjd 749168404Spjd /* 750168404Spjd * Check the state of the root vdev. If it can't be opened, it 751168404Spjd * indicates one or more toplevel vdevs are faulted. 752168404Spjd */ 753168404Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 754168404Spjd error = ENXIO; 755168404Spjd goto out; 756168404Spjd } 757168404Spjd 758168404Spjd if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 759168404Spjd dmu_tx_t *tx; 760168404Spjd int need_update = B_FALSE; 761168404Spjd int c; 762168404Spjd 763168404Spjd /* 764168404Spjd * Claim log blocks that haven't been committed yet. 765168404Spjd * This must all happen in a single txg. 766168404Spjd */ 767168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 768168404Spjd spa_first_txg(spa)); 769168404Spjd (void) dmu_objset_find(spa->spa_name, 770168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 771168404Spjd dmu_tx_commit(tx); 772168404Spjd 773168404Spjd spa->spa_sync_on = B_TRUE; 774168404Spjd txg_sync_start(spa->spa_dsl_pool); 775168404Spjd 776168404Spjd /* 777168404Spjd * Wait for all claims to sync. 778168404Spjd */ 779168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 780168404Spjd 781168404Spjd /* 782168404Spjd * If the config cache is stale, or we have uninitialized 783168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 784168404Spjd */ 785168404Spjd if (config_cache_txg != spa->spa_config_txg || 786168404Spjd state == SPA_LOAD_IMPORT) 787168404Spjd need_update = B_TRUE; 788168404Spjd 789168404Spjd for (c = 0; c < rvd->vdev_children; c++) 790168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 791168404Spjd need_update = B_TRUE; 792168404Spjd 793168404Spjd /* 794168404Spjd * Update the config cache asychronously in case we're the 795168404Spjd * root pool, in which case the config cache isn't writable yet. 796168404Spjd */ 797168404Spjd if (need_update) 798168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 799168404Spjd } 800168404Spjd 801168404Spjd error = 0; 802168404Spjdout: 803168404Spjd if (error && error != EBADF) 804168404Spjd zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 805168404Spjd spa->spa_load_state = SPA_LOAD_NONE; 806168404Spjd spa->spa_ena = 0; 807168404Spjd 808168404Spjd return (error); 809168404Spjd} 810168404Spjd 811168404Spjd/* 812168404Spjd * Pool Open/Import 813168404Spjd * 814168404Spjd * The import case is identical to an open except that the configuration is sent 815168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 816168404Spjd * case of an open, the pool configuration will exist in the 817168404Spjd * POOL_STATE_UNITIALIZED state. 818168404Spjd * 819168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 820168404Spjd * the same time open the pool, without having to keep around the spa_t in some 821168404Spjd * ambiguous state. 822168404Spjd */ 823168404Spjdstatic int 824168404Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 825168404Spjd{ 826168404Spjd spa_t *spa; 827168404Spjd int error; 828168404Spjd int loaded = B_FALSE; 829168404Spjd int locked = B_FALSE; 830168404Spjd 831168404Spjd *spapp = NULL; 832168404Spjd 833168404Spjd /* 834168404Spjd * As disgusting as this is, we need to support recursive calls to this 835168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 836168404Spjd * up calling spa_open() again. The real fix is to figure out how to 837168404Spjd * avoid dsl_dir_open() calling this in the first place. 838168404Spjd */ 839168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 840168404Spjd mutex_enter(&spa_namespace_lock); 841168404Spjd locked = B_TRUE; 842168404Spjd } 843168404Spjd 844168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 845168404Spjd if (locked) 846168404Spjd mutex_exit(&spa_namespace_lock); 847168404Spjd return (ENOENT); 848168404Spjd } 849168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 850168404Spjd 851168404Spjd spa_activate(spa); 852168404Spjd 853168404Spjd error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 854168404Spjd 855168404Spjd if (error == EBADF) { 856168404Spjd /* 857168404Spjd * If vdev_validate() returns failure (indicated by 858168404Spjd * EBADF), it indicates that one of the vdevs indicates 859168404Spjd * that the pool has been exported or destroyed. If 860168404Spjd * this is the case, the config cache is out of sync and 861168404Spjd * we should remove the pool from the namespace. 862168404Spjd */ 863168404Spjd zfs_post_ok(spa, NULL); 864168404Spjd spa_unload(spa); 865168404Spjd spa_deactivate(spa); 866168404Spjd spa_remove(spa); 867168404Spjd spa_config_sync(); 868168404Spjd if (locked) 869168404Spjd mutex_exit(&spa_namespace_lock); 870168404Spjd return (ENOENT); 871168404Spjd } 872168404Spjd 873168404Spjd if (error) { 874168404Spjd /* 875168404Spjd * We can't open the pool, but we still have useful 876168404Spjd * information: the state of each vdev after the 877168404Spjd * attempted vdev_open(). Return this to the user. 878168404Spjd */ 879168404Spjd if (config != NULL && spa->spa_root_vdev != NULL) { 880168404Spjd spa_config_enter(spa, RW_READER, FTAG); 881168404Spjd *config = spa_config_generate(spa, NULL, -1ULL, 882168404Spjd B_TRUE); 883168404Spjd spa_config_exit(spa, FTAG); 884168404Spjd } 885168404Spjd spa_unload(spa); 886168404Spjd spa_deactivate(spa); 887168404Spjd spa->spa_last_open_failed = B_TRUE; 888168404Spjd if (locked) 889168404Spjd mutex_exit(&spa_namespace_lock); 890168404Spjd *spapp = NULL; 891168404Spjd return (error); 892168404Spjd } else { 893168404Spjd zfs_post_ok(spa, NULL); 894168404Spjd spa->spa_last_open_failed = B_FALSE; 895168404Spjd } 896168404Spjd 897168404Spjd loaded = B_TRUE; 898168404Spjd } 899168404Spjd 900168404Spjd spa_open_ref(spa, tag); 901168404Spjd if (locked) 902168404Spjd mutex_exit(&spa_namespace_lock); 903168404Spjd 904168404Spjd *spapp = spa; 905168404Spjd 906168404Spjd if (config != NULL) { 907168404Spjd spa_config_enter(spa, RW_READER, FTAG); 908168404Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 909168404Spjd spa_config_exit(spa, FTAG); 910168404Spjd } 911168404Spjd 912168404Spjd /* 913168404Spjd * If we just loaded the pool, resilver anything that's out of date. 914168404Spjd */ 915168404Spjd if (loaded && (spa_mode & FWRITE)) 916168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 917168404Spjd 918168404Spjd return (0); 919168404Spjd} 920168404Spjd 921168404Spjdint 922168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 923168404Spjd{ 924168404Spjd return (spa_open_common(name, spapp, tag, NULL)); 925168404Spjd} 926168404Spjd 927168404Spjd/* 928168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 929168404Spjd * preventing it from being exported or destroyed. 930168404Spjd */ 931168404Spjdspa_t * 932168404Spjdspa_inject_addref(char *name) 933168404Spjd{ 934168404Spjd spa_t *spa; 935168404Spjd 936168404Spjd mutex_enter(&spa_namespace_lock); 937168404Spjd if ((spa = spa_lookup(name)) == NULL) { 938168404Spjd mutex_exit(&spa_namespace_lock); 939168404Spjd return (NULL); 940168404Spjd } 941168404Spjd spa->spa_inject_ref++; 942168404Spjd mutex_exit(&spa_namespace_lock); 943168404Spjd 944168404Spjd return (spa); 945168404Spjd} 946168404Spjd 947168404Spjdvoid 948168404Spjdspa_inject_delref(spa_t *spa) 949168404Spjd{ 950168404Spjd mutex_enter(&spa_namespace_lock); 951168404Spjd spa->spa_inject_ref--; 952168404Spjd mutex_exit(&spa_namespace_lock); 953168404Spjd} 954168404Spjd 955168404Spjdstatic void 956168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 957168404Spjd{ 958168404Spjd nvlist_t **spares; 959168404Spjd uint_t i, nspares; 960168404Spjd nvlist_t *nvroot; 961168404Spjd uint64_t guid; 962168404Spjd vdev_stat_t *vs; 963168404Spjd uint_t vsc; 964168404Spjd uint64_t pool; 965168404Spjd 966168404Spjd if (spa->spa_nspares == 0) 967168404Spjd return; 968168404Spjd 969168404Spjd VERIFY(nvlist_lookup_nvlist(config, 970168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 971168404Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 972168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 973168404Spjd if (nspares != 0) { 974168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 975168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 976168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 977168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 978168404Spjd 979168404Spjd /* 980168404Spjd * Go through and find any spares which have since been 981168404Spjd * repurposed as an active spare. If this is the case, update 982168404Spjd * their status appropriately. 983168404Spjd */ 984168404Spjd for (i = 0; i < nspares; i++) { 985168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 986168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 987168404Spjd if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 988168404Spjd VERIFY(nvlist_lookup_uint64_array( 989168404Spjd spares[i], ZPOOL_CONFIG_STATS, 990168404Spjd (uint64_t **)&vs, &vsc) == 0); 991168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 992168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 993168404Spjd } 994168404Spjd } 995168404Spjd } 996168404Spjd} 997168404Spjd 998168404Spjdint 999168404Spjdspa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1000168404Spjd{ 1001168404Spjd int error; 1002168404Spjd spa_t *spa; 1003168404Spjd 1004168404Spjd *config = NULL; 1005168404Spjd error = spa_open_common(name, &spa, FTAG, config); 1006168404Spjd 1007168404Spjd if (spa && *config != NULL) { 1008168404Spjd VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1009168404Spjd spa_get_errlog_size(spa)) == 0); 1010168404Spjd 1011168404Spjd spa_add_spares(spa, *config); 1012168404Spjd } 1013168404Spjd 1014168404Spjd /* 1015168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 1016168404Spjd * and call spa_lookup() directly. 1017168404Spjd */ 1018168404Spjd if (altroot) { 1019168404Spjd if (spa == NULL) { 1020168404Spjd mutex_enter(&spa_namespace_lock); 1021168404Spjd spa = spa_lookup(name); 1022168404Spjd if (spa) 1023168404Spjd spa_altroot(spa, altroot, buflen); 1024168404Spjd else 1025168404Spjd altroot[0] = '\0'; 1026168404Spjd spa = NULL; 1027168404Spjd mutex_exit(&spa_namespace_lock); 1028168404Spjd } else { 1029168404Spjd spa_altroot(spa, altroot, buflen); 1030168404Spjd } 1031168404Spjd } 1032168404Spjd 1033168404Spjd if (spa != NULL) 1034168404Spjd spa_close(spa, FTAG); 1035168404Spjd 1036168404Spjd return (error); 1037168404Spjd} 1038168404Spjd 1039168404Spjd/* 1040168404Spjd * Validate that the 'spares' array is well formed. We must have an array of 1041168404Spjd * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1042168404Spjd * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1043168404Spjd * as they are well-formed. 1044168404Spjd */ 1045168404Spjdstatic int 1046168404Spjdspa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1047168404Spjd{ 1048168404Spjd nvlist_t **spares; 1049168404Spjd uint_t i, nspares; 1050168404Spjd vdev_t *vd; 1051168404Spjd int error; 1052168404Spjd 1053168404Spjd /* 1054168404Spjd * It's acceptable to have no spares specified. 1055168404Spjd */ 1056168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1057168404Spjd &spares, &nspares) != 0) 1058168404Spjd return (0); 1059168404Spjd 1060168404Spjd if (nspares == 0) 1061168404Spjd return (EINVAL); 1062168404Spjd 1063168404Spjd /* 1064168404Spjd * Make sure the pool is formatted with a version that supports hot 1065168404Spjd * spares. 1066168404Spjd */ 1067168404Spjd if (spa_version(spa) < ZFS_VERSION_SPARES) 1068168404Spjd return (ENOTSUP); 1069168404Spjd 1070168404Spjd /* 1071168404Spjd * Set the pending spare list so we correctly handle device in-use 1072168404Spjd * checking. 1073168404Spjd */ 1074168404Spjd spa->spa_pending_spares = spares; 1075168404Spjd spa->spa_pending_nspares = nspares; 1076168404Spjd 1077168404Spjd for (i = 0; i < nspares; i++) { 1078168404Spjd if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1079168404Spjd mode)) != 0) 1080168404Spjd goto out; 1081168404Spjd 1082168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 1083168404Spjd vdev_free(vd); 1084168404Spjd error = EINVAL; 1085168404Spjd goto out; 1086168404Spjd } 1087168404Spjd 1088168404Spjd vd->vdev_top = vd; 1089168404Spjd 1090168404Spjd if ((error = vdev_open(vd)) == 0 && 1091168404Spjd (error = vdev_label_init(vd, crtxg, 1092168404Spjd VDEV_LABEL_SPARE)) == 0) { 1093168404Spjd VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1094168404Spjd vd->vdev_guid) == 0); 1095168404Spjd } 1096168404Spjd 1097168404Spjd vdev_free(vd); 1098168404Spjd 1099168404Spjd if (error && mode != VDEV_ALLOC_SPARE) 1100168404Spjd goto out; 1101168404Spjd else 1102168404Spjd error = 0; 1103168404Spjd } 1104168404Spjd 1105168404Spjdout: 1106168404Spjd spa->spa_pending_spares = NULL; 1107168404Spjd spa->spa_pending_nspares = 0; 1108168404Spjd return (error); 1109168404Spjd} 1110168404Spjd 1111168404Spjd/* 1112168404Spjd * Pool Creation 1113168404Spjd */ 1114168404Spjdint 1115168404Spjdspa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1116168404Spjd{ 1117168404Spjd spa_t *spa; 1118168404Spjd vdev_t *rvd; 1119168404Spjd dsl_pool_t *dp; 1120168404Spjd dmu_tx_t *tx; 1121168404Spjd int c, error = 0; 1122168404Spjd uint64_t txg = TXG_INITIAL; 1123168404Spjd nvlist_t **spares; 1124168404Spjd uint_t nspares; 1125168404Spjd 1126168404Spjd /* 1127168404Spjd * If this pool already exists, return failure. 1128168404Spjd */ 1129168404Spjd mutex_enter(&spa_namespace_lock); 1130168404Spjd if (spa_lookup(pool) != NULL) { 1131168404Spjd mutex_exit(&spa_namespace_lock); 1132168404Spjd return (EEXIST); 1133168404Spjd } 1134168404Spjd 1135168404Spjd /* 1136168404Spjd * Allocate a new spa_t structure. 1137168404Spjd */ 1138168404Spjd spa = spa_add(pool, altroot); 1139168404Spjd spa_activate(spa); 1140168404Spjd 1141168404Spjd spa->spa_uberblock.ub_txg = txg - 1; 1142168404Spjd spa->spa_uberblock.ub_version = ZFS_VERSION; 1143168404Spjd spa->spa_ubsync = spa->spa_uberblock; 1144168404Spjd 1145168404Spjd /* 1146168404Spjd * Create the root vdev. 1147168404Spjd */ 1148168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 1149168404Spjd 1150168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1151168404Spjd 1152168404Spjd ASSERT(error != 0 || rvd != NULL); 1153168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1154168404Spjd 1155168404Spjd if (error == 0 && rvd->vdev_children == 0) 1156168404Spjd error = EINVAL; 1157168404Spjd 1158168404Spjd if (error == 0 && 1159168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1160168404Spjd (error = spa_validate_spares(spa, nvroot, txg, 1161168404Spjd VDEV_ALLOC_ADD)) == 0) { 1162168404Spjd for (c = 0; c < rvd->vdev_children; c++) 1163168404Spjd vdev_init(rvd->vdev_child[c], txg); 1164168404Spjd vdev_config_dirty(rvd); 1165168404Spjd } 1166168404Spjd 1167168404Spjd spa_config_exit(spa, FTAG); 1168168404Spjd 1169168404Spjd if (error != 0) { 1170168404Spjd spa_unload(spa); 1171168404Spjd spa_deactivate(spa); 1172168404Spjd spa_remove(spa); 1173168404Spjd mutex_exit(&spa_namespace_lock); 1174168404Spjd return (error); 1175168404Spjd } 1176168404Spjd 1177168404Spjd /* 1178168404Spjd * Get the list of spares, if specified. 1179168404Spjd */ 1180168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1181168404Spjd &spares, &nspares) == 0) { 1182168404Spjd VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1183168404Spjd KM_SLEEP) == 0); 1184168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1185168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1186168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 1187168404Spjd spa_load_spares(spa); 1188168404Spjd spa_config_exit(spa, FTAG); 1189168404Spjd spa->spa_sync_spares = B_TRUE; 1190168404Spjd } 1191168404Spjd 1192168404Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1193168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 1194168404Spjd 1195168404Spjd tx = dmu_tx_create_assigned(dp, txg); 1196168404Spjd 1197168404Spjd /* 1198168404Spjd * Create the pool config object. 1199168404Spjd */ 1200168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1201168404Spjd DMU_OT_PACKED_NVLIST, 1 << 14, 1202168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1203168404Spjd 1204168404Spjd if (zap_add(spa->spa_meta_objset, 1205168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1206168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1207168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 1208168404Spjd } 1209168404Spjd 1210168404Spjd /* Newly created pools are always deflated. */ 1211168404Spjd spa->spa_deflate = TRUE; 1212168404Spjd if (zap_add(spa->spa_meta_objset, 1213168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1214168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1215168404Spjd cmn_err(CE_PANIC, "failed to add deflate"); 1216168404Spjd } 1217168404Spjd 1218168404Spjd /* 1219168404Spjd * Create the deferred-free bplist object. Turn off compression 1220168404Spjd * because sync-to-convergence takes longer if the blocksize 1221168404Spjd * keeps changing. 1222168404Spjd */ 1223168404Spjd spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1224168404Spjd 1 << 14, tx); 1225168404Spjd dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1226168404Spjd ZIO_COMPRESS_OFF, tx); 1227168404Spjd 1228168404Spjd if (zap_add(spa->spa_meta_objset, 1229168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1230168404Spjd sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1231168404Spjd cmn_err(CE_PANIC, "failed to add bplist"); 1232168404Spjd } 1233168404Spjd 1234168404Spjd /* 1235168404Spjd * Create the pool's history object. 1236168404Spjd */ 1237168404Spjd spa_history_create_obj(spa, tx); 1238168404Spjd 1239168404Spjd dmu_tx_commit(tx); 1240168404Spjd 1241168404Spjd spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1242168404Spjd spa->spa_sync_on = B_TRUE; 1243168404Spjd txg_sync_start(spa->spa_dsl_pool); 1244168404Spjd 1245168404Spjd /* 1246168404Spjd * We explicitly wait for the first transaction to complete so that our 1247168404Spjd * bean counters are appropriately updated. 1248168404Spjd */ 1249168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 1250168404Spjd 1251168404Spjd spa_config_sync(); 1252168404Spjd 1253168404Spjd mutex_exit(&spa_namespace_lock); 1254168404Spjd 1255168404Spjd return (0); 1256168404Spjd} 1257168404Spjd 1258168404Spjd/* 1259168404Spjd * Import the given pool into the system. We set up the necessary spa_t and 1260168404Spjd * then call spa_load() to do the dirty work. 1261168404Spjd */ 1262168404Spjdint 1263168404Spjdspa_import(const char *pool, nvlist_t *config, const char *altroot) 1264168404Spjd{ 1265168404Spjd spa_t *spa; 1266168404Spjd int error; 1267168404Spjd nvlist_t *nvroot; 1268168404Spjd nvlist_t **spares; 1269168404Spjd uint_t nspares; 1270168404Spjd 1271168404Spjd if (!(spa_mode & FWRITE)) 1272168404Spjd return (EROFS); 1273168404Spjd 1274168404Spjd /* 1275168404Spjd * If a pool with this name exists, return failure. 1276168404Spjd */ 1277168404Spjd mutex_enter(&spa_namespace_lock); 1278168404Spjd if (spa_lookup(pool) != NULL) { 1279168404Spjd mutex_exit(&spa_namespace_lock); 1280168404Spjd return (EEXIST); 1281168404Spjd } 1282168404Spjd 1283168404Spjd /* 1284168404Spjd * Create and initialize the spa structure. 1285168404Spjd */ 1286168404Spjd spa = spa_add(pool, altroot); 1287168404Spjd spa_activate(spa); 1288168404Spjd 1289168404Spjd /* 1290168404Spjd * Pass off the heavy lifting to spa_load(). 1291168404Spjd * Pass TRUE for mosconfig because the user-supplied config 1292168404Spjd * is actually the one to trust when doing an import. 1293168404Spjd */ 1294168404Spjd error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1295168404Spjd 1296168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 1297168404Spjd /* 1298168404Spjd * Toss any existing sparelist, as it doesn't have any validity anymore, 1299168404Spjd * and conflicts with spa_has_spare(). 1300168404Spjd */ 1301168404Spjd if (spa->spa_sparelist) { 1302168404Spjd nvlist_free(spa->spa_sparelist); 1303168404Spjd spa->spa_sparelist = NULL; 1304168404Spjd spa_load_spares(spa); 1305168404Spjd } 1306168404Spjd 1307168404Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1308168404Spjd &nvroot) == 0); 1309168404Spjd if (error == 0) 1310168404Spjd error = spa_validate_spares(spa, nvroot, -1ULL, 1311168404Spjd VDEV_ALLOC_SPARE); 1312168404Spjd spa_config_exit(spa, FTAG); 1313168404Spjd 1314168404Spjd if (error != 0) { 1315168404Spjd spa_unload(spa); 1316168404Spjd spa_deactivate(spa); 1317168404Spjd spa_remove(spa); 1318168404Spjd mutex_exit(&spa_namespace_lock); 1319168404Spjd return (error); 1320168404Spjd } 1321168404Spjd 1322168404Spjd /* 1323168404Spjd * Override any spares as specified by the user, as these may have 1324168404Spjd * correct device names/devids, etc. 1325168404Spjd */ 1326168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1327168404Spjd &spares, &nspares) == 0) { 1328168404Spjd if (spa->spa_sparelist) 1329168404Spjd VERIFY(nvlist_remove(spa->spa_sparelist, 1330168404Spjd ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1331168404Spjd else 1332168404Spjd VERIFY(nvlist_alloc(&spa->spa_sparelist, 1333168404Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 1334168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1335168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1336168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 1337168404Spjd spa_load_spares(spa); 1338168404Spjd spa_config_exit(spa, FTAG); 1339168404Spjd spa->spa_sync_spares = B_TRUE; 1340168404Spjd } 1341168404Spjd 1342168404Spjd /* 1343168404Spjd * Update the config cache to include the newly-imported pool. 1344168404Spjd */ 1345168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1346168404Spjd 1347168404Spjd mutex_exit(&spa_namespace_lock); 1348168404Spjd 1349168404Spjd /* 1350168404Spjd * Resilver anything that's out of date. 1351168404Spjd */ 1352168404Spjd if (spa_mode & FWRITE) 1353168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1354168404Spjd 1355168404Spjd return (0); 1356168404Spjd} 1357168404Spjd 1358168404Spjd/* 1359168404Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 1360168404Spjd * to get the vdev stats associated with the imported devices. 1361168404Spjd */ 1362168404Spjd#define TRYIMPORT_NAME "$import" 1363168404Spjd 1364168404Spjdnvlist_t * 1365168404Spjdspa_tryimport(nvlist_t *tryconfig) 1366168404Spjd{ 1367168404Spjd nvlist_t *config = NULL; 1368168404Spjd char *poolname; 1369168404Spjd spa_t *spa; 1370168404Spjd uint64_t state; 1371168404Spjd 1372168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1373168404Spjd return (NULL); 1374168404Spjd 1375168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1376168404Spjd return (NULL); 1377168404Spjd 1378168404Spjd /* 1379168404Spjd * Create and initialize the spa structure. 1380168404Spjd */ 1381168404Spjd mutex_enter(&spa_namespace_lock); 1382168404Spjd spa = spa_add(TRYIMPORT_NAME, NULL); 1383168404Spjd spa_activate(spa); 1384168404Spjd 1385168404Spjd /* 1386168404Spjd * Pass off the heavy lifting to spa_load(). 1387168404Spjd * Pass TRUE for mosconfig because the user-supplied config 1388168404Spjd * is actually the one to trust when doing an import. 1389168404Spjd */ 1390168404Spjd (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1391168404Spjd 1392168404Spjd /* 1393168404Spjd * If 'tryconfig' was at least parsable, return the current config. 1394168404Spjd */ 1395168404Spjd if (spa->spa_root_vdev != NULL) { 1396168404Spjd spa_config_enter(spa, RW_READER, FTAG); 1397168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1398168404Spjd spa_config_exit(spa, FTAG); 1399168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1400168404Spjd poolname) == 0); 1401168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1402168404Spjd state) == 0); 1403168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1404168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 1405168404Spjd 1406168404Spjd /* 1407168404Spjd * Add the list of hot spares. 1408168404Spjd */ 1409168404Spjd spa_add_spares(spa, config); 1410168404Spjd } 1411168404Spjd 1412168404Spjd spa_unload(spa); 1413168404Spjd spa_deactivate(spa); 1414168404Spjd spa_remove(spa); 1415168404Spjd mutex_exit(&spa_namespace_lock); 1416168404Spjd 1417168404Spjd return (config); 1418168404Spjd} 1419168404Spjd 1420168404Spjd/* 1421168404Spjd * Pool export/destroy 1422168404Spjd * 1423168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 1424168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 1425168404Spjd * update the pool state and sync all the labels to disk, removing the 1426168404Spjd * configuration from the cache afterwards. 1427168404Spjd */ 1428168404Spjdstatic int 1429168404Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1430168404Spjd{ 1431168404Spjd spa_t *spa; 1432168404Spjd 1433168404Spjd if (oldconfig) 1434168404Spjd *oldconfig = NULL; 1435168404Spjd 1436168404Spjd if (!(spa_mode & FWRITE)) 1437168404Spjd return (EROFS); 1438168404Spjd 1439168404Spjd mutex_enter(&spa_namespace_lock); 1440168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 1441168404Spjd mutex_exit(&spa_namespace_lock); 1442168404Spjd return (ENOENT); 1443168404Spjd } 1444168404Spjd 1445168404Spjd /* 1446168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 1447168404Spjd * reacquire the namespace lock, and see if we can export. 1448168404Spjd */ 1449168404Spjd spa_open_ref(spa, FTAG); 1450168404Spjd mutex_exit(&spa_namespace_lock); 1451168404Spjd spa_async_suspend(spa); 1452168404Spjd mutex_enter(&spa_namespace_lock); 1453168404Spjd spa_close(spa, FTAG); 1454168404Spjd 1455168404Spjd /* 1456168404Spjd * The pool will be in core if it's openable, 1457168404Spjd * in which case we can modify its state. 1458168404Spjd */ 1459168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1460168404Spjd /* 1461168404Spjd * Objsets may be open only because they're dirty, so we 1462168404Spjd * have to force it to sync before checking spa_refcnt. 1463168404Spjd */ 1464168404Spjd spa_scrub_suspend(spa); 1465168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1466168404Spjd 1467168404Spjd /* 1468168404Spjd * A pool cannot be exported or destroyed if there are active 1469168404Spjd * references. If we are resetting a pool, allow references by 1470168404Spjd * fault injection handlers. 1471168404Spjd */ 1472168404Spjd if (!spa_refcount_zero(spa) || 1473168404Spjd (spa->spa_inject_ref != 0 && 1474168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 1475168404Spjd spa_scrub_resume(spa); 1476168404Spjd spa_async_resume(spa); 1477168404Spjd mutex_exit(&spa_namespace_lock); 1478168404Spjd return (EBUSY); 1479168404Spjd } 1480168404Spjd 1481168404Spjd spa_scrub_resume(spa); 1482168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1483168404Spjd 1484168404Spjd /* 1485168404Spjd * We want this to be reflected on every label, 1486168404Spjd * so mark them all dirty. spa_unload() will do the 1487168404Spjd * final sync that pushes these changes out. 1488168404Spjd */ 1489168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 1490168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 1491168404Spjd spa->spa_state = new_state; 1492168404Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1493168404Spjd vdev_config_dirty(spa->spa_root_vdev); 1494168404Spjd spa_config_exit(spa, FTAG); 1495168404Spjd } 1496168404Spjd } 1497168404Spjd 1498168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1499168404Spjd spa_unload(spa); 1500168404Spjd spa_deactivate(spa); 1501168404Spjd } 1502168404Spjd 1503168404Spjd if (oldconfig && spa->spa_config) 1504168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1505168404Spjd 1506168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 1507168404Spjd spa_remove(spa); 1508168404Spjd spa_config_sync(); 1509168404Spjd } 1510168404Spjd mutex_exit(&spa_namespace_lock); 1511168404Spjd 1512168404Spjd return (0); 1513168404Spjd} 1514168404Spjd 1515168404Spjd/* 1516168404Spjd * Destroy a storage pool. 1517168404Spjd */ 1518168404Spjdint 1519168404Spjdspa_destroy(char *pool) 1520168404Spjd{ 1521168404Spjd return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1522168404Spjd} 1523168404Spjd 1524168404Spjd/* 1525168404Spjd * Export a storage pool. 1526168404Spjd */ 1527168404Spjdint 1528168404Spjdspa_export(char *pool, nvlist_t **oldconfig) 1529168404Spjd{ 1530168404Spjd return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1531168404Spjd} 1532168404Spjd 1533168404Spjd/* 1534168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 1535168404Spjd * from the namespace in any way. 1536168404Spjd */ 1537168404Spjdint 1538168404Spjdspa_reset(char *pool) 1539168404Spjd{ 1540168404Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1541168404Spjd} 1542168404Spjd 1543168404Spjd 1544168404Spjd/* 1545168404Spjd * ========================================================================== 1546168404Spjd * Device manipulation 1547168404Spjd * ========================================================================== 1548168404Spjd */ 1549168404Spjd 1550168404Spjd/* 1551168404Spjd * Add capacity to a storage pool. 1552168404Spjd */ 1553168404Spjdint 1554168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1555168404Spjd{ 1556168404Spjd uint64_t txg; 1557168404Spjd int c, error; 1558168404Spjd vdev_t *rvd = spa->spa_root_vdev; 1559168404Spjd vdev_t *vd, *tvd; 1560168404Spjd nvlist_t **spares; 1561168404Spjd uint_t i, nspares; 1562168404Spjd 1563168404Spjd txg = spa_vdev_enter(spa); 1564168404Spjd 1565168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1566168404Spjd VDEV_ALLOC_ADD)) != 0) 1567168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 1568168404Spjd 1569168404Spjd spa->spa_pending_vdev = vd; 1570168404Spjd 1571168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1572168404Spjd &spares, &nspares) != 0) 1573168404Spjd nspares = 0; 1574168404Spjd 1575168404Spjd if (vd->vdev_children == 0 && nspares == 0) { 1576168404Spjd spa->spa_pending_vdev = NULL; 1577168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1578168404Spjd } 1579168404Spjd 1580168404Spjd if (vd->vdev_children != 0) { 1581168404Spjd if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1582168404Spjd spa->spa_pending_vdev = NULL; 1583168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 1584168404Spjd } 1585168404Spjd } 1586168404Spjd 1587168404Spjd /* 1588168404Spjd * We must validate the spares after checking the children. Otherwise, 1589168404Spjd * vdev_inuse() will blindly overwrite the spare. 1590168404Spjd */ 1591168404Spjd if ((error = spa_validate_spares(spa, nvroot, txg, 1592168404Spjd VDEV_ALLOC_ADD)) != 0) { 1593168404Spjd spa->spa_pending_vdev = NULL; 1594168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 1595168404Spjd } 1596168404Spjd 1597168404Spjd spa->spa_pending_vdev = NULL; 1598168404Spjd 1599168404Spjd /* 1600168404Spjd * Transfer each new top-level vdev from vd to rvd. 1601168404Spjd */ 1602168404Spjd for (c = 0; c < vd->vdev_children; c++) { 1603168404Spjd tvd = vd->vdev_child[c]; 1604168404Spjd vdev_remove_child(vd, tvd); 1605168404Spjd tvd->vdev_id = rvd->vdev_children; 1606168404Spjd vdev_add_child(rvd, tvd); 1607168404Spjd vdev_config_dirty(tvd); 1608168404Spjd } 1609168404Spjd 1610168404Spjd if (nspares != 0) { 1611168404Spjd if (spa->spa_sparelist != NULL) { 1612168404Spjd nvlist_t **oldspares; 1613168404Spjd uint_t oldnspares; 1614168404Spjd nvlist_t **newspares; 1615168404Spjd 1616168404Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1617168404Spjd ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1618168404Spjd 1619168404Spjd newspares = kmem_alloc(sizeof (void *) * 1620168404Spjd (nspares + oldnspares), KM_SLEEP); 1621168404Spjd for (i = 0; i < oldnspares; i++) 1622168404Spjd VERIFY(nvlist_dup(oldspares[i], 1623168404Spjd &newspares[i], KM_SLEEP) == 0); 1624168404Spjd for (i = 0; i < nspares; i++) 1625168404Spjd VERIFY(nvlist_dup(spares[i], 1626168404Spjd &newspares[i + oldnspares], 1627168404Spjd KM_SLEEP) == 0); 1628168404Spjd 1629168404Spjd VERIFY(nvlist_remove(spa->spa_sparelist, 1630168404Spjd ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1631168404Spjd 1632168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1633168404Spjd ZPOOL_CONFIG_SPARES, newspares, 1634168404Spjd nspares + oldnspares) == 0); 1635168404Spjd for (i = 0; i < oldnspares + nspares; i++) 1636168404Spjd nvlist_free(newspares[i]); 1637168404Spjd kmem_free(newspares, (oldnspares + nspares) * 1638168404Spjd sizeof (void *)); 1639168404Spjd } else { 1640168404Spjd VERIFY(nvlist_alloc(&spa->spa_sparelist, 1641168404Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 1642168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1643168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1644168404Spjd } 1645168404Spjd 1646168404Spjd spa_load_spares(spa); 1647168404Spjd spa->spa_sync_spares = B_TRUE; 1648168404Spjd } 1649168404Spjd 1650168404Spjd /* 1651168404Spjd * We have to be careful when adding new vdevs to an existing pool. 1652168404Spjd * If other threads start allocating from these vdevs before we 1653168404Spjd * sync the config cache, and we lose power, then upon reboot we may 1654168404Spjd * fail to open the pool because there are DVAs that the config cache 1655168404Spjd * can't translate. Therefore, we first add the vdevs without 1656168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1657168404Spjd * and then let spa_config_update() initialize the new metaslabs. 1658168404Spjd * 1659168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 1660168404Spjd * if we lose power at any point in this sequence, the remaining 1661168404Spjd * steps will be completed the next time we load the pool. 1662168404Spjd */ 1663168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 1664168404Spjd 1665168404Spjd mutex_enter(&spa_namespace_lock); 1666168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1667168404Spjd mutex_exit(&spa_namespace_lock); 1668168404Spjd 1669168404Spjd return (0); 1670168404Spjd} 1671168404Spjd 1672168404Spjd/* 1673168404Spjd * Attach a device to a mirror. The arguments are the path to any device 1674168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 1675168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 1676168404Spjd * 1677168404Spjd * If 'replacing' is specified, the new device is intended to replace the 1678168404Spjd * existing device; in this case the two devices are made into their own 1679168404Spjd * mirror using the 'replacing' vdev, which is functionally idendical to 1680168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 1681168404Spjd * extra rules: you can't attach to it after it's been created, and upon 1682168404Spjd * completion of resilvering, the first disk (the one being replaced) 1683168404Spjd * is automatically detached. 1684168404Spjd */ 1685168404Spjdint 1686168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1687168404Spjd{ 1688168404Spjd uint64_t txg, open_txg; 1689168404Spjd int error; 1690168404Spjd vdev_t *rvd = spa->spa_root_vdev; 1691168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1692168404Spjd vdev_ops_t *pvops; 1693168404Spjd 1694168404Spjd txg = spa_vdev_enter(spa); 1695168404Spjd 1696168404Spjd oldvd = vdev_lookup_by_guid(rvd, guid); 1697168404Spjd 1698168404Spjd if (oldvd == NULL) 1699168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1700168404Spjd 1701168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 1702168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1703168404Spjd 1704168404Spjd pvd = oldvd->vdev_parent; 1705168404Spjd 1706168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1707168404Spjd VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1708168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1709168404Spjd 1710168404Spjd newvd = newrootvd->vdev_child[0]; 1711168404Spjd 1712168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 1713168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1714168404Spjd 1715168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1716168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 1717168404Spjd 1718168404Spjd if (!replacing) { 1719168404Spjd /* 1720168404Spjd * For attach, the only allowable parent is a mirror or the root 1721168404Spjd * vdev. 1722168404Spjd */ 1723168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 1724168404Spjd pvd->vdev_ops != &vdev_root_ops) 1725168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1726168404Spjd 1727168404Spjd pvops = &vdev_mirror_ops; 1728168404Spjd } else { 1729168404Spjd /* 1730168404Spjd * Active hot spares can only be replaced by inactive hot 1731168404Spjd * spares. 1732168404Spjd */ 1733168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 1734168404Spjd pvd->vdev_child[1] == oldvd && 1735168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 1736168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1737168404Spjd 1738168404Spjd /* 1739168404Spjd * If the source is a hot spare, and the parent isn't already a 1740168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 1741168404Spjd * want to create a replacing vdev. The user is not allowed to 1742168404Spjd * attach to a spared vdev child unless the 'isspare' state is 1743168404Spjd * the same (spare replaces spare, non-spare replaces 1744168404Spjd * non-spare). 1745168404Spjd */ 1746168404Spjd if (pvd->vdev_ops == &vdev_replacing_ops) 1747168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1748168404Spjd else if (pvd->vdev_ops == &vdev_spare_ops && 1749168404Spjd newvd->vdev_isspare != oldvd->vdev_isspare) 1750168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1751168404Spjd else if (pvd->vdev_ops != &vdev_spare_ops && 1752168404Spjd newvd->vdev_isspare) 1753168404Spjd pvops = &vdev_spare_ops; 1754168404Spjd else 1755168404Spjd pvops = &vdev_replacing_ops; 1756168404Spjd } 1757168404Spjd 1758168404Spjd /* 1759168404Spjd * Compare the new device size with the replaceable/attachable 1760168404Spjd * device size. 1761168404Spjd */ 1762168404Spjd if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1763168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1764168404Spjd 1765168404Spjd /* 1766168404Spjd * The new device cannot have a higher alignment requirement 1767168404Spjd * than the top-level vdev. 1768168404Spjd */ 1769168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1770168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1771168404Spjd 1772168404Spjd /* 1773168404Spjd * If this is an in-place replacement, update oldvd's path and devid 1774168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 1775168404Spjd */ 1776168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1777168404Spjd spa_strfree(oldvd->vdev_path); 1778168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1779168404Spjd KM_SLEEP); 1780168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 1781168404Spjd newvd->vdev_path, "old"); 1782168404Spjd if (oldvd->vdev_devid != NULL) { 1783168404Spjd spa_strfree(oldvd->vdev_devid); 1784168404Spjd oldvd->vdev_devid = NULL; 1785168404Spjd } 1786168404Spjd } 1787168404Spjd 1788168404Spjd /* 1789168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 1790168404Spjd * mirror/replacing/spare vdev above oldvd. 1791168404Spjd */ 1792168404Spjd if (pvd->vdev_ops != pvops) 1793168404Spjd pvd = vdev_add_parent(oldvd, pvops); 1794168404Spjd 1795168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 1796168404Spjd ASSERT(pvd->vdev_ops == pvops); 1797168404Spjd ASSERT(oldvd->vdev_parent == pvd); 1798168404Spjd 1799168404Spjd /* 1800168404Spjd * Extract the new device from its root and add it to pvd. 1801168404Spjd */ 1802168404Spjd vdev_remove_child(newrootvd, newvd); 1803168404Spjd newvd->vdev_id = pvd->vdev_children; 1804168404Spjd vdev_add_child(pvd, newvd); 1805168404Spjd 1806168404Spjd /* 1807168404Spjd * If newvd is smaller than oldvd, but larger than its rsize, 1808168404Spjd * the addition of newvd may have decreased our parent's asize. 1809168404Spjd */ 1810168404Spjd pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1811168404Spjd 1812168404Spjd tvd = newvd->vdev_top; 1813168404Spjd ASSERT(pvd->vdev_top == tvd); 1814168404Spjd ASSERT(tvd->vdev_parent == rvd); 1815168404Spjd 1816168404Spjd vdev_config_dirty(tvd); 1817168404Spjd 1818168404Spjd /* 1819168404Spjd * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1820168404Spjd * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1821168404Spjd */ 1822168404Spjd open_txg = txg + TXG_CONCURRENT_STATES - 1; 1823168404Spjd 1824168404Spjd mutex_enter(&newvd->vdev_dtl_lock); 1825168404Spjd space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1826168404Spjd open_txg - TXG_INITIAL + 1); 1827168404Spjd mutex_exit(&newvd->vdev_dtl_lock); 1828168404Spjd 1829168404Spjd if (newvd->vdev_isspare) 1830168404Spjd spa_spare_activate(newvd); 1831168404Spjd 1832168404Spjd /* 1833168404Spjd * Mark newvd's DTL dirty in this txg. 1834168404Spjd */ 1835168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 1836168404Spjd 1837168404Spjd (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1838168404Spjd 1839168404Spjd /* 1840168404Spjd * Kick off a resilver to update newvd. 1841168404Spjd */ 1842168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1843168404Spjd 1844168404Spjd return (0); 1845168404Spjd} 1846168404Spjd 1847168404Spjd/* 1848168404Spjd * Detach a device from a mirror or replacing vdev. 1849168404Spjd * If 'replace_done' is specified, only detach if the parent 1850168404Spjd * is a replacing vdev. 1851168404Spjd */ 1852168404Spjdint 1853168404Spjdspa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1854168404Spjd{ 1855168404Spjd uint64_t txg; 1856168404Spjd int c, t, error; 1857168404Spjd vdev_t *rvd = spa->spa_root_vdev; 1858168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 1859168404Spjd boolean_t unspare = B_FALSE; 1860168404Spjd uint64_t unspare_guid; 1861168404Spjd 1862168404Spjd txg = spa_vdev_enter(spa); 1863168404Spjd 1864168404Spjd vd = vdev_lookup_by_guid(rvd, guid); 1865168404Spjd 1866168404Spjd if (vd == NULL) 1867168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1868168404Spjd 1869168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 1870168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1871168404Spjd 1872168404Spjd pvd = vd->vdev_parent; 1873168404Spjd 1874168404Spjd /* 1875168404Spjd * If replace_done is specified, only remove this device if it's 1876168404Spjd * the first child of a replacing vdev. For the 'spare' vdev, either 1877168404Spjd * disk can be removed. 1878168404Spjd */ 1879168404Spjd if (replace_done) { 1880168404Spjd if (pvd->vdev_ops == &vdev_replacing_ops) { 1881168404Spjd if (vd->vdev_id != 0) 1882168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1883168404Spjd } else if (pvd->vdev_ops != &vdev_spare_ops) { 1884168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1885168404Spjd } 1886168404Spjd } 1887168404Spjd 1888168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1889168404Spjd spa_version(spa) >= ZFS_VERSION_SPARES); 1890168404Spjd 1891168404Spjd /* 1892168404Spjd * Only mirror, replacing, and spare vdevs support detach. 1893168404Spjd */ 1894168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 1895168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 1896168404Spjd pvd->vdev_ops != &vdev_spare_ops) 1897168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1898168404Spjd 1899168404Spjd /* 1900168404Spjd * If there's only one replica, you can't detach it. 1901168404Spjd */ 1902168404Spjd if (pvd->vdev_children <= 1) 1903168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1904168404Spjd 1905168404Spjd /* 1906168404Spjd * If all siblings have non-empty DTLs, this device may have the only 1907168404Spjd * valid copy of the data, which means we cannot safely detach it. 1908168404Spjd * 1909168404Spjd * XXX -- as in the vdev_offline() case, we really want a more 1910168404Spjd * precise DTL check. 1911168404Spjd */ 1912168404Spjd for (c = 0; c < pvd->vdev_children; c++) { 1913168404Spjd uint64_t dirty; 1914168404Spjd 1915168404Spjd cvd = pvd->vdev_child[c]; 1916168404Spjd if (cvd == vd) 1917168404Spjd continue; 1918168404Spjd if (vdev_is_dead(cvd)) 1919168404Spjd continue; 1920168404Spjd mutex_enter(&cvd->vdev_dtl_lock); 1921168404Spjd dirty = cvd->vdev_dtl_map.sm_space | 1922168404Spjd cvd->vdev_dtl_scrub.sm_space; 1923168404Spjd mutex_exit(&cvd->vdev_dtl_lock); 1924168404Spjd if (!dirty) 1925168404Spjd break; 1926168404Spjd } 1927168404Spjd 1928168404Spjd /* 1929168404Spjd * If we are a replacing or spare vdev, then we can always detach the 1930168404Spjd * latter child, as that is how one cancels the operation. 1931168404Spjd */ 1932168404Spjd if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1933168404Spjd c == pvd->vdev_children) 1934168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1935168404Spjd 1936168404Spjd /* 1937168404Spjd * If we are detaching the original disk from a spare, then it implies 1938168404Spjd * that the spare should become a real disk, and be removed from the 1939168404Spjd * active spare list for the pool. 1940168404Spjd */ 1941168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 1942168404Spjd vd->vdev_id == 0) 1943168404Spjd unspare = B_TRUE; 1944168404Spjd 1945168404Spjd /* 1946168404Spjd * Erase the disk labels so the disk can be used for other things. 1947168404Spjd * This must be done after all other error cases are handled, 1948168404Spjd * but before we disembowel vd (so we can still do I/O to it). 1949168404Spjd * But if we can't do it, don't treat the error as fatal -- 1950168404Spjd * it may be that the unwritability of the disk is the reason 1951168404Spjd * it's being detached! 1952168404Spjd */ 1953168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1954168404Spjd 1955168404Spjd /* 1956168404Spjd * Remove vd from its parent and compact the parent's children. 1957168404Spjd */ 1958168404Spjd vdev_remove_child(pvd, vd); 1959168404Spjd vdev_compact_children(pvd); 1960168404Spjd 1961168404Spjd /* 1962168404Spjd * Remember one of the remaining children so we can get tvd below. 1963168404Spjd */ 1964168404Spjd cvd = pvd->vdev_child[0]; 1965168404Spjd 1966168404Spjd /* 1967168404Spjd * If we need to remove the remaining child from the list of hot spares, 1968168404Spjd * do it now, marking the vdev as no longer a spare in the process. We 1969168404Spjd * must do this before vdev_remove_parent(), because that can change the 1970168404Spjd * GUID if it creates a new toplevel GUID. 1971168404Spjd */ 1972168404Spjd if (unspare) { 1973168404Spjd ASSERT(cvd->vdev_isspare); 1974168404Spjd spa_spare_remove(cvd); 1975168404Spjd unspare_guid = cvd->vdev_guid; 1976168404Spjd } 1977168404Spjd 1978168404Spjd /* 1979168404Spjd * If the parent mirror/replacing vdev only has one child, 1980168404Spjd * the parent is no longer needed. Remove it from the tree. 1981168404Spjd */ 1982168404Spjd if (pvd->vdev_children == 1) 1983168404Spjd vdev_remove_parent(cvd); 1984168404Spjd 1985168404Spjd /* 1986168404Spjd * We don't set tvd until now because the parent we just removed 1987168404Spjd * may have been the previous top-level vdev. 1988168404Spjd */ 1989168404Spjd tvd = cvd->vdev_top; 1990168404Spjd ASSERT(tvd->vdev_parent == rvd); 1991168404Spjd 1992168404Spjd /* 1993168404Spjd * Reevaluate the parent vdev state. 1994168404Spjd */ 1995168404Spjd vdev_propagate_state(cvd->vdev_parent); 1996168404Spjd 1997168404Spjd /* 1998168404Spjd * If the device we just detached was smaller than the others, it may be 1999168404Spjd * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 2000168404Spjd * can't fail because the existing metaslabs are already in core, so 2001168404Spjd * there's nothing to read from disk. 2002168404Spjd */ 2003168404Spjd VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2004168404Spjd 2005168404Spjd vdev_config_dirty(tvd); 2006168404Spjd 2007168404Spjd /* 2008168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2009168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 2010168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 2011168404Spjd * prevent vd from being accessed after it's freed. 2012168404Spjd */ 2013168404Spjd for (t = 0; t < TXG_SIZE; t++) 2014168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2015168404Spjd vd->vdev_detached = B_TRUE; 2016168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 2017168404Spjd 2018168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 2019168404Spjd 2020168404Spjd /* 2021168404Spjd * If this was the removal of the original device in a hot spare vdev, 2022168404Spjd * then we want to go through and remove the device from the hot spare 2023168404Spjd * list of every other pool. 2024168404Spjd */ 2025168404Spjd if (unspare) { 2026168404Spjd spa = NULL; 2027168404Spjd mutex_enter(&spa_namespace_lock); 2028168404Spjd while ((spa = spa_next(spa)) != NULL) { 2029168404Spjd if (spa->spa_state != POOL_STATE_ACTIVE) 2030168404Spjd continue; 2031168404Spjd 2032168404Spjd (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2033168404Spjd } 2034168404Spjd mutex_exit(&spa_namespace_lock); 2035168404Spjd } 2036168404Spjd 2037168404Spjd return (error); 2038168404Spjd} 2039168404Spjd 2040168404Spjd/* 2041168404Spjd * Remove a device from the pool. Currently, this supports removing only hot 2042168404Spjd * spares. 2043168404Spjd */ 2044168404Spjdint 2045168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2046168404Spjd{ 2047168404Spjd vdev_t *vd; 2048168404Spjd nvlist_t **spares, *nv, **newspares; 2049168404Spjd uint_t i, j, nspares; 2050168404Spjd int ret = 0; 2051168404Spjd 2052168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 2053168404Spjd 2054168404Spjd vd = spa_lookup_by_guid(spa, guid); 2055168404Spjd 2056168404Spjd nv = NULL; 2057168404Spjd if (spa->spa_spares != NULL && 2058168404Spjd nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2059168404Spjd &spares, &nspares) == 0) { 2060168404Spjd for (i = 0; i < nspares; i++) { 2061168404Spjd uint64_t theguid; 2062168404Spjd 2063168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 2064168404Spjd ZPOOL_CONFIG_GUID, &theguid) == 0); 2065168404Spjd if (theguid == guid) { 2066168404Spjd nv = spares[i]; 2067168404Spjd break; 2068168404Spjd } 2069168404Spjd } 2070168404Spjd } 2071168404Spjd 2072168404Spjd /* 2073168404Spjd * We only support removing a hot spare, and only if it's not currently 2074168404Spjd * in use in this pool. 2075168404Spjd */ 2076168404Spjd if (nv == NULL && vd == NULL) { 2077168404Spjd ret = ENOENT; 2078168404Spjd goto out; 2079168404Spjd } 2080168404Spjd 2081168404Spjd if (nv == NULL && vd != NULL) { 2082168404Spjd ret = ENOTSUP; 2083168404Spjd goto out; 2084168404Spjd } 2085168404Spjd 2086168404Spjd if (!unspare && nv != NULL && vd != NULL) { 2087168404Spjd ret = EBUSY; 2088168404Spjd goto out; 2089168404Spjd } 2090168404Spjd 2091168404Spjd if (nspares == 1) { 2092168404Spjd newspares = NULL; 2093168404Spjd } else { 2094168404Spjd newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2095168404Spjd KM_SLEEP); 2096168404Spjd for (i = 0, j = 0; i < nspares; i++) { 2097168404Spjd if (spares[i] != nv) 2098168404Spjd VERIFY(nvlist_dup(spares[i], 2099168404Spjd &newspares[j++], KM_SLEEP) == 0); 2100168404Spjd } 2101168404Spjd } 2102168404Spjd 2103168404Spjd VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2104168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 2105168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2106168404Spjd newspares, nspares - 1) == 0); 2107168404Spjd for (i = 0; i < nspares - 1; i++) 2108168404Spjd nvlist_free(newspares[i]); 2109168404Spjd kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2110168404Spjd spa_load_spares(spa); 2111168404Spjd spa->spa_sync_spares = B_TRUE; 2112168404Spjd 2113168404Spjdout: 2114168404Spjd spa_config_exit(spa, FTAG); 2115168404Spjd 2116168404Spjd return (ret); 2117168404Spjd} 2118168404Spjd 2119168404Spjd/* 2120168404Spjd * Find any device that's done replacing, so we can detach it. 2121168404Spjd */ 2122168404Spjdstatic vdev_t * 2123168404Spjdspa_vdev_replace_done_hunt(vdev_t *vd) 2124168404Spjd{ 2125168404Spjd vdev_t *newvd, *oldvd; 2126168404Spjd int c; 2127168404Spjd 2128168404Spjd for (c = 0; c < vd->vdev_children; c++) { 2129168404Spjd oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2130168404Spjd if (oldvd != NULL) 2131168404Spjd return (oldvd); 2132168404Spjd } 2133168404Spjd 2134168404Spjd if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2135168404Spjd oldvd = vd->vdev_child[0]; 2136168404Spjd newvd = vd->vdev_child[1]; 2137168404Spjd 2138168404Spjd mutex_enter(&newvd->vdev_dtl_lock); 2139168404Spjd if (newvd->vdev_dtl_map.sm_space == 0 && 2140168404Spjd newvd->vdev_dtl_scrub.sm_space == 0) { 2141168404Spjd mutex_exit(&newvd->vdev_dtl_lock); 2142168404Spjd return (oldvd); 2143168404Spjd } 2144168404Spjd mutex_exit(&newvd->vdev_dtl_lock); 2145168404Spjd } 2146168404Spjd 2147168404Spjd return (NULL); 2148168404Spjd} 2149168404Spjd 2150168404Spjdstatic void 2151168404Spjdspa_vdev_replace_done(spa_t *spa) 2152168404Spjd{ 2153168404Spjd vdev_t *vd; 2154168404Spjd vdev_t *pvd; 2155168404Spjd uint64_t guid; 2156168404Spjd uint64_t pguid = 0; 2157168404Spjd 2158168404Spjd spa_config_enter(spa, RW_READER, FTAG); 2159168404Spjd 2160168404Spjd while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2161168404Spjd guid = vd->vdev_guid; 2162168404Spjd /* 2163168404Spjd * If we have just finished replacing a hot spared device, then 2164168404Spjd * we need to detach the parent's first child (the original hot 2165168404Spjd * spare) as well. 2166168404Spjd */ 2167168404Spjd pvd = vd->vdev_parent; 2168168404Spjd if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2169168404Spjd pvd->vdev_id == 0) { 2170168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2171168404Spjd ASSERT(pvd->vdev_parent->vdev_children == 2); 2172168404Spjd pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2173168404Spjd } 2174168404Spjd spa_config_exit(spa, FTAG); 2175168404Spjd if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2176168404Spjd return; 2177168404Spjd if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2178168404Spjd return; 2179168404Spjd spa_config_enter(spa, RW_READER, FTAG); 2180168404Spjd } 2181168404Spjd 2182168404Spjd spa_config_exit(spa, FTAG); 2183168404Spjd} 2184168404Spjd 2185168404Spjd/* 2186168404Spjd * Update the stored path for this vdev. Dirty the vdev configuration, relying 2187168404Spjd * on spa_vdev_enter/exit() to synchronize the labels and cache. 2188168404Spjd */ 2189168404Spjdint 2190168404Spjdspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2191168404Spjd{ 2192168404Spjd vdev_t *rvd, *vd; 2193168404Spjd uint64_t txg; 2194168404Spjd 2195168404Spjd rvd = spa->spa_root_vdev; 2196168404Spjd 2197168404Spjd txg = spa_vdev_enter(spa); 2198168404Spjd 2199168404Spjd if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2200168404Spjd /* 2201168404Spjd * Determine if this is a reference to a hot spare. In that 2202168404Spjd * case, update the path as stored in the spare list. 2203168404Spjd */ 2204168404Spjd nvlist_t **spares; 2205168404Spjd uint_t i, nspares; 2206168404Spjd if (spa->spa_sparelist != NULL) { 2207168404Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2208168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2209168404Spjd for (i = 0; i < nspares; i++) { 2210168404Spjd uint64_t theguid; 2211168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 2212168404Spjd ZPOOL_CONFIG_GUID, &theguid) == 0); 2213168404Spjd if (theguid == guid) 2214168404Spjd break; 2215168404Spjd } 2216168404Spjd 2217168404Spjd if (i == nspares) 2218168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2219168404Spjd 2220168404Spjd VERIFY(nvlist_add_string(spares[i], 2221168404Spjd ZPOOL_CONFIG_PATH, newpath) == 0); 2222168404Spjd spa_load_spares(spa); 2223168404Spjd spa->spa_sync_spares = B_TRUE; 2224168404Spjd return (spa_vdev_exit(spa, NULL, txg, 0)); 2225168404Spjd } else { 2226168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2227168404Spjd } 2228168404Spjd } 2229168404Spjd 2230168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 2231168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2232168404Spjd 2233168404Spjd spa_strfree(vd->vdev_path); 2234168404Spjd vd->vdev_path = spa_strdup(newpath); 2235168404Spjd 2236168404Spjd vdev_config_dirty(vd->vdev_top); 2237168404Spjd 2238168404Spjd return (spa_vdev_exit(spa, NULL, txg, 0)); 2239168404Spjd} 2240168404Spjd 2241168404Spjd/* 2242168404Spjd * ========================================================================== 2243168404Spjd * SPA Scrubbing 2244168404Spjd * ========================================================================== 2245168404Spjd */ 2246168404Spjd 2247168404Spjdstatic void 2248168404Spjdspa_scrub_io_done(zio_t *zio) 2249168404Spjd{ 2250168404Spjd spa_t *spa = zio->io_spa; 2251168404Spjd 2252168404Spjd zio_data_buf_free(zio->io_data, zio->io_size); 2253168404Spjd 2254168404Spjd mutex_enter(&spa->spa_scrub_lock); 2255168404Spjd if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2256168404Spjd vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2257168404Spjd spa->spa_scrub_errors++; 2258168404Spjd mutex_enter(&vd->vdev_stat_lock); 2259168404Spjd vd->vdev_stat.vs_scrub_errors++; 2260168404Spjd mutex_exit(&vd->vdev_stat_lock); 2261168404Spjd } 2262168404Spjd 2263168404Spjd if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2264168404Spjd cv_broadcast(&spa->spa_scrub_io_cv); 2265168404Spjd 2266168404Spjd ASSERT(spa->spa_scrub_inflight >= 0); 2267168404Spjd 2268168404Spjd mutex_exit(&spa->spa_scrub_lock); 2269168404Spjd} 2270168404Spjd 2271168404Spjdstatic void 2272168404Spjdspa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2273168404Spjd zbookmark_t *zb) 2274168404Spjd{ 2275168404Spjd size_t size = BP_GET_LSIZE(bp); 2276168404Spjd void *data; 2277168404Spjd 2278168404Spjd mutex_enter(&spa->spa_scrub_lock); 2279168404Spjd /* 2280168404Spjd * Do not give too much work to vdev(s). 2281168404Spjd */ 2282168404Spjd while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2283168404Spjd cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2284168404Spjd } 2285168404Spjd spa->spa_scrub_inflight++; 2286168404Spjd mutex_exit(&spa->spa_scrub_lock); 2287168404Spjd 2288168404Spjd data = zio_data_buf_alloc(size); 2289168404Spjd 2290168404Spjd if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2291168404Spjd flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2292168404Spjd 2293168404Spjd flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2294168404Spjd 2295168404Spjd zio_nowait(zio_read(NULL, spa, bp, data, size, 2296168404Spjd spa_scrub_io_done, NULL, priority, flags, zb)); 2297168404Spjd} 2298168404Spjd 2299168404Spjd/* ARGSUSED */ 2300168404Spjdstatic int 2301168404Spjdspa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2302168404Spjd{ 2303168404Spjd blkptr_t *bp = &bc->bc_blkptr; 2304168404Spjd vdev_t *vd = spa->spa_root_vdev; 2305168404Spjd dva_t *dva = bp->blk_dva; 2306168404Spjd int needs_resilver = B_FALSE; 2307168404Spjd int d; 2308168404Spjd 2309168404Spjd if (bc->bc_errno) { 2310168404Spjd /* 2311168404Spjd * We can't scrub this block, but we can continue to scrub 2312168404Spjd * the rest of the pool. Note the error and move along. 2313168404Spjd */ 2314168404Spjd mutex_enter(&spa->spa_scrub_lock); 2315168404Spjd spa->spa_scrub_errors++; 2316168404Spjd mutex_exit(&spa->spa_scrub_lock); 2317168404Spjd 2318168404Spjd mutex_enter(&vd->vdev_stat_lock); 2319168404Spjd vd->vdev_stat.vs_scrub_errors++; 2320168404Spjd mutex_exit(&vd->vdev_stat_lock); 2321168404Spjd 2322168404Spjd return (ERESTART); 2323168404Spjd } 2324168404Spjd 2325168404Spjd ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2326168404Spjd 2327168404Spjd for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2328168404Spjd vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2329168404Spjd 2330168404Spjd ASSERT(vd != NULL); 2331168404Spjd 2332168404Spjd /* 2333168404Spjd * Keep track of how much data we've examined so that 2334168404Spjd * zpool(1M) status can make useful progress reports. 2335168404Spjd */ 2336168404Spjd mutex_enter(&vd->vdev_stat_lock); 2337168404Spjd vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2338168404Spjd mutex_exit(&vd->vdev_stat_lock); 2339168404Spjd 2340168404Spjd if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2341168404Spjd if (DVA_GET_GANG(&dva[d])) { 2342168404Spjd /* 2343168404Spjd * Gang members may be spread across multiple 2344168404Spjd * vdevs, so the best we can do is look at the 2345168404Spjd * pool-wide DTL. 2346168404Spjd * XXX -- it would be better to change our 2347168404Spjd * allocation policy to ensure that this can't 2348168404Spjd * happen. 2349168404Spjd */ 2350168404Spjd vd = spa->spa_root_vdev; 2351168404Spjd } 2352168404Spjd if (vdev_dtl_contains(&vd->vdev_dtl_map, 2353168404Spjd bp->blk_birth, 1)) 2354168404Spjd needs_resilver = B_TRUE; 2355168404Spjd } 2356168404Spjd } 2357168404Spjd 2358168404Spjd if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2359168404Spjd spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2360168404Spjd ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2361168404Spjd else if (needs_resilver) 2362168404Spjd spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2363168404Spjd ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2364168404Spjd 2365168404Spjd return (0); 2366168404Spjd} 2367168404Spjd 2368168404Spjdstatic void 2369168404Spjdspa_scrub_thread(void *arg) 2370168404Spjd{ 2371168404Spjd spa_t *spa = arg; 2372168404Spjd callb_cpr_t cprinfo; 2373168404Spjd traverse_handle_t *th = spa->spa_scrub_th; 2374168404Spjd vdev_t *rvd = spa->spa_root_vdev; 2375168404Spjd pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2376168404Spjd int error = 0; 2377168404Spjd boolean_t complete; 2378168404Spjd 2379168404Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2380168404Spjd 2381168404Spjd /* 2382168404Spjd * If we're restarting due to a snapshot create/delete, 2383168404Spjd * wait for that to complete. 2384168404Spjd */ 2385168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 2386168404Spjd 2387168404Spjd dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2388168404Spjd scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2389168404Spjd spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2390168404Spjd 2391168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 2392168404Spjd vdev_reopen(rvd); /* purge all vdev caches */ 2393168404Spjd vdev_config_dirty(rvd); /* rewrite all disk labels */ 2394168404Spjd vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2395168404Spjd spa_config_exit(spa, FTAG); 2396168404Spjd 2397168404Spjd mutex_enter(&spa->spa_scrub_lock); 2398168404Spjd spa->spa_scrub_errors = 0; 2399168404Spjd spa->spa_scrub_active = 1; 2400168404Spjd ASSERT(spa->spa_scrub_inflight == 0); 2401168404Spjd 2402168404Spjd while (!spa->spa_scrub_stop) { 2403168404Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 2404168404Spjd while (spa->spa_scrub_suspended) { 2405168404Spjd spa->spa_scrub_active = 0; 2406168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2407168404Spjd cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2408168404Spjd spa->spa_scrub_active = 1; 2409168404Spjd } 2410168404Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2411168404Spjd 2412168404Spjd if (spa->spa_scrub_restart_txg != 0) 2413168404Spjd break; 2414168404Spjd 2415168404Spjd mutex_exit(&spa->spa_scrub_lock); 2416168404Spjd error = traverse_more(th); 2417168404Spjd mutex_enter(&spa->spa_scrub_lock); 2418168404Spjd if (error != EAGAIN) 2419168404Spjd break; 2420168404Spjd } 2421168404Spjd 2422168404Spjd while (spa->spa_scrub_inflight) 2423168404Spjd cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2424168404Spjd 2425168404Spjd spa->spa_scrub_active = 0; 2426168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2427168404Spjd 2428168404Spjd mutex_exit(&spa->spa_scrub_lock); 2429168404Spjd 2430168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 2431168404Spjd 2432168404Spjd mutex_enter(&spa->spa_scrub_lock); 2433168404Spjd 2434168404Spjd /* 2435168404Spjd * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2436168404Spjd * AND the spa config lock to synchronize with any config changes 2437168404Spjd * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2438168404Spjd */ 2439168404Spjd if (spa->spa_scrub_restart_txg != 0) 2440168404Spjd error = ERESTART; 2441168404Spjd 2442168404Spjd if (spa->spa_scrub_stop) 2443168404Spjd error = EINTR; 2444168404Spjd 2445168404Spjd /* 2446168404Spjd * Even if there were uncorrectable errors, we consider the scrub 2447168404Spjd * completed. The downside is that if there is a transient error during 2448168404Spjd * a resilver, we won't resilver the data properly to the target. But 2449168404Spjd * if the damage is permanent (more likely) we will resilver forever, 2450168404Spjd * which isn't really acceptable. Since there is enough information for 2451168404Spjd * the user to know what has failed and why, this seems like a more 2452168404Spjd * tractable approach. 2453168404Spjd */ 2454168404Spjd complete = (error == 0); 2455168404Spjd 2456168404Spjd dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2457168404Spjd scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2458168404Spjd spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2459168404Spjd error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2460168404Spjd 2461168404Spjd mutex_exit(&spa->spa_scrub_lock); 2462168404Spjd 2463168404Spjd /* 2464168404Spjd * If the scrub/resilver completed, update all DTLs to reflect this. 2465168404Spjd * Whether it succeeded or not, vacate all temporary scrub DTLs. 2466168404Spjd */ 2467168404Spjd vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2468168404Spjd complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2469168404Spjd vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2470168404Spjd spa_errlog_rotate(spa); 2471168404Spjd 2472168404Spjd spa_config_exit(spa, FTAG); 2473168404Spjd 2474168404Spjd mutex_enter(&spa->spa_scrub_lock); 2475168404Spjd 2476168404Spjd /* 2477168404Spjd * We may have finished replacing a device. 2478168404Spjd * Let the async thread assess this and handle the detach. 2479168404Spjd */ 2480168404Spjd spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2481168404Spjd 2482168404Spjd /* 2483168404Spjd * If we were told to restart, our final act is to start a new scrub. 2484168404Spjd */ 2485168404Spjd if (error == ERESTART) 2486168404Spjd spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2487168404Spjd SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2488168404Spjd 2489168404Spjd spa->spa_scrub_type = POOL_SCRUB_NONE; 2490168404Spjd spa->spa_scrub_active = 0; 2491168404Spjd spa->spa_scrub_thread = NULL; 2492168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2493168404Spjd CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2494168404Spjd thread_exit(); 2495168404Spjd} 2496168404Spjd 2497168404Spjdvoid 2498168404Spjdspa_scrub_suspend(spa_t *spa) 2499168404Spjd{ 2500168404Spjd mutex_enter(&spa->spa_scrub_lock); 2501168404Spjd spa->spa_scrub_suspended++; 2502168404Spjd while (spa->spa_scrub_active) { 2503168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2504168404Spjd cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2505168404Spjd } 2506168404Spjd while (spa->spa_scrub_inflight) 2507168404Spjd cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2508168404Spjd mutex_exit(&spa->spa_scrub_lock); 2509168404Spjd} 2510168404Spjd 2511168404Spjdvoid 2512168404Spjdspa_scrub_resume(spa_t *spa) 2513168404Spjd{ 2514168404Spjd mutex_enter(&spa->spa_scrub_lock); 2515168404Spjd ASSERT(spa->spa_scrub_suspended != 0); 2516168404Spjd if (--spa->spa_scrub_suspended == 0) 2517168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2518168404Spjd mutex_exit(&spa->spa_scrub_lock); 2519168404Spjd} 2520168404Spjd 2521168404Spjdvoid 2522168404Spjdspa_scrub_restart(spa_t *spa, uint64_t txg) 2523168404Spjd{ 2524168404Spjd /* 2525168404Spjd * Something happened (e.g. snapshot create/delete) that means 2526168404Spjd * we must restart any in-progress scrubs. The itinerary will 2527168404Spjd * fix this properly. 2528168404Spjd */ 2529168404Spjd mutex_enter(&spa->spa_scrub_lock); 2530168404Spjd spa->spa_scrub_restart_txg = txg; 2531168404Spjd mutex_exit(&spa->spa_scrub_lock); 2532168404Spjd} 2533168404Spjd 2534168404Spjdint 2535168404Spjdspa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2536168404Spjd{ 2537168404Spjd space_seg_t *ss; 2538168404Spjd uint64_t mintxg, maxtxg; 2539168404Spjd vdev_t *rvd = spa->spa_root_vdev; 2540168404Spjd 2541168404Spjd if ((uint_t)type >= POOL_SCRUB_TYPES) 2542168404Spjd return (ENOTSUP); 2543168404Spjd 2544168404Spjd mutex_enter(&spa->spa_scrub_lock); 2545168404Spjd 2546168404Spjd /* 2547168404Spjd * If there's a scrub or resilver already in progress, stop it. 2548168404Spjd */ 2549168404Spjd while (spa->spa_scrub_thread != NULL) { 2550168404Spjd /* 2551168404Spjd * Don't stop a resilver unless forced. 2552168404Spjd */ 2553168404Spjd if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2554168404Spjd mutex_exit(&spa->spa_scrub_lock); 2555168404Spjd return (EBUSY); 2556168404Spjd } 2557168404Spjd spa->spa_scrub_stop = 1; 2558168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2559168404Spjd cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2560168404Spjd } 2561168404Spjd 2562168404Spjd /* 2563168404Spjd * Terminate the previous traverse. 2564168404Spjd */ 2565168404Spjd if (spa->spa_scrub_th != NULL) { 2566168404Spjd traverse_fini(spa->spa_scrub_th); 2567168404Spjd spa->spa_scrub_th = NULL; 2568168404Spjd } 2569168404Spjd 2570168404Spjd if (rvd == NULL) { 2571168404Spjd ASSERT(spa->spa_scrub_stop == 0); 2572168404Spjd ASSERT(spa->spa_scrub_type == type); 2573168404Spjd ASSERT(spa->spa_scrub_restart_txg == 0); 2574168404Spjd mutex_exit(&spa->spa_scrub_lock); 2575168404Spjd return (0); 2576168404Spjd } 2577168404Spjd 2578168404Spjd mintxg = TXG_INITIAL - 1; 2579168404Spjd maxtxg = spa_last_synced_txg(spa) + 1; 2580168404Spjd 2581168404Spjd mutex_enter(&rvd->vdev_dtl_lock); 2582168404Spjd 2583168404Spjd if (rvd->vdev_dtl_map.sm_space == 0) { 2584168404Spjd /* 2585168404Spjd * The pool-wide DTL is empty. 2586168404Spjd * If this is a resilver, there's nothing to do except 2587168404Spjd * check whether any in-progress replacements have completed. 2588168404Spjd */ 2589168404Spjd if (type == POOL_SCRUB_RESILVER) { 2590168404Spjd type = POOL_SCRUB_NONE; 2591168404Spjd spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2592168404Spjd } 2593168404Spjd } else { 2594168404Spjd /* 2595168404Spjd * The pool-wide DTL is non-empty. 2596168404Spjd * If this is a normal scrub, upgrade to a resilver instead. 2597168404Spjd */ 2598168404Spjd if (type == POOL_SCRUB_EVERYTHING) 2599168404Spjd type = POOL_SCRUB_RESILVER; 2600168404Spjd } 2601168404Spjd 2602168404Spjd if (type == POOL_SCRUB_RESILVER) { 2603168404Spjd /* 2604168404Spjd * Determine the resilvering boundaries. 2605168404Spjd * 2606168404Spjd * Note: (mintxg, maxtxg) is an open interval, 2607168404Spjd * i.e. mintxg and maxtxg themselves are not included. 2608168404Spjd * 2609168404Spjd * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2610168404Spjd * so we don't claim to resilver a txg that's still changing. 2611168404Spjd */ 2612168404Spjd ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2613168404Spjd mintxg = ss->ss_start - 1; 2614168404Spjd ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2615168404Spjd maxtxg = MIN(ss->ss_end, maxtxg); 2616168404Spjd } 2617168404Spjd 2618168404Spjd mutex_exit(&rvd->vdev_dtl_lock); 2619168404Spjd 2620168404Spjd spa->spa_scrub_stop = 0; 2621168404Spjd spa->spa_scrub_type = type; 2622168404Spjd spa->spa_scrub_restart_txg = 0; 2623168404Spjd 2624168404Spjd if (type != POOL_SCRUB_NONE) { 2625168404Spjd spa->spa_scrub_mintxg = mintxg; 2626168404Spjd spa->spa_scrub_maxtxg = maxtxg; 2627168404Spjd spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2628168404Spjd ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2629168404Spjd ZIO_FLAG_CANFAIL); 2630168404Spjd traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2631168404Spjd spa->spa_scrub_thread = thread_create(NULL, 0, 2632168404Spjd spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2633168404Spjd } 2634168404Spjd 2635168404Spjd mutex_exit(&spa->spa_scrub_lock); 2636168404Spjd 2637168404Spjd return (0); 2638168404Spjd} 2639168404Spjd 2640168404Spjd/* 2641168404Spjd * ========================================================================== 2642168404Spjd * SPA async task processing 2643168404Spjd * ========================================================================== 2644168404Spjd */ 2645168404Spjd 2646168404Spjdstatic void 2647168404Spjdspa_async_reopen(spa_t *spa) 2648168404Spjd{ 2649168404Spjd vdev_t *rvd = spa->spa_root_vdev; 2650168404Spjd vdev_t *tvd; 2651168404Spjd int c; 2652168404Spjd 2653168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 2654168404Spjd 2655168404Spjd for (c = 0; c < rvd->vdev_children; c++) { 2656168404Spjd tvd = rvd->vdev_child[c]; 2657168404Spjd if (tvd->vdev_reopen_wanted) { 2658168404Spjd tvd->vdev_reopen_wanted = 0; 2659168404Spjd vdev_reopen(tvd); 2660168404Spjd } 2661168404Spjd } 2662168404Spjd 2663168404Spjd spa_config_exit(spa, FTAG); 2664168404Spjd} 2665168404Spjd 2666168404Spjdstatic void 2667168404Spjdspa_async_thread(void *arg) 2668168404Spjd{ 2669168404Spjd spa_t *spa = arg; 2670168404Spjd int tasks; 2671168404Spjd 2672168404Spjd ASSERT(spa->spa_sync_on); 2673168404Spjd 2674168404Spjd mutex_enter(&spa->spa_async_lock); 2675168404Spjd tasks = spa->spa_async_tasks; 2676168404Spjd spa->spa_async_tasks = 0; 2677168404Spjd mutex_exit(&spa->spa_async_lock); 2678168404Spjd 2679168404Spjd /* 2680168404Spjd * See if the config needs to be updated. 2681168404Spjd */ 2682168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2683168404Spjd mutex_enter(&spa_namespace_lock); 2684168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2685168404Spjd mutex_exit(&spa_namespace_lock); 2686168404Spjd } 2687168404Spjd 2688168404Spjd /* 2689168404Spjd * See if any devices need to be reopened. 2690168404Spjd */ 2691168404Spjd if (tasks & SPA_ASYNC_REOPEN) 2692168404Spjd spa_async_reopen(spa); 2693168404Spjd 2694168404Spjd /* 2695168404Spjd * If any devices are done replacing, detach them. 2696168404Spjd */ 2697168404Spjd if (tasks & SPA_ASYNC_REPLACE_DONE) 2698168404Spjd spa_vdev_replace_done(spa); 2699168404Spjd 2700168404Spjd /* 2701168404Spjd * Kick off a scrub. 2702168404Spjd */ 2703168404Spjd if (tasks & SPA_ASYNC_SCRUB) 2704168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2705168404Spjd 2706168404Spjd /* 2707168404Spjd * Kick off a resilver. 2708168404Spjd */ 2709168404Spjd if (tasks & SPA_ASYNC_RESILVER) 2710168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2711168404Spjd 2712168404Spjd /* 2713168404Spjd * Let the world know that we're done. 2714168404Spjd */ 2715168404Spjd mutex_enter(&spa->spa_async_lock); 2716168404Spjd spa->spa_async_thread = NULL; 2717168404Spjd cv_broadcast(&spa->spa_async_cv); 2718168404Spjd mutex_exit(&spa->spa_async_lock); 2719168404Spjd thread_exit(); 2720168404Spjd} 2721168404Spjd 2722168404Spjdvoid 2723168404Spjdspa_async_suspend(spa_t *spa) 2724168404Spjd{ 2725168404Spjd mutex_enter(&spa->spa_async_lock); 2726168404Spjd spa->spa_async_suspended++; 2727168404Spjd while (spa->spa_async_thread != NULL) 2728168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2729168404Spjd mutex_exit(&spa->spa_async_lock); 2730168404Spjd} 2731168404Spjd 2732168404Spjdvoid 2733168404Spjdspa_async_resume(spa_t *spa) 2734168404Spjd{ 2735168404Spjd mutex_enter(&spa->spa_async_lock); 2736168404Spjd ASSERT(spa->spa_async_suspended != 0); 2737168404Spjd spa->spa_async_suspended--; 2738168404Spjd mutex_exit(&spa->spa_async_lock); 2739168404Spjd} 2740168404Spjd 2741168404Spjdstatic void 2742168404Spjdspa_async_dispatch(spa_t *spa) 2743168404Spjd{ 2744168404Spjd mutex_enter(&spa->spa_async_lock); 2745168404Spjd if (spa->spa_async_tasks && !spa->spa_async_suspended && 2746168404Spjd spa->spa_async_thread == NULL && 2747168404Spjd rootdir != NULL && !vn_is_readonly(rootdir)) 2748168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 2749168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2750168404Spjd mutex_exit(&spa->spa_async_lock); 2751168404Spjd} 2752168404Spjd 2753168404Spjdvoid 2754168404Spjdspa_async_request(spa_t *spa, int task) 2755168404Spjd{ 2756168404Spjd mutex_enter(&spa->spa_async_lock); 2757168404Spjd spa->spa_async_tasks |= task; 2758168404Spjd mutex_exit(&spa->spa_async_lock); 2759168404Spjd} 2760168404Spjd 2761168404Spjd/* 2762168404Spjd * ========================================================================== 2763168404Spjd * SPA syncing routines 2764168404Spjd * ========================================================================== 2765168404Spjd */ 2766168404Spjd 2767168404Spjdstatic void 2768168404Spjdspa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2769168404Spjd{ 2770168404Spjd bplist_t *bpl = &spa->spa_sync_bplist; 2771168404Spjd dmu_tx_t *tx; 2772168404Spjd blkptr_t blk; 2773168404Spjd uint64_t itor = 0; 2774168404Spjd zio_t *zio; 2775168404Spjd int error; 2776168404Spjd uint8_t c = 1; 2777168404Spjd 2778168404Spjd zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2779168404Spjd 2780168404Spjd while (bplist_iterate(bpl, &itor, &blk) == 0) 2781168404Spjd zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2782168404Spjd 2783168404Spjd error = zio_wait(zio); 2784168404Spjd ASSERT3U(error, ==, 0); 2785168404Spjd 2786168404Spjd tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2787168404Spjd bplist_vacate(bpl, tx); 2788168404Spjd 2789168404Spjd /* 2790168404Spjd * Pre-dirty the first block so we sync to convergence faster. 2791168404Spjd * (Usually only the first block is needed.) 2792168404Spjd */ 2793168404Spjd dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2794168404Spjd dmu_tx_commit(tx); 2795168404Spjd} 2796168404Spjd 2797168404Spjdstatic void 2798168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2799168404Spjd{ 2800168404Spjd char *packed = NULL; 2801168404Spjd size_t nvsize = 0; 2802168404Spjd dmu_buf_t *db; 2803168404Spjd 2804168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2805168404Spjd 2806168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 2807168404Spjd 2808168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2809168404Spjd KM_SLEEP) == 0); 2810168404Spjd 2811168404Spjd dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2812168404Spjd 2813168404Spjd kmem_free(packed, nvsize); 2814168404Spjd 2815168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2816168404Spjd dmu_buf_will_dirty(db, tx); 2817168404Spjd *(uint64_t *)db->db_data = nvsize; 2818168404Spjd dmu_buf_rele(db, FTAG); 2819168404Spjd} 2820168404Spjd 2821168404Spjdstatic void 2822168404Spjdspa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2823168404Spjd{ 2824168404Spjd nvlist_t *nvroot; 2825168404Spjd nvlist_t **spares; 2826168404Spjd int i; 2827168404Spjd 2828168404Spjd if (!spa->spa_sync_spares) 2829168404Spjd return; 2830168404Spjd 2831168404Spjd /* 2832168404Spjd * Update the MOS nvlist describing the list of available spares. 2833168404Spjd * spa_validate_spares() will have already made sure this nvlist is 2834168404Spjd * valid and the vdevs are labelled appropriately. 2835168404Spjd */ 2836168404Spjd if (spa->spa_spares_object == 0) { 2837168404Spjd spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2838168404Spjd DMU_OT_PACKED_NVLIST, 1 << 14, 2839168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2840168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 2841168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2842168404Spjd sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2843168404Spjd } 2844168404Spjd 2845168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2846168404Spjd if (spa->spa_nspares == 0) { 2847168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2848168404Spjd NULL, 0) == 0); 2849168404Spjd } else { 2850168404Spjd spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2851168404Spjd KM_SLEEP); 2852168404Spjd for (i = 0; i < spa->spa_nspares; i++) 2853168404Spjd spares[i] = vdev_config_generate(spa, 2854168404Spjd spa->spa_spares[i], B_FALSE, B_TRUE); 2855168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2856168404Spjd spares, spa->spa_nspares) == 0); 2857168404Spjd for (i = 0; i < spa->spa_nspares; i++) 2858168404Spjd nvlist_free(spares[i]); 2859168404Spjd kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2860168404Spjd } 2861168404Spjd 2862168404Spjd spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2863168404Spjd nvlist_free(nvroot); 2864168404Spjd 2865168404Spjd spa->spa_sync_spares = B_FALSE; 2866168404Spjd} 2867168404Spjd 2868168404Spjdstatic void 2869168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2870168404Spjd{ 2871168404Spjd nvlist_t *config; 2872168404Spjd 2873168404Spjd if (list_is_empty(&spa->spa_dirty_list)) 2874168404Spjd return; 2875168404Spjd 2876168404Spjd config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2877168404Spjd 2878168404Spjd if (spa->spa_config_syncing) 2879168404Spjd nvlist_free(spa->spa_config_syncing); 2880168404Spjd spa->spa_config_syncing = config; 2881168404Spjd 2882168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2883168404Spjd} 2884168404Spjd 2885168404Spjdstatic void 2886168404Spjdspa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 2887168404Spjd{ 2888168404Spjd spa_t *spa = arg1; 2889168404Spjd nvlist_t *nvp = arg2; 2890168404Spjd nvpair_t *nvpair; 2891168404Spjd objset_t *mos = spa->spa_meta_objset; 2892168404Spjd uint64_t zapobj; 2893168404Spjd 2894168404Spjd mutex_enter(&spa->spa_props_lock); 2895168404Spjd if (spa->spa_pool_props_object == 0) { 2896168404Spjd zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2897168404Spjd VERIFY(zapobj > 0); 2898168404Spjd 2899168404Spjd spa->spa_pool_props_object = zapobj; 2900168404Spjd 2901168404Spjd VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2902168404Spjd DMU_POOL_PROPS, 8, 1, 2903168404Spjd &spa->spa_pool_props_object, tx) == 0); 2904168404Spjd } 2905168404Spjd mutex_exit(&spa->spa_props_lock); 2906168404Spjd 2907168404Spjd nvpair = NULL; 2908168404Spjd while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2909168404Spjd switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2910168404Spjd case ZFS_PROP_BOOTFS: 2911168404Spjd VERIFY(nvlist_lookup_uint64(nvp, 2912168404Spjd nvpair_name(nvpair), &spa->spa_bootfs) == 0); 2913168404Spjd VERIFY(zap_update(mos, 2914168404Spjd spa->spa_pool_props_object, 2915168404Spjd zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 2916168404Spjd &spa->spa_bootfs, tx) == 0); 2917168404Spjd break; 2918168404Spjd } 2919168404Spjd } 2920168404Spjd} 2921168404Spjd 2922168404Spjd/* 2923168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 2924168404Spjd * part of the process, so we iterate until it converges. 2925168404Spjd */ 2926168404Spjdvoid 2927168404Spjdspa_sync(spa_t *spa, uint64_t txg) 2928168404Spjd{ 2929168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 2930168404Spjd objset_t *mos = spa->spa_meta_objset; 2931168404Spjd bplist_t *bpl = &spa->spa_sync_bplist; 2932168404Spjd vdev_t *rvd = spa->spa_root_vdev; 2933168404Spjd vdev_t *vd; 2934168404Spjd dmu_tx_t *tx; 2935168404Spjd int dirty_vdevs; 2936168404Spjd 2937168404Spjd /* 2938168404Spjd * Lock out configuration changes. 2939168404Spjd */ 2940168404Spjd spa_config_enter(spa, RW_READER, FTAG); 2941168404Spjd 2942168404Spjd spa->spa_syncing_txg = txg; 2943168404Spjd spa->spa_sync_pass = 0; 2944168404Spjd 2945168404Spjd VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2946168404Spjd 2947168404Spjd tx = dmu_tx_create_assigned(dp, txg); 2948168404Spjd 2949168404Spjd /* 2950168404Spjd * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2951168404Spjd * set spa_deflate if we have no raid-z vdevs. 2952168404Spjd */ 2953168404Spjd if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2954168404Spjd spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2955168404Spjd int i; 2956168404Spjd 2957168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 2958168404Spjd vd = rvd->vdev_child[i]; 2959168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2960168404Spjd break; 2961168404Spjd } 2962168404Spjd if (i == rvd->vdev_children) { 2963168404Spjd spa->spa_deflate = TRUE; 2964168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 2965168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2966168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2967168404Spjd } 2968168404Spjd } 2969168404Spjd 2970168404Spjd /* 2971168404Spjd * If anything has changed in this txg, push the deferred frees 2972168404Spjd * from the previous txg. If not, leave them alone so that we 2973168404Spjd * don't generate work on an otherwise idle system. 2974168404Spjd */ 2975168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2976168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2977168404Spjd !txg_list_empty(&dp->dp_sync_tasks, txg)) 2978168404Spjd spa_sync_deferred_frees(spa, txg); 2979168404Spjd 2980168404Spjd /* 2981168404Spjd * Iterate to convergence. 2982168404Spjd */ 2983168404Spjd do { 2984168404Spjd spa->spa_sync_pass++; 2985168404Spjd 2986168404Spjd spa_sync_config_object(spa, tx); 2987168404Spjd spa_sync_spares(spa, tx); 2988168404Spjd spa_errlog_sync(spa, txg); 2989168404Spjd dsl_pool_sync(dp, txg); 2990168404Spjd 2991168404Spjd dirty_vdevs = 0; 2992168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2993168404Spjd vdev_sync(vd, txg); 2994168404Spjd dirty_vdevs++; 2995168404Spjd } 2996168404Spjd 2997168404Spjd bplist_sync(bpl, tx); 2998168404Spjd } while (dirty_vdevs); 2999168404Spjd 3000168404Spjd bplist_close(bpl); 3001168404Spjd 3002168404Spjd dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3003168404Spjd 3004168404Spjd /* 3005168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 3006168404Spjd * to commit the transaction group. 3007168404Spjd * 3008168404Spjd * If there are any dirty vdevs, sync the uberblock to all vdevs. 3009168404Spjd * Otherwise, pick a random top-level vdev that's known to be 3010168404Spjd * visible in the config cache (see spa_vdev_add() for details). 3011168404Spjd * If the write fails, try the next vdev until we're tried them all. 3012168404Spjd */ 3013168404Spjd if (!list_is_empty(&spa->spa_dirty_list)) { 3014168404Spjd VERIFY(vdev_config_sync(rvd, txg) == 0); 3015168404Spjd } else { 3016168404Spjd int children = rvd->vdev_children; 3017168404Spjd int c0 = spa_get_random(children); 3018168404Spjd int c; 3019168404Spjd 3020168404Spjd for (c = 0; c < children; c++) { 3021168404Spjd vd = rvd->vdev_child[(c0 + c) % children]; 3022168404Spjd if (vd->vdev_ms_array == 0) 3023168404Spjd continue; 3024168404Spjd if (vdev_config_sync(vd, txg) == 0) 3025168404Spjd break; 3026168404Spjd } 3027168404Spjd if (c == children) 3028168404Spjd VERIFY(vdev_config_sync(rvd, txg) == 0); 3029168404Spjd } 3030168404Spjd 3031168404Spjd dmu_tx_commit(tx); 3032168404Spjd 3033168404Spjd /* 3034168404Spjd * Clear the dirty config list. 3035168404Spjd */ 3036168404Spjd while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3037168404Spjd vdev_config_clean(vd); 3038168404Spjd 3039168404Spjd /* 3040168404Spjd * Now that the new config has synced transactionally, 3041168404Spjd * let it become visible to the config cache. 3042168404Spjd */ 3043168404Spjd if (spa->spa_config_syncing != NULL) { 3044168404Spjd spa_config_set(spa, spa->spa_config_syncing); 3045168404Spjd spa->spa_config_txg = txg; 3046168404Spjd spa->spa_config_syncing = NULL; 3047168404Spjd } 3048168404Spjd 3049168404Spjd /* 3050168404Spjd * Make a stable copy of the fully synced uberblock. 3051168404Spjd * We use this as the root for pool traversals. 3052168404Spjd */ 3053168404Spjd spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3054168404Spjd 3055168404Spjd spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3056168404Spjd 3057168404Spjd rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3058168404Spjd spa->spa_traverse_wanted = 0; 3059168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3060168404Spjd rw_exit(&spa->spa_traverse_lock); 3061168404Spjd 3062168404Spjd spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3063168404Spjd 3064168404Spjd /* 3065168404Spjd * Clean up the ZIL records for the synced txg. 3066168404Spjd */ 3067168404Spjd dsl_pool_zil_clean(dp); 3068168404Spjd 3069168404Spjd /* 3070168404Spjd * Update usable space statistics. 3071168404Spjd */ 3072168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3073168404Spjd vdev_sync_done(vd, txg); 3074168404Spjd 3075168404Spjd /* 3076168404Spjd * It had better be the case that we didn't dirty anything 3077168404Spjd * since vdev_config_sync(). 3078168404Spjd */ 3079168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3080168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3081168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3082168404Spjd ASSERT(bpl->bpl_queue == NULL); 3083168404Spjd 3084168404Spjd spa_config_exit(spa, FTAG); 3085168404Spjd 3086168404Spjd /* 3087168404Spjd * If any async tasks have been requested, kick them off. 3088168404Spjd */ 3089168404Spjd spa_async_dispatch(spa); 3090168404Spjd} 3091168404Spjd 3092168404Spjd/* 3093168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 3094168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 3095168404Spjd * sync. 3096168404Spjd */ 3097168404Spjdvoid 3098168404Spjdspa_sync_allpools(void) 3099168404Spjd{ 3100168404Spjd spa_t *spa = NULL; 3101168404Spjd mutex_enter(&spa_namespace_lock); 3102168404Spjd while ((spa = spa_next(spa)) != NULL) { 3103168404Spjd if (spa_state(spa) != POOL_STATE_ACTIVE) 3104168404Spjd continue; 3105168404Spjd spa_open_ref(spa, FTAG); 3106168404Spjd mutex_exit(&spa_namespace_lock); 3107168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 3108168404Spjd mutex_enter(&spa_namespace_lock); 3109168404Spjd spa_close(spa, FTAG); 3110168404Spjd } 3111168404Spjd mutex_exit(&spa_namespace_lock); 3112168404Spjd} 3113168404Spjd 3114168404Spjd/* 3115168404Spjd * ========================================================================== 3116168404Spjd * Miscellaneous routines 3117168404Spjd * ========================================================================== 3118168404Spjd */ 3119168404Spjd 3120168404Spjd/* 3121168404Spjd * Remove all pools in the system. 3122168404Spjd */ 3123168404Spjdvoid 3124168404Spjdspa_evict_all(void) 3125168404Spjd{ 3126168404Spjd spa_t *spa; 3127168404Spjd 3128168404Spjd /* 3129168404Spjd * Remove all cached state. All pools should be closed now, 3130168404Spjd * so every spa in the AVL tree should be unreferenced. 3131168404Spjd */ 3132168404Spjd mutex_enter(&spa_namespace_lock); 3133168404Spjd while ((spa = spa_next(NULL)) != NULL) { 3134168404Spjd /* 3135168404Spjd * Stop async tasks. The async thread may need to detach 3136168404Spjd * a device that's been replaced, which requires grabbing 3137168404Spjd * spa_namespace_lock, so we must drop it here. 3138168404Spjd */ 3139168404Spjd spa_open_ref(spa, FTAG); 3140168404Spjd mutex_exit(&spa_namespace_lock); 3141168404Spjd spa_async_suspend(spa); 3142168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3143168404Spjd mutex_enter(&spa_namespace_lock); 3144168404Spjd spa_close(spa, FTAG); 3145168404Spjd 3146168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3147168404Spjd spa_unload(spa); 3148168404Spjd spa_deactivate(spa); 3149168404Spjd } 3150168404Spjd spa_remove(spa); 3151168404Spjd } 3152168404Spjd mutex_exit(&spa_namespace_lock); 3153168404Spjd} 3154168404Spjd 3155168404Spjdvdev_t * 3156168404Spjdspa_lookup_by_guid(spa_t *spa, uint64_t guid) 3157168404Spjd{ 3158168404Spjd return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3159168404Spjd} 3160168404Spjd 3161168404Spjdvoid 3162168404Spjdspa_upgrade(spa_t *spa) 3163168404Spjd{ 3164168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 3165168404Spjd 3166168404Spjd /* 3167168404Spjd * This should only be called for a non-faulted pool, and since a 3168168404Spjd * future version would result in an unopenable pool, this shouldn't be 3169168404Spjd * possible. 3170168404Spjd */ 3171168404Spjd ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3172168404Spjd 3173168404Spjd spa->spa_uberblock.ub_version = ZFS_VERSION; 3174168404Spjd vdev_config_dirty(spa->spa_root_vdev); 3175168404Spjd 3176168404Spjd spa_config_exit(spa, FTAG); 3177168404Spjd 3178168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 3179168404Spjd} 3180168404Spjd 3181168404Spjdboolean_t 3182168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 3183168404Spjd{ 3184168404Spjd int i; 3185168404Spjd uint64_t spareguid; 3186168404Spjd 3187168404Spjd for (i = 0; i < spa->spa_nspares; i++) 3188168404Spjd if (spa->spa_spares[i]->vdev_guid == guid) 3189168404Spjd return (B_TRUE); 3190168404Spjd 3191168404Spjd for (i = 0; i < spa->spa_pending_nspares; i++) { 3192168404Spjd if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3193168404Spjd ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3194168404Spjd spareguid == guid) 3195168404Spjd return (B_TRUE); 3196168404Spjd } 3197168404Spjd 3198168404Spjd return (B_FALSE); 3199168404Spjd} 3200168404Spjd 3201168404Spjdint 3202168404Spjdspa_set_props(spa_t *spa, nvlist_t *nvp) 3203168404Spjd{ 3204168404Spjd return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3205168404Spjd spa, nvp, 3)); 3206168404Spjd} 3207168404Spjd 3208168404Spjdint 3209168404Spjdspa_get_props(spa_t *spa, nvlist_t **nvp) 3210168404Spjd{ 3211168404Spjd zap_cursor_t zc; 3212168404Spjd zap_attribute_t za; 3213168404Spjd objset_t *mos = spa->spa_meta_objset; 3214168404Spjd zfs_source_t src; 3215168404Spjd zfs_prop_t prop; 3216168404Spjd nvlist_t *propval; 3217168404Spjd uint64_t value; 3218168404Spjd int err; 3219168404Spjd 3220168404Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3221168404Spjd 3222168404Spjd mutex_enter(&spa->spa_props_lock); 3223168404Spjd /* If no props object, then just return empty nvlist */ 3224168404Spjd if (spa->spa_pool_props_object == 0) { 3225168404Spjd mutex_exit(&spa->spa_props_lock); 3226168404Spjd return (0); 3227168404Spjd } 3228168404Spjd 3229168404Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3230168404Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 3231168404Spjd zap_cursor_advance(&zc)) { 3232168404Spjd 3233168404Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3234168404Spjd continue; 3235168404Spjd 3236168404Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3237168404Spjd switch (za.za_integer_length) { 3238168404Spjd case 8: 3239168404Spjd if (zfs_prop_default_numeric(prop) == 3240168404Spjd za.za_first_integer) 3241168404Spjd src = ZFS_SRC_DEFAULT; 3242168404Spjd else 3243168404Spjd src = ZFS_SRC_LOCAL; 3244168404Spjd value = za.za_first_integer; 3245168404Spjd 3246168404Spjd if (prop == ZFS_PROP_BOOTFS) { 3247168404Spjd dsl_pool_t *dp; 3248168404Spjd dsl_dataset_t *ds = NULL; 3249168404Spjd char strval[MAXPATHLEN]; 3250168404Spjd 3251168404Spjd dp = spa_get_dsl(spa); 3252168404Spjd rw_enter(&dp->dp_config_rwlock, RW_READER); 3253168404Spjd if ((err = dsl_dataset_open_obj(dp, 3254168404Spjd za.za_first_integer, NULL, DS_MODE_NONE, 3255168404Spjd FTAG, &ds)) != 0) { 3256168404Spjd rw_exit(&dp->dp_config_rwlock); 3257168404Spjd break; 3258168404Spjd } 3259168404Spjd dsl_dataset_name(ds, strval); 3260168404Spjd dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3261168404Spjd rw_exit(&dp->dp_config_rwlock); 3262168404Spjd 3263168404Spjd VERIFY(nvlist_add_uint64(propval, 3264168404Spjd ZFS_PROP_SOURCE, src) == 0); 3265168404Spjd VERIFY(nvlist_add_string(propval, 3266168404Spjd ZFS_PROP_VALUE, strval) == 0); 3267168404Spjd } else { 3268168404Spjd VERIFY(nvlist_add_uint64(propval, 3269168404Spjd ZFS_PROP_SOURCE, src) == 0); 3270168404Spjd VERIFY(nvlist_add_uint64(propval, 3271168404Spjd ZFS_PROP_VALUE, value) == 0); 3272168404Spjd } 3273168404Spjd VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3274168404Spjd propval) == 0); 3275168404Spjd break; 3276168404Spjd } 3277168404Spjd nvlist_free(propval); 3278168404Spjd } 3279168404Spjd zap_cursor_fini(&zc); 3280168404Spjd mutex_exit(&spa->spa_props_lock); 3281168404Spjd if (err && err != ENOENT) { 3282168404Spjd nvlist_free(*nvp); 3283168404Spjd return (err); 3284168404Spjd } 3285168404Spjd 3286168404Spjd return (0); 3287168404Spjd} 3288168404Spjd 3289168404Spjd/* 3290168404Spjd * If the bootfs property value is dsobj, clear it. 3291168404Spjd */ 3292168404Spjdvoid 3293168404Spjdspa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3294168404Spjd{ 3295168404Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3296168404Spjd VERIFY(zap_remove(spa->spa_meta_objset, 3297168404Spjd spa->spa_pool_props_object, 3298168404Spjd zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 3299168404Spjd spa->spa_bootfs = 0; 3300168404Spjd } 3301168404Spjd} 3302