spa.c revision 168926
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23168404Spjd * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24168404Spjd * Use is subject to license terms. 25168404Spjd */ 26168404Spjd 27168404Spjd#pragma ident "%Z%%M% %I% %E% SMI" 28168404Spjd 29168404Spjd/* 30168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 31168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 32168404Spjd * pool. 33168404Spjd */ 34168404Spjd 35168404Spjd#include <sys/zfs_context.h> 36168404Spjd#include <sys/fm/fs/zfs.h> 37168404Spjd#include <sys/spa_impl.h> 38168404Spjd#include <sys/zio.h> 39168404Spjd#include <sys/zio_checksum.h> 40168404Spjd#include <sys/zio_compress.h> 41168404Spjd#include <sys/dmu.h> 42168404Spjd#include <sys/dmu_tx.h> 43168404Spjd#include <sys/zap.h> 44168404Spjd#include <sys/zil.h> 45168404Spjd#include <sys/vdev_impl.h> 46168404Spjd#include <sys/metaslab.h> 47168404Spjd#include <sys/uberblock_impl.h> 48168404Spjd#include <sys/txg.h> 49168404Spjd#include <sys/avl.h> 50168404Spjd#include <sys/dmu_traverse.h> 51168404Spjd#include <sys/dmu_objset.h> 52168404Spjd#include <sys/unique.h> 53168404Spjd#include <sys/dsl_pool.h> 54168404Spjd#include <sys/dsl_dataset.h> 55168404Spjd#include <sys/dsl_dir.h> 56168404Spjd#include <sys/dsl_prop.h> 57168404Spjd#include <sys/dsl_synctask.h> 58168404Spjd#include <sys/fs/zfs.h> 59168404Spjd#include <sys/callb.h> 60168404Spjd 61168712Spjdint zio_taskq_threads = 0; 62168712SpjdSYSCTL_DECL(_vfs_zfs); 63168712SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 64168712SpjdTUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads); 65168712SpjdSYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW, 66168712Spjd &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type"); 67168404Spjd 68168712Spjd 69168404Spjd/* 70168404Spjd * ========================================================================== 71168404Spjd * SPA state manipulation (open/create/destroy/import/export) 72168404Spjd * ========================================================================== 73168404Spjd */ 74168404Spjd 75168404Spjdstatic int 76168404Spjdspa_error_entry_compare(const void *a, const void *b) 77168404Spjd{ 78168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 79168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 80168404Spjd int ret; 81168404Spjd 82168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 83168404Spjd sizeof (zbookmark_t)); 84168404Spjd 85168404Spjd if (ret < 0) 86168404Spjd return (-1); 87168404Spjd else if (ret > 0) 88168404Spjd return (1); 89168404Spjd else 90168404Spjd return (0); 91168404Spjd} 92168404Spjd 93168404Spjd/* 94168404Spjd * Utility function which retrieves copies of the current logs and 95168404Spjd * re-initializes them in the process. 96168404Spjd */ 97168404Spjdvoid 98168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 99168404Spjd{ 100168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 101168404Spjd 102168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 103168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 104168404Spjd 105168404Spjd avl_create(&spa->spa_errlist_scrub, 106168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 107168404Spjd offsetof(spa_error_entry_t, se_avl)); 108168404Spjd avl_create(&spa->spa_errlist_last, 109168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 110168404Spjd offsetof(spa_error_entry_t, se_avl)); 111168404Spjd} 112168404Spjd 113168404Spjd/* 114168404Spjd * Activate an uninitialized pool. 115168404Spjd */ 116168404Spjdstatic void 117168404Spjdspa_activate(spa_t *spa) 118168404Spjd{ 119168404Spjd int t; 120168712Spjd int nthreads = zio_taskq_threads; 121168712Spjd char name[32]; 122168404Spjd 123168404Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 124168404Spjd 125168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 126168404Spjd 127168404Spjd spa->spa_normal_class = metaslab_class_create(); 128168404Spjd 129168712Spjd if (nthreads == 0) 130168715Spjd nthreads = max_ncpus; 131168404Spjd for (t = 0; t < ZIO_TYPES; t++) { 132168712Spjd snprintf(name, sizeof(name), "spa_zio_issue %d", t); 133168712Spjd spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads, 134168712Spjd maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 135168712Spjd snprintf(name, sizeof(name), "spa_zio_intr %d", t); 136168712Spjd spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads, 137168712Spjd maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 138168404Spjd } 139168404Spjd 140168404Spjd rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 141168404Spjd 142168404Spjd mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL); 143168404Spjd mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 144168404Spjd mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 145168404Spjd mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 146168404Spjd cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL); 147168404Spjd mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 148168404Spjd mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 149168404Spjd mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 150168404Spjd 151168404Spjd list_create(&spa->spa_dirty_list, sizeof (vdev_t), 152168404Spjd offsetof(vdev_t, vdev_dirty_node)); 153168404Spjd 154168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 155168404Spjd offsetof(struct vdev, vdev_txg_node)); 156168404Spjd 157168404Spjd avl_create(&spa->spa_errlist_scrub, 158168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 159168404Spjd offsetof(spa_error_entry_t, se_avl)); 160168404Spjd avl_create(&spa->spa_errlist_last, 161168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 162168404Spjd offsetof(spa_error_entry_t, se_avl)); 163168404Spjd} 164168404Spjd 165168404Spjd/* 166168404Spjd * Opposite of spa_activate(). 167168404Spjd */ 168168404Spjdstatic void 169168404Spjdspa_deactivate(spa_t *spa) 170168404Spjd{ 171168404Spjd int t; 172168404Spjd 173168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 174168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 175168404Spjd ASSERT(spa->spa_root_vdev == NULL); 176168404Spjd 177168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 178168404Spjd 179168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 180168404Spjd 181168404Spjd list_destroy(&spa->spa_dirty_list); 182168404Spjd 183168404Spjd for (t = 0; t < ZIO_TYPES; t++) { 184168404Spjd taskq_destroy(spa->spa_zio_issue_taskq[t]); 185168404Spjd taskq_destroy(spa->spa_zio_intr_taskq[t]); 186168404Spjd spa->spa_zio_issue_taskq[t] = NULL; 187168404Spjd spa->spa_zio_intr_taskq[t] = NULL; 188168404Spjd } 189168404Spjd 190168404Spjd metaslab_class_destroy(spa->spa_normal_class); 191168404Spjd spa->spa_normal_class = NULL; 192168404Spjd 193168404Spjd /* 194168404Spjd * If this was part of an import or the open otherwise failed, we may 195168404Spjd * still have errors left in the queues. Empty them just in case. 196168404Spjd */ 197168404Spjd spa_errlog_drain(spa); 198168404Spjd 199168404Spjd avl_destroy(&spa->spa_errlist_scrub); 200168404Spjd avl_destroy(&spa->spa_errlist_last); 201168404Spjd 202168404Spjd rw_destroy(&spa->spa_traverse_lock); 203168404Spjd mutex_destroy(&spa->spa_uberblock_lock); 204168404Spjd mutex_destroy(&spa->spa_errlog_lock); 205168404Spjd mutex_destroy(&spa->spa_errlist_lock); 206168404Spjd mutex_destroy(&spa->spa_config_lock.scl_lock); 207168404Spjd cv_destroy(&spa->spa_config_lock.scl_cv); 208168404Spjd mutex_destroy(&spa->spa_sync_bplist.bpl_lock); 209168404Spjd mutex_destroy(&spa->spa_history_lock); 210168404Spjd mutex_destroy(&spa->spa_props_lock); 211168404Spjd 212168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 213168404Spjd} 214168404Spjd 215168404Spjd/* 216168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 217168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 218168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 219168404Spjd * All vdev validation is done by the vdev_alloc() routine. 220168404Spjd */ 221168404Spjdstatic int 222168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 223168404Spjd uint_t id, int atype) 224168404Spjd{ 225168404Spjd nvlist_t **child; 226168404Spjd uint_t c, children; 227168404Spjd int error; 228168404Spjd 229168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 230168404Spjd return (error); 231168404Spjd 232168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 233168404Spjd return (0); 234168404Spjd 235168404Spjd if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 236168404Spjd &child, &children) != 0) { 237168404Spjd vdev_free(*vdp); 238168404Spjd *vdp = NULL; 239168404Spjd return (EINVAL); 240168404Spjd } 241168404Spjd 242168404Spjd for (c = 0; c < children; c++) { 243168404Spjd vdev_t *vd; 244168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 245168404Spjd atype)) != 0) { 246168404Spjd vdev_free(*vdp); 247168404Spjd *vdp = NULL; 248168404Spjd return (error); 249168404Spjd } 250168404Spjd } 251168404Spjd 252168404Spjd ASSERT(*vdp != NULL); 253168404Spjd 254168404Spjd return (0); 255168404Spjd} 256168404Spjd 257168404Spjd/* 258168404Spjd * Opposite of spa_load(). 259168404Spjd */ 260168404Spjdstatic void 261168404Spjdspa_unload(spa_t *spa) 262168404Spjd{ 263168404Spjd int i; 264168404Spjd 265168404Spjd /* 266168404Spjd * Stop async tasks. 267168404Spjd */ 268168404Spjd spa_async_suspend(spa); 269168404Spjd 270168404Spjd /* 271168404Spjd * Stop syncing. 272168404Spjd */ 273168404Spjd if (spa->spa_sync_on) { 274168404Spjd txg_sync_stop(spa->spa_dsl_pool); 275168404Spjd spa->spa_sync_on = B_FALSE; 276168404Spjd } 277168404Spjd 278168404Spjd /* 279168404Spjd * Wait for any outstanding prefetch I/O to complete. 280168404Spjd */ 281168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 282168404Spjd spa_config_exit(spa, FTAG); 283168404Spjd 284168404Spjd /* 285168404Spjd * Close the dsl pool. 286168404Spjd */ 287168404Spjd if (spa->spa_dsl_pool) { 288168404Spjd dsl_pool_close(spa->spa_dsl_pool); 289168404Spjd spa->spa_dsl_pool = NULL; 290168404Spjd } 291168404Spjd 292168404Spjd /* 293168404Spjd * Close all vdevs. 294168404Spjd */ 295168404Spjd if (spa->spa_root_vdev) 296168404Spjd vdev_free(spa->spa_root_vdev); 297168404Spjd ASSERT(spa->spa_root_vdev == NULL); 298168404Spjd 299168404Spjd for (i = 0; i < spa->spa_nspares; i++) 300168404Spjd vdev_free(spa->spa_spares[i]); 301168404Spjd if (spa->spa_spares) { 302168404Spjd kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 303168404Spjd spa->spa_spares = NULL; 304168404Spjd } 305168404Spjd if (spa->spa_sparelist) { 306168404Spjd nvlist_free(spa->spa_sparelist); 307168404Spjd spa->spa_sparelist = NULL; 308168404Spjd } 309168404Spjd 310168404Spjd spa->spa_async_suspended = 0; 311168404Spjd} 312168404Spjd 313168404Spjd/* 314168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 315168404Spjd * this pool. When this is called, we have some form of basic information in 316168404Spjd * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 317168404Spjd * re-generate a more complete list including status information. 318168404Spjd */ 319168404Spjdstatic void 320168404Spjdspa_load_spares(spa_t *spa) 321168404Spjd{ 322168404Spjd nvlist_t **spares; 323168404Spjd uint_t nspares; 324168404Spjd int i; 325168404Spjd vdev_t *vd, *tvd; 326168404Spjd 327168404Spjd /* 328168404Spjd * First, close and free any existing spare vdevs. 329168404Spjd */ 330168404Spjd for (i = 0; i < spa->spa_nspares; i++) { 331168404Spjd vd = spa->spa_spares[i]; 332168404Spjd 333168404Spjd /* Undo the call to spa_activate() below */ 334168404Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 335168404Spjd tvd->vdev_isspare) 336168404Spjd spa_spare_remove(tvd); 337168404Spjd vdev_close(vd); 338168404Spjd vdev_free(vd); 339168404Spjd } 340168404Spjd 341168404Spjd if (spa->spa_spares) 342168404Spjd kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 343168404Spjd 344168404Spjd if (spa->spa_sparelist == NULL) 345168404Spjd nspares = 0; 346168404Spjd else 347168404Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 348168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 349168404Spjd 350168404Spjd spa->spa_nspares = (int)nspares; 351168404Spjd spa->spa_spares = NULL; 352168404Spjd 353168404Spjd if (nspares == 0) 354168404Spjd return; 355168404Spjd 356168404Spjd /* 357168404Spjd * Construct the array of vdevs, opening them to get status in the 358168404Spjd * process. For each spare, there is potentially two different vdev_t 359168404Spjd * structures associated with it: one in the list of spares (used only 360168404Spjd * for basic validation purposes) and one in the active vdev 361168404Spjd * configuration (if it's spared in). During this phase we open and 362168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 363168404Spjd * active configuration, then we also mark this vdev as an active spare. 364168404Spjd */ 365168404Spjd spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 366168404Spjd for (i = 0; i < spa->spa_nspares; i++) { 367168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 368168404Spjd VDEV_ALLOC_SPARE) == 0); 369168404Spjd ASSERT(vd != NULL); 370168404Spjd 371168404Spjd spa->spa_spares[i] = vd; 372168404Spjd 373168404Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 374168404Spjd if (!tvd->vdev_isspare) 375168404Spjd spa_spare_add(tvd); 376168404Spjd 377168404Spjd /* 378168404Spjd * We only mark the spare active if we were successfully 379168404Spjd * able to load the vdev. Otherwise, importing a pool 380168404Spjd * with a bad active spare would result in strange 381168404Spjd * behavior, because multiple pool would think the spare 382168404Spjd * is actively in use. 383168404Spjd * 384168404Spjd * There is a vulnerability here to an equally bizarre 385168404Spjd * circumstance, where a dead active spare is later 386168404Spjd * brought back to life (onlined or otherwise). Given 387168404Spjd * the rarity of this scenario, and the extra complexity 388168404Spjd * it adds, we ignore the possibility. 389168404Spjd */ 390168404Spjd if (!vdev_is_dead(tvd)) 391168404Spjd spa_spare_activate(tvd); 392168404Spjd } 393168404Spjd 394168404Spjd if (vdev_open(vd) != 0) 395168404Spjd continue; 396168404Spjd 397168404Spjd vd->vdev_top = vd; 398168404Spjd (void) vdev_validate_spare(vd); 399168404Spjd } 400168404Spjd 401168404Spjd /* 402168404Spjd * Recompute the stashed list of spares, with status information 403168404Spjd * this time. 404168404Spjd */ 405168404Spjd VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 406168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 407168404Spjd 408168404Spjd spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 409168404Spjd for (i = 0; i < spa->spa_nspares; i++) 410168404Spjd spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 411168404Spjd B_TRUE, B_TRUE); 412168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 413168404Spjd spares, spa->spa_nspares) == 0); 414168404Spjd for (i = 0; i < spa->spa_nspares; i++) 415168404Spjd nvlist_free(spares[i]); 416168404Spjd kmem_free(spares, spa->spa_nspares * sizeof (void *)); 417168404Spjd} 418168404Spjd 419168404Spjdstatic int 420168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 421168404Spjd{ 422168404Spjd dmu_buf_t *db; 423168404Spjd char *packed = NULL; 424168404Spjd size_t nvsize = 0; 425168404Spjd int error; 426168404Spjd *value = NULL; 427168404Spjd 428168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 429168404Spjd nvsize = *(uint64_t *)db->db_data; 430168404Spjd dmu_buf_rele(db, FTAG); 431168404Spjd 432168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 433168404Spjd error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 434168404Spjd if (error == 0) 435168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 436168404Spjd kmem_free(packed, nvsize); 437168404Spjd 438168404Spjd return (error); 439168404Spjd} 440168404Spjd 441168404Spjd/* 442168404Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 443168404Spjd * source of configuration information. 444168404Spjd */ 445168404Spjdstatic int 446168404Spjdspa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 447168404Spjd{ 448168404Spjd int error = 0; 449168404Spjd nvlist_t *nvroot = NULL; 450168404Spjd vdev_t *rvd; 451168404Spjd uberblock_t *ub = &spa->spa_uberblock; 452168404Spjd uint64_t config_cache_txg = spa->spa_config_txg; 453168404Spjd uint64_t pool_guid; 454168404Spjd uint64_t version; 455168404Spjd zio_t *zio; 456168404Spjd 457168404Spjd spa->spa_load_state = state; 458168404Spjd 459168404Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 460168404Spjd nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 461168404Spjd error = EINVAL; 462168404Spjd goto out; 463168404Spjd } 464168404Spjd 465168404Spjd /* 466168404Spjd * Versioning wasn't explicitly added to the label until later, so if 467168404Spjd * it's not present treat it as the initial version. 468168404Spjd */ 469168404Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 470168404Spjd version = ZFS_VERSION_INITIAL; 471168404Spjd 472168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 473168404Spjd &spa->spa_config_txg); 474168404Spjd 475168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 476168404Spjd spa_guid_exists(pool_guid, 0)) { 477168404Spjd error = EEXIST; 478168404Spjd goto out; 479168404Spjd } 480168404Spjd 481168404Spjd spa->spa_load_guid = pool_guid; 482168404Spjd 483168404Spjd /* 484168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 485168404Spjd * value that will be returned by spa_version() since parsing the 486168404Spjd * configuration requires knowing the version number. 487168404Spjd */ 488168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 489168404Spjd spa->spa_ubsync.ub_version = version; 490168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 491168404Spjd spa_config_exit(spa, FTAG); 492168404Spjd 493168404Spjd if (error != 0) 494168404Spjd goto out; 495168404Spjd 496168404Spjd ASSERT(spa->spa_root_vdev == rvd); 497168404Spjd ASSERT(spa_guid(spa) == pool_guid); 498168404Spjd 499168404Spjd /* 500168404Spjd * Try to open all vdevs, loading each label in the process. 501168404Spjd */ 502168926Spjd error = vdev_open(rvd); 503168926Spjd if (error != 0) 504168404Spjd goto out; 505168404Spjd 506168404Spjd /* 507168404Spjd * Validate the labels for all leaf vdevs. We need to grab the config 508168404Spjd * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 509168404Spjd * flag. 510168404Spjd */ 511168404Spjd spa_config_enter(spa, RW_READER, FTAG); 512168404Spjd error = vdev_validate(rvd); 513168404Spjd spa_config_exit(spa, FTAG); 514168404Spjd 515168926Spjd if (error != 0) 516168404Spjd goto out; 517168404Spjd 518168404Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 519168404Spjd error = ENXIO; 520168404Spjd goto out; 521168404Spjd } 522168404Spjd 523168404Spjd /* 524168404Spjd * Find the best uberblock. 525168404Spjd */ 526168404Spjd bzero(ub, sizeof (uberblock_t)); 527168404Spjd 528168404Spjd zio = zio_root(spa, NULL, NULL, 529168404Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 530168404Spjd vdev_uberblock_load(zio, rvd, ub); 531168404Spjd error = zio_wait(zio); 532168404Spjd 533168404Spjd /* 534168404Spjd * If we weren't able to find a single valid uberblock, return failure. 535168404Spjd */ 536168404Spjd if (ub->ub_txg == 0) { 537168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 538168404Spjd VDEV_AUX_CORRUPT_DATA); 539168404Spjd error = ENXIO; 540168404Spjd goto out; 541168404Spjd } 542168404Spjd 543168404Spjd /* 544168404Spjd * If the pool is newer than the code, we can't open it. 545168404Spjd */ 546168404Spjd if (ub->ub_version > ZFS_VERSION) { 547168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 548168404Spjd VDEV_AUX_VERSION_NEWER); 549168404Spjd error = ENOTSUP; 550168404Spjd goto out; 551168404Spjd } 552168404Spjd 553168404Spjd /* 554168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 555168404Spjd * incomplete configuration. 556168404Spjd */ 557168404Spjd if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 558168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 559168404Spjd VDEV_AUX_BAD_GUID_SUM); 560168404Spjd error = ENXIO; 561168404Spjd goto out; 562168404Spjd } 563168404Spjd 564168404Spjd /* 565168404Spjd * Initialize internal SPA structures. 566168404Spjd */ 567168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 568168404Spjd spa->spa_ubsync = spa->spa_uberblock; 569168404Spjd spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 570168404Spjd error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 571168404Spjd if (error) { 572168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 573168404Spjd VDEV_AUX_CORRUPT_DATA); 574168404Spjd goto out; 575168404Spjd } 576168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 577168404Spjd 578168404Spjd if (zap_lookup(spa->spa_meta_objset, 579168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 580168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 581168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 582168404Spjd VDEV_AUX_CORRUPT_DATA); 583168404Spjd error = EIO; 584168404Spjd goto out; 585168404Spjd } 586168404Spjd 587168404Spjd if (!mosconfig) { 588168404Spjd nvlist_t *newconfig; 589168498Spjd uint64_t hostid; 590168404Spjd 591168404Spjd if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 592168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 593168404Spjd VDEV_AUX_CORRUPT_DATA); 594168404Spjd error = EIO; 595168404Spjd goto out; 596168404Spjd } 597168404Spjd 598168821Spjd /* 599168821Spjd * hostid is set after the root file system is mounted, so 600168821Spjd * ignore the check until it's done. 601168821Spjd */ 602168498Spjd if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 603168821Spjd &hostid) == 0 && root_mounted()) { 604168498Spjd char *hostname; 605168498Spjd unsigned long myhostid = 0; 606168498Spjd 607168498Spjd VERIFY(nvlist_lookup_string(newconfig, 608168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 609168498Spjd 610168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 611168498Spjd if ((unsigned long)hostid != myhostid) { 612168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 613168498Spjd "loaded as it was last accessed by " 614168498Spjd "another system (host: %s hostid: 0x%lx). " 615168498Spjd "See: http://www.sun.com/msg/ZFS-8000-EY", 616168498Spjd spa->spa_name, hostname, 617168498Spjd (unsigned long)hostid); 618168498Spjd error = EBADF; 619168498Spjd goto out; 620168498Spjd } 621168498Spjd } 622168498Spjd 623168404Spjd spa_config_set(spa, newconfig); 624168404Spjd spa_unload(spa); 625168404Spjd spa_deactivate(spa); 626168404Spjd spa_activate(spa); 627168404Spjd 628168404Spjd return (spa_load(spa, newconfig, state, B_TRUE)); 629168404Spjd } 630168404Spjd 631168404Spjd if (zap_lookup(spa->spa_meta_objset, 632168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 633168404Spjd sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 634168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 635168404Spjd VDEV_AUX_CORRUPT_DATA); 636168404Spjd error = EIO; 637168404Spjd goto out; 638168404Spjd } 639168404Spjd 640168404Spjd /* 641168404Spjd * Load the bit that tells us to use the new accounting function 642168404Spjd * (raid-z deflation). If we have an older pool, this will not 643168404Spjd * be present. 644168404Spjd */ 645168404Spjd error = zap_lookup(spa->spa_meta_objset, 646168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 647168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate); 648168404Spjd if (error != 0 && error != ENOENT) { 649168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 650168404Spjd VDEV_AUX_CORRUPT_DATA); 651168404Spjd error = EIO; 652168404Spjd goto out; 653168404Spjd } 654168404Spjd 655168404Spjd /* 656168404Spjd * Load the persistent error log. If we have an older pool, this will 657168404Spjd * not be present. 658168404Spjd */ 659168404Spjd error = zap_lookup(spa->spa_meta_objset, 660168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 661168404Spjd sizeof (uint64_t), 1, &spa->spa_errlog_last); 662168404Spjd if (error != 0 && error != ENOENT) { 663168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 664168404Spjd VDEV_AUX_CORRUPT_DATA); 665168404Spjd error = EIO; 666168404Spjd goto out; 667168404Spjd } 668168404Spjd 669168404Spjd error = zap_lookup(spa->spa_meta_objset, 670168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 671168404Spjd sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 672168404Spjd if (error != 0 && error != ENOENT) { 673168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 674168404Spjd VDEV_AUX_CORRUPT_DATA); 675168404Spjd error = EIO; 676168404Spjd goto out; 677168404Spjd } 678168404Spjd 679168404Spjd /* 680168404Spjd * Load the history object. If we have an older pool, this 681168404Spjd * will not be present. 682168404Spjd */ 683168404Spjd error = zap_lookup(spa->spa_meta_objset, 684168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 685168404Spjd sizeof (uint64_t), 1, &spa->spa_history); 686168404Spjd if (error != 0 && error != ENOENT) { 687168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 688168404Spjd VDEV_AUX_CORRUPT_DATA); 689168404Spjd error = EIO; 690168404Spjd goto out; 691168404Spjd } 692168404Spjd 693168404Spjd /* 694168404Spjd * Load any hot spares for this pool. 695168404Spjd */ 696168404Spjd error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 697168404Spjd DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 698168404Spjd if (error != 0 && error != ENOENT) { 699168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 700168404Spjd VDEV_AUX_CORRUPT_DATA); 701168404Spjd error = EIO; 702168404Spjd goto out; 703168404Spjd } 704168404Spjd if (error == 0) { 705168404Spjd ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 706168404Spjd if (load_nvlist(spa, spa->spa_spares_object, 707168404Spjd &spa->spa_sparelist) != 0) { 708168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 709168404Spjd VDEV_AUX_CORRUPT_DATA); 710168404Spjd error = EIO; 711168404Spjd goto out; 712168404Spjd } 713168404Spjd 714168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 715168404Spjd spa_load_spares(spa); 716168404Spjd spa_config_exit(spa, FTAG); 717168404Spjd } 718168404Spjd 719168404Spjd error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 720168404Spjd DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 721168404Spjd 722168404Spjd if (error && error != ENOENT) { 723168404Spjd vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 724168404Spjd VDEV_AUX_CORRUPT_DATA); 725168404Spjd error = EIO; 726168404Spjd goto out; 727168404Spjd } 728168404Spjd 729168404Spjd if (error == 0) { 730168404Spjd (void) zap_lookup(spa->spa_meta_objset, 731168404Spjd spa->spa_pool_props_object, 732168404Spjd zpool_prop_to_name(ZFS_PROP_BOOTFS), 733168404Spjd sizeof (uint64_t), 1, &spa->spa_bootfs); 734168404Spjd } 735168404Spjd 736168404Spjd /* 737168404Spjd * Load the vdev state for all toplevel vdevs. 738168404Spjd */ 739168404Spjd vdev_load(rvd); 740168404Spjd 741168404Spjd /* 742168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 743168404Spjd */ 744168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 745168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 746168404Spjd spa_config_exit(spa, FTAG); 747168404Spjd 748168404Spjd /* 749168404Spjd * Check the state of the root vdev. If it can't be opened, it 750168404Spjd * indicates one or more toplevel vdevs are faulted. 751168404Spjd */ 752168404Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 753168404Spjd error = ENXIO; 754168404Spjd goto out; 755168404Spjd } 756168404Spjd 757168404Spjd if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 758168404Spjd dmu_tx_t *tx; 759168404Spjd int need_update = B_FALSE; 760168404Spjd int c; 761168404Spjd 762168404Spjd /* 763168404Spjd * Claim log blocks that haven't been committed yet. 764168404Spjd * This must all happen in a single txg. 765168404Spjd */ 766168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 767168404Spjd spa_first_txg(spa)); 768168404Spjd (void) dmu_objset_find(spa->spa_name, 769168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 770168404Spjd dmu_tx_commit(tx); 771168404Spjd 772168404Spjd spa->spa_sync_on = B_TRUE; 773168404Spjd txg_sync_start(spa->spa_dsl_pool); 774168404Spjd 775168404Spjd /* 776168404Spjd * Wait for all claims to sync. 777168404Spjd */ 778168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 779168404Spjd 780168404Spjd /* 781168404Spjd * If the config cache is stale, or we have uninitialized 782168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 783168404Spjd */ 784168404Spjd if (config_cache_txg != spa->spa_config_txg || 785168404Spjd state == SPA_LOAD_IMPORT) 786168404Spjd need_update = B_TRUE; 787168404Spjd 788168404Spjd for (c = 0; c < rvd->vdev_children; c++) 789168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 790168404Spjd need_update = B_TRUE; 791168404Spjd 792168404Spjd /* 793168404Spjd * Update the config cache asychronously in case we're the 794168404Spjd * root pool, in which case the config cache isn't writable yet. 795168404Spjd */ 796168404Spjd if (need_update) 797168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 798168404Spjd } 799168404Spjd 800168404Spjd error = 0; 801168404Spjdout: 802168404Spjd if (error && error != EBADF) 803168404Spjd zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 804168404Spjd spa->spa_load_state = SPA_LOAD_NONE; 805168404Spjd spa->spa_ena = 0; 806168404Spjd 807168404Spjd return (error); 808168404Spjd} 809168404Spjd 810168404Spjd/* 811168404Spjd * Pool Open/Import 812168404Spjd * 813168404Spjd * The import case is identical to an open except that the configuration is sent 814168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 815168404Spjd * case of an open, the pool configuration will exist in the 816168404Spjd * POOL_STATE_UNITIALIZED state. 817168404Spjd * 818168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 819168404Spjd * the same time open the pool, without having to keep around the spa_t in some 820168404Spjd * ambiguous state. 821168404Spjd */ 822168404Spjdstatic int 823168404Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 824168404Spjd{ 825168404Spjd spa_t *spa; 826168404Spjd int error; 827168404Spjd int loaded = B_FALSE; 828168404Spjd int locked = B_FALSE; 829168404Spjd 830168404Spjd *spapp = NULL; 831168404Spjd 832168404Spjd /* 833168404Spjd * As disgusting as this is, we need to support recursive calls to this 834168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 835168404Spjd * up calling spa_open() again. The real fix is to figure out how to 836168404Spjd * avoid dsl_dir_open() calling this in the first place. 837168404Spjd */ 838168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 839168404Spjd mutex_enter(&spa_namespace_lock); 840168404Spjd locked = B_TRUE; 841168404Spjd } 842168404Spjd 843168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 844168404Spjd if (locked) 845168404Spjd mutex_exit(&spa_namespace_lock); 846168404Spjd return (ENOENT); 847168404Spjd } 848168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 849168404Spjd 850168404Spjd spa_activate(spa); 851168404Spjd 852168404Spjd error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 853168404Spjd 854168404Spjd if (error == EBADF) { 855168404Spjd /* 856168404Spjd * If vdev_validate() returns failure (indicated by 857168404Spjd * EBADF), it indicates that one of the vdevs indicates 858168404Spjd * that the pool has been exported or destroyed. If 859168404Spjd * this is the case, the config cache is out of sync and 860168404Spjd * we should remove the pool from the namespace. 861168404Spjd */ 862168404Spjd zfs_post_ok(spa, NULL); 863168404Spjd spa_unload(spa); 864168404Spjd spa_deactivate(spa); 865168404Spjd spa_remove(spa); 866168404Spjd spa_config_sync(); 867168404Spjd if (locked) 868168404Spjd mutex_exit(&spa_namespace_lock); 869168404Spjd return (ENOENT); 870168404Spjd } 871168404Spjd 872168404Spjd if (error) { 873168404Spjd /* 874168404Spjd * We can't open the pool, but we still have useful 875168404Spjd * information: the state of each vdev after the 876168404Spjd * attempted vdev_open(). Return this to the user. 877168404Spjd */ 878168404Spjd if (config != NULL && spa->spa_root_vdev != NULL) { 879168404Spjd spa_config_enter(spa, RW_READER, FTAG); 880168404Spjd *config = spa_config_generate(spa, NULL, -1ULL, 881168404Spjd B_TRUE); 882168404Spjd spa_config_exit(spa, FTAG); 883168404Spjd } 884168404Spjd spa_unload(spa); 885168404Spjd spa_deactivate(spa); 886168404Spjd spa->spa_last_open_failed = B_TRUE; 887168404Spjd if (locked) 888168404Spjd mutex_exit(&spa_namespace_lock); 889168404Spjd *spapp = NULL; 890168404Spjd return (error); 891168404Spjd } else { 892168404Spjd zfs_post_ok(spa, NULL); 893168404Spjd spa->spa_last_open_failed = B_FALSE; 894168404Spjd } 895168404Spjd 896168404Spjd loaded = B_TRUE; 897168404Spjd } 898168404Spjd 899168404Spjd spa_open_ref(spa, tag); 900168404Spjd if (locked) 901168404Spjd mutex_exit(&spa_namespace_lock); 902168404Spjd 903168404Spjd *spapp = spa; 904168404Spjd 905168404Spjd if (config != NULL) { 906168404Spjd spa_config_enter(spa, RW_READER, FTAG); 907168404Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 908168404Spjd spa_config_exit(spa, FTAG); 909168404Spjd } 910168404Spjd 911168404Spjd /* 912168404Spjd * If we just loaded the pool, resilver anything that's out of date. 913168404Spjd */ 914168404Spjd if (loaded && (spa_mode & FWRITE)) 915168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 916168404Spjd 917168404Spjd return (0); 918168404Spjd} 919168404Spjd 920168404Spjdint 921168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 922168404Spjd{ 923168404Spjd return (spa_open_common(name, spapp, tag, NULL)); 924168404Spjd} 925168404Spjd 926168404Spjd/* 927168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 928168404Spjd * preventing it from being exported or destroyed. 929168404Spjd */ 930168404Spjdspa_t * 931168404Spjdspa_inject_addref(char *name) 932168404Spjd{ 933168404Spjd spa_t *spa; 934168404Spjd 935168404Spjd mutex_enter(&spa_namespace_lock); 936168404Spjd if ((spa = spa_lookup(name)) == NULL) { 937168404Spjd mutex_exit(&spa_namespace_lock); 938168404Spjd return (NULL); 939168404Spjd } 940168404Spjd spa->spa_inject_ref++; 941168404Spjd mutex_exit(&spa_namespace_lock); 942168404Spjd 943168404Spjd return (spa); 944168404Spjd} 945168404Spjd 946168404Spjdvoid 947168404Spjdspa_inject_delref(spa_t *spa) 948168404Spjd{ 949168404Spjd mutex_enter(&spa_namespace_lock); 950168404Spjd spa->spa_inject_ref--; 951168404Spjd mutex_exit(&spa_namespace_lock); 952168404Spjd} 953168404Spjd 954168404Spjdstatic void 955168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 956168404Spjd{ 957168404Spjd nvlist_t **spares; 958168404Spjd uint_t i, nspares; 959168404Spjd nvlist_t *nvroot; 960168404Spjd uint64_t guid; 961168404Spjd vdev_stat_t *vs; 962168404Spjd uint_t vsc; 963168404Spjd uint64_t pool; 964168404Spjd 965168404Spjd if (spa->spa_nspares == 0) 966168404Spjd return; 967168404Spjd 968168404Spjd VERIFY(nvlist_lookup_nvlist(config, 969168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 970168404Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 971168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 972168404Spjd if (nspares != 0) { 973168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 974168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 975168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 976168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 977168404Spjd 978168404Spjd /* 979168404Spjd * Go through and find any spares which have since been 980168404Spjd * repurposed as an active spare. If this is the case, update 981168404Spjd * their status appropriately. 982168404Spjd */ 983168404Spjd for (i = 0; i < nspares; i++) { 984168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 985168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 986168404Spjd if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 987168404Spjd VERIFY(nvlist_lookup_uint64_array( 988168404Spjd spares[i], ZPOOL_CONFIG_STATS, 989168404Spjd (uint64_t **)&vs, &vsc) == 0); 990168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 991168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 992168404Spjd } 993168404Spjd } 994168404Spjd } 995168404Spjd} 996168404Spjd 997168404Spjdint 998168404Spjdspa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 999168404Spjd{ 1000168404Spjd int error; 1001168404Spjd spa_t *spa; 1002168404Spjd 1003168404Spjd *config = NULL; 1004168404Spjd error = spa_open_common(name, &spa, FTAG, config); 1005168404Spjd 1006168404Spjd if (spa && *config != NULL) { 1007168404Spjd VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1008168404Spjd spa_get_errlog_size(spa)) == 0); 1009168404Spjd 1010168404Spjd spa_add_spares(spa, *config); 1011168404Spjd } 1012168404Spjd 1013168404Spjd /* 1014168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 1015168404Spjd * and call spa_lookup() directly. 1016168404Spjd */ 1017168404Spjd if (altroot) { 1018168404Spjd if (spa == NULL) { 1019168404Spjd mutex_enter(&spa_namespace_lock); 1020168404Spjd spa = spa_lookup(name); 1021168404Spjd if (spa) 1022168404Spjd spa_altroot(spa, altroot, buflen); 1023168404Spjd else 1024168404Spjd altroot[0] = '\0'; 1025168404Spjd spa = NULL; 1026168404Spjd mutex_exit(&spa_namespace_lock); 1027168404Spjd } else { 1028168404Spjd spa_altroot(spa, altroot, buflen); 1029168404Spjd } 1030168404Spjd } 1031168404Spjd 1032168404Spjd if (spa != NULL) 1033168404Spjd spa_close(spa, FTAG); 1034168404Spjd 1035168404Spjd return (error); 1036168404Spjd} 1037168404Spjd 1038168404Spjd/* 1039168404Spjd * Validate that the 'spares' array is well formed. We must have an array of 1040168404Spjd * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1041168404Spjd * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1042168404Spjd * as they are well-formed. 1043168404Spjd */ 1044168404Spjdstatic int 1045168404Spjdspa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1046168404Spjd{ 1047168404Spjd nvlist_t **spares; 1048168404Spjd uint_t i, nspares; 1049168404Spjd vdev_t *vd; 1050168404Spjd int error; 1051168404Spjd 1052168404Spjd /* 1053168404Spjd * It's acceptable to have no spares specified. 1054168404Spjd */ 1055168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1056168404Spjd &spares, &nspares) != 0) 1057168404Spjd return (0); 1058168404Spjd 1059168404Spjd if (nspares == 0) 1060168404Spjd return (EINVAL); 1061168404Spjd 1062168404Spjd /* 1063168404Spjd * Make sure the pool is formatted with a version that supports hot 1064168404Spjd * spares. 1065168404Spjd */ 1066168404Spjd if (spa_version(spa) < ZFS_VERSION_SPARES) 1067168404Spjd return (ENOTSUP); 1068168404Spjd 1069168404Spjd /* 1070168404Spjd * Set the pending spare list so we correctly handle device in-use 1071168404Spjd * checking. 1072168404Spjd */ 1073168404Spjd spa->spa_pending_spares = spares; 1074168404Spjd spa->spa_pending_nspares = nspares; 1075168404Spjd 1076168404Spjd for (i = 0; i < nspares; i++) { 1077168404Spjd if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1078168404Spjd mode)) != 0) 1079168404Spjd goto out; 1080168404Spjd 1081168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 1082168404Spjd vdev_free(vd); 1083168404Spjd error = EINVAL; 1084168404Spjd goto out; 1085168404Spjd } 1086168404Spjd 1087168404Spjd vd->vdev_top = vd; 1088168404Spjd 1089168404Spjd if ((error = vdev_open(vd)) == 0 && 1090168404Spjd (error = vdev_label_init(vd, crtxg, 1091168404Spjd VDEV_LABEL_SPARE)) == 0) { 1092168404Spjd VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1093168404Spjd vd->vdev_guid) == 0); 1094168404Spjd } 1095168404Spjd 1096168404Spjd vdev_free(vd); 1097168404Spjd 1098168404Spjd if (error && mode != VDEV_ALLOC_SPARE) 1099168404Spjd goto out; 1100168404Spjd else 1101168404Spjd error = 0; 1102168404Spjd } 1103168404Spjd 1104168404Spjdout: 1105168404Spjd spa->spa_pending_spares = NULL; 1106168404Spjd spa->spa_pending_nspares = 0; 1107168404Spjd return (error); 1108168404Spjd} 1109168404Spjd 1110168404Spjd/* 1111168404Spjd * Pool Creation 1112168404Spjd */ 1113168404Spjdint 1114168404Spjdspa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1115168404Spjd{ 1116168404Spjd spa_t *spa; 1117168404Spjd vdev_t *rvd; 1118168404Spjd dsl_pool_t *dp; 1119168404Spjd dmu_tx_t *tx; 1120168404Spjd int c, error = 0; 1121168404Spjd uint64_t txg = TXG_INITIAL; 1122168404Spjd nvlist_t **spares; 1123168404Spjd uint_t nspares; 1124168404Spjd 1125168404Spjd /* 1126168404Spjd * If this pool already exists, return failure. 1127168404Spjd */ 1128168404Spjd mutex_enter(&spa_namespace_lock); 1129168404Spjd if (spa_lookup(pool) != NULL) { 1130168404Spjd mutex_exit(&spa_namespace_lock); 1131168404Spjd return (EEXIST); 1132168404Spjd } 1133168404Spjd 1134168404Spjd /* 1135168404Spjd * Allocate a new spa_t structure. 1136168404Spjd */ 1137168404Spjd spa = spa_add(pool, altroot); 1138168404Spjd spa_activate(spa); 1139168404Spjd 1140168404Spjd spa->spa_uberblock.ub_txg = txg - 1; 1141168404Spjd spa->spa_uberblock.ub_version = ZFS_VERSION; 1142168404Spjd spa->spa_ubsync = spa->spa_uberblock; 1143168404Spjd 1144168404Spjd /* 1145168404Spjd * Create the root vdev. 1146168404Spjd */ 1147168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 1148168404Spjd 1149168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1150168404Spjd 1151168404Spjd ASSERT(error != 0 || rvd != NULL); 1152168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1153168404Spjd 1154168404Spjd if (error == 0 && rvd->vdev_children == 0) 1155168404Spjd error = EINVAL; 1156168404Spjd 1157168404Spjd if (error == 0 && 1158168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1159168404Spjd (error = spa_validate_spares(spa, nvroot, txg, 1160168404Spjd VDEV_ALLOC_ADD)) == 0) { 1161168404Spjd for (c = 0; c < rvd->vdev_children; c++) 1162168404Spjd vdev_init(rvd->vdev_child[c], txg); 1163168404Spjd vdev_config_dirty(rvd); 1164168404Spjd } 1165168404Spjd 1166168404Spjd spa_config_exit(spa, FTAG); 1167168404Spjd 1168168404Spjd if (error != 0) { 1169168404Spjd spa_unload(spa); 1170168404Spjd spa_deactivate(spa); 1171168404Spjd spa_remove(spa); 1172168404Spjd mutex_exit(&spa_namespace_lock); 1173168404Spjd return (error); 1174168404Spjd } 1175168404Spjd 1176168404Spjd /* 1177168404Spjd * Get the list of spares, if specified. 1178168404Spjd */ 1179168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1180168404Spjd &spares, &nspares) == 0) { 1181168404Spjd VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1182168404Spjd KM_SLEEP) == 0); 1183168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1184168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1185168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 1186168404Spjd spa_load_spares(spa); 1187168404Spjd spa_config_exit(spa, FTAG); 1188168404Spjd spa->spa_sync_spares = B_TRUE; 1189168404Spjd } 1190168404Spjd 1191168404Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1192168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 1193168404Spjd 1194168404Spjd tx = dmu_tx_create_assigned(dp, txg); 1195168404Spjd 1196168404Spjd /* 1197168404Spjd * Create the pool config object. 1198168404Spjd */ 1199168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1200168404Spjd DMU_OT_PACKED_NVLIST, 1 << 14, 1201168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1202168404Spjd 1203168404Spjd if (zap_add(spa->spa_meta_objset, 1204168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1205168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1206168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 1207168404Spjd } 1208168404Spjd 1209168404Spjd /* Newly created pools are always deflated. */ 1210168404Spjd spa->spa_deflate = TRUE; 1211168404Spjd if (zap_add(spa->spa_meta_objset, 1212168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1213168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1214168404Spjd cmn_err(CE_PANIC, "failed to add deflate"); 1215168404Spjd } 1216168404Spjd 1217168404Spjd /* 1218168404Spjd * Create the deferred-free bplist object. Turn off compression 1219168404Spjd * because sync-to-convergence takes longer if the blocksize 1220168404Spjd * keeps changing. 1221168404Spjd */ 1222168404Spjd spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1223168404Spjd 1 << 14, tx); 1224168404Spjd dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1225168404Spjd ZIO_COMPRESS_OFF, tx); 1226168404Spjd 1227168404Spjd if (zap_add(spa->spa_meta_objset, 1228168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1229168404Spjd sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1230168404Spjd cmn_err(CE_PANIC, "failed to add bplist"); 1231168404Spjd } 1232168404Spjd 1233168404Spjd /* 1234168404Spjd * Create the pool's history object. 1235168404Spjd */ 1236168404Spjd spa_history_create_obj(spa, tx); 1237168404Spjd 1238168404Spjd dmu_tx_commit(tx); 1239168404Spjd 1240168404Spjd spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1241168404Spjd spa->spa_sync_on = B_TRUE; 1242168404Spjd txg_sync_start(spa->spa_dsl_pool); 1243168404Spjd 1244168404Spjd /* 1245168404Spjd * We explicitly wait for the first transaction to complete so that our 1246168404Spjd * bean counters are appropriately updated. 1247168404Spjd */ 1248168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 1249168404Spjd 1250168404Spjd spa_config_sync(); 1251168404Spjd 1252168404Spjd mutex_exit(&spa_namespace_lock); 1253168404Spjd 1254168404Spjd return (0); 1255168404Spjd} 1256168404Spjd 1257168404Spjd/* 1258168404Spjd * Import the given pool into the system. We set up the necessary spa_t and 1259168404Spjd * then call spa_load() to do the dirty work. 1260168404Spjd */ 1261168404Spjdint 1262168404Spjdspa_import(const char *pool, nvlist_t *config, const char *altroot) 1263168404Spjd{ 1264168404Spjd spa_t *spa; 1265168404Spjd int error; 1266168404Spjd nvlist_t *nvroot; 1267168404Spjd nvlist_t **spares; 1268168404Spjd uint_t nspares; 1269168404Spjd 1270168404Spjd if (!(spa_mode & FWRITE)) 1271168404Spjd return (EROFS); 1272168404Spjd 1273168404Spjd /* 1274168404Spjd * If a pool with this name exists, return failure. 1275168404Spjd */ 1276168404Spjd mutex_enter(&spa_namespace_lock); 1277168404Spjd if (spa_lookup(pool) != NULL) { 1278168404Spjd mutex_exit(&spa_namespace_lock); 1279168404Spjd return (EEXIST); 1280168404Spjd } 1281168404Spjd 1282168404Spjd /* 1283168404Spjd * Create and initialize the spa structure. 1284168404Spjd */ 1285168404Spjd spa = spa_add(pool, altroot); 1286168404Spjd spa_activate(spa); 1287168404Spjd 1288168404Spjd /* 1289168404Spjd * Pass off the heavy lifting to spa_load(). 1290168404Spjd * Pass TRUE for mosconfig because the user-supplied config 1291168404Spjd * is actually the one to trust when doing an import. 1292168404Spjd */ 1293168404Spjd error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1294168404Spjd 1295168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 1296168404Spjd /* 1297168404Spjd * Toss any existing sparelist, as it doesn't have any validity anymore, 1298168404Spjd * and conflicts with spa_has_spare(). 1299168404Spjd */ 1300168404Spjd if (spa->spa_sparelist) { 1301168404Spjd nvlist_free(spa->spa_sparelist); 1302168404Spjd spa->spa_sparelist = NULL; 1303168404Spjd spa_load_spares(spa); 1304168404Spjd } 1305168404Spjd 1306168404Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1307168404Spjd &nvroot) == 0); 1308168404Spjd if (error == 0) 1309168404Spjd error = spa_validate_spares(spa, nvroot, -1ULL, 1310168404Spjd VDEV_ALLOC_SPARE); 1311168404Spjd spa_config_exit(spa, FTAG); 1312168404Spjd 1313168404Spjd if (error != 0) { 1314168404Spjd spa_unload(spa); 1315168404Spjd spa_deactivate(spa); 1316168404Spjd spa_remove(spa); 1317168404Spjd mutex_exit(&spa_namespace_lock); 1318168404Spjd return (error); 1319168404Spjd } 1320168404Spjd 1321168404Spjd /* 1322168404Spjd * Override any spares as specified by the user, as these may have 1323168404Spjd * correct device names/devids, etc. 1324168404Spjd */ 1325168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1326168404Spjd &spares, &nspares) == 0) { 1327168404Spjd if (spa->spa_sparelist) 1328168404Spjd VERIFY(nvlist_remove(spa->spa_sparelist, 1329168404Spjd ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1330168404Spjd else 1331168404Spjd VERIFY(nvlist_alloc(&spa->spa_sparelist, 1332168404Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 1333168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1334168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1335168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 1336168404Spjd spa_load_spares(spa); 1337168404Spjd spa_config_exit(spa, FTAG); 1338168404Spjd spa->spa_sync_spares = B_TRUE; 1339168404Spjd } 1340168404Spjd 1341168404Spjd /* 1342168404Spjd * Update the config cache to include the newly-imported pool. 1343168404Spjd */ 1344168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1345168404Spjd 1346168404Spjd mutex_exit(&spa_namespace_lock); 1347168404Spjd 1348168404Spjd /* 1349168404Spjd * Resilver anything that's out of date. 1350168404Spjd */ 1351168404Spjd if (spa_mode & FWRITE) 1352168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1353168404Spjd 1354168404Spjd return (0); 1355168404Spjd} 1356168404Spjd 1357168404Spjd/* 1358168404Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 1359168404Spjd * to get the vdev stats associated with the imported devices. 1360168404Spjd */ 1361168404Spjd#define TRYIMPORT_NAME "$import" 1362168404Spjd 1363168404Spjdnvlist_t * 1364168404Spjdspa_tryimport(nvlist_t *tryconfig) 1365168404Spjd{ 1366168404Spjd nvlist_t *config = NULL; 1367168404Spjd char *poolname; 1368168404Spjd spa_t *spa; 1369168404Spjd uint64_t state; 1370168404Spjd 1371168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1372168404Spjd return (NULL); 1373168404Spjd 1374168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1375168404Spjd return (NULL); 1376168404Spjd 1377168404Spjd /* 1378168404Spjd * Create and initialize the spa structure. 1379168404Spjd */ 1380168404Spjd mutex_enter(&spa_namespace_lock); 1381168404Spjd spa = spa_add(TRYIMPORT_NAME, NULL); 1382168404Spjd spa_activate(spa); 1383168404Spjd 1384168404Spjd /* 1385168404Spjd * Pass off the heavy lifting to spa_load(). 1386168404Spjd * Pass TRUE for mosconfig because the user-supplied config 1387168404Spjd * is actually the one to trust when doing an import. 1388168404Spjd */ 1389168404Spjd (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1390168404Spjd 1391168404Spjd /* 1392168404Spjd * If 'tryconfig' was at least parsable, return the current config. 1393168404Spjd */ 1394168404Spjd if (spa->spa_root_vdev != NULL) { 1395168404Spjd spa_config_enter(spa, RW_READER, FTAG); 1396168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1397168404Spjd spa_config_exit(spa, FTAG); 1398168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1399168404Spjd poolname) == 0); 1400168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1401168404Spjd state) == 0); 1402168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1403168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 1404168404Spjd 1405168404Spjd /* 1406168404Spjd * Add the list of hot spares. 1407168404Spjd */ 1408168404Spjd spa_add_spares(spa, config); 1409168404Spjd } 1410168404Spjd 1411168404Spjd spa_unload(spa); 1412168404Spjd spa_deactivate(spa); 1413168404Spjd spa_remove(spa); 1414168404Spjd mutex_exit(&spa_namespace_lock); 1415168404Spjd 1416168404Spjd return (config); 1417168404Spjd} 1418168404Spjd 1419168404Spjd/* 1420168404Spjd * Pool export/destroy 1421168404Spjd * 1422168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 1423168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 1424168404Spjd * update the pool state and sync all the labels to disk, removing the 1425168404Spjd * configuration from the cache afterwards. 1426168404Spjd */ 1427168404Spjdstatic int 1428168404Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1429168404Spjd{ 1430168404Spjd spa_t *spa; 1431168404Spjd 1432168404Spjd if (oldconfig) 1433168404Spjd *oldconfig = NULL; 1434168404Spjd 1435168404Spjd if (!(spa_mode & FWRITE)) 1436168404Spjd return (EROFS); 1437168404Spjd 1438168404Spjd mutex_enter(&spa_namespace_lock); 1439168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 1440168404Spjd mutex_exit(&spa_namespace_lock); 1441168404Spjd return (ENOENT); 1442168404Spjd } 1443168404Spjd 1444168404Spjd /* 1445168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 1446168404Spjd * reacquire the namespace lock, and see if we can export. 1447168404Spjd */ 1448168404Spjd spa_open_ref(spa, FTAG); 1449168404Spjd mutex_exit(&spa_namespace_lock); 1450168404Spjd spa_async_suspend(spa); 1451168404Spjd mutex_enter(&spa_namespace_lock); 1452168404Spjd spa_close(spa, FTAG); 1453168404Spjd 1454168404Spjd /* 1455168404Spjd * The pool will be in core if it's openable, 1456168404Spjd * in which case we can modify its state. 1457168404Spjd */ 1458168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1459168404Spjd /* 1460168404Spjd * Objsets may be open only because they're dirty, so we 1461168404Spjd * have to force it to sync before checking spa_refcnt. 1462168404Spjd */ 1463168404Spjd spa_scrub_suspend(spa); 1464168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1465168404Spjd 1466168404Spjd /* 1467168404Spjd * A pool cannot be exported or destroyed if there are active 1468168404Spjd * references. If we are resetting a pool, allow references by 1469168404Spjd * fault injection handlers. 1470168404Spjd */ 1471168404Spjd if (!spa_refcount_zero(spa) || 1472168404Spjd (spa->spa_inject_ref != 0 && 1473168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 1474168404Spjd spa_scrub_resume(spa); 1475168404Spjd spa_async_resume(spa); 1476168404Spjd mutex_exit(&spa_namespace_lock); 1477168404Spjd return (EBUSY); 1478168404Spjd } 1479168404Spjd 1480168404Spjd spa_scrub_resume(spa); 1481168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1482168404Spjd 1483168404Spjd /* 1484168404Spjd * We want this to be reflected on every label, 1485168404Spjd * so mark them all dirty. spa_unload() will do the 1486168404Spjd * final sync that pushes these changes out. 1487168404Spjd */ 1488168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 1489168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 1490168404Spjd spa->spa_state = new_state; 1491168404Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1492168404Spjd vdev_config_dirty(spa->spa_root_vdev); 1493168404Spjd spa_config_exit(spa, FTAG); 1494168404Spjd } 1495168404Spjd } 1496168404Spjd 1497168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1498168404Spjd spa_unload(spa); 1499168404Spjd spa_deactivate(spa); 1500168404Spjd } 1501168404Spjd 1502168404Spjd if (oldconfig && spa->spa_config) 1503168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1504168404Spjd 1505168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 1506168404Spjd spa_remove(spa); 1507168404Spjd spa_config_sync(); 1508168404Spjd } 1509168404Spjd mutex_exit(&spa_namespace_lock); 1510168404Spjd 1511168404Spjd return (0); 1512168404Spjd} 1513168404Spjd 1514168404Spjd/* 1515168404Spjd * Destroy a storage pool. 1516168404Spjd */ 1517168404Spjdint 1518168404Spjdspa_destroy(char *pool) 1519168404Spjd{ 1520168404Spjd return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1521168404Spjd} 1522168404Spjd 1523168404Spjd/* 1524168404Spjd * Export a storage pool. 1525168404Spjd */ 1526168404Spjdint 1527168404Spjdspa_export(char *pool, nvlist_t **oldconfig) 1528168404Spjd{ 1529168404Spjd return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1530168404Spjd} 1531168404Spjd 1532168404Spjd/* 1533168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 1534168404Spjd * from the namespace in any way. 1535168404Spjd */ 1536168404Spjdint 1537168404Spjdspa_reset(char *pool) 1538168404Spjd{ 1539168404Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1540168404Spjd} 1541168404Spjd 1542168404Spjd 1543168404Spjd/* 1544168404Spjd * ========================================================================== 1545168404Spjd * Device manipulation 1546168404Spjd * ========================================================================== 1547168404Spjd */ 1548168404Spjd 1549168404Spjd/* 1550168404Spjd * Add capacity to a storage pool. 1551168404Spjd */ 1552168404Spjdint 1553168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1554168404Spjd{ 1555168404Spjd uint64_t txg; 1556168404Spjd int c, error; 1557168404Spjd vdev_t *rvd = spa->spa_root_vdev; 1558168404Spjd vdev_t *vd, *tvd; 1559168404Spjd nvlist_t **spares; 1560168404Spjd uint_t i, nspares; 1561168404Spjd 1562168404Spjd txg = spa_vdev_enter(spa); 1563168404Spjd 1564168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1565168404Spjd VDEV_ALLOC_ADD)) != 0) 1566168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 1567168404Spjd 1568168404Spjd spa->spa_pending_vdev = vd; 1569168404Spjd 1570168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1571168404Spjd &spares, &nspares) != 0) 1572168404Spjd nspares = 0; 1573168404Spjd 1574168404Spjd if (vd->vdev_children == 0 && nspares == 0) { 1575168404Spjd spa->spa_pending_vdev = NULL; 1576168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1577168404Spjd } 1578168404Spjd 1579168404Spjd if (vd->vdev_children != 0) { 1580168404Spjd if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1581168404Spjd spa->spa_pending_vdev = NULL; 1582168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 1583168404Spjd } 1584168404Spjd } 1585168404Spjd 1586168404Spjd /* 1587168404Spjd * We must validate the spares after checking the children. Otherwise, 1588168404Spjd * vdev_inuse() will blindly overwrite the spare. 1589168404Spjd */ 1590168404Spjd if ((error = spa_validate_spares(spa, nvroot, txg, 1591168404Spjd VDEV_ALLOC_ADD)) != 0) { 1592168404Spjd spa->spa_pending_vdev = NULL; 1593168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 1594168404Spjd } 1595168404Spjd 1596168404Spjd spa->spa_pending_vdev = NULL; 1597168404Spjd 1598168404Spjd /* 1599168404Spjd * Transfer each new top-level vdev from vd to rvd. 1600168404Spjd */ 1601168404Spjd for (c = 0; c < vd->vdev_children; c++) { 1602168404Spjd tvd = vd->vdev_child[c]; 1603168404Spjd vdev_remove_child(vd, tvd); 1604168404Spjd tvd->vdev_id = rvd->vdev_children; 1605168404Spjd vdev_add_child(rvd, tvd); 1606168404Spjd vdev_config_dirty(tvd); 1607168404Spjd } 1608168404Spjd 1609168404Spjd if (nspares != 0) { 1610168404Spjd if (spa->spa_sparelist != NULL) { 1611168404Spjd nvlist_t **oldspares; 1612168404Spjd uint_t oldnspares; 1613168404Spjd nvlist_t **newspares; 1614168404Spjd 1615168404Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1616168404Spjd ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1617168404Spjd 1618168404Spjd newspares = kmem_alloc(sizeof (void *) * 1619168404Spjd (nspares + oldnspares), KM_SLEEP); 1620168404Spjd for (i = 0; i < oldnspares; i++) 1621168404Spjd VERIFY(nvlist_dup(oldspares[i], 1622168404Spjd &newspares[i], KM_SLEEP) == 0); 1623168404Spjd for (i = 0; i < nspares; i++) 1624168404Spjd VERIFY(nvlist_dup(spares[i], 1625168404Spjd &newspares[i + oldnspares], 1626168404Spjd KM_SLEEP) == 0); 1627168404Spjd 1628168404Spjd VERIFY(nvlist_remove(spa->spa_sparelist, 1629168404Spjd ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1630168404Spjd 1631168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1632168404Spjd ZPOOL_CONFIG_SPARES, newspares, 1633168404Spjd nspares + oldnspares) == 0); 1634168404Spjd for (i = 0; i < oldnspares + nspares; i++) 1635168404Spjd nvlist_free(newspares[i]); 1636168404Spjd kmem_free(newspares, (oldnspares + nspares) * 1637168404Spjd sizeof (void *)); 1638168404Spjd } else { 1639168404Spjd VERIFY(nvlist_alloc(&spa->spa_sparelist, 1640168404Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 1641168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1642168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1643168404Spjd } 1644168404Spjd 1645168404Spjd spa_load_spares(spa); 1646168404Spjd spa->spa_sync_spares = B_TRUE; 1647168404Spjd } 1648168404Spjd 1649168404Spjd /* 1650168404Spjd * We have to be careful when adding new vdevs to an existing pool. 1651168404Spjd * If other threads start allocating from these vdevs before we 1652168404Spjd * sync the config cache, and we lose power, then upon reboot we may 1653168404Spjd * fail to open the pool because there are DVAs that the config cache 1654168404Spjd * can't translate. Therefore, we first add the vdevs without 1655168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1656168404Spjd * and then let spa_config_update() initialize the new metaslabs. 1657168404Spjd * 1658168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 1659168404Spjd * if we lose power at any point in this sequence, the remaining 1660168404Spjd * steps will be completed the next time we load the pool. 1661168404Spjd */ 1662168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 1663168404Spjd 1664168404Spjd mutex_enter(&spa_namespace_lock); 1665168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1666168404Spjd mutex_exit(&spa_namespace_lock); 1667168404Spjd 1668168404Spjd return (0); 1669168404Spjd} 1670168404Spjd 1671168404Spjd/* 1672168404Spjd * Attach a device to a mirror. The arguments are the path to any device 1673168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 1674168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 1675168404Spjd * 1676168404Spjd * If 'replacing' is specified, the new device is intended to replace the 1677168404Spjd * existing device; in this case the two devices are made into their own 1678168404Spjd * mirror using the 'replacing' vdev, which is functionally idendical to 1679168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 1680168404Spjd * extra rules: you can't attach to it after it's been created, and upon 1681168404Spjd * completion of resilvering, the first disk (the one being replaced) 1682168404Spjd * is automatically detached. 1683168404Spjd */ 1684168404Spjdint 1685168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1686168404Spjd{ 1687168404Spjd uint64_t txg, open_txg; 1688168404Spjd int error; 1689168404Spjd vdev_t *rvd = spa->spa_root_vdev; 1690168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1691168404Spjd vdev_ops_t *pvops; 1692168404Spjd 1693168404Spjd txg = spa_vdev_enter(spa); 1694168404Spjd 1695168404Spjd oldvd = vdev_lookup_by_guid(rvd, guid); 1696168404Spjd 1697168404Spjd if (oldvd == NULL) 1698168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1699168404Spjd 1700168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 1701168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1702168404Spjd 1703168404Spjd pvd = oldvd->vdev_parent; 1704168404Spjd 1705168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1706168404Spjd VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1707168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1708168404Spjd 1709168404Spjd newvd = newrootvd->vdev_child[0]; 1710168404Spjd 1711168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 1712168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1713168404Spjd 1714168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1715168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 1716168404Spjd 1717168404Spjd if (!replacing) { 1718168404Spjd /* 1719168404Spjd * For attach, the only allowable parent is a mirror or the root 1720168404Spjd * vdev. 1721168404Spjd */ 1722168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 1723168404Spjd pvd->vdev_ops != &vdev_root_ops) 1724168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1725168404Spjd 1726168404Spjd pvops = &vdev_mirror_ops; 1727168404Spjd } else { 1728168404Spjd /* 1729168404Spjd * Active hot spares can only be replaced by inactive hot 1730168404Spjd * spares. 1731168404Spjd */ 1732168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 1733168404Spjd pvd->vdev_child[1] == oldvd && 1734168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 1735168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1736168404Spjd 1737168404Spjd /* 1738168404Spjd * If the source is a hot spare, and the parent isn't already a 1739168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 1740168404Spjd * want to create a replacing vdev. The user is not allowed to 1741168404Spjd * attach to a spared vdev child unless the 'isspare' state is 1742168404Spjd * the same (spare replaces spare, non-spare replaces 1743168404Spjd * non-spare). 1744168404Spjd */ 1745168404Spjd if (pvd->vdev_ops == &vdev_replacing_ops) 1746168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1747168404Spjd else if (pvd->vdev_ops == &vdev_spare_ops && 1748168404Spjd newvd->vdev_isspare != oldvd->vdev_isspare) 1749168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1750168404Spjd else if (pvd->vdev_ops != &vdev_spare_ops && 1751168404Spjd newvd->vdev_isspare) 1752168404Spjd pvops = &vdev_spare_ops; 1753168404Spjd else 1754168404Spjd pvops = &vdev_replacing_ops; 1755168404Spjd } 1756168404Spjd 1757168404Spjd /* 1758168404Spjd * Compare the new device size with the replaceable/attachable 1759168404Spjd * device size. 1760168404Spjd */ 1761168404Spjd if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1762168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1763168404Spjd 1764168404Spjd /* 1765168404Spjd * The new device cannot have a higher alignment requirement 1766168404Spjd * than the top-level vdev. 1767168404Spjd */ 1768168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1769168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1770168404Spjd 1771168404Spjd /* 1772168404Spjd * If this is an in-place replacement, update oldvd's path and devid 1773168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 1774168404Spjd */ 1775168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1776168404Spjd spa_strfree(oldvd->vdev_path); 1777168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1778168404Spjd KM_SLEEP); 1779168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 1780168404Spjd newvd->vdev_path, "old"); 1781168404Spjd if (oldvd->vdev_devid != NULL) { 1782168404Spjd spa_strfree(oldvd->vdev_devid); 1783168404Spjd oldvd->vdev_devid = NULL; 1784168404Spjd } 1785168404Spjd } 1786168404Spjd 1787168404Spjd /* 1788168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 1789168404Spjd * mirror/replacing/spare vdev above oldvd. 1790168404Spjd */ 1791168404Spjd if (pvd->vdev_ops != pvops) 1792168404Spjd pvd = vdev_add_parent(oldvd, pvops); 1793168404Spjd 1794168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 1795168404Spjd ASSERT(pvd->vdev_ops == pvops); 1796168404Spjd ASSERT(oldvd->vdev_parent == pvd); 1797168404Spjd 1798168404Spjd /* 1799168404Spjd * Extract the new device from its root and add it to pvd. 1800168404Spjd */ 1801168404Spjd vdev_remove_child(newrootvd, newvd); 1802168404Spjd newvd->vdev_id = pvd->vdev_children; 1803168404Spjd vdev_add_child(pvd, newvd); 1804168404Spjd 1805168404Spjd /* 1806168404Spjd * If newvd is smaller than oldvd, but larger than its rsize, 1807168404Spjd * the addition of newvd may have decreased our parent's asize. 1808168404Spjd */ 1809168404Spjd pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1810168404Spjd 1811168404Spjd tvd = newvd->vdev_top; 1812168404Spjd ASSERT(pvd->vdev_top == tvd); 1813168404Spjd ASSERT(tvd->vdev_parent == rvd); 1814168404Spjd 1815168404Spjd vdev_config_dirty(tvd); 1816168404Spjd 1817168404Spjd /* 1818168404Spjd * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1819168404Spjd * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1820168404Spjd */ 1821168404Spjd open_txg = txg + TXG_CONCURRENT_STATES - 1; 1822168404Spjd 1823168404Spjd mutex_enter(&newvd->vdev_dtl_lock); 1824168404Spjd space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1825168404Spjd open_txg - TXG_INITIAL + 1); 1826168404Spjd mutex_exit(&newvd->vdev_dtl_lock); 1827168404Spjd 1828168404Spjd if (newvd->vdev_isspare) 1829168404Spjd spa_spare_activate(newvd); 1830168404Spjd 1831168404Spjd /* 1832168404Spjd * Mark newvd's DTL dirty in this txg. 1833168404Spjd */ 1834168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 1835168404Spjd 1836168404Spjd (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1837168404Spjd 1838168404Spjd /* 1839168404Spjd * Kick off a resilver to update newvd. 1840168404Spjd */ 1841168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1842168404Spjd 1843168404Spjd return (0); 1844168404Spjd} 1845168404Spjd 1846168404Spjd/* 1847168404Spjd * Detach a device from a mirror or replacing vdev. 1848168404Spjd * If 'replace_done' is specified, only detach if the parent 1849168404Spjd * is a replacing vdev. 1850168404Spjd */ 1851168404Spjdint 1852168404Spjdspa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1853168404Spjd{ 1854168404Spjd uint64_t txg; 1855168404Spjd int c, t, error; 1856168404Spjd vdev_t *rvd = spa->spa_root_vdev; 1857168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 1858168404Spjd boolean_t unspare = B_FALSE; 1859168404Spjd uint64_t unspare_guid; 1860168404Spjd 1861168404Spjd txg = spa_vdev_enter(spa); 1862168404Spjd 1863168404Spjd vd = vdev_lookup_by_guid(rvd, guid); 1864168404Spjd 1865168404Spjd if (vd == NULL) 1866168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1867168404Spjd 1868168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 1869168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1870168404Spjd 1871168404Spjd pvd = vd->vdev_parent; 1872168404Spjd 1873168404Spjd /* 1874168404Spjd * If replace_done is specified, only remove this device if it's 1875168404Spjd * the first child of a replacing vdev. For the 'spare' vdev, either 1876168404Spjd * disk can be removed. 1877168404Spjd */ 1878168404Spjd if (replace_done) { 1879168404Spjd if (pvd->vdev_ops == &vdev_replacing_ops) { 1880168404Spjd if (vd->vdev_id != 0) 1881168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1882168404Spjd } else if (pvd->vdev_ops != &vdev_spare_ops) { 1883168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1884168404Spjd } 1885168404Spjd } 1886168404Spjd 1887168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1888168404Spjd spa_version(spa) >= ZFS_VERSION_SPARES); 1889168404Spjd 1890168404Spjd /* 1891168404Spjd * Only mirror, replacing, and spare vdevs support detach. 1892168404Spjd */ 1893168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 1894168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 1895168404Spjd pvd->vdev_ops != &vdev_spare_ops) 1896168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1897168404Spjd 1898168404Spjd /* 1899168404Spjd * If there's only one replica, you can't detach it. 1900168404Spjd */ 1901168404Spjd if (pvd->vdev_children <= 1) 1902168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1903168404Spjd 1904168404Spjd /* 1905168404Spjd * If all siblings have non-empty DTLs, this device may have the only 1906168404Spjd * valid copy of the data, which means we cannot safely detach it. 1907168404Spjd * 1908168404Spjd * XXX -- as in the vdev_offline() case, we really want a more 1909168404Spjd * precise DTL check. 1910168404Spjd */ 1911168404Spjd for (c = 0; c < pvd->vdev_children; c++) { 1912168404Spjd uint64_t dirty; 1913168404Spjd 1914168404Spjd cvd = pvd->vdev_child[c]; 1915168404Spjd if (cvd == vd) 1916168404Spjd continue; 1917168404Spjd if (vdev_is_dead(cvd)) 1918168404Spjd continue; 1919168404Spjd mutex_enter(&cvd->vdev_dtl_lock); 1920168404Spjd dirty = cvd->vdev_dtl_map.sm_space | 1921168404Spjd cvd->vdev_dtl_scrub.sm_space; 1922168404Spjd mutex_exit(&cvd->vdev_dtl_lock); 1923168404Spjd if (!dirty) 1924168404Spjd break; 1925168404Spjd } 1926168404Spjd 1927168404Spjd /* 1928168404Spjd * If we are a replacing or spare vdev, then we can always detach the 1929168404Spjd * latter child, as that is how one cancels the operation. 1930168404Spjd */ 1931168404Spjd if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1932168404Spjd c == pvd->vdev_children) 1933168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1934168404Spjd 1935168404Spjd /* 1936168404Spjd * If we are detaching the original disk from a spare, then it implies 1937168404Spjd * that the spare should become a real disk, and be removed from the 1938168404Spjd * active spare list for the pool. 1939168404Spjd */ 1940168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 1941168404Spjd vd->vdev_id == 0) 1942168404Spjd unspare = B_TRUE; 1943168404Spjd 1944168404Spjd /* 1945168404Spjd * Erase the disk labels so the disk can be used for other things. 1946168404Spjd * This must be done after all other error cases are handled, 1947168404Spjd * but before we disembowel vd (so we can still do I/O to it). 1948168404Spjd * But if we can't do it, don't treat the error as fatal -- 1949168404Spjd * it may be that the unwritability of the disk is the reason 1950168404Spjd * it's being detached! 1951168404Spjd */ 1952168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1953168404Spjd 1954168404Spjd /* 1955168404Spjd * Remove vd from its parent and compact the parent's children. 1956168404Spjd */ 1957168404Spjd vdev_remove_child(pvd, vd); 1958168404Spjd vdev_compact_children(pvd); 1959168404Spjd 1960168404Spjd /* 1961168404Spjd * Remember one of the remaining children so we can get tvd below. 1962168404Spjd */ 1963168404Spjd cvd = pvd->vdev_child[0]; 1964168404Spjd 1965168404Spjd /* 1966168404Spjd * If we need to remove the remaining child from the list of hot spares, 1967168404Spjd * do it now, marking the vdev as no longer a spare in the process. We 1968168404Spjd * must do this before vdev_remove_parent(), because that can change the 1969168404Spjd * GUID if it creates a new toplevel GUID. 1970168404Spjd */ 1971168404Spjd if (unspare) { 1972168404Spjd ASSERT(cvd->vdev_isspare); 1973168404Spjd spa_spare_remove(cvd); 1974168404Spjd unspare_guid = cvd->vdev_guid; 1975168404Spjd } 1976168404Spjd 1977168404Spjd /* 1978168404Spjd * If the parent mirror/replacing vdev only has one child, 1979168404Spjd * the parent is no longer needed. Remove it from the tree. 1980168404Spjd */ 1981168404Spjd if (pvd->vdev_children == 1) 1982168404Spjd vdev_remove_parent(cvd); 1983168404Spjd 1984168404Spjd /* 1985168404Spjd * We don't set tvd until now because the parent we just removed 1986168404Spjd * may have been the previous top-level vdev. 1987168404Spjd */ 1988168404Spjd tvd = cvd->vdev_top; 1989168404Spjd ASSERT(tvd->vdev_parent == rvd); 1990168404Spjd 1991168404Spjd /* 1992168404Spjd * Reevaluate the parent vdev state. 1993168404Spjd */ 1994168404Spjd vdev_propagate_state(cvd->vdev_parent); 1995168404Spjd 1996168404Spjd /* 1997168404Spjd * If the device we just detached was smaller than the others, it may be 1998168404Spjd * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 1999168404Spjd * can't fail because the existing metaslabs are already in core, so 2000168404Spjd * there's nothing to read from disk. 2001168404Spjd */ 2002168404Spjd VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2003168404Spjd 2004168404Spjd vdev_config_dirty(tvd); 2005168404Spjd 2006168404Spjd /* 2007168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2008168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 2009168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 2010168404Spjd * prevent vd from being accessed after it's freed. 2011168404Spjd */ 2012168404Spjd for (t = 0; t < TXG_SIZE; t++) 2013168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2014168404Spjd vd->vdev_detached = B_TRUE; 2015168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 2016168404Spjd 2017168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 2018168404Spjd 2019168404Spjd /* 2020168404Spjd * If this was the removal of the original device in a hot spare vdev, 2021168404Spjd * then we want to go through and remove the device from the hot spare 2022168404Spjd * list of every other pool. 2023168404Spjd */ 2024168404Spjd if (unspare) { 2025168404Spjd spa = NULL; 2026168404Spjd mutex_enter(&spa_namespace_lock); 2027168404Spjd while ((spa = spa_next(spa)) != NULL) { 2028168404Spjd if (spa->spa_state != POOL_STATE_ACTIVE) 2029168404Spjd continue; 2030168404Spjd 2031168404Spjd (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2032168404Spjd } 2033168404Spjd mutex_exit(&spa_namespace_lock); 2034168404Spjd } 2035168404Spjd 2036168404Spjd return (error); 2037168404Spjd} 2038168404Spjd 2039168404Spjd/* 2040168404Spjd * Remove a device from the pool. Currently, this supports removing only hot 2041168404Spjd * spares. 2042168404Spjd */ 2043168404Spjdint 2044168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2045168404Spjd{ 2046168404Spjd vdev_t *vd; 2047168404Spjd nvlist_t **spares, *nv, **newspares; 2048168404Spjd uint_t i, j, nspares; 2049168404Spjd int ret = 0; 2050168404Spjd 2051168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 2052168404Spjd 2053168404Spjd vd = spa_lookup_by_guid(spa, guid); 2054168404Spjd 2055168404Spjd nv = NULL; 2056168404Spjd if (spa->spa_spares != NULL && 2057168404Spjd nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2058168404Spjd &spares, &nspares) == 0) { 2059168404Spjd for (i = 0; i < nspares; i++) { 2060168404Spjd uint64_t theguid; 2061168404Spjd 2062168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 2063168404Spjd ZPOOL_CONFIG_GUID, &theguid) == 0); 2064168404Spjd if (theguid == guid) { 2065168404Spjd nv = spares[i]; 2066168404Spjd break; 2067168404Spjd } 2068168404Spjd } 2069168404Spjd } 2070168404Spjd 2071168404Spjd /* 2072168404Spjd * We only support removing a hot spare, and only if it's not currently 2073168404Spjd * in use in this pool. 2074168404Spjd */ 2075168404Spjd if (nv == NULL && vd == NULL) { 2076168404Spjd ret = ENOENT; 2077168404Spjd goto out; 2078168404Spjd } 2079168404Spjd 2080168404Spjd if (nv == NULL && vd != NULL) { 2081168404Spjd ret = ENOTSUP; 2082168404Spjd goto out; 2083168404Spjd } 2084168404Spjd 2085168404Spjd if (!unspare && nv != NULL && vd != NULL) { 2086168404Spjd ret = EBUSY; 2087168404Spjd goto out; 2088168404Spjd } 2089168404Spjd 2090168404Spjd if (nspares == 1) { 2091168404Spjd newspares = NULL; 2092168404Spjd } else { 2093168404Spjd newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2094168404Spjd KM_SLEEP); 2095168404Spjd for (i = 0, j = 0; i < nspares; i++) { 2096168404Spjd if (spares[i] != nv) 2097168404Spjd VERIFY(nvlist_dup(spares[i], 2098168404Spjd &newspares[j++], KM_SLEEP) == 0); 2099168404Spjd } 2100168404Spjd } 2101168404Spjd 2102168404Spjd VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2103168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 2104168404Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2105168404Spjd newspares, nspares - 1) == 0); 2106168404Spjd for (i = 0; i < nspares - 1; i++) 2107168404Spjd nvlist_free(newspares[i]); 2108168404Spjd kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2109168404Spjd spa_load_spares(spa); 2110168404Spjd spa->spa_sync_spares = B_TRUE; 2111168404Spjd 2112168404Spjdout: 2113168404Spjd spa_config_exit(spa, FTAG); 2114168404Spjd 2115168404Spjd return (ret); 2116168404Spjd} 2117168404Spjd 2118168404Spjd/* 2119168404Spjd * Find any device that's done replacing, so we can detach it. 2120168404Spjd */ 2121168404Spjdstatic vdev_t * 2122168404Spjdspa_vdev_replace_done_hunt(vdev_t *vd) 2123168404Spjd{ 2124168404Spjd vdev_t *newvd, *oldvd; 2125168404Spjd int c; 2126168404Spjd 2127168404Spjd for (c = 0; c < vd->vdev_children; c++) { 2128168404Spjd oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2129168404Spjd if (oldvd != NULL) 2130168404Spjd return (oldvd); 2131168404Spjd } 2132168404Spjd 2133168404Spjd if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2134168404Spjd oldvd = vd->vdev_child[0]; 2135168404Spjd newvd = vd->vdev_child[1]; 2136168404Spjd 2137168404Spjd mutex_enter(&newvd->vdev_dtl_lock); 2138168404Spjd if (newvd->vdev_dtl_map.sm_space == 0 && 2139168404Spjd newvd->vdev_dtl_scrub.sm_space == 0) { 2140168404Spjd mutex_exit(&newvd->vdev_dtl_lock); 2141168404Spjd return (oldvd); 2142168404Spjd } 2143168404Spjd mutex_exit(&newvd->vdev_dtl_lock); 2144168404Spjd } 2145168404Spjd 2146168404Spjd return (NULL); 2147168404Spjd} 2148168404Spjd 2149168404Spjdstatic void 2150168404Spjdspa_vdev_replace_done(spa_t *spa) 2151168404Spjd{ 2152168404Spjd vdev_t *vd; 2153168404Spjd vdev_t *pvd; 2154168404Spjd uint64_t guid; 2155168404Spjd uint64_t pguid = 0; 2156168404Spjd 2157168404Spjd spa_config_enter(spa, RW_READER, FTAG); 2158168404Spjd 2159168404Spjd while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2160168404Spjd guid = vd->vdev_guid; 2161168404Spjd /* 2162168404Spjd * If we have just finished replacing a hot spared device, then 2163168404Spjd * we need to detach the parent's first child (the original hot 2164168404Spjd * spare) as well. 2165168404Spjd */ 2166168404Spjd pvd = vd->vdev_parent; 2167168404Spjd if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2168168404Spjd pvd->vdev_id == 0) { 2169168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2170168404Spjd ASSERT(pvd->vdev_parent->vdev_children == 2); 2171168404Spjd pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2172168404Spjd } 2173168404Spjd spa_config_exit(spa, FTAG); 2174168404Spjd if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2175168404Spjd return; 2176168404Spjd if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2177168404Spjd return; 2178168404Spjd spa_config_enter(spa, RW_READER, FTAG); 2179168404Spjd } 2180168404Spjd 2181168404Spjd spa_config_exit(spa, FTAG); 2182168404Spjd} 2183168404Spjd 2184168404Spjd/* 2185168404Spjd * Update the stored path for this vdev. Dirty the vdev configuration, relying 2186168404Spjd * on spa_vdev_enter/exit() to synchronize the labels and cache. 2187168404Spjd */ 2188168404Spjdint 2189168404Spjdspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2190168404Spjd{ 2191168404Spjd vdev_t *rvd, *vd; 2192168404Spjd uint64_t txg; 2193168404Spjd 2194168404Spjd rvd = spa->spa_root_vdev; 2195168404Spjd 2196168404Spjd txg = spa_vdev_enter(spa); 2197168404Spjd 2198168404Spjd if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2199168404Spjd /* 2200168404Spjd * Determine if this is a reference to a hot spare. In that 2201168404Spjd * case, update the path as stored in the spare list. 2202168404Spjd */ 2203168404Spjd nvlist_t **spares; 2204168404Spjd uint_t i, nspares; 2205168404Spjd if (spa->spa_sparelist != NULL) { 2206168404Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2207168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2208168404Spjd for (i = 0; i < nspares; i++) { 2209168404Spjd uint64_t theguid; 2210168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 2211168404Spjd ZPOOL_CONFIG_GUID, &theguid) == 0); 2212168404Spjd if (theguid == guid) 2213168404Spjd break; 2214168404Spjd } 2215168404Spjd 2216168404Spjd if (i == nspares) 2217168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2218168404Spjd 2219168404Spjd VERIFY(nvlist_add_string(spares[i], 2220168404Spjd ZPOOL_CONFIG_PATH, newpath) == 0); 2221168404Spjd spa_load_spares(spa); 2222168404Spjd spa->spa_sync_spares = B_TRUE; 2223168404Spjd return (spa_vdev_exit(spa, NULL, txg, 0)); 2224168404Spjd } else { 2225168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2226168404Spjd } 2227168404Spjd } 2228168404Spjd 2229168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 2230168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2231168404Spjd 2232168404Spjd spa_strfree(vd->vdev_path); 2233168404Spjd vd->vdev_path = spa_strdup(newpath); 2234168404Spjd 2235168404Spjd vdev_config_dirty(vd->vdev_top); 2236168404Spjd 2237168404Spjd return (spa_vdev_exit(spa, NULL, txg, 0)); 2238168404Spjd} 2239168404Spjd 2240168404Spjd/* 2241168404Spjd * ========================================================================== 2242168404Spjd * SPA Scrubbing 2243168404Spjd * ========================================================================== 2244168404Spjd */ 2245168404Spjd 2246168404Spjdstatic void 2247168404Spjdspa_scrub_io_done(zio_t *zio) 2248168404Spjd{ 2249168404Spjd spa_t *spa = zio->io_spa; 2250168404Spjd 2251168404Spjd zio_data_buf_free(zio->io_data, zio->io_size); 2252168404Spjd 2253168404Spjd mutex_enter(&spa->spa_scrub_lock); 2254168404Spjd if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2255168404Spjd vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2256168404Spjd spa->spa_scrub_errors++; 2257168404Spjd mutex_enter(&vd->vdev_stat_lock); 2258168404Spjd vd->vdev_stat.vs_scrub_errors++; 2259168404Spjd mutex_exit(&vd->vdev_stat_lock); 2260168404Spjd } 2261168404Spjd 2262168404Spjd if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2263168404Spjd cv_broadcast(&spa->spa_scrub_io_cv); 2264168404Spjd 2265168404Spjd ASSERT(spa->spa_scrub_inflight >= 0); 2266168404Spjd 2267168404Spjd mutex_exit(&spa->spa_scrub_lock); 2268168404Spjd} 2269168404Spjd 2270168404Spjdstatic void 2271168404Spjdspa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2272168404Spjd zbookmark_t *zb) 2273168404Spjd{ 2274168404Spjd size_t size = BP_GET_LSIZE(bp); 2275168404Spjd void *data; 2276168404Spjd 2277168404Spjd mutex_enter(&spa->spa_scrub_lock); 2278168404Spjd /* 2279168404Spjd * Do not give too much work to vdev(s). 2280168404Spjd */ 2281168404Spjd while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2282168404Spjd cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2283168404Spjd } 2284168404Spjd spa->spa_scrub_inflight++; 2285168404Spjd mutex_exit(&spa->spa_scrub_lock); 2286168404Spjd 2287168404Spjd data = zio_data_buf_alloc(size); 2288168404Spjd 2289168404Spjd if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2290168404Spjd flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2291168404Spjd 2292168404Spjd flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2293168404Spjd 2294168404Spjd zio_nowait(zio_read(NULL, spa, bp, data, size, 2295168404Spjd spa_scrub_io_done, NULL, priority, flags, zb)); 2296168404Spjd} 2297168404Spjd 2298168404Spjd/* ARGSUSED */ 2299168404Spjdstatic int 2300168404Spjdspa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2301168404Spjd{ 2302168404Spjd blkptr_t *bp = &bc->bc_blkptr; 2303168404Spjd vdev_t *vd = spa->spa_root_vdev; 2304168404Spjd dva_t *dva = bp->blk_dva; 2305168404Spjd int needs_resilver = B_FALSE; 2306168404Spjd int d; 2307168404Spjd 2308168404Spjd if (bc->bc_errno) { 2309168404Spjd /* 2310168404Spjd * We can't scrub this block, but we can continue to scrub 2311168404Spjd * the rest of the pool. Note the error and move along. 2312168404Spjd */ 2313168404Spjd mutex_enter(&spa->spa_scrub_lock); 2314168404Spjd spa->spa_scrub_errors++; 2315168404Spjd mutex_exit(&spa->spa_scrub_lock); 2316168404Spjd 2317168404Spjd mutex_enter(&vd->vdev_stat_lock); 2318168404Spjd vd->vdev_stat.vs_scrub_errors++; 2319168404Spjd mutex_exit(&vd->vdev_stat_lock); 2320168404Spjd 2321168404Spjd return (ERESTART); 2322168404Spjd } 2323168404Spjd 2324168404Spjd ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2325168404Spjd 2326168404Spjd for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2327168404Spjd vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2328168404Spjd 2329168404Spjd ASSERT(vd != NULL); 2330168404Spjd 2331168404Spjd /* 2332168404Spjd * Keep track of how much data we've examined so that 2333168404Spjd * zpool(1M) status can make useful progress reports. 2334168404Spjd */ 2335168404Spjd mutex_enter(&vd->vdev_stat_lock); 2336168404Spjd vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2337168404Spjd mutex_exit(&vd->vdev_stat_lock); 2338168404Spjd 2339168404Spjd if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2340168404Spjd if (DVA_GET_GANG(&dva[d])) { 2341168404Spjd /* 2342168404Spjd * Gang members may be spread across multiple 2343168404Spjd * vdevs, so the best we can do is look at the 2344168404Spjd * pool-wide DTL. 2345168404Spjd * XXX -- it would be better to change our 2346168404Spjd * allocation policy to ensure that this can't 2347168404Spjd * happen. 2348168404Spjd */ 2349168404Spjd vd = spa->spa_root_vdev; 2350168404Spjd } 2351168404Spjd if (vdev_dtl_contains(&vd->vdev_dtl_map, 2352168404Spjd bp->blk_birth, 1)) 2353168404Spjd needs_resilver = B_TRUE; 2354168404Spjd } 2355168404Spjd } 2356168404Spjd 2357168404Spjd if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2358168404Spjd spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2359168404Spjd ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2360168404Spjd else if (needs_resilver) 2361168404Spjd spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2362168404Spjd ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2363168404Spjd 2364168404Spjd return (0); 2365168404Spjd} 2366168404Spjd 2367168404Spjdstatic void 2368168404Spjdspa_scrub_thread(void *arg) 2369168404Spjd{ 2370168404Spjd spa_t *spa = arg; 2371168404Spjd callb_cpr_t cprinfo; 2372168404Spjd traverse_handle_t *th = spa->spa_scrub_th; 2373168404Spjd vdev_t *rvd = spa->spa_root_vdev; 2374168404Spjd pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2375168404Spjd int error = 0; 2376168404Spjd boolean_t complete; 2377168404Spjd 2378168404Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2379168404Spjd 2380168404Spjd /* 2381168404Spjd * If we're restarting due to a snapshot create/delete, 2382168404Spjd * wait for that to complete. 2383168404Spjd */ 2384168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 2385168404Spjd 2386168404Spjd dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2387168404Spjd scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2388168404Spjd spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2389168404Spjd 2390168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 2391168404Spjd vdev_reopen(rvd); /* purge all vdev caches */ 2392168404Spjd vdev_config_dirty(rvd); /* rewrite all disk labels */ 2393168404Spjd vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2394168404Spjd spa_config_exit(spa, FTAG); 2395168404Spjd 2396168404Spjd mutex_enter(&spa->spa_scrub_lock); 2397168404Spjd spa->spa_scrub_errors = 0; 2398168404Spjd spa->spa_scrub_active = 1; 2399168404Spjd ASSERT(spa->spa_scrub_inflight == 0); 2400168404Spjd 2401168404Spjd while (!spa->spa_scrub_stop) { 2402168404Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 2403168404Spjd while (spa->spa_scrub_suspended) { 2404168404Spjd spa->spa_scrub_active = 0; 2405168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2406168404Spjd cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2407168404Spjd spa->spa_scrub_active = 1; 2408168404Spjd } 2409168404Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2410168404Spjd 2411168404Spjd if (spa->spa_scrub_restart_txg != 0) 2412168404Spjd break; 2413168404Spjd 2414168404Spjd mutex_exit(&spa->spa_scrub_lock); 2415168404Spjd error = traverse_more(th); 2416168404Spjd mutex_enter(&spa->spa_scrub_lock); 2417168404Spjd if (error != EAGAIN) 2418168404Spjd break; 2419168404Spjd } 2420168404Spjd 2421168404Spjd while (spa->spa_scrub_inflight) 2422168404Spjd cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2423168404Spjd 2424168404Spjd spa->spa_scrub_active = 0; 2425168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2426168404Spjd 2427168404Spjd mutex_exit(&spa->spa_scrub_lock); 2428168404Spjd 2429168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 2430168404Spjd 2431168404Spjd mutex_enter(&spa->spa_scrub_lock); 2432168404Spjd 2433168404Spjd /* 2434168404Spjd * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2435168404Spjd * AND the spa config lock to synchronize with any config changes 2436168404Spjd * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2437168404Spjd */ 2438168404Spjd if (spa->spa_scrub_restart_txg != 0) 2439168404Spjd error = ERESTART; 2440168404Spjd 2441168404Spjd if (spa->spa_scrub_stop) 2442168404Spjd error = EINTR; 2443168404Spjd 2444168404Spjd /* 2445168404Spjd * Even if there were uncorrectable errors, we consider the scrub 2446168404Spjd * completed. The downside is that if there is a transient error during 2447168404Spjd * a resilver, we won't resilver the data properly to the target. But 2448168404Spjd * if the damage is permanent (more likely) we will resilver forever, 2449168404Spjd * which isn't really acceptable. Since there is enough information for 2450168404Spjd * the user to know what has failed and why, this seems like a more 2451168404Spjd * tractable approach. 2452168404Spjd */ 2453168404Spjd complete = (error == 0); 2454168404Spjd 2455168404Spjd dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2456168404Spjd scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2457168404Spjd spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2458168404Spjd error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2459168404Spjd 2460168404Spjd mutex_exit(&spa->spa_scrub_lock); 2461168404Spjd 2462168404Spjd /* 2463168404Spjd * If the scrub/resilver completed, update all DTLs to reflect this. 2464168404Spjd * Whether it succeeded or not, vacate all temporary scrub DTLs. 2465168404Spjd */ 2466168404Spjd vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2467168404Spjd complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2468168404Spjd vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2469168404Spjd spa_errlog_rotate(spa); 2470168404Spjd 2471168404Spjd spa_config_exit(spa, FTAG); 2472168404Spjd 2473168404Spjd mutex_enter(&spa->spa_scrub_lock); 2474168404Spjd 2475168404Spjd /* 2476168404Spjd * We may have finished replacing a device. 2477168404Spjd * Let the async thread assess this and handle the detach. 2478168404Spjd */ 2479168404Spjd spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2480168404Spjd 2481168404Spjd /* 2482168404Spjd * If we were told to restart, our final act is to start a new scrub. 2483168404Spjd */ 2484168404Spjd if (error == ERESTART) 2485168404Spjd spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2486168404Spjd SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2487168404Spjd 2488168404Spjd spa->spa_scrub_type = POOL_SCRUB_NONE; 2489168404Spjd spa->spa_scrub_active = 0; 2490168404Spjd spa->spa_scrub_thread = NULL; 2491168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2492168404Spjd CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2493168404Spjd thread_exit(); 2494168404Spjd} 2495168404Spjd 2496168404Spjdvoid 2497168404Spjdspa_scrub_suspend(spa_t *spa) 2498168404Spjd{ 2499168404Spjd mutex_enter(&spa->spa_scrub_lock); 2500168404Spjd spa->spa_scrub_suspended++; 2501168404Spjd while (spa->spa_scrub_active) { 2502168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2503168404Spjd cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2504168404Spjd } 2505168404Spjd while (spa->spa_scrub_inflight) 2506168404Spjd cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2507168404Spjd mutex_exit(&spa->spa_scrub_lock); 2508168404Spjd} 2509168404Spjd 2510168404Spjdvoid 2511168404Spjdspa_scrub_resume(spa_t *spa) 2512168404Spjd{ 2513168404Spjd mutex_enter(&spa->spa_scrub_lock); 2514168404Spjd ASSERT(spa->spa_scrub_suspended != 0); 2515168404Spjd if (--spa->spa_scrub_suspended == 0) 2516168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2517168404Spjd mutex_exit(&spa->spa_scrub_lock); 2518168404Spjd} 2519168404Spjd 2520168404Spjdvoid 2521168404Spjdspa_scrub_restart(spa_t *spa, uint64_t txg) 2522168404Spjd{ 2523168404Spjd /* 2524168404Spjd * Something happened (e.g. snapshot create/delete) that means 2525168404Spjd * we must restart any in-progress scrubs. The itinerary will 2526168404Spjd * fix this properly. 2527168404Spjd */ 2528168404Spjd mutex_enter(&spa->spa_scrub_lock); 2529168404Spjd spa->spa_scrub_restart_txg = txg; 2530168404Spjd mutex_exit(&spa->spa_scrub_lock); 2531168404Spjd} 2532168404Spjd 2533168404Spjdint 2534168404Spjdspa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2535168404Spjd{ 2536168404Spjd space_seg_t *ss; 2537168404Spjd uint64_t mintxg, maxtxg; 2538168404Spjd vdev_t *rvd = spa->spa_root_vdev; 2539168404Spjd 2540168404Spjd if ((uint_t)type >= POOL_SCRUB_TYPES) 2541168404Spjd return (ENOTSUP); 2542168404Spjd 2543168404Spjd mutex_enter(&spa->spa_scrub_lock); 2544168404Spjd 2545168404Spjd /* 2546168404Spjd * If there's a scrub or resilver already in progress, stop it. 2547168404Spjd */ 2548168404Spjd while (spa->spa_scrub_thread != NULL) { 2549168404Spjd /* 2550168404Spjd * Don't stop a resilver unless forced. 2551168404Spjd */ 2552168404Spjd if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2553168404Spjd mutex_exit(&spa->spa_scrub_lock); 2554168404Spjd return (EBUSY); 2555168404Spjd } 2556168404Spjd spa->spa_scrub_stop = 1; 2557168404Spjd cv_broadcast(&spa->spa_scrub_cv); 2558168404Spjd cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2559168404Spjd } 2560168404Spjd 2561168404Spjd /* 2562168404Spjd * Terminate the previous traverse. 2563168404Spjd */ 2564168404Spjd if (spa->spa_scrub_th != NULL) { 2565168404Spjd traverse_fini(spa->spa_scrub_th); 2566168404Spjd spa->spa_scrub_th = NULL; 2567168404Spjd } 2568168404Spjd 2569168404Spjd if (rvd == NULL) { 2570168404Spjd ASSERT(spa->spa_scrub_stop == 0); 2571168404Spjd ASSERT(spa->spa_scrub_type == type); 2572168404Spjd ASSERT(spa->spa_scrub_restart_txg == 0); 2573168404Spjd mutex_exit(&spa->spa_scrub_lock); 2574168404Spjd return (0); 2575168404Spjd } 2576168404Spjd 2577168404Spjd mintxg = TXG_INITIAL - 1; 2578168404Spjd maxtxg = spa_last_synced_txg(spa) + 1; 2579168404Spjd 2580168404Spjd mutex_enter(&rvd->vdev_dtl_lock); 2581168404Spjd 2582168404Spjd if (rvd->vdev_dtl_map.sm_space == 0) { 2583168404Spjd /* 2584168404Spjd * The pool-wide DTL is empty. 2585168404Spjd * If this is a resilver, there's nothing to do except 2586168404Spjd * check whether any in-progress replacements have completed. 2587168404Spjd */ 2588168404Spjd if (type == POOL_SCRUB_RESILVER) { 2589168404Spjd type = POOL_SCRUB_NONE; 2590168404Spjd spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2591168404Spjd } 2592168404Spjd } else { 2593168404Spjd /* 2594168404Spjd * The pool-wide DTL is non-empty. 2595168404Spjd * If this is a normal scrub, upgrade to a resilver instead. 2596168404Spjd */ 2597168404Spjd if (type == POOL_SCRUB_EVERYTHING) 2598168404Spjd type = POOL_SCRUB_RESILVER; 2599168404Spjd } 2600168404Spjd 2601168404Spjd if (type == POOL_SCRUB_RESILVER) { 2602168404Spjd /* 2603168404Spjd * Determine the resilvering boundaries. 2604168404Spjd * 2605168404Spjd * Note: (mintxg, maxtxg) is an open interval, 2606168404Spjd * i.e. mintxg and maxtxg themselves are not included. 2607168404Spjd * 2608168404Spjd * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2609168404Spjd * so we don't claim to resilver a txg that's still changing. 2610168404Spjd */ 2611168404Spjd ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2612168404Spjd mintxg = ss->ss_start - 1; 2613168404Spjd ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2614168404Spjd maxtxg = MIN(ss->ss_end, maxtxg); 2615168404Spjd } 2616168404Spjd 2617168404Spjd mutex_exit(&rvd->vdev_dtl_lock); 2618168404Spjd 2619168404Spjd spa->spa_scrub_stop = 0; 2620168404Spjd spa->spa_scrub_type = type; 2621168404Spjd spa->spa_scrub_restart_txg = 0; 2622168404Spjd 2623168404Spjd if (type != POOL_SCRUB_NONE) { 2624168404Spjd spa->spa_scrub_mintxg = mintxg; 2625168404Spjd spa->spa_scrub_maxtxg = maxtxg; 2626168404Spjd spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2627168404Spjd ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2628168404Spjd ZIO_FLAG_CANFAIL); 2629168404Spjd traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2630168404Spjd spa->spa_scrub_thread = thread_create(NULL, 0, 2631168404Spjd spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2632168404Spjd } 2633168404Spjd 2634168404Spjd mutex_exit(&spa->spa_scrub_lock); 2635168404Spjd 2636168404Spjd return (0); 2637168404Spjd} 2638168404Spjd 2639168404Spjd/* 2640168404Spjd * ========================================================================== 2641168404Spjd * SPA async task processing 2642168404Spjd * ========================================================================== 2643168404Spjd */ 2644168404Spjd 2645168404Spjdstatic void 2646168404Spjdspa_async_reopen(spa_t *spa) 2647168404Spjd{ 2648168404Spjd vdev_t *rvd = spa->spa_root_vdev; 2649168404Spjd vdev_t *tvd; 2650168404Spjd int c; 2651168404Spjd 2652168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 2653168404Spjd 2654168404Spjd for (c = 0; c < rvd->vdev_children; c++) { 2655168404Spjd tvd = rvd->vdev_child[c]; 2656168404Spjd if (tvd->vdev_reopen_wanted) { 2657168404Spjd tvd->vdev_reopen_wanted = 0; 2658168404Spjd vdev_reopen(tvd); 2659168404Spjd } 2660168404Spjd } 2661168404Spjd 2662168404Spjd spa_config_exit(spa, FTAG); 2663168404Spjd} 2664168404Spjd 2665168404Spjdstatic void 2666168404Spjdspa_async_thread(void *arg) 2667168404Spjd{ 2668168404Spjd spa_t *spa = arg; 2669168404Spjd int tasks; 2670168404Spjd 2671168404Spjd ASSERT(spa->spa_sync_on); 2672168404Spjd 2673168404Spjd mutex_enter(&spa->spa_async_lock); 2674168404Spjd tasks = spa->spa_async_tasks; 2675168404Spjd spa->spa_async_tasks = 0; 2676168404Spjd mutex_exit(&spa->spa_async_lock); 2677168404Spjd 2678168404Spjd /* 2679168404Spjd * See if the config needs to be updated. 2680168404Spjd */ 2681168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2682168404Spjd mutex_enter(&spa_namespace_lock); 2683168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2684168404Spjd mutex_exit(&spa_namespace_lock); 2685168404Spjd } 2686168404Spjd 2687168404Spjd /* 2688168404Spjd * See if any devices need to be reopened. 2689168404Spjd */ 2690168404Spjd if (tasks & SPA_ASYNC_REOPEN) 2691168404Spjd spa_async_reopen(spa); 2692168404Spjd 2693168404Spjd /* 2694168404Spjd * If any devices are done replacing, detach them. 2695168404Spjd */ 2696168404Spjd if (tasks & SPA_ASYNC_REPLACE_DONE) 2697168404Spjd spa_vdev_replace_done(spa); 2698168404Spjd 2699168404Spjd /* 2700168404Spjd * Kick off a scrub. 2701168404Spjd */ 2702168404Spjd if (tasks & SPA_ASYNC_SCRUB) 2703168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2704168404Spjd 2705168404Spjd /* 2706168404Spjd * Kick off a resilver. 2707168404Spjd */ 2708168404Spjd if (tasks & SPA_ASYNC_RESILVER) 2709168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2710168404Spjd 2711168404Spjd /* 2712168404Spjd * Let the world know that we're done. 2713168404Spjd */ 2714168404Spjd mutex_enter(&spa->spa_async_lock); 2715168404Spjd spa->spa_async_thread = NULL; 2716168404Spjd cv_broadcast(&spa->spa_async_cv); 2717168404Spjd mutex_exit(&spa->spa_async_lock); 2718168404Spjd thread_exit(); 2719168404Spjd} 2720168404Spjd 2721168404Spjdvoid 2722168404Spjdspa_async_suspend(spa_t *spa) 2723168404Spjd{ 2724168404Spjd mutex_enter(&spa->spa_async_lock); 2725168404Spjd spa->spa_async_suspended++; 2726168404Spjd while (spa->spa_async_thread != NULL) 2727168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2728168404Spjd mutex_exit(&spa->spa_async_lock); 2729168404Spjd} 2730168404Spjd 2731168404Spjdvoid 2732168404Spjdspa_async_resume(spa_t *spa) 2733168404Spjd{ 2734168404Spjd mutex_enter(&spa->spa_async_lock); 2735168404Spjd ASSERT(spa->spa_async_suspended != 0); 2736168404Spjd spa->spa_async_suspended--; 2737168404Spjd mutex_exit(&spa->spa_async_lock); 2738168404Spjd} 2739168404Spjd 2740168404Spjdstatic void 2741168404Spjdspa_async_dispatch(spa_t *spa) 2742168404Spjd{ 2743168404Spjd mutex_enter(&spa->spa_async_lock); 2744168404Spjd if (spa->spa_async_tasks && !spa->spa_async_suspended && 2745168404Spjd spa->spa_async_thread == NULL && 2746168404Spjd rootdir != NULL && !vn_is_readonly(rootdir)) 2747168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 2748168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2749168404Spjd mutex_exit(&spa->spa_async_lock); 2750168404Spjd} 2751168404Spjd 2752168404Spjdvoid 2753168404Spjdspa_async_request(spa_t *spa, int task) 2754168404Spjd{ 2755168404Spjd mutex_enter(&spa->spa_async_lock); 2756168404Spjd spa->spa_async_tasks |= task; 2757168404Spjd mutex_exit(&spa->spa_async_lock); 2758168404Spjd} 2759168404Spjd 2760168404Spjd/* 2761168404Spjd * ========================================================================== 2762168404Spjd * SPA syncing routines 2763168404Spjd * ========================================================================== 2764168404Spjd */ 2765168404Spjd 2766168404Spjdstatic void 2767168404Spjdspa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2768168404Spjd{ 2769168404Spjd bplist_t *bpl = &spa->spa_sync_bplist; 2770168404Spjd dmu_tx_t *tx; 2771168404Spjd blkptr_t blk; 2772168404Spjd uint64_t itor = 0; 2773168404Spjd zio_t *zio; 2774168404Spjd int error; 2775168404Spjd uint8_t c = 1; 2776168404Spjd 2777168404Spjd zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2778168404Spjd 2779168404Spjd while (bplist_iterate(bpl, &itor, &blk) == 0) 2780168404Spjd zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2781168404Spjd 2782168404Spjd error = zio_wait(zio); 2783168404Spjd ASSERT3U(error, ==, 0); 2784168404Spjd 2785168404Spjd tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2786168404Spjd bplist_vacate(bpl, tx); 2787168404Spjd 2788168404Spjd /* 2789168404Spjd * Pre-dirty the first block so we sync to convergence faster. 2790168404Spjd * (Usually only the first block is needed.) 2791168404Spjd */ 2792168404Spjd dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2793168404Spjd dmu_tx_commit(tx); 2794168404Spjd} 2795168404Spjd 2796168404Spjdstatic void 2797168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2798168404Spjd{ 2799168404Spjd char *packed = NULL; 2800168404Spjd size_t nvsize = 0; 2801168404Spjd dmu_buf_t *db; 2802168404Spjd 2803168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2804168404Spjd 2805168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 2806168404Spjd 2807168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2808168404Spjd KM_SLEEP) == 0); 2809168404Spjd 2810168404Spjd dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2811168404Spjd 2812168404Spjd kmem_free(packed, nvsize); 2813168404Spjd 2814168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2815168404Spjd dmu_buf_will_dirty(db, tx); 2816168404Spjd *(uint64_t *)db->db_data = nvsize; 2817168404Spjd dmu_buf_rele(db, FTAG); 2818168404Spjd} 2819168404Spjd 2820168404Spjdstatic void 2821168404Spjdspa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2822168404Spjd{ 2823168404Spjd nvlist_t *nvroot; 2824168404Spjd nvlist_t **spares; 2825168404Spjd int i; 2826168404Spjd 2827168404Spjd if (!spa->spa_sync_spares) 2828168404Spjd return; 2829168404Spjd 2830168404Spjd /* 2831168404Spjd * Update the MOS nvlist describing the list of available spares. 2832168404Spjd * spa_validate_spares() will have already made sure this nvlist is 2833168404Spjd * valid and the vdevs are labelled appropriately. 2834168404Spjd */ 2835168404Spjd if (spa->spa_spares_object == 0) { 2836168404Spjd spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2837168404Spjd DMU_OT_PACKED_NVLIST, 1 << 14, 2838168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2839168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 2840168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2841168404Spjd sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2842168404Spjd } 2843168404Spjd 2844168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2845168404Spjd if (spa->spa_nspares == 0) { 2846168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2847168404Spjd NULL, 0) == 0); 2848168404Spjd } else { 2849168404Spjd spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2850168404Spjd KM_SLEEP); 2851168404Spjd for (i = 0; i < spa->spa_nspares; i++) 2852168404Spjd spares[i] = vdev_config_generate(spa, 2853168404Spjd spa->spa_spares[i], B_FALSE, B_TRUE); 2854168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2855168404Spjd spares, spa->spa_nspares) == 0); 2856168404Spjd for (i = 0; i < spa->spa_nspares; i++) 2857168404Spjd nvlist_free(spares[i]); 2858168404Spjd kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2859168404Spjd } 2860168404Spjd 2861168404Spjd spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2862168404Spjd nvlist_free(nvroot); 2863168404Spjd 2864168404Spjd spa->spa_sync_spares = B_FALSE; 2865168404Spjd} 2866168404Spjd 2867168404Spjdstatic void 2868168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2869168404Spjd{ 2870168404Spjd nvlist_t *config; 2871168404Spjd 2872168404Spjd if (list_is_empty(&spa->spa_dirty_list)) 2873168404Spjd return; 2874168404Spjd 2875168404Spjd config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2876168404Spjd 2877168404Spjd if (spa->spa_config_syncing) 2878168404Spjd nvlist_free(spa->spa_config_syncing); 2879168404Spjd spa->spa_config_syncing = config; 2880168404Spjd 2881168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2882168404Spjd} 2883168404Spjd 2884168404Spjdstatic void 2885168404Spjdspa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 2886168404Spjd{ 2887168404Spjd spa_t *spa = arg1; 2888168404Spjd nvlist_t *nvp = arg2; 2889168404Spjd nvpair_t *nvpair; 2890168404Spjd objset_t *mos = spa->spa_meta_objset; 2891168404Spjd uint64_t zapobj; 2892168404Spjd 2893168404Spjd mutex_enter(&spa->spa_props_lock); 2894168404Spjd if (spa->spa_pool_props_object == 0) { 2895168404Spjd zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2896168404Spjd VERIFY(zapobj > 0); 2897168404Spjd 2898168404Spjd spa->spa_pool_props_object = zapobj; 2899168404Spjd 2900168404Spjd VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2901168404Spjd DMU_POOL_PROPS, 8, 1, 2902168404Spjd &spa->spa_pool_props_object, tx) == 0); 2903168404Spjd } 2904168404Spjd mutex_exit(&spa->spa_props_lock); 2905168404Spjd 2906168404Spjd nvpair = NULL; 2907168404Spjd while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2908168404Spjd switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2909168404Spjd case ZFS_PROP_BOOTFS: 2910168404Spjd VERIFY(nvlist_lookup_uint64(nvp, 2911168404Spjd nvpair_name(nvpair), &spa->spa_bootfs) == 0); 2912168404Spjd VERIFY(zap_update(mos, 2913168404Spjd spa->spa_pool_props_object, 2914168404Spjd zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 2915168404Spjd &spa->spa_bootfs, tx) == 0); 2916168404Spjd break; 2917168404Spjd } 2918168404Spjd } 2919168404Spjd} 2920168404Spjd 2921168404Spjd/* 2922168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 2923168404Spjd * part of the process, so we iterate until it converges. 2924168404Spjd */ 2925168404Spjdvoid 2926168404Spjdspa_sync(spa_t *spa, uint64_t txg) 2927168404Spjd{ 2928168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 2929168404Spjd objset_t *mos = spa->spa_meta_objset; 2930168404Spjd bplist_t *bpl = &spa->spa_sync_bplist; 2931168404Spjd vdev_t *rvd = spa->spa_root_vdev; 2932168404Spjd vdev_t *vd; 2933168404Spjd dmu_tx_t *tx; 2934168404Spjd int dirty_vdevs; 2935168404Spjd 2936168404Spjd /* 2937168404Spjd * Lock out configuration changes. 2938168404Spjd */ 2939168404Spjd spa_config_enter(spa, RW_READER, FTAG); 2940168404Spjd 2941168404Spjd spa->spa_syncing_txg = txg; 2942168404Spjd spa->spa_sync_pass = 0; 2943168404Spjd 2944168404Spjd VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2945168404Spjd 2946168404Spjd tx = dmu_tx_create_assigned(dp, txg); 2947168404Spjd 2948168404Spjd /* 2949168404Spjd * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2950168404Spjd * set spa_deflate if we have no raid-z vdevs. 2951168404Spjd */ 2952168404Spjd if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2953168404Spjd spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2954168404Spjd int i; 2955168404Spjd 2956168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 2957168404Spjd vd = rvd->vdev_child[i]; 2958168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2959168404Spjd break; 2960168404Spjd } 2961168404Spjd if (i == rvd->vdev_children) { 2962168404Spjd spa->spa_deflate = TRUE; 2963168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 2964168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2965168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2966168404Spjd } 2967168404Spjd } 2968168404Spjd 2969168404Spjd /* 2970168404Spjd * If anything has changed in this txg, push the deferred frees 2971168404Spjd * from the previous txg. If not, leave them alone so that we 2972168404Spjd * don't generate work on an otherwise idle system. 2973168404Spjd */ 2974168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2975168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2976168404Spjd !txg_list_empty(&dp->dp_sync_tasks, txg)) 2977168404Spjd spa_sync_deferred_frees(spa, txg); 2978168404Spjd 2979168404Spjd /* 2980168404Spjd * Iterate to convergence. 2981168404Spjd */ 2982168404Spjd do { 2983168404Spjd spa->spa_sync_pass++; 2984168404Spjd 2985168404Spjd spa_sync_config_object(spa, tx); 2986168404Spjd spa_sync_spares(spa, tx); 2987168404Spjd spa_errlog_sync(spa, txg); 2988168404Spjd dsl_pool_sync(dp, txg); 2989168404Spjd 2990168404Spjd dirty_vdevs = 0; 2991168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2992168404Spjd vdev_sync(vd, txg); 2993168404Spjd dirty_vdevs++; 2994168404Spjd } 2995168404Spjd 2996168404Spjd bplist_sync(bpl, tx); 2997168404Spjd } while (dirty_vdevs); 2998168404Spjd 2999168404Spjd bplist_close(bpl); 3000168404Spjd 3001168404Spjd dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3002168404Spjd 3003168404Spjd /* 3004168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 3005168404Spjd * to commit the transaction group. 3006168404Spjd * 3007168404Spjd * If there are any dirty vdevs, sync the uberblock to all vdevs. 3008168404Spjd * Otherwise, pick a random top-level vdev that's known to be 3009168404Spjd * visible in the config cache (see spa_vdev_add() for details). 3010168404Spjd * If the write fails, try the next vdev until we're tried them all. 3011168404Spjd */ 3012168404Spjd if (!list_is_empty(&spa->spa_dirty_list)) { 3013168404Spjd VERIFY(vdev_config_sync(rvd, txg) == 0); 3014168404Spjd } else { 3015168404Spjd int children = rvd->vdev_children; 3016168404Spjd int c0 = spa_get_random(children); 3017168404Spjd int c; 3018168404Spjd 3019168404Spjd for (c = 0; c < children; c++) { 3020168404Spjd vd = rvd->vdev_child[(c0 + c) % children]; 3021168404Spjd if (vd->vdev_ms_array == 0) 3022168404Spjd continue; 3023168404Spjd if (vdev_config_sync(vd, txg) == 0) 3024168404Spjd break; 3025168404Spjd } 3026168404Spjd if (c == children) 3027168404Spjd VERIFY(vdev_config_sync(rvd, txg) == 0); 3028168404Spjd } 3029168404Spjd 3030168404Spjd dmu_tx_commit(tx); 3031168404Spjd 3032168404Spjd /* 3033168404Spjd * Clear the dirty config list. 3034168404Spjd */ 3035168404Spjd while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3036168404Spjd vdev_config_clean(vd); 3037168404Spjd 3038168404Spjd /* 3039168404Spjd * Now that the new config has synced transactionally, 3040168404Spjd * let it become visible to the config cache. 3041168404Spjd */ 3042168404Spjd if (spa->spa_config_syncing != NULL) { 3043168404Spjd spa_config_set(spa, spa->spa_config_syncing); 3044168404Spjd spa->spa_config_txg = txg; 3045168404Spjd spa->spa_config_syncing = NULL; 3046168404Spjd } 3047168404Spjd 3048168404Spjd /* 3049168404Spjd * Make a stable copy of the fully synced uberblock. 3050168404Spjd * We use this as the root for pool traversals. 3051168404Spjd */ 3052168404Spjd spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3053168404Spjd 3054168404Spjd spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3055168404Spjd 3056168404Spjd rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3057168404Spjd spa->spa_traverse_wanted = 0; 3058168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3059168404Spjd rw_exit(&spa->spa_traverse_lock); 3060168404Spjd 3061168404Spjd spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3062168404Spjd 3063168404Spjd /* 3064168404Spjd * Clean up the ZIL records for the synced txg. 3065168404Spjd */ 3066168404Spjd dsl_pool_zil_clean(dp); 3067168404Spjd 3068168404Spjd /* 3069168404Spjd * Update usable space statistics. 3070168404Spjd */ 3071168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3072168404Spjd vdev_sync_done(vd, txg); 3073168404Spjd 3074168404Spjd /* 3075168404Spjd * It had better be the case that we didn't dirty anything 3076168404Spjd * since vdev_config_sync(). 3077168404Spjd */ 3078168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3079168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3080168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3081168404Spjd ASSERT(bpl->bpl_queue == NULL); 3082168404Spjd 3083168404Spjd spa_config_exit(spa, FTAG); 3084168404Spjd 3085168404Spjd /* 3086168404Spjd * If any async tasks have been requested, kick them off. 3087168404Spjd */ 3088168404Spjd spa_async_dispatch(spa); 3089168404Spjd} 3090168404Spjd 3091168404Spjd/* 3092168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 3093168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 3094168404Spjd * sync. 3095168404Spjd */ 3096168404Spjdvoid 3097168404Spjdspa_sync_allpools(void) 3098168404Spjd{ 3099168404Spjd spa_t *spa = NULL; 3100168404Spjd mutex_enter(&spa_namespace_lock); 3101168404Spjd while ((spa = spa_next(spa)) != NULL) { 3102168404Spjd if (spa_state(spa) != POOL_STATE_ACTIVE) 3103168404Spjd continue; 3104168404Spjd spa_open_ref(spa, FTAG); 3105168404Spjd mutex_exit(&spa_namespace_lock); 3106168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 3107168404Spjd mutex_enter(&spa_namespace_lock); 3108168404Spjd spa_close(spa, FTAG); 3109168404Spjd } 3110168404Spjd mutex_exit(&spa_namespace_lock); 3111168404Spjd} 3112168404Spjd 3113168404Spjd/* 3114168404Spjd * ========================================================================== 3115168404Spjd * Miscellaneous routines 3116168404Spjd * ========================================================================== 3117168404Spjd */ 3118168404Spjd 3119168404Spjd/* 3120168404Spjd * Remove all pools in the system. 3121168404Spjd */ 3122168404Spjdvoid 3123168404Spjdspa_evict_all(void) 3124168404Spjd{ 3125168404Spjd spa_t *spa; 3126168404Spjd 3127168404Spjd /* 3128168404Spjd * Remove all cached state. All pools should be closed now, 3129168404Spjd * so every spa in the AVL tree should be unreferenced. 3130168404Spjd */ 3131168404Spjd mutex_enter(&spa_namespace_lock); 3132168404Spjd while ((spa = spa_next(NULL)) != NULL) { 3133168404Spjd /* 3134168404Spjd * Stop async tasks. The async thread may need to detach 3135168404Spjd * a device that's been replaced, which requires grabbing 3136168404Spjd * spa_namespace_lock, so we must drop it here. 3137168404Spjd */ 3138168404Spjd spa_open_ref(spa, FTAG); 3139168404Spjd mutex_exit(&spa_namespace_lock); 3140168404Spjd spa_async_suspend(spa); 3141168404Spjd VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3142168404Spjd mutex_enter(&spa_namespace_lock); 3143168404Spjd spa_close(spa, FTAG); 3144168404Spjd 3145168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3146168404Spjd spa_unload(spa); 3147168404Spjd spa_deactivate(spa); 3148168404Spjd } 3149168404Spjd spa_remove(spa); 3150168404Spjd } 3151168404Spjd mutex_exit(&spa_namespace_lock); 3152168404Spjd} 3153168404Spjd 3154168404Spjdvdev_t * 3155168404Spjdspa_lookup_by_guid(spa_t *spa, uint64_t guid) 3156168404Spjd{ 3157168404Spjd return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3158168404Spjd} 3159168404Spjd 3160168404Spjdvoid 3161168404Spjdspa_upgrade(spa_t *spa) 3162168404Spjd{ 3163168404Spjd spa_config_enter(spa, RW_WRITER, FTAG); 3164168404Spjd 3165168404Spjd /* 3166168404Spjd * This should only be called for a non-faulted pool, and since a 3167168404Spjd * future version would result in an unopenable pool, this shouldn't be 3168168404Spjd * possible. 3169168404Spjd */ 3170168404Spjd ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3171168404Spjd 3172168404Spjd spa->spa_uberblock.ub_version = ZFS_VERSION; 3173168404Spjd vdev_config_dirty(spa->spa_root_vdev); 3174168404Spjd 3175168404Spjd spa_config_exit(spa, FTAG); 3176168404Spjd 3177168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 3178168404Spjd} 3179168404Spjd 3180168404Spjdboolean_t 3181168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 3182168404Spjd{ 3183168404Spjd int i; 3184168404Spjd uint64_t spareguid; 3185168404Spjd 3186168404Spjd for (i = 0; i < spa->spa_nspares; i++) 3187168404Spjd if (spa->spa_spares[i]->vdev_guid == guid) 3188168404Spjd return (B_TRUE); 3189168404Spjd 3190168404Spjd for (i = 0; i < spa->spa_pending_nspares; i++) { 3191168404Spjd if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3192168404Spjd ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3193168404Spjd spareguid == guid) 3194168404Spjd return (B_TRUE); 3195168404Spjd } 3196168404Spjd 3197168404Spjd return (B_FALSE); 3198168404Spjd} 3199168404Spjd 3200168404Spjdint 3201168404Spjdspa_set_props(spa_t *spa, nvlist_t *nvp) 3202168404Spjd{ 3203168404Spjd return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3204168404Spjd spa, nvp, 3)); 3205168404Spjd} 3206168404Spjd 3207168404Spjdint 3208168404Spjdspa_get_props(spa_t *spa, nvlist_t **nvp) 3209168404Spjd{ 3210168404Spjd zap_cursor_t zc; 3211168404Spjd zap_attribute_t za; 3212168404Spjd objset_t *mos = spa->spa_meta_objset; 3213168404Spjd zfs_source_t src; 3214168404Spjd zfs_prop_t prop; 3215168404Spjd nvlist_t *propval; 3216168404Spjd uint64_t value; 3217168404Spjd int err; 3218168404Spjd 3219168404Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3220168404Spjd 3221168404Spjd mutex_enter(&spa->spa_props_lock); 3222168404Spjd /* If no props object, then just return empty nvlist */ 3223168404Spjd if (spa->spa_pool_props_object == 0) { 3224168404Spjd mutex_exit(&spa->spa_props_lock); 3225168404Spjd return (0); 3226168404Spjd } 3227168404Spjd 3228168404Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3229168404Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 3230168404Spjd zap_cursor_advance(&zc)) { 3231168404Spjd 3232168404Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3233168404Spjd continue; 3234168404Spjd 3235168404Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3236168404Spjd switch (za.za_integer_length) { 3237168404Spjd case 8: 3238168404Spjd if (zfs_prop_default_numeric(prop) == 3239168404Spjd za.za_first_integer) 3240168404Spjd src = ZFS_SRC_DEFAULT; 3241168404Spjd else 3242168404Spjd src = ZFS_SRC_LOCAL; 3243168404Spjd value = za.za_first_integer; 3244168404Spjd 3245168404Spjd if (prop == ZFS_PROP_BOOTFS) { 3246168404Spjd dsl_pool_t *dp; 3247168404Spjd dsl_dataset_t *ds = NULL; 3248168404Spjd char strval[MAXPATHLEN]; 3249168404Spjd 3250168404Spjd dp = spa_get_dsl(spa); 3251168404Spjd rw_enter(&dp->dp_config_rwlock, RW_READER); 3252168404Spjd if ((err = dsl_dataset_open_obj(dp, 3253168404Spjd za.za_first_integer, NULL, DS_MODE_NONE, 3254168404Spjd FTAG, &ds)) != 0) { 3255168404Spjd rw_exit(&dp->dp_config_rwlock); 3256168404Spjd break; 3257168404Spjd } 3258168404Spjd dsl_dataset_name(ds, strval); 3259168404Spjd dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3260168404Spjd rw_exit(&dp->dp_config_rwlock); 3261168404Spjd 3262168404Spjd VERIFY(nvlist_add_uint64(propval, 3263168404Spjd ZFS_PROP_SOURCE, src) == 0); 3264168404Spjd VERIFY(nvlist_add_string(propval, 3265168404Spjd ZFS_PROP_VALUE, strval) == 0); 3266168404Spjd } else { 3267168404Spjd VERIFY(nvlist_add_uint64(propval, 3268168404Spjd ZFS_PROP_SOURCE, src) == 0); 3269168404Spjd VERIFY(nvlist_add_uint64(propval, 3270168404Spjd ZFS_PROP_VALUE, value) == 0); 3271168404Spjd } 3272168404Spjd VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3273168404Spjd propval) == 0); 3274168404Spjd break; 3275168404Spjd } 3276168404Spjd nvlist_free(propval); 3277168404Spjd } 3278168404Spjd zap_cursor_fini(&zc); 3279168404Spjd mutex_exit(&spa->spa_props_lock); 3280168404Spjd if (err && err != ENOENT) { 3281168404Spjd nvlist_free(*nvp); 3282168404Spjd return (err); 3283168404Spjd } 3284168404Spjd 3285168404Spjd return (0); 3286168404Spjd} 3287168404Spjd 3288168404Spjd/* 3289168404Spjd * If the bootfs property value is dsobj, clear it. 3290168404Spjd */ 3291168404Spjdvoid 3292168404Spjdspa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3293168404Spjd{ 3294168404Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3295168404Spjd VERIFY(zap_remove(spa->spa_meta_objset, 3296168404Spjd spa->spa_pool_props_object, 3297168404Spjd zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 3298168404Spjd spa->spa_bootfs = 0; 3299168404Spjd } 3300168404Spjd} 3301