vdev.c revision 288569
1169695Skan/* 2169695Skan * CDDL HEADER START 3169695Skan * 4169695Skan * The contents of this file are subject to the terms of the 5169695Skan * Common Development and Distribution License (the "License"). 6169695Skan * You may not use this file except in compliance with the License. 7169695Skan * 8169695Skan * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9169695Skan * or http://www.opensolaris.org/os/licensing. 10169695Skan * See the License for the specific language governing permissions 11169695Skan * and limitations under the License. 12169695Skan * 13169695Skan * When distributing Covered Code, include this CDDL HEADER in each 14169695Skan * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15169695Skan * If applicable, add the following below this CDDL HEADER, with the 16169695Skan * fields enclosed by brackets "[]" replaced with your own identifying 17169695Skan * information: Portions Copyright [yyyy] [name of copyright owner] 18169695Skan * 19169695Skan * CDDL HEADER END 20169695Skan */ 21169695Skan 22169695Skan/* 23169695Skan * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24169695Skan * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 25169695Skan * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 26169695Skan * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27169695Skan */ 28169695Skan 29169695Skan#include <sys/zfs_context.h> 30169695Skan#include <sys/fm/fs/zfs.h> 31169695Skan#include <sys/spa.h> 32169695Skan#include <sys/spa_impl.h> 33169695Skan#include <sys/dmu.h> 34169695Skan#include <sys/dmu_tx.h> 35169695Skan#include <sys/vdev_impl.h> 36169695Skan#include <sys/uberblock_impl.h> 37169695Skan#include <sys/metaslab.h> 38169695Skan#include <sys/metaslab_impl.h> 39169695Skan#include <sys/space_map.h> 40169695Skan#include <sys/space_reftree.h> 41169695Skan#include <sys/zio.h> 42169695Skan#include <sys/zap.h> 43169695Skan#include <sys/fs/zfs.h> 44169695Skan#include <sys/arc.h> 45169695Skan#include <sys/zil.h> 46169695Skan#include <sys/dsl_scan.h> 47169695Skan#include <sys/trim_map.h> 48169695Skan 49169695SkanSYSCTL_DECL(_vfs_zfs); 50169695SkanSYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 51169695Skan 52169695Skan/* 53169695Skan * Virtual device management. 54169695Skan */ 55169695Skan 56169695Skan/* 57169695Skan * The limit for ZFS to automatically increase a top-level vdev's ashift 58169695Skan * from logical ashift to physical ashift. 59169695Skan * 60169695Skan * Example: one or more 512B emulation child vdevs 61169695Skan * child->vdev_ashift = 9 (512 bytes) 62169695Skan * child->vdev_physical_ashift = 12 (4096 bytes) 63169695Skan * zfs_max_auto_ashift = 11 (2048 bytes) 64169695Skan * zfs_min_auto_ashift = 9 (512 bytes) 65169695Skan * 66169695Skan * On pool creation or the addition of a new top-level vdev, ZFS will 67169695Skan * increase the ashift of the top-level vdev to 2048 as limited by 68169695Skan * zfs_max_auto_ashift. 69169695Skan * 70169695Skan * Example: one or more 512B emulation child vdevs 71169695Skan * child->vdev_ashift = 9 (512 bytes) 72169695Skan * child->vdev_physical_ashift = 12 (4096 bytes) 73169695Skan * zfs_max_auto_ashift = 13 (8192 bytes) 74169695Skan * zfs_min_auto_ashift = 9 (512 bytes) 75169695Skan * 76169695Skan * On pool creation or the addition of a new top-level vdev, ZFS will 77169695Skan * increase the ashift of the top-level vdev to 4096 to match the 78169695Skan * max vdev_physical_ashift. 79169695Skan * 80169695Skan * Example: one or more 512B emulation child vdevs 81169695Skan * child->vdev_ashift = 9 (512 bytes) 82169695Skan * child->vdev_physical_ashift = 9 (512 bytes) 83169695Skan * zfs_max_auto_ashift = 13 (8192 bytes) 84169695Skan * zfs_min_auto_ashift = 12 (4096 bytes) 85169695Skan * 86169695Skan * On pool creation or the addition of a new top-level vdev, ZFS will 87169695Skan * increase the ashift of the top-level vdev to 4096 to match the 88169695Skan * zfs_min_auto_ashift. 89169695Skan */ 90169695Skanstatic uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; 91169695Skanstatic uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; 92169695Skan 93169695Skanstatic int 94169695Skansysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) 95169695Skan{ 96169695Skan uint64_t val; 97169695Skan int err; 98169695Skan 99169695Skan val = zfs_max_auto_ashift; 100169695Skan err = sysctl_handle_64(oidp, &val, 0, req); 101169695Skan if (err != 0 || req->newptr == NULL) 102169695Skan return (err); 103169695Skan 104169695Skan if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) 105169695Skan return (EINVAL); 106169695Skan 107169695Skan zfs_max_auto_ashift = val; 108169695Skan 109169695Skan return (0); 110169695Skan} 111169695SkanSYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, 112169695Skan CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 113169695Skan sysctl_vfs_zfs_max_auto_ashift, "QU", 114169695Skan "Max ashift used when optimising for logical -> physical sectors size on " 115169695Skan "new top-level vdevs."); 116169695Skan 117169695Skanstatic int 118169695Skansysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) 119169695Skan{ 120169695Skan uint64_t val; 121169695Skan int err; 122169695Skan 123169695Skan val = zfs_min_auto_ashift; 124169695Skan err = sysctl_handle_64(oidp, &val, 0, req); 125169695Skan if (err != 0 || req->newptr == NULL) 126169695Skan return (err); 127169695Skan 128169695Skan if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) 129169695Skan return (EINVAL); 130169695Skan 131169695Skan zfs_min_auto_ashift = val; 132169695Skan 133169695Skan return (0); 134169695Skan} 135169695SkanSYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, 136169695Skan CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 137169695Skan sysctl_vfs_zfs_min_auto_ashift, "QU", 138169695Skan "Min ashift used when creating new top-level vdevs."); 139169695Skan 140169695Skanstatic vdev_ops_t *vdev_ops_table[] = { 141169695Skan &vdev_root_ops, 142169695Skan &vdev_raidz_ops, 143169695Skan &vdev_mirror_ops, 144169695Skan &vdev_replacing_ops, 145169695Skan &vdev_spare_ops, 146169695Skan#ifdef _KERNEL 147169695Skan &vdev_geom_ops, 148169695Skan#else 149169695Skan &vdev_disk_ops, 150169695Skan#endif 151169695Skan &vdev_file_ops, 152169695Skan &vdev_missing_ops, 153169695Skan &vdev_hole_ops, 154169695Skan NULL 155169695Skan}; 156169695Skan 157169695Skan 158169695Skan/* 159169695Skan * When a vdev is added, it will be divided into approximately (but no 160169695Skan * more than) this number of metaslabs. 161169695Skan */ 162169695Skanint metaslabs_per_vdev = 200; 163169695SkanSYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN, 164169695Skan &metaslabs_per_vdev, 0, 165169695Skan "When a vdev is added, how many metaslabs the vdev should be divided into"); 166169695Skan 167169695Skan/* 168169695Skan * Given a vdev type, return the appropriate ops vector. 169169695Skan */ 170169695Skanstatic vdev_ops_t * 171169695Skanvdev_getops(const char *type) 172169695Skan{ 173169695Skan vdev_ops_t *ops, **opspp; 174169695Skan 175169695Skan for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 176169695Skan if (strcmp(ops->vdev_op_type, type) == 0) 177169695Skan break; 178169695Skan 179169695Skan return (ops); 180169695Skan} 181169695Skan 182169695Skan/* 183169695Skan * Default asize function: return the MAX of psize with the asize of 184169695Skan * all children. This is what's used by anything other than RAID-Z. 185169695Skan */ 186169695Skanuint64_t 187169695Skanvdev_default_asize(vdev_t *vd, uint64_t psize) 188169695Skan{ 189169695Skan uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 190169695Skan uint64_t csize; 191169695Skan 192169695Skan for (int c = 0; c < vd->vdev_children; c++) { 193169695Skan csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 194169695Skan asize = MAX(asize, csize); 195169695Skan } 196169695Skan 197169695Skan return (asize); 198169695Skan} 199169695Skan 200169695Skan/* 201169695Skan * Get the minimum allocatable size. We define the allocatable size as 202169695Skan * the vdev's asize rounded to the nearest metaslab. This allows us to 203169695Skan * replace or attach devices which don't have the same physical size but 204169695Skan * can still satisfy the same number of allocations. 205169695Skan */ 206169695Skanuint64_t 207169695Skanvdev_get_min_asize(vdev_t *vd) 208169695Skan{ 209169695Skan vdev_t *pvd = vd->vdev_parent; 210169695Skan 211169695Skan /* 212169695Skan * If our parent is NULL (inactive spare or cache) or is the root, 213169695Skan * just return our own asize. 214169695Skan */ 215169695Skan if (pvd == NULL) 216169695Skan return (vd->vdev_asize); 217169695Skan 218169695Skan /* 219169695Skan * The top-level vdev just returns the allocatable size rounded 220169695Skan * to the nearest metaslab. 221169695Skan */ 222169695Skan if (vd == vd->vdev_top) 223169695Skan return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 224169695Skan 225169695Skan /* 226169695Skan * The allocatable space for a raidz vdev is N * sizeof(smallest child), 227169695Skan * so each child must provide at least 1/Nth of its asize. 228169695Skan */ 229169695Skan if (pvd->vdev_ops == &vdev_raidz_ops) 230169695Skan return (pvd->vdev_min_asize / pvd->vdev_children); 231169695Skan 232169695Skan return (pvd->vdev_min_asize); 233169695Skan} 234169695Skan 235169695Skanvoid 236169695Skanvdev_set_min_asize(vdev_t *vd) 237169695Skan{ 238169695Skan vd->vdev_min_asize = vdev_get_min_asize(vd); 239169695Skan 240169695Skan for (int c = 0; c < vd->vdev_children; c++) 241169695Skan vdev_set_min_asize(vd->vdev_child[c]); 242169695Skan} 243169695Skan 244283010Spfgvdev_t * 245283010Spfgvdev_lookup_top(spa_t *spa, uint64_t vdev) 246283010Spfg{ 247283010Spfg vdev_t *rvd = spa->spa_root_vdev; 248283010Spfg 249283010Spfg ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 250283010Spfg 251283010Spfg if (vdev < rvd->vdev_children) { 252283010Spfg ASSERT(rvd->vdev_child[vdev] != NULL); 253283010Spfg return (rvd->vdev_child[vdev]); 254283010Spfg } 255283010Spfg 256283010Spfg return (NULL); 257283010Spfg} 258283010Spfg 259283010Spfgvdev_t * 260283010Spfgvdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 261283010Spfg{ 262283010Spfg vdev_t *mvd; 263283010Spfg 264283010Spfg if (vd->vdev_guid == guid) 265283010Spfg return (vd); 266283010Spfg 267283010Spfg for (int c = 0; c < vd->vdev_children; c++) 268283010Spfg if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 269283010Spfg NULL) 270283010Spfg return (mvd); 271283010Spfg 272283010Spfg return (NULL); 273283010Spfg} 274283010Spfg 275283010Spfgstatic int 276283010Spfgvdev_count_leaves_impl(vdev_t *vd) 277283010Spfg{ 278283010Spfg int n = 0; 279283010Spfg 280283010Spfg if (vd->vdev_ops->vdev_op_leaf) 281283010Spfg return (1); 282283010Spfg 283283010Spfg for (int c = 0; c < vd->vdev_children; c++) 284283010Spfg n += vdev_count_leaves_impl(vd->vdev_child[c]); 285283010Spfg 286283010Spfg return (n); 287283010Spfg} 288283010Spfg 289283010Spfgint 290283010Spfgvdev_count_leaves(spa_t *spa) 291283010Spfg{ 292283010Spfg return (vdev_count_leaves_impl(spa->spa_root_vdev)); 293283010Spfg} 294283010Spfg 295283010Spfgvoid 296283010Spfgvdev_add_child(vdev_t *pvd, vdev_t *cvd) 297283010Spfg{ 298283010Spfg size_t oldsize, newsize; 299283010Spfg uint64_t id = cvd->vdev_id; 300283010Spfg vdev_t **newchild; 301283010Spfg spa_t *spa = cvd->vdev_spa; 302283010Spfg 303283010Spfg ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 304283010Spfg ASSERT(cvd->vdev_parent == NULL); 305283010Spfg 306283010Spfg cvd->vdev_parent = pvd; 307283010Spfg 308283010Spfg if (pvd == NULL) 309283010Spfg return; 310283010Spfg 311283010Spfg ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 312283010Spfg 313283010Spfg oldsize = pvd->vdev_children * sizeof (vdev_t *); 314283010Spfg pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 315283010Spfg newsize = pvd->vdev_children * sizeof (vdev_t *); 316283010Spfg 317283010Spfg newchild = kmem_zalloc(newsize, KM_SLEEP); 318283010Spfg if (pvd->vdev_child != NULL) { 319283010Spfg bcopy(pvd->vdev_child, newchild, oldsize); 320283010Spfg kmem_free(pvd->vdev_child, oldsize); 321283010Spfg } 322283010Spfg 323283010Spfg pvd->vdev_child = newchild; 324283010Spfg pvd->vdev_child[id] = cvd; 325283010Spfg 326283010Spfg cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 327283010Spfg ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 328283010Spfg 329283010Spfg /* 330283010Spfg * Walk up all ancestors to update guid sum. 331283010Spfg */ 332283010Spfg for (; pvd != NULL; pvd = pvd->vdev_parent) 333283010Spfg pvd->vdev_guid_sum += cvd->vdev_guid_sum; 334283010Spfg} 335283010Spfg 336283010Spfgvoid 337283010Spfgvdev_remove_child(vdev_t *pvd, vdev_t *cvd) 338283010Spfg{ 339283010Spfg int c; 340283010Spfg uint_t id = cvd->vdev_id; 341283010Spfg 342283010Spfg ASSERT(cvd->vdev_parent == pvd); 343283010Spfg 344283010Spfg if (pvd == NULL) 345283010Spfg return; 346283010Spfg 347283010Spfg ASSERT(id < pvd->vdev_children); 348283010Spfg ASSERT(pvd->vdev_child[id] == cvd); 349283010Spfg 350283010Spfg pvd->vdev_child[id] = NULL; 351283010Spfg cvd->vdev_parent = NULL; 352283010Spfg 353283010Spfg for (c = 0; c < pvd->vdev_children; c++) 354283010Spfg if (pvd->vdev_child[c]) 355283010Spfg break; 356283010Spfg 357283010Spfg if (c == pvd->vdev_children) { 358283010Spfg kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 359283010Spfg pvd->vdev_child = NULL; 360283010Spfg pvd->vdev_children = 0; 361283010Spfg } 362283010Spfg 363283010Spfg /* 364283010Spfg * Walk up all ancestors to update guid sum. 365283010Spfg */ 366283010Spfg for (; pvd != NULL; pvd = pvd->vdev_parent) 367283010Spfg pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 368283010Spfg} 369283010Spfg 370283010Spfg/* 371283010Spfg * Remove any holes in the child array. 372283010Spfg */ 373283010Spfgvoid 374283010Spfgvdev_compact_children(vdev_t *pvd) 375283010Spfg{ 376283010Spfg vdev_t **newchild, *cvd; 377283010Spfg int oldc = pvd->vdev_children; 378283010Spfg int newc; 379283010Spfg 380283010Spfg ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 381283010Spfg 382283010Spfg for (int c = newc = 0; c < oldc; c++) 383283010Spfg if (pvd->vdev_child[c]) 384283010Spfg newc++; 385283010Spfg 386283010Spfg newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 387283010Spfg 388283010Spfg for (int c = newc = 0; c < oldc; c++) { 389283010Spfg if ((cvd = pvd->vdev_child[c]) != NULL) { 390283010Spfg newchild[newc] = cvd; 391283010Spfg cvd->vdev_id = newc++; 392283010Spfg } 393169695Skan } 394169695Skan 395169695Skan kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 396169695Skan pvd->vdev_child = newchild; 397169695Skan pvd->vdev_children = newc; 398169695Skan} 399169695Skan 400169695Skan/* 401169695Skan * Allocate and minimally initialize a vdev_t. 402169695Skan */ 403169695Skanvdev_t * 404169695Skanvdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 405169695Skan{ 406169695Skan vdev_t *vd; 407169695Skan 408169695Skan vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 409169695Skan 410169695Skan if (spa->spa_root_vdev == NULL) { 411169695Skan ASSERT(ops == &vdev_root_ops); 412169695Skan spa->spa_root_vdev = vd; 413169695Skan spa->spa_load_guid = spa_generate_guid(NULL); 414169695Skan } 415169695Skan 416169695Skan if (guid == 0 && ops != &vdev_hole_ops) { 417169695Skan if (spa->spa_root_vdev == vd) { 418169695Skan /* 419169695Skan * The root vdev's guid will also be the pool guid, 420169695Skan * which must be unique among all pools. 421169695Skan */ 422169695Skan guid = spa_generate_guid(NULL); 423169695Skan } else { 424169695Skan /* 425169695Skan * Any other vdev's guid must be unique within the pool. 426169695Skan */ 427169695Skan guid = spa_generate_guid(spa); 428169695Skan } 429169695Skan ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 430169695Skan } 431169695Skan 432169695Skan vd->vdev_spa = spa; 433169695Skan vd->vdev_id = id; 434169695Skan vd->vdev_guid = guid; 435169695Skan vd->vdev_guid_sum = guid; 436169695Skan vd->vdev_ops = ops; 437169695Skan vd->vdev_state = VDEV_STATE_CLOSED; 438169695Skan vd->vdev_ishole = (ops == &vdev_hole_ops); 439169695Skan 440169695Skan mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 441169695Skan mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 442169695Skan mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 443169695Skan for (int t = 0; t < DTL_TYPES; t++) { 444169695Skan vd->vdev_dtl[t] = range_tree_create(NULL, NULL, 445169695Skan &vd->vdev_dtl_lock); 446169695Skan } 447169695Skan txg_list_create(&vd->vdev_ms_list, 448169695Skan offsetof(struct metaslab, ms_txg_node)); 449169695Skan txg_list_create(&vd->vdev_dtl_list, 450169695Skan offsetof(struct vdev, vdev_dtl_node)); 451169695Skan vd->vdev_stat.vs_timestamp = gethrtime(); 452169695Skan vdev_queue_init(vd); 453169695Skan vdev_cache_init(vd); 454169695Skan 455169695Skan return (vd); 456169695Skan} 457169695Skan 458169695Skan/* 459169695Skan * Allocate a new vdev. The 'alloctype' is used to control whether we are 460283010Spfg * creating a new vdev or loading an existing one - the behavior is slightly 461169695Skan * different for each case. 462169695Skan */ 463169695Skanint 464169695Skanvdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 465169695Skan int alloctype) 466169695Skan{ 467169695Skan vdev_ops_t *ops; 468169695Skan char *type; 469169695Skan uint64_t guid = 0, islog, nparity; 470169695Skan vdev_t *vd; 471169695Skan 472169695Skan ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 473169695Skan 474169695Skan if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 475169695Skan return (SET_ERROR(EINVAL)); 476169695Skan 477169695Skan if ((ops = vdev_getops(type)) == NULL) 478169695Skan return (SET_ERROR(EINVAL)); 479169695Skan 480169695Skan /* 481169695Skan * If this is a load, get the vdev guid from the nvlist. 482169695Skan * Otherwise, vdev_alloc_common() will generate one for us. 483169695Skan */ 484169695Skan if (alloctype == VDEV_ALLOC_LOAD) { 485169695Skan uint64_t label_id; 486169695Skan 487169695Skan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 488169695Skan label_id != id) 489169695Skan return (SET_ERROR(EINVAL)); 490169695Skan 491169695Skan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 492169695Skan return (SET_ERROR(EINVAL)); 493169695Skan } else if (alloctype == VDEV_ALLOC_SPARE) { 494169695Skan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 495169695Skan return (SET_ERROR(EINVAL)); 496169695Skan } else if (alloctype == VDEV_ALLOC_L2CACHE) { 497169695Skan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 498169695Skan return (SET_ERROR(EINVAL)); 499169695Skan } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 500169695Skan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 501169695Skan return (SET_ERROR(EINVAL)); 502169695Skan } 503169695Skan 504169695Skan /* 505169695Skan * The first allocated vdev must be of type 'root'. 506169695Skan */ 507169695Skan if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 508169695Skan return (SET_ERROR(EINVAL)); 509169695Skan 510169695Skan /* 511169695Skan * Determine whether we're a log vdev. 512169695Skan */ 513169695Skan islog = 0; 514169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 515169695Skan if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 516169695Skan return (SET_ERROR(ENOTSUP)); 517169695Skan 518169695Skan if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 519169695Skan return (SET_ERROR(ENOTSUP)); 520169695Skan 521169695Skan /* 522169695Skan * Set the nparity property for RAID-Z vdevs. 523169695Skan */ 524169695Skan nparity = -1ULL; 525169695Skan if (ops == &vdev_raidz_ops) { 526169695Skan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 527169695Skan &nparity) == 0) { 528169695Skan if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 529169695Skan return (SET_ERROR(EINVAL)); 530169695Skan /* 531169695Skan * Previous versions could only support 1 or 2 parity 532169695Skan * device. 533169695Skan */ 534169695Skan if (nparity > 1 && 535169695Skan spa_version(spa) < SPA_VERSION_RAIDZ2) 536169695Skan return (SET_ERROR(ENOTSUP)); 537169695Skan if (nparity > 2 && 538169695Skan spa_version(spa) < SPA_VERSION_RAIDZ3) 539169695Skan return (SET_ERROR(ENOTSUP)); 540169695Skan } else { 541169695Skan /* 542169695Skan * We require the parity to be specified for SPAs that 543169695Skan * support multiple parity levels. 544169695Skan */ 545169695Skan if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 546169695Skan return (SET_ERROR(EINVAL)); 547169695Skan /* 548169695Skan * Otherwise, we default to 1 parity device for RAID-Z. 549169695Skan */ 550169695Skan nparity = 1; 551169695Skan } 552169695Skan } else { 553169695Skan nparity = 0; 554169695Skan } 555169695Skan ASSERT(nparity != -1ULL); 556169695Skan 557169695Skan vd = vdev_alloc_common(spa, id, guid, ops); 558169695Skan 559169695Skan vd->vdev_islog = islog; 560169695Skan vd->vdev_nparity = nparity; 561169695Skan 562169695Skan if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 563169695Skan vd->vdev_path = spa_strdup(vd->vdev_path); 564169695Skan if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 565169695Skan vd->vdev_devid = spa_strdup(vd->vdev_devid); 566169695Skan if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 567169695Skan &vd->vdev_physpath) == 0) 568169695Skan vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 569169695Skan if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 570169695Skan vd->vdev_fru = spa_strdup(vd->vdev_fru); 571169695Skan 572169695Skan /* 573169695Skan * Set the whole_disk property. If it's not specified, leave the value 574169695Skan * as -1. 575169695Skan */ 576169695Skan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 577169695Skan &vd->vdev_wholedisk) != 0) 578169695Skan vd->vdev_wholedisk = -1ULL; 579169695Skan 580169695Skan /* 581169695Skan * Look for the 'not present' flag. This will only be set if the device 582169695Skan * was not present at the time of import. 583169695Skan */ 584169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 585169695Skan &vd->vdev_not_present); 586169695Skan 587169695Skan /* 588169695Skan * Get the alignment requirement. 589169695Skan */ 590169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 591169695Skan 592169695Skan /* 593169695Skan * Retrieve the vdev creation time. 594169695Skan */ 595169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 596169695Skan &vd->vdev_crtxg); 597169695Skan 598169695Skan /* 599169695Skan * If we're a top-level vdev, try to load the allocation parameters. 600169695Skan */ 601169695Skan if (parent && !parent->vdev_parent && 602169695Skan (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 603169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 604169695Skan &vd->vdev_ms_array); 605169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 606169695Skan &vd->vdev_ms_shift); 607169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 608169695Skan &vd->vdev_asize); 609169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 610169695Skan &vd->vdev_removing); 611169695Skan } 612169695Skan 613169695Skan if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 614169695Skan ASSERT(alloctype == VDEV_ALLOC_LOAD || 615169695Skan alloctype == VDEV_ALLOC_ADD || 616169695Skan alloctype == VDEV_ALLOC_SPLIT || 617169695Skan alloctype == VDEV_ALLOC_ROOTPOOL); 618169695Skan vd->vdev_mg = metaslab_group_create(islog ? 619169695Skan spa_log_class(spa) : spa_normal_class(spa), vd); 620169695Skan } 621169695Skan 622169695Skan /* 623169695Skan * If we're a leaf vdev, try to load the DTL object and other state. 624169695Skan */ 625169695Skan if (vd->vdev_ops->vdev_op_leaf && 626169695Skan (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 627169695Skan alloctype == VDEV_ALLOC_ROOTPOOL)) { 628169695Skan if (alloctype == VDEV_ALLOC_LOAD) { 629169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 630169695Skan &vd->vdev_dtl_object); 631169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 632169695Skan &vd->vdev_unspare); 633169695Skan } 634169695Skan 635169695Skan if (alloctype == VDEV_ALLOC_ROOTPOOL) { 636169695Skan uint64_t spare = 0; 637169695Skan 638169695Skan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 639169695Skan &spare) == 0 && spare) 640169695Skan spa_spare_add(vd); 641169695Skan } 642169695Skan 643169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 644169695Skan &vd->vdev_offline); 645169695Skan 646169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 647169695Skan &vd->vdev_resilver_txg); 648169695Skan 649169695Skan /* 650169695Skan * When importing a pool, we want to ignore the persistent fault 651169695Skan * state, as the diagnosis made on another system may not be 652169695Skan * valid in the current context. Local vdevs will 653169695Skan * remain in the faulted state. 654169695Skan */ 655169695Skan if (spa_load_state(spa) == SPA_LOAD_OPEN) { 656169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 657169695Skan &vd->vdev_faulted); 658169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 659169695Skan &vd->vdev_degraded); 660169695Skan (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 661169695Skan &vd->vdev_removed); 662169695Skan 663169695Skan if (vd->vdev_faulted || vd->vdev_degraded) { 664169695Skan char *aux; 665169695Skan 666169695Skan vd->vdev_label_aux = 667169695Skan VDEV_AUX_ERR_EXCEEDED; 668169695Skan if (nvlist_lookup_string(nv, 669169695Skan ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 670169695Skan strcmp(aux, "external") == 0) 671169695Skan vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 672169695Skan } 673169695Skan } 674169695Skan } 675169695Skan 676169695Skan /* 677169695Skan * Add ourselves to the parent's list of children. 678169695Skan */ 679169695Skan vdev_add_child(parent, vd); 680169695Skan 681169695Skan *vdp = vd; 682169695Skan 683169695Skan return (0); 684169695Skan} 685169695Skan 686169695Skanvoid 687169695Skanvdev_free(vdev_t *vd) 688169695Skan{ 689169695Skan spa_t *spa = vd->vdev_spa; 690169695Skan 691169695Skan /* 692169695Skan * vdev_free() implies closing the vdev first. This is simpler than 693169695Skan * trying to ensure complicated semantics for all callers. 694169695Skan */ 695169695Skan vdev_close(vd); 696169695Skan 697169695Skan ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 698169695Skan ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 699169695Skan 700169695Skan /* 701169695Skan * Free all children. 702169695Skan */ 703169695Skan for (int c = 0; c < vd->vdev_children; c++) 704169695Skan vdev_free(vd->vdev_child[c]); 705169695Skan 706169695Skan ASSERT(vd->vdev_child == NULL); 707169695Skan ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 708169695Skan 709169695Skan /* 710169695Skan * Discard allocation state. 711169695Skan */ 712169695Skan if (vd->vdev_mg != NULL) { 713169695Skan vdev_metaslab_fini(vd); 714169695Skan metaslab_group_destroy(vd->vdev_mg); 715169695Skan } 716169695Skan 717169695Skan ASSERT0(vd->vdev_stat.vs_space); 718169695Skan ASSERT0(vd->vdev_stat.vs_dspace); 719169695Skan ASSERT0(vd->vdev_stat.vs_alloc); 720169695Skan 721169695Skan /* 722169695Skan * Remove this vdev from its parent's child list. 723169695Skan */ 724169695Skan vdev_remove_child(vd->vdev_parent, vd); 725169695Skan 726169695Skan ASSERT(vd->vdev_parent == NULL); 727169695Skan 728169695Skan /* 729169695Skan * Clean up vdev structure. 730169695Skan */ 731169695Skan vdev_queue_fini(vd); 732169695Skan vdev_cache_fini(vd); 733169695Skan 734169695Skan if (vd->vdev_path) 735169695Skan spa_strfree(vd->vdev_path); 736169695Skan if (vd->vdev_devid) 737169695Skan spa_strfree(vd->vdev_devid); 738169695Skan if (vd->vdev_physpath) 739169695Skan spa_strfree(vd->vdev_physpath); 740169695Skan if (vd->vdev_fru) 741169695Skan spa_strfree(vd->vdev_fru); 742169695Skan 743169695Skan if (vd->vdev_isspare) 744169695Skan spa_spare_remove(vd); 745169695Skan if (vd->vdev_isl2cache) 746169695Skan spa_l2cache_remove(vd); 747169695Skan 748169695Skan txg_list_destroy(&vd->vdev_ms_list); 749169695Skan txg_list_destroy(&vd->vdev_dtl_list); 750169695Skan 751169695Skan mutex_enter(&vd->vdev_dtl_lock); 752169695Skan space_map_close(vd->vdev_dtl_sm); 753169695Skan for (int t = 0; t < DTL_TYPES; t++) { 754169695Skan range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 755169695Skan range_tree_destroy(vd->vdev_dtl[t]); 756169695Skan } 757169695Skan mutex_exit(&vd->vdev_dtl_lock); 758169695Skan 759169695Skan mutex_destroy(&vd->vdev_dtl_lock); 760169695Skan mutex_destroy(&vd->vdev_stat_lock); 761169695Skan mutex_destroy(&vd->vdev_probe_lock); 762169695Skan 763169695Skan if (vd == spa->spa_root_vdev) 764169695Skan spa->spa_root_vdev = NULL; 765169695Skan 766169695Skan kmem_free(vd, sizeof (vdev_t)); 767169695Skan} 768169695Skan 769169695Skan/* 770169695Skan * Transfer top-level vdev state from svd to tvd. 771169695Skan */ 772169695Skanstatic void 773169695Skanvdev_top_transfer(vdev_t *svd, vdev_t *tvd) 774169695Skan{ 775169695Skan spa_t *spa = svd->vdev_spa; 776169695Skan metaslab_t *msp; 777169695Skan vdev_t *vd; 778169695Skan int t; 779169695Skan 780169695Skan ASSERT(tvd == tvd->vdev_top); 781169695Skan 782169695Skan tvd->vdev_ms_array = svd->vdev_ms_array; 783169695Skan tvd->vdev_ms_shift = svd->vdev_ms_shift; 784169695Skan tvd->vdev_ms_count = svd->vdev_ms_count; 785169695Skan 786169695Skan svd->vdev_ms_array = 0; 787169695Skan svd->vdev_ms_shift = 0; 788169695Skan svd->vdev_ms_count = 0; 789169695Skan 790169695Skan if (tvd->vdev_mg) 791169695Skan ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 792169695Skan tvd->vdev_mg = svd->vdev_mg; 793169695Skan tvd->vdev_ms = svd->vdev_ms; 794169695Skan 795169695Skan svd->vdev_mg = NULL; 796169695Skan svd->vdev_ms = NULL; 797169695Skan 798169695Skan if (tvd->vdev_mg != NULL) 799169695Skan tvd->vdev_mg->mg_vd = tvd; 800169695Skan 801169695Skan tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 802169695Skan tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 803169695Skan tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 804169695Skan 805169695Skan svd->vdev_stat.vs_alloc = 0; 806169695Skan svd->vdev_stat.vs_space = 0; 807169695Skan svd->vdev_stat.vs_dspace = 0; 808169695Skan 809169695Skan for (t = 0; t < TXG_SIZE; t++) { 810169695Skan while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 811169695Skan (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 812169695Skan while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 813169695Skan (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 814169695Skan if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 815169695Skan (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 816169695Skan } 817169695Skan 818169695Skan if (list_link_active(&svd->vdev_config_dirty_node)) { 819169695Skan vdev_config_clean(svd); 820169695Skan vdev_config_dirty(tvd); 821169695Skan } 822169695Skan 823169695Skan if (list_link_active(&svd->vdev_state_dirty_node)) { 824169695Skan vdev_state_clean(svd); 825169695Skan vdev_state_dirty(tvd); 826169695Skan } 827169695Skan 828169695Skan tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 829169695Skan svd->vdev_deflate_ratio = 0; 830169695Skan 831169695Skan tvd->vdev_islog = svd->vdev_islog; 832169695Skan svd->vdev_islog = 0; 833169695Skan} 834169695Skan 835169695Skanstatic void 836169695Skanvdev_top_update(vdev_t *tvd, vdev_t *vd) 837169695Skan{ 838169695Skan if (vd == NULL) 839169695Skan return; 840169695Skan 841169695Skan vd->vdev_top = tvd; 842169695Skan 843169695Skan for (int c = 0; c < vd->vdev_children; c++) 844169695Skan vdev_top_update(tvd, vd->vdev_child[c]); 845169695Skan} 846169695Skan 847169695Skan/* 848169695Skan * Add a mirror/replacing vdev above an existing vdev. 849169695Skan */ 850169695Skanvdev_t * 851169695Skanvdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 852169695Skan{ 853169695Skan spa_t *spa = cvd->vdev_spa; 854169695Skan vdev_t *pvd = cvd->vdev_parent; 855169695Skan vdev_t *mvd; 856169695Skan 857169695Skan ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 858169695Skan 859169695Skan mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 860169695Skan 861169695Skan mvd->vdev_asize = cvd->vdev_asize; 862169695Skan mvd->vdev_min_asize = cvd->vdev_min_asize; 863169695Skan mvd->vdev_max_asize = cvd->vdev_max_asize; 864169695Skan mvd->vdev_ashift = cvd->vdev_ashift; 865169695Skan mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; 866169695Skan mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; 867169695Skan mvd->vdev_state = cvd->vdev_state; 868169695Skan mvd->vdev_crtxg = cvd->vdev_crtxg; 869169695Skan 870283010Spfg vdev_remove_child(pvd, cvd); 871169695Skan vdev_add_child(pvd, mvd); 872169695Skan cvd->vdev_id = mvd->vdev_children; 873169695Skan vdev_add_child(mvd, cvd); 874169695Skan vdev_top_update(cvd->vdev_top, cvd->vdev_top); 875169695Skan 876283010Spfg if (mvd == mvd->vdev_top) 877169695Skan vdev_top_transfer(cvd, mvd); 878169695Skan 879169695Skan return (mvd); 880169695Skan} 881169695Skan 882169695Skan/* 883169695Skan * Remove a 1-way mirror/replacing vdev from the tree. 884169695Skan */ 885283010Spfgvoid 886169695Skanvdev_remove_parent(vdev_t *cvd) 887169695Skan{ 888169695Skan vdev_t *mvd = cvd->vdev_parent; 889169695Skan vdev_t *pvd = mvd->vdev_parent; 890169695Skan 891169695Skan ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 892169695Skan 893169695Skan ASSERT(mvd->vdev_children == 1); 894169695Skan ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 895169695Skan mvd->vdev_ops == &vdev_replacing_ops || 896169695Skan mvd->vdev_ops == &vdev_spare_ops); 897169695Skan cvd->vdev_ashift = mvd->vdev_ashift; 898169695Skan cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; 899169695Skan cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; 900169695Skan 901283010Spfg vdev_remove_child(mvd, cvd); 902283010Spfg vdev_remove_child(pvd, mvd); 903283010Spfg 904283010Spfg /* 905283010Spfg * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 906283010Spfg * Otherwise, we could have detached an offline device, and when we 907283010Spfg * go to import the pool we'll think we have two top-level vdevs, 908283010Spfg * instead of a different version of the same top-level vdev. 909169695Skan */ 910169695Skan if (mvd->vdev_top == mvd) { 911169695Skan uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 912169695Skan cvd->vdev_orig_guid = cvd->vdev_guid; 913169695Skan cvd->vdev_guid += guid_delta; 914169695Skan cvd->vdev_guid_sum += guid_delta; 915169695Skan } 916169695Skan cvd->vdev_id = mvd->vdev_id; 917169695Skan vdev_add_child(pvd, cvd); 918169695Skan vdev_top_update(cvd->vdev_top, cvd->vdev_top); 919169695Skan 920169695Skan if (cvd == cvd->vdev_top) 921169695Skan vdev_top_transfer(mvd, cvd); 922169695Skan 923169695Skan ASSERT(mvd->vdev_children == 0); 924169695Skan vdev_free(mvd); 925169695Skan} 926169695Skan 927169695Skanint 928169695Skanvdev_metaslab_init(vdev_t *vd, uint64_t txg) 929169695Skan{ 930169695Skan spa_t *spa = vd->vdev_spa; 931169695Skan objset_t *mos = spa->spa_meta_objset; 932169695Skan uint64_t m; 933169695Skan uint64_t oldc = vd->vdev_ms_count; 934169695Skan uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 935169695Skan metaslab_t **mspp; 936169695Skan int error; 937169695Skan 938169695Skan ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 939169695Skan 940169695Skan /* 941169695Skan * This vdev is not being allocated from yet or is a hole. 942169695Skan */ 943169695Skan if (vd->vdev_ms_shift == 0) 944169695Skan return (0); 945169695Skan 946169695Skan ASSERT(!vd->vdev_ishole); 947169695Skan 948169695Skan /* 949169695Skan * Compute the raidz-deflation ratio. Note, we hard-code 950169695Skan * in 128k (1 << 17) because it is the "typical" blocksize. 951169695Skan * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, 952169695Skan * otherwise it would inconsistently account for existing bp's. 953169695Skan */ 954169695Skan vd->vdev_deflate_ratio = (1 << 17) / 955169695Skan (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 956169695Skan 957169695Skan ASSERT(oldc <= newc); 958169695Skan 959169695Skan mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 960169695Skan 961169695Skan if (oldc != 0) { 962169695Skan bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 963169695Skan kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 964169695Skan } 965169695Skan 966169695Skan vd->vdev_ms = mspp; 967169695Skan vd->vdev_ms_count = newc; 968169695Skan 969169695Skan for (m = oldc; m < newc; m++) { 970169695Skan uint64_t object = 0; 971169695Skan 972169695Skan if (txg == 0) { 973169695Skan error = dmu_read(mos, vd->vdev_ms_array, 974169695Skan m * sizeof (uint64_t), sizeof (uint64_t), &object, 975169695Skan DMU_READ_PREFETCH); 976169695Skan if (error) 977169695Skan return (error); 978169695Skan } 979169695Skan 980169695Skan error = metaslab_init(vd->vdev_mg, m, object, txg, 981169695Skan &(vd->vdev_ms[m])); 982169695Skan if (error) 983169695Skan return (error); 984169695Skan } 985169695Skan 986169695Skan if (txg == 0) 987169695Skan spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 988169695Skan 989169695Skan /* 990169695Skan * If the vdev is being removed we don't activate 991169695Skan * the metaslabs since we want to ensure that no new 992169695Skan * allocations are performed on this device. 993169695Skan */ 994169695Skan if (oldc == 0 && !vd->vdev_removing) 995169695Skan metaslab_group_activate(vd->vdev_mg); 996169695Skan 997169695Skan if (txg == 0) 998169695Skan spa_config_exit(spa, SCL_ALLOC, FTAG); 999169695Skan 1000169695Skan return (0); 1001169695Skan} 1002169695Skan 1003169695Skanvoid 1004169695Skanvdev_metaslab_fini(vdev_t *vd) 1005169695Skan{ 1006169695Skan uint64_t m; 1007169695Skan uint64_t count = vd->vdev_ms_count; 1008169695Skan 1009169695Skan if (vd->vdev_ms != NULL) { 1010169695Skan metaslab_group_passivate(vd->vdev_mg); 1011169695Skan for (m = 0; m < count; m++) { 1012169695Skan metaslab_t *msp = vd->vdev_ms[m]; 1013283010Spfg 1014283010Spfg if (msp != NULL) 1015283010Spfg metaslab_fini(msp); 1016283010Spfg } 1017283010Spfg kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 1018283010Spfg vd->vdev_ms = NULL; 1019169695Skan } 1020169695Skan} 1021169695Skan 1022283010Spfgtypedef struct vdev_probe_stats { 1023283010Spfg boolean_t vps_readable; 1024169695Skan boolean_t vps_writeable; 1025169695Skan int vps_flags; 1026169695Skan} vdev_probe_stats_t; 1027169695Skan 1028169695Skanstatic void 1029169695Skanvdev_probe_done(zio_t *zio) 1030169695Skan{ 1031283010Spfg spa_t *spa = zio->io_spa; 1032283010Spfg vdev_t *vd = zio->io_vd; 1033283010Spfg vdev_probe_stats_t *vps = zio->io_private; 1034169695Skan 1035169695Skan ASSERT(vd->vdev_probe_zio != NULL); 1036169695Skan 1037169695Skan if (zio->io_type == ZIO_TYPE_READ) { 1038169695Skan if (zio->io_error == 0) 1039169695Skan vps->vps_readable = 1; 1040169695Skan if (zio->io_error == 0 && spa_writeable(spa)) { 1041169695Skan zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 1042283010Spfg zio->io_offset, zio->io_size, zio->io_data, 1043169695Skan ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1044169695Skan ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 1045169695Skan } else { 1046169695Skan zio_buf_free(zio->io_data, zio->io_size); 1047169695Skan } 1048169695Skan } else if (zio->io_type == ZIO_TYPE_WRITE) { 1049169695Skan if (zio->io_error == 0) 1050169695Skan vps->vps_writeable = 1; 1051169695Skan zio_buf_free(zio->io_data, zio->io_size); 1052169695Skan } else if (zio->io_type == ZIO_TYPE_NULL) { 1053169695Skan zio_t *pio; 1054169695Skan 1055169695Skan vd->vdev_cant_read |= !vps->vps_readable; 1056169695Skan vd->vdev_cant_write |= !vps->vps_writeable; 1057169695Skan 1058169695Skan if (vdev_readable(vd) && 1059169695Skan (vdev_writeable(vd) || !spa_writeable(spa))) { 1060169695Skan zio->io_error = 0; 1061169695Skan } else { 1062169695Skan ASSERT(zio->io_error != 0); 1063169695Skan zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 1064169695Skan spa, vd, NULL, 0, 0); 1065169695Skan zio->io_error = SET_ERROR(ENXIO); 1066169695Skan } 1067169695Skan 1068169695Skan mutex_enter(&vd->vdev_probe_lock); 1069169695Skan ASSERT(vd->vdev_probe_zio == zio); 1070169695Skan vd->vdev_probe_zio = NULL; 1071169695Skan mutex_exit(&vd->vdev_probe_lock); 1072169695Skan 1073169695Skan while ((pio = zio_walk_parents(zio)) != NULL) 1074169695Skan if (!vdev_accessible(vd, pio)) 1075169695Skan pio->io_error = SET_ERROR(ENXIO); 1076169695Skan 1077169695Skan kmem_free(vps, sizeof (*vps)); 1078169695Skan } 1079169695Skan} 1080169695Skan 1081169695Skan/* 1082169695Skan * Determine whether this device is accessible. 1083169695Skan * 1084169695Skan * Read and write to several known locations: the pad regions of each 1085169695Skan * vdev label but the first, which we leave alone in case it contains 1086169695Skan * a VTOC. 1087169695Skan */ 1088169695Skanzio_t * 1089169695Skanvdev_probe(vdev_t *vd, zio_t *zio) 1090169695Skan{ 1091169695Skan spa_t *spa = vd->vdev_spa; 1092169695Skan vdev_probe_stats_t *vps = NULL; 1093169695Skan zio_t *pio; 1094169695Skan 1095169695Skan ASSERT(vd->vdev_ops->vdev_op_leaf); 1096169695Skan 1097169695Skan /* 1098169695Skan * Don't probe the probe. 1099169695Skan */ 1100169695Skan if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1101169695Skan return (NULL); 1102169695Skan 1103169695Skan /* 1104169695Skan * To prevent 'probe storms' when a device fails, we create 1105169695Skan * just one probe i/o at a time. All zios that want to probe 1106169695Skan * this vdev will become parents of the probe io. 1107169695Skan */ 1108169695Skan mutex_enter(&vd->vdev_probe_lock); 1109169695Skan 1110169695Skan if ((pio = vd->vdev_probe_zio) == NULL) { 1111169695Skan vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 1112169695Skan 1113169695Skan vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 1114169695Skan ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 1115169695Skan ZIO_FLAG_TRYHARD; 1116169695Skan 1117169695Skan if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1118169695Skan /* 1119169695Skan * vdev_cant_read and vdev_cant_write can only 1120169695Skan * transition from TRUE to FALSE when we have the 1121169695Skan * SCL_ZIO lock as writer; otherwise they can only 1122169695Skan * transition from FALSE to TRUE. This ensures that 1123169695Skan * any zio looking at these values can assume that 1124169695Skan * failures persist for the life of the I/O. That's 1125169695Skan * important because when a device has intermittent 1126169695Skan * connectivity problems, we want to ensure that 1127169695Skan * they're ascribed to the device (ENXIO) and not 1128169695Skan * the zio (EIO). 1129169695Skan * 1130169695Skan * Since we hold SCL_ZIO as writer here, clear both 1131169695Skan * values so the probe can reevaluate from first 1132169695Skan * principles. 1133169695Skan */ 1134169695Skan vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1135169695Skan vd->vdev_cant_read = B_FALSE; 1136169695Skan vd->vdev_cant_write = B_FALSE; 1137169695Skan } 1138169695Skan 1139169695Skan vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1140169695Skan vdev_probe_done, vps, 1141169695Skan vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1142169695Skan 1143169695Skan /* 1144169695Skan * We can't change the vdev state in this context, so we 1145169695Skan * kick off an async task to do it on our behalf. 1146169695Skan */ 1147169695Skan if (zio != NULL) { 1148169695Skan vd->vdev_probe_wanted = B_TRUE; 1149169695Skan spa_async_request(spa, SPA_ASYNC_PROBE); 1150169695Skan } 1151169695Skan } 1152169695Skan 1153169695Skan if (zio != NULL) 1154169695Skan zio_add_child(zio, pio); 1155169695Skan 1156169695Skan mutex_exit(&vd->vdev_probe_lock); 1157169695Skan 1158169695Skan if (vps == NULL) { 1159169695Skan ASSERT(zio != NULL); 1160169695Skan return (NULL); 1161169695Skan } 1162169695Skan 1163169695Skan for (int l = 1; l < VDEV_LABELS; l++) { 1164169695Skan zio_nowait(zio_read_phys(pio, vd, 1165169695Skan vdev_label_offset(vd->vdev_psize, l, 1166169695Skan offsetof(vdev_label_t, vl_pad2)), 1167169695Skan VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), 1168169695Skan ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1169169695Skan ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1170169695Skan } 1171169695Skan 1172169695Skan if (zio == NULL) 1173169695Skan return (pio); 1174169695Skan 1175169695Skan zio_nowait(pio); 1176169695Skan return (NULL); 1177169695Skan} 1178169695Skan 1179169695Skanstatic void 1180169695Skanvdev_open_child(void *arg) 1181169695Skan{ 1182169695Skan vdev_t *vd = arg; 1183169695Skan 1184169695Skan vd->vdev_open_thread = curthread; 1185169695Skan vd->vdev_open_error = vdev_open(vd); 1186169695Skan vd->vdev_open_thread = NULL; 1187169695Skan} 1188169695Skan 1189169695Skanboolean_t 1190169695Skanvdev_uses_zvols(vdev_t *vd) 1191169695Skan{ 1192169695Skan if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1193169695Skan strlen(ZVOL_DIR)) == 0) 1194169695Skan return (B_TRUE); 1195169695Skan for (int c = 0; c < vd->vdev_children; c++) 1196169695Skan if (vdev_uses_zvols(vd->vdev_child[c])) 1197169695Skan return (B_TRUE); 1198169695Skan return (B_FALSE); 1199169695Skan} 1200169695Skan 1201169695Skanvoid 1202169695Skanvdev_open_children(vdev_t *vd) 1203169695Skan{ 1204169695Skan taskq_t *tq; 1205169695Skan int children = vd->vdev_children; 1206169695Skan 1207169695Skan /* 1208169695Skan * in order to handle pools on top of zvols, do the opens 1209169695Skan * in a single thread so that the same thread holds the 1210169695Skan * spa_namespace_lock 1211169695Skan */ 1212169695Skan if (B_TRUE || vdev_uses_zvols(vd)) { 1213169695Skan for (int c = 0; c < children; c++) 1214169695Skan vd->vdev_child[c]->vdev_open_error = 1215169695Skan vdev_open(vd->vdev_child[c]); 1216169695Skan return; 1217169695Skan } 1218169695Skan tq = taskq_create("vdev_open", children, minclsyspri, 1219169695Skan children, children, TASKQ_PREPOPULATE); 1220169695Skan 1221169695Skan for (int c = 0; c < children; c++) 1222169695Skan VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1223169695Skan TQ_SLEEP) != 0); 1224169695Skan 1225169695Skan taskq_destroy(tq); 1226169695Skan} 1227169695Skan 1228169695Skan/* 1229169695Skan * Prepare a virtual device for access. 1230169695Skan */ 1231169695Skanint 1232169695Skanvdev_open(vdev_t *vd) 1233169695Skan{ 1234169695Skan spa_t *spa = vd->vdev_spa; 1235169695Skan int error; 1236169695Skan uint64_t osize = 0; 1237169695Skan uint64_t max_osize = 0; 1238169695Skan uint64_t asize, max_asize, psize; 1239169695Skan uint64_t logical_ashift = 0; 1240169695Skan uint64_t physical_ashift = 0; 1241169695Skan 1242169695Skan ASSERT(vd->vdev_open_thread == curthread || 1243169695Skan spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1244169695Skan ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1245169695Skan vd->vdev_state == VDEV_STATE_CANT_OPEN || 1246169695Skan vd->vdev_state == VDEV_STATE_OFFLINE); 1247169695Skan 1248169695Skan vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1249169695Skan vd->vdev_cant_read = B_FALSE; 1250169695Skan vd->vdev_cant_write = B_FALSE; 1251169695Skan vd->vdev_notrim = B_FALSE; 1252169695Skan vd->vdev_min_asize = vdev_get_min_asize(vd); 1253169695Skan 1254169695Skan /* 1255169695Skan * If this vdev is not removed, check its fault status. If it's 1256169695Skan * faulted, bail out of the open. 1257169695Skan */ 1258169695Skan if (!vd->vdev_removed && vd->vdev_faulted) { 1259169695Skan ASSERT(vd->vdev_children == 0); 1260169695Skan ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1261169695Skan vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1262169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1263169695Skan vd->vdev_label_aux); 1264169695Skan return (SET_ERROR(ENXIO)); 1265169695Skan } else if (vd->vdev_offline) { 1266169695Skan ASSERT(vd->vdev_children == 0); 1267169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1268169695Skan return (SET_ERROR(ENXIO)); 1269169695Skan } 1270169695Skan 1271169695Skan error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, 1272169695Skan &logical_ashift, &physical_ashift); 1273169695Skan 1274169695Skan /* 1275169695Skan * Reset the vdev_reopening flag so that we actually close 1276169695Skan * the vdev on error. 1277169695Skan */ 1278169695Skan vd->vdev_reopening = B_FALSE; 1279169695Skan if (zio_injection_enabled && error == 0) 1280169695Skan error = zio_handle_device_injection(vd, NULL, ENXIO); 1281169695Skan 1282169695Skan if (error) { 1283169695Skan if (vd->vdev_removed && 1284169695Skan vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1285169695Skan vd->vdev_removed = B_FALSE; 1286169695Skan 1287169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1288169695Skan vd->vdev_stat.vs_aux); 1289169695Skan return (error); 1290169695Skan } 1291169695Skan 1292169695Skan vd->vdev_removed = B_FALSE; 1293169695Skan 1294169695Skan /* 1295169695Skan * Recheck the faulted flag now that we have confirmed that 1296169695Skan * the vdev is accessible. If we're faulted, bail. 1297169695Skan */ 1298169695Skan if (vd->vdev_faulted) { 1299169695Skan ASSERT(vd->vdev_children == 0); 1300169695Skan ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1301169695Skan vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1302169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1303169695Skan vd->vdev_label_aux); 1304169695Skan return (SET_ERROR(ENXIO)); 1305169695Skan } 1306169695Skan 1307169695Skan if (vd->vdev_degraded) { 1308169695Skan ASSERT(vd->vdev_children == 0); 1309169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1310169695Skan VDEV_AUX_ERR_EXCEEDED); 1311169695Skan } else { 1312169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1313169695Skan } 1314169695Skan 1315169695Skan /* 1316169695Skan * For hole or missing vdevs we just return success. 1317169695Skan */ 1318169695Skan if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1319169695Skan return (0); 1320169695Skan 1321169695Skan if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) 1322169695Skan trim_map_create(vd); 1323169695Skan 1324169695Skan for (int c = 0; c < vd->vdev_children; c++) { 1325169695Skan if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1326169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1327169695Skan VDEV_AUX_NONE); 1328169695Skan break; 1329169695Skan } 1330169695Skan } 1331169695Skan 1332169695Skan osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1333169695Skan max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1334169695Skan 1335169695Skan if (vd->vdev_children == 0) { 1336169695Skan if (osize < SPA_MINDEVSIZE) { 1337169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1338169695Skan VDEV_AUX_TOO_SMALL); 1339169695Skan return (SET_ERROR(EOVERFLOW)); 1340169695Skan } 1341169695Skan psize = osize; 1342169695Skan asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1343169695Skan max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1344169695Skan VDEV_LABEL_END_SIZE); 1345169695Skan } else { 1346169695Skan if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1347169695Skan (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1348169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1349169695Skan VDEV_AUX_TOO_SMALL); 1350169695Skan return (SET_ERROR(EOVERFLOW)); 1351169695Skan } 1352169695Skan psize = 0; 1353169695Skan asize = osize; 1354169695Skan max_asize = max_osize; 1355169695Skan } 1356169695Skan 1357169695Skan vd->vdev_psize = psize; 1358169695Skan 1359169695Skan /* 1360169695Skan * Make sure the allocatable size hasn't shrunk. 1361169695Skan */ 1362169695Skan if (asize < vd->vdev_min_asize) { 1363169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1364169695Skan VDEV_AUX_BAD_LABEL); 1365169695Skan return (SET_ERROR(EINVAL)); 1366169695Skan } 1367169695Skan 1368169695Skan vd->vdev_physical_ashift = 1369169695Skan MAX(physical_ashift, vd->vdev_physical_ashift); 1370169695Skan vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); 1371169695Skan vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); 1372169695Skan 1373169695Skan if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { 1374169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1375169695Skan VDEV_AUX_ASHIFT_TOO_BIG); 1376169695Skan return (EINVAL); 1377169695Skan } 1378169695Skan 1379169695Skan if (vd->vdev_asize == 0) { 1380169695Skan /* 1381169695Skan * This is the first-ever open, so use the computed values. 1382169695Skan * For testing purposes, a higher ashift can be requested. 1383169695Skan */ 1384169695Skan vd->vdev_asize = asize; 1385169695Skan vd->vdev_max_asize = max_asize; 1386169695Skan } else { 1387169695Skan /* 1388169695Skan * Make sure the alignment requirement hasn't increased. 1389169695Skan */ 1390169695Skan if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && 1391169695Skan vd->vdev_ops->vdev_op_leaf) { 1392169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1393169695Skan VDEV_AUX_BAD_LABEL); 1394169695Skan return (EINVAL); 1395169695Skan } 1396169695Skan vd->vdev_max_asize = max_asize; 1397169695Skan } 1398169695Skan 1399169695Skan /* 1400169695Skan * If all children are healthy and the asize has increased, 1401169695Skan * then we've experienced dynamic LUN growth. If automatic 1402169695Skan * expansion is enabled then use the additional space. 1403169695Skan */ 1404169695Skan if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && 1405169695Skan (vd->vdev_expanding || spa->spa_autoexpand)) 1406169695Skan vd->vdev_asize = asize; 1407169695Skan 1408169695Skan vdev_set_min_asize(vd); 1409169695Skan 1410169695Skan /* 1411169695Skan * Ensure we can issue some IO before declaring the 1412169695Skan * vdev open for business. 1413169695Skan */ 1414169695Skan if (vd->vdev_ops->vdev_op_leaf && 1415169695Skan (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1416169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1417169695Skan VDEV_AUX_ERR_EXCEEDED); 1418169695Skan return (error); 1419169695Skan } 1420169695Skan 1421169695Skan /* 1422169695Skan * Track the min and max ashift values for normal data devices. 1423169695Skan */ 1424169695Skan if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1425169695Skan !vd->vdev_islog && vd->vdev_aux == NULL) { 1426169695Skan if (vd->vdev_ashift > spa->spa_max_ashift) 1427169695Skan spa->spa_max_ashift = vd->vdev_ashift; 1428169695Skan if (vd->vdev_ashift < spa->spa_min_ashift) 1429169695Skan spa->spa_min_ashift = vd->vdev_ashift; 1430169695Skan } 1431169695Skan 1432169695Skan /* 1433169695Skan * If a leaf vdev has a DTL, and seems healthy, then kick off a 1434169695Skan * resilver. But don't do this if we are doing a reopen for a scrub, 1435169695Skan * since this would just restart the scrub we are already doing. 1436169695Skan */ 1437169695Skan if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1438169695Skan vdev_resilver_needed(vd, NULL, NULL)) 1439169695Skan spa_async_request(spa, SPA_ASYNC_RESILVER); 1440169695Skan 1441169695Skan return (0); 1442169695Skan} 1443169695Skan 1444169695Skan/* 1445169695Skan * Called once the vdevs are all opened, this routine validates the label 1446169695Skan * contents. This needs to be done before vdev_load() so that we don't 1447169695Skan * inadvertently do repair I/Os to the wrong device. 1448169695Skan * 1449169695Skan * If 'strict' is false ignore the spa guid check. This is necessary because 1450169695Skan * if the machine crashed during a re-guid the new guid might have been written 1451169695Skan * to all of the vdev labels, but not the cached config. The strict check 1452169695Skan * will be performed when the pool is opened again using the mos config. 1453169695Skan * 1454169695Skan * This function will only return failure if one of the vdevs indicates that it 1455169695Skan * has since been destroyed or exported. This is only possible if 1456169695Skan * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1457169695Skan * will be updated but the function will return 0. 1458169695Skan */ 1459169695Skanint 1460169695Skanvdev_validate(vdev_t *vd, boolean_t strict) 1461169695Skan{ 1462169695Skan spa_t *spa = vd->vdev_spa; 1463169695Skan nvlist_t *label; 1464169695Skan uint64_t guid = 0, top_guid; 1465169695Skan uint64_t state; 1466169695Skan 1467169695Skan for (int c = 0; c < vd->vdev_children; c++) 1468169695Skan if (vdev_validate(vd->vdev_child[c], strict) != 0) 1469169695Skan return (SET_ERROR(EBADF)); 1470169695Skan 1471169695Skan /* 1472169695Skan * If the device has already failed, or was marked offline, don't do 1473169695Skan * any further validation. Otherwise, label I/O will fail and we will 1474169695Skan * overwrite the previous state. 1475169695Skan */ 1476169695Skan if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1477169695Skan uint64_t aux_guid = 0; 1478169695Skan nvlist_t *nvl; 1479169695Skan uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1480169695Skan spa_last_synced_txg(spa) : -1ULL; 1481169695Skan 1482169695Skan if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1483169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1484169695Skan VDEV_AUX_BAD_LABEL); 1485169695Skan return (0); 1486169695Skan } 1487169695Skan 1488169695Skan /* 1489169695Skan * Determine if this vdev has been split off into another 1490169695Skan * pool. If so, then refuse to open it. 1491169695Skan */ 1492169695Skan if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1493169695Skan &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1494169695Skan vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1495169695Skan VDEV_AUX_SPLIT_POOL); 1496169695Skan nvlist_free(label); 1497169695Skan return (0); 1498169695Skan } 1499169695Skan 1500169695Skan if (strict && (nvlist_lookup_uint64(label, 1501169695Skan ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 1502169695Skan guid != spa_guid(spa))) { 1503169695Skan vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1504169695Skan VDEV_AUX_CORRUPT_DATA); 1505169695Skan nvlist_free(label); 1506169695Skan return (0); 1507169695Skan } 1508169695Skan 1509169695Skan if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1510169695Skan != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1511169695Skan &aux_guid) != 0) 1512169695Skan aux_guid = 0; 1513169695Skan 1514169695Skan /* 1515169695Skan * If this vdev just became a top-level vdev because its 1516169695Skan * sibling was detached, it will have adopted the parent's 1517169695Skan * vdev guid -- but the label may or may not be on disk yet. 1518169695Skan * Fortunately, either version of the label will have the 1519169695Skan * same top guid, so if we're a top-level vdev, we can 1520169695Skan * safely compare to that instead. 1521169695Skan * 1522169695Skan * If we split this vdev off instead, then we also check the 1523169695Skan * original pool's guid. We don't want to consider the vdev 1524169695Skan * corrupt if it is partway through a split operation. 1525169695Skan */ 1526169695Skan if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1527169695Skan &guid) != 0 || 1528169695Skan nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1529169695Skan &top_guid) != 0 || 1530169695Skan ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1531169695Skan (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1532169695Skan vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1533169695Skan VDEV_AUX_CORRUPT_DATA); 1534169695Skan nvlist_free(label); 1535169695Skan return (0); 1536169695Skan } 1537169695Skan 1538169695Skan if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1539169695Skan &state) != 0) { 1540169695Skan vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1541169695Skan VDEV_AUX_CORRUPT_DATA); 1542169695Skan nvlist_free(label); 1543169695Skan return (0); 1544169695Skan } 1545169695Skan 1546169695Skan nvlist_free(label); 1547169695Skan 1548169695Skan /* 1549169695Skan * If this is a verbatim import, no need to check the 1550169695Skan * state of the pool. 1551169695Skan */ 1552169695Skan if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1553169695Skan spa_load_state(spa) == SPA_LOAD_OPEN && 1554169695Skan state != POOL_STATE_ACTIVE) 1555169695Skan return (SET_ERROR(EBADF)); 1556169695Skan 1557169695Skan /* 1558169695Skan * If we were able to open and validate a vdev that was 1559169695Skan * previously marked permanently unavailable, clear that state 1560169695Skan * now. 1561169695Skan */ 1562169695Skan if (vd->vdev_not_present) 1563169695Skan vd->vdev_not_present = 0; 1564169695Skan } 1565169695Skan 1566169695Skan return (0); 1567169695Skan} 1568169695Skan 1569169695Skan/* 1570169695Skan * Close a virtual device. 1571169695Skan */ 1572169695Skanvoid 1573169695Skanvdev_close(vdev_t *vd) 1574169695Skan{ 1575169695Skan spa_t *spa = vd->vdev_spa; 1576169695Skan vdev_t *pvd = vd->vdev_parent; 1577169695Skan 1578169695Skan ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1579169695Skan 1580169695Skan /* 1581169695Skan * If our parent is reopening, then we are as well, unless we are 1582169695Skan * going offline. 1583169695Skan */ 1584169695Skan if (pvd != NULL && pvd->vdev_reopening) 1585169695Skan vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1586169695Skan 1587169695Skan vd->vdev_ops->vdev_op_close(vd); 1588169695Skan 1589169695Skan vdev_cache_purge(vd); 1590169695Skan 1591169695Skan if (vd->vdev_ops->vdev_op_leaf) 1592169695Skan trim_map_destroy(vd); 1593169695Skan 1594169695Skan /* 1595169695Skan * We record the previous state before we close it, so that if we are 1596169695Skan * doing a reopen(), we don't generate FMA ereports if we notice that 1597169695Skan * it's still faulted. 1598169695Skan */ 1599169695Skan vd->vdev_prevstate = vd->vdev_state; 1600169695Skan 1601169695Skan if (vd->vdev_offline) 1602169695Skan vd->vdev_state = VDEV_STATE_OFFLINE; 1603169695Skan else 1604169695Skan vd->vdev_state = VDEV_STATE_CLOSED; 1605169695Skan vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1606169695Skan} 1607169695Skan 1608169695Skanvoid 1609169695Skanvdev_hold(vdev_t *vd) 1610169695Skan{ 1611169695Skan spa_t *spa = vd->vdev_spa; 1612169695Skan 1613169695Skan ASSERT(spa_is_root(spa)); 1614169695Skan if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1615169695Skan return; 1616169695Skan 1617169695Skan for (int c = 0; c < vd->vdev_children; c++) 1618169695Skan vdev_hold(vd->vdev_child[c]); 1619169695Skan 1620169695Skan if (vd->vdev_ops->vdev_op_leaf) 1621169695Skan vd->vdev_ops->vdev_op_hold(vd); 1622169695Skan} 1623169695Skan 1624169695Skanvoid 1625169695Skanvdev_rele(vdev_t *vd) 1626169695Skan{ 1627169695Skan spa_t *spa = vd->vdev_spa; 1628169695Skan 1629169695Skan ASSERT(spa_is_root(spa)); 1630169695Skan for (int c = 0; c < vd->vdev_children; c++) 1631169695Skan vdev_rele(vd->vdev_child[c]); 1632169695Skan 1633169695Skan if (vd->vdev_ops->vdev_op_leaf) 1634169695Skan vd->vdev_ops->vdev_op_rele(vd); 1635169695Skan} 1636169695Skan 1637169695Skan/* 1638169695Skan * Reopen all interior vdevs and any unopened leaves. We don't actually 1639169695Skan * reopen leaf vdevs which had previously been opened as they might deadlock 1640169695Skan * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1641169695Skan * If the leaf has never been opened then open it, as usual. 1642169695Skan */ 1643169695Skanvoid 1644169695Skanvdev_reopen(vdev_t *vd) 1645169695Skan{ 1646169695Skan spa_t *spa = vd->vdev_spa; 1647169695Skan 1648169695Skan ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1649169695Skan 1650169695Skan /* set the reopening flag unless we're taking the vdev offline */ 1651169695Skan vd->vdev_reopening = !vd->vdev_offline; 1652169695Skan vdev_close(vd); 1653169695Skan (void) vdev_open(vd); 1654169695Skan 1655169695Skan /* 1656169695Skan * Call vdev_validate() here to make sure we have the same device. 1657169695Skan * Otherwise, a device with an invalid label could be successfully 1658169695Skan * opened in response to vdev_reopen(). 1659169695Skan */ 1660169695Skan if (vd->vdev_aux) { 1661169695Skan (void) vdev_validate_aux(vd); 1662169695Skan if (vdev_readable(vd) && vdev_writeable(vd) && 1663169695Skan vd->vdev_aux == &spa->spa_l2cache && 1664169695Skan !l2arc_vdev_present(vd)) 1665169695Skan l2arc_add_vdev(spa, vd); 1666169695Skan } else { 1667169695Skan (void) vdev_validate(vd, B_TRUE); 1668169695Skan } 1669169695Skan 1670169695Skan /* 1671169695Skan * Reassess parent vdev's health. 1672169695Skan */ 1673169695Skan vdev_propagate_state(vd); 1674169695Skan} 1675169695Skan 1676169695Skanint 1677169695Skanvdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1678169695Skan{ 1679169695Skan int error; 1680169695Skan 1681169695Skan /* 1682169695Skan * Normally, partial opens (e.g. of a mirror) are allowed. 1683169695Skan * For a create, however, we want to fail the request if 1684169695Skan * there are any components we can't open. 1685169695Skan */ 1686169695Skan error = vdev_open(vd); 1687169695Skan 1688169695Skan if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1689169695Skan vdev_close(vd); 1690169695Skan return (error ? error : ENXIO); 1691169695Skan } 1692283010Spfg 1693169695Skan /* 1694169695Skan * Recursively load DTLs and initialize all labels. 1695169695Skan */ 1696169695Skan if ((error = vdev_dtl_load(vd)) != 0 || 1697169695Skan (error = vdev_label_init(vd, txg, isreplacing ? 1698169695Skan VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1699169695Skan vdev_close(vd); 1700169695Skan return (error); 1701169695Skan } 1702169695Skan 1703169695Skan return (0); 1704169695Skan} 1705169695Skan 1706169695Skanvoid 1707169695Skanvdev_metaslab_set_size(vdev_t *vd) 1708169695Skan{ 1709169695Skan /* 1710169695Skan * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. 1711169695Skan */ 1712169695Skan vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); 1713169695Skan vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1714169695Skan} 1715169695Skan 1716169695Skan/* 1717169695Skan * Maximize performance by inflating the configured ashift for top level 1718169695Skan * vdevs to be as close to the physical ashift as possible while maintaining 1719169695Skan * administrator defined limits and ensuring it doesn't go below the 1720169695Skan * logical ashift. 1721169695Skan */ 1722169695Skanvoid 1723169695Skanvdev_ashift_optimize(vdev_t *vd) 1724169695Skan{ 1725169695Skan if (vd == vd->vdev_top) { 1726169695Skan if (vd->vdev_ashift < vd->vdev_physical_ashift) { 1727169695Skan vd->vdev_ashift = MIN( 1728169695Skan MAX(zfs_max_auto_ashift, vd->vdev_ashift), 1729169695Skan MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); 1730169695Skan } else { 1731169695Skan /* 1732169695Skan * Unusual case where logical ashift > physical ashift 1733169695Skan * so we can't cap the calculated ashift based on max 1734169695Skan * ashift as that would cause failures. 1735169695Skan * We still check if we need to increase it to match 1736169695Skan * the min ashift. 1737169695Skan */ 1738169695Skan vd->vdev_ashift = MAX(zfs_min_auto_ashift, 1739169695Skan vd->vdev_ashift); 1740169695Skan } 1741169695Skan } 1742169695Skan} 1743169695Skan 1744169695Skanvoid 1745169695Skanvdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1746169695Skan{ 1747169695Skan ASSERT(vd == vd->vdev_top); 1748169695Skan ASSERT(!vd->vdev_ishole); 1749169695Skan ASSERT(ISP2(flags)); 1750169695Skan ASSERT(spa_writeable(vd->vdev_spa)); 1751169695Skan 1752169695Skan if (flags & VDD_METASLAB) 1753169695Skan (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1754169695Skan 1755169695Skan if (flags & VDD_DTL) 1756169695Skan (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1757169695Skan 1758169695Skan (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1759169695Skan} 1760169695Skan 1761169695Skanvoid 1762169695Skanvdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) 1763169695Skan{ 1764169695Skan for (int c = 0; c < vd->vdev_children; c++) 1765169695Skan vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 1766169695Skan 1767169695Skan if (vd->vdev_ops->vdev_op_leaf) 1768169695Skan vdev_dirty(vd->vdev_top, flags, vd, txg); 1769169695Skan} 1770169695Skan 1771169695Skan/* 1772169695Skan * DTLs. 1773169695Skan * 1774169695Skan * A vdev's DTL (dirty time log) is the set of transaction groups for which 1775169695Skan * the vdev has less than perfect replication. There are four kinds of DTL: 1776169695Skan * 1777169695Skan * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1778169695Skan * 1779169695Skan * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1780169695Skan * 1781169695Skan * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1782169695Skan * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1783169695Skan * txgs that was scrubbed. 1784169695Skan * 1785169695Skan * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1786169695Skan * persistent errors or just some device being offline. 1787169695Skan * Unlike the other three, the DTL_OUTAGE map is not generally 1788169695Skan * maintained; it's only computed when needed, typically to 1789169695Skan * determine whether a device can be detached. 1790169695Skan * 1791169695Skan * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1792169695Skan * either has the data or it doesn't. 1793169695Skan * 1794169695Skan * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1795169695Skan * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1796169695Skan * if any child is less than fully replicated, then so is its parent. 1797169695Skan * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1798169695Skan * comprising only those txgs which appear in 'maxfaults' or more children; 1799169695Skan * those are the txgs we don't have enough replication to read. For example, 1800169695Skan * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1801169695Skan * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1802169695Skan * two child DTL_MISSING maps. 1803169695Skan * 1804169695Skan * It should be clear from the above that to compute the DTLs and outage maps 1805169695Skan * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1806169695Skan * Therefore, that is all we keep on disk. When loading the pool, or after 1807169695Skan * a configuration change, we generate all other DTLs from first principles. 1808169695Skan */ 1809169695Skanvoid 1810169695Skanvdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1811169695Skan{ 1812169695Skan range_tree_t *rt = vd->vdev_dtl[t]; 1813169695Skan 1814169695Skan ASSERT(t < DTL_TYPES); 1815169695Skan ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1816169695Skan ASSERT(spa_writeable(vd->vdev_spa)); 1817169695Skan 1818169695Skan mutex_enter(rt->rt_lock); 1819169695Skan if (!range_tree_contains(rt, txg, size)) 1820169695Skan range_tree_add(rt, txg, size); 1821169695Skan mutex_exit(rt->rt_lock); 1822169695Skan} 1823169695Skan 1824169695Skanboolean_t 1825169695Skanvdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1826169695Skan{ 1827169695Skan range_tree_t *rt = vd->vdev_dtl[t]; 1828169695Skan boolean_t dirty = B_FALSE; 1829169695Skan 1830169695Skan ASSERT(t < DTL_TYPES); 1831169695Skan ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1832169695Skan 1833169695Skan mutex_enter(rt->rt_lock); 1834169695Skan if (range_tree_space(rt) != 0) 1835169695Skan dirty = range_tree_contains(rt, txg, size); 1836169695Skan mutex_exit(rt->rt_lock); 1837169695Skan 1838169695Skan return (dirty); 1839169695Skan} 1840169695Skan 1841169695Skanboolean_t 1842169695Skanvdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1843169695Skan{ 1844169695Skan range_tree_t *rt = vd->vdev_dtl[t]; 1845169695Skan boolean_t empty; 1846169695Skan 1847169695Skan mutex_enter(rt->rt_lock); 1848169695Skan empty = (range_tree_space(rt) == 0); 1849169695Skan mutex_exit(rt->rt_lock); 1850169695Skan 1851169695Skan return (empty); 1852169695Skan} 1853169695Skan 1854169695Skan/* 1855169695Skan * Returns the lowest txg in the DTL range. 1856169695Skan */ 1857169695Skanstatic uint64_t 1858169695Skanvdev_dtl_min(vdev_t *vd) 1859169695Skan{ 1860169695Skan range_seg_t *rs; 1861169695Skan 1862169695Skan ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1863169695Skan ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1864169695Skan ASSERT0(vd->vdev_children); 1865169695Skan 1866169695Skan rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1867169695Skan return (rs->rs_start - 1); 1868169695Skan} 1869169695Skan 1870169695Skan/* 1871169695Skan * Returns the highest txg in the DTL. 1872169695Skan */ 1873169695Skanstatic uint64_t 1874169695Skanvdev_dtl_max(vdev_t *vd) 1875169695Skan{ 1876169695Skan range_seg_t *rs; 1877169695Skan 1878169695Skan ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1879169695Skan ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1880169695Skan ASSERT0(vd->vdev_children); 1881169695Skan 1882169695Skan rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1883169695Skan return (rs->rs_end); 1884169695Skan} 1885169695Skan 1886169695Skan/* 1887169695Skan * Determine if a resilvering vdev should remove any DTL entries from 1888169695Skan * its range. If the vdev was resilvering for the entire duration of the 1889169695Skan * scan then it should excise that range from its DTLs. Otherwise, this 1890169695Skan * vdev is considered partially resilvered and should leave its DTL 1891169695Skan * entries intact. The comment in vdev_dtl_reassess() describes how we 1892169695Skan * excise the DTLs. 1893169695Skan */ 1894169695Skanstatic boolean_t 1895169695Skanvdev_dtl_should_excise(vdev_t *vd) 1896169695Skan{ 1897169695Skan spa_t *spa = vd->vdev_spa; 1898169695Skan dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1899169695Skan 1900169695Skan ASSERT0(scn->scn_phys.scn_errors); 1901169695Skan ASSERT0(vd->vdev_children); 1902169695Skan 1903169695Skan if (vd->vdev_resilver_txg == 0 || 1904169695Skan range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) 1905169695Skan return (B_TRUE); 1906169695Skan 1907169695Skan /* 1908169695Skan * When a resilver is initiated the scan will assign the scn_max_txg 1909169695Skan * value to the highest txg value that exists in all DTLs. If this 1910169695Skan * device's max DTL is not part of this scan (i.e. it is not in 1911169695Skan * the range (scn_min_txg, scn_max_txg] then it is not eligible 1912169695Skan * for excision. 1913169695Skan */ 1914169695Skan if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 1915169695Skan ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); 1916169695Skan ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); 1917169695Skan ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); 1918169695Skan return (B_TRUE); 1919169695Skan } 1920169695Skan return (B_FALSE); 1921169695Skan} 1922169695Skan 1923169695Skan/* 1924169695Skan * Reassess DTLs after a config change or scrub completion. 1925169695Skan */ 1926169695Skanvoid 1927169695Skanvdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1928169695Skan{ 1929169695Skan spa_t *spa = vd->vdev_spa; 1930169695Skan avl_tree_t reftree; 1931169695Skan int minref; 1932169695Skan 1933169695Skan ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1934169695Skan 1935169695Skan for (int c = 0; c < vd->vdev_children; c++) 1936169695Skan vdev_dtl_reassess(vd->vdev_child[c], txg, 1937169695Skan scrub_txg, scrub_done); 1938169695Skan 1939169695Skan if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) 1940169695Skan return; 1941169695Skan 1942169695Skan if (vd->vdev_ops->vdev_op_leaf) { 1943169695Skan dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1944169695Skan 1945169695Skan mutex_enter(&vd->vdev_dtl_lock); 1946169695Skan 1947169695Skan /* 1948169695Skan * If we've completed a scan cleanly then determine 1949169695Skan * if this vdev should remove any DTLs. We only want to 1950169695Skan * excise regions on vdevs that were available during 1951169695Skan * the entire duration of this scan. 1952169695Skan */ 1953169695Skan if (scrub_txg != 0 && 1954169695Skan (spa->spa_scrub_started || 1955169695Skan (scn != NULL && scn->scn_phys.scn_errors == 0)) && 1956169695Skan vdev_dtl_should_excise(vd)) { 1957169695Skan /* 1958169695Skan * We completed a scrub up to scrub_txg. If we 1959169695Skan * did it without rebooting, then the scrub dtl 1960169695Skan * will be valid, so excise the old region and 1961169695Skan * fold in the scrub dtl. Otherwise, leave the 1962169695Skan * dtl as-is if there was an error. 1963169695Skan * 1964169695Skan * There's little trick here: to excise the beginning 1965169695Skan * of the DTL_MISSING map, we put it into a reference 1966169695Skan * tree and then add a segment with refcnt -1 that 1967169695Skan * covers the range [0, scrub_txg). This means 1968169695Skan * that each txg in that range has refcnt -1 or 0. 1969169695Skan * We then add DTL_SCRUB with a refcnt of 2, so that 1970169695Skan * entries in the range [0, scrub_txg) will have a 1971169695Skan * positive refcnt -- either 1 or 2. We then convert 1972169695Skan * the reference tree into the new DTL_MISSING map. 1973169695Skan */ 1974169695Skan space_reftree_create(&reftree); 1975169695Skan space_reftree_add_map(&reftree, 1976169695Skan vd->vdev_dtl[DTL_MISSING], 1); 1977169695Skan space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 1978169695Skan space_reftree_add_map(&reftree, 1979169695Skan vd->vdev_dtl[DTL_SCRUB], 2); 1980169695Skan space_reftree_generate_map(&reftree, 1981169695Skan vd->vdev_dtl[DTL_MISSING], 1); 1982169695Skan space_reftree_destroy(&reftree); 1983169695Skan } 1984169695Skan range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 1985169695Skan range_tree_walk(vd->vdev_dtl[DTL_MISSING], 1986169695Skan range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 1987169695Skan if (scrub_done) 1988169695Skan range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 1989169695Skan range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 1990169695Skan if (!vdev_readable(vd)) 1991169695Skan range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 1992169695Skan else 1993169695Skan range_tree_walk(vd->vdev_dtl[DTL_MISSING], 1994169695Skan range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 1995169695Skan 1996169695Skan /* 1997169695Skan * If the vdev was resilvering and no longer has any 1998169695Skan * DTLs then reset its resilvering flag and dirty 1999169695Skan * the top level so that we persist the change. 2000169695Skan */ 2001169695Skan if (vd->vdev_resilver_txg != 0 && 2002169695Skan range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && 2003169695Skan range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { 2004169695Skan vd->vdev_resilver_txg = 0; 2005169695Skan vdev_config_dirty(vd->vdev_top); 2006169695Skan } 2007169695Skan 2008169695Skan mutex_exit(&vd->vdev_dtl_lock); 2009169695Skan 2010169695Skan if (txg != 0) 2011169695Skan vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 2012169695Skan return; 2013169695Skan } 2014169695Skan 2015169695Skan mutex_enter(&vd->vdev_dtl_lock); 2016169695Skan for (int t = 0; t < DTL_TYPES; t++) { 2017169695Skan /* account for child's outage in parent's missing map */ 2018169695Skan int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 2019169695Skan if (t == DTL_SCRUB) 2020169695Skan continue; /* leaf vdevs only */ 2021169695Skan if (t == DTL_PARTIAL) 2022169695Skan minref = 1; /* i.e. non-zero */ 2023169695Skan else if (vd->vdev_nparity != 0) 2024169695Skan minref = vd->vdev_nparity + 1; /* RAID-Z */ 2025169695Skan else 2026169695Skan minref = vd->vdev_children; /* any kind of mirror */ 2027169695Skan space_reftree_create(&reftree); 2028169695Skan for (int c = 0; c < vd->vdev_children; c++) { 2029169695Skan vdev_t *cvd = vd->vdev_child[c]; 2030169695Skan mutex_enter(&cvd->vdev_dtl_lock); 2031169695Skan space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); 2032169695Skan mutex_exit(&cvd->vdev_dtl_lock); 2033169695Skan } 2034169695Skan space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); 2035169695Skan space_reftree_destroy(&reftree); 2036169695Skan } 2037169695Skan mutex_exit(&vd->vdev_dtl_lock); 2038169695Skan} 2039169695Skan 2040169695Skanint 2041169695Skanvdev_dtl_load(vdev_t *vd) 2042169695Skan{ 2043169695Skan spa_t *spa = vd->vdev_spa; 2044169695Skan objset_t *mos = spa->spa_meta_objset; 2045169695Skan int error = 0; 2046169695Skan 2047169695Skan if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 2048169695Skan ASSERT(!vd->vdev_ishole); 2049169695Skan 2050169695Skan error = space_map_open(&vd->vdev_dtl_sm, mos, 2051169695Skan vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); 2052169695Skan if (error) 2053169695Skan return (error); 2054169695Skan ASSERT(vd->vdev_dtl_sm != NULL); 2055169695Skan 2056169695Skan mutex_enter(&vd->vdev_dtl_lock); 2057169695Skan 2058169695Skan /* 2059169695Skan * Now that we've opened the space_map we need to update 2060169695Skan * the in-core DTL. 2061169695Skan */ 2062169695Skan space_map_update(vd->vdev_dtl_sm); 2063169695Skan 2064169695Skan error = space_map_load(vd->vdev_dtl_sm, 2065169695Skan vd->vdev_dtl[DTL_MISSING], SM_ALLOC); 2066169695Skan mutex_exit(&vd->vdev_dtl_lock); 2067169695Skan 2068169695Skan return (error); 2069169695Skan } 2070169695Skan 2071169695Skan for (int c = 0; c < vd->vdev_children; c++) { 2072169695Skan error = vdev_dtl_load(vd->vdev_child[c]); 2073169695Skan if (error != 0) 2074169695Skan break; 2075169695Skan } 2076169695Skan 2077169695Skan return (error); 2078169695Skan} 2079169695Skan 2080169695Skanvoid 2081169695Skanvdev_dtl_sync(vdev_t *vd, uint64_t txg) 2082169695Skan{ 2083169695Skan spa_t *spa = vd->vdev_spa; 2084169695Skan range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; 2085169695Skan objset_t *mos = spa->spa_meta_objset; 2086169695Skan range_tree_t *rtsync; 2087169695Skan kmutex_t rtlock; 2088169695Skan dmu_tx_t *tx; 2089169695Skan uint64_t object = space_map_object(vd->vdev_dtl_sm); 2090169695Skan 2091169695Skan ASSERT(!vd->vdev_ishole); 2092169695Skan ASSERT(vd->vdev_ops->vdev_op_leaf); 2093169695Skan 2094169695Skan tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2095169695Skan 2096169695Skan if (vd->vdev_detached || vd->vdev_top->vdev_removing) { 2097169695Skan mutex_enter(&vd->vdev_dtl_lock); 2098169695Skan space_map_free(vd->vdev_dtl_sm, tx); 2099169695Skan space_map_close(vd->vdev_dtl_sm); 2100169695Skan vd->vdev_dtl_sm = NULL; 2101169695Skan mutex_exit(&vd->vdev_dtl_lock); 2102169695Skan dmu_tx_commit(tx); 2103169695Skan return; 2104169695Skan } 2105169695Skan 2106169695Skan if (vd->vdev_dtl_sm == NULL) { 2107169695Skan uint64_t new_object; 2108169695Skan 2109169695Skan new_object = space_map_alloc(mos, tx); 2110169695Skan VERIFY3U(new_object, !=, 0); 2111169695Skan 2112169695Skan VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 2113169695Skan 0, -1ULL, 0, &vd->vdev_dtl_lock)); 2114169695Skan ASSERT(vd->vdev_dtl_sm != NULL); 2115169695Skan } 2116169695Skan 2117169695Skan bzero(&rtlock, sizeof(rtlock)); 2118169695Skan mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); 2119169695Skan 2120169695Skan rtsync = range_tree_create(NULL, NULL, &rtlock); 2121169695Skan 2122169695Skan mutex_enter(&rtlock); 2123169695Skan 2124169695Skan mutex_enter(&vd->vdev_dtl_lock); 2125169695Skan range_tree_walk(rt, range_tree_add, rtsync); 2126169695Skan mutex_exit(&vd->vdev_dtl_lock); 2127169695Skan 2128169695Skan space_map_truncate(vd->vdev_dtl_sm, tx); 2129169695Skan space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); 2130169695Skan range_tree_vacate(rtsync, NULL, NULL); 2131169695Skan 2132169695Skan range_tree_destroy(rtsync); 2133169695Skan 2134169695Skan mutex_exit(&rtlock); 2135169695Skan mutex_destroy(&rtlock); 2136169695Skan 2137169695Skan /* 2138169695Skan * If the object for the space map has changed then dirty 2139169695Skan * the top level so that we update the config. 2140169695Skan */ 2141169695Skan if (object != space_map_object(vd->vdev_dtl_sm)) { 2142169695Skan zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " 2143169695Skan "new object %llu", txg, spa_name(spa), object, 2144169695Skan space_map_object(vd->vdev_dtl_sm)); 2145169695Skan vdev_config_dirty(vd->vdev_top); 2146169695Skan } 2147169695Skan 2148169695Skan dmu_tx_commit(tx); 2149169695Skan 2150169695Skan mutex_enter(&vd->vdev_dtl_lock); 2151169695Skan space_map_update(vd->vdev_dtl_sm); 2152169695Skan mutex_exit(&vd->vdev_dtl_lock); 2153169695Skan} 2154169695Skan 2155169695Skan/* 2156169695Skan * Determine whether the specified vdev can be offlined/detached/removed 2157169695Skan * without losing data. 2158169695Skan */ 2159169695Skanboolean_t 2160169695Skanvdev_dtl_required(vdev_t *vd) 2161169695Skan{ 2162169695Skan spa_t *spa = vd->vdev_spa; 2163169695Skan vdev_t *tvd = vd->vdev_top; 2164169695Skan uint8_t cant_read = vd->vdev_cant_read; 2165169695Skan boolean_t required; 2166169695Skan 2167169695Skan ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2168169695Skan 2169169695Skan if (vd == spa->spa_root_vdev || vd == tvd) 2170169695Skan return (B_TRUE); 2171169695Skan 2172169695Skan /* 2173169695Skan * Temporarily mark the device as unreadable, and then determine 2174169695Skan * whether this results in any DTL outages in the top-level vdev. 2175169695Skan * If not, we can safely offline/detach/remove the device. 2176169695Skan */ 2177169695Skan vd->vdev_cant_read = B_TRUE; 2178169695Skan vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2179169695Skan required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 2180169695Skan vd->vdev_cant_read = cant_read; 2181169695Skan vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2182169695Skan 2183169695Skan if (!required && zio_injection_enabled) 2184169695Skan required = !!zio_handle_device_injection(vd, NULL, ECHILD); 2185169695Skan 2186169695Skan return (required); 2187169695Skan} 2188169695Skan 2189169695Skan/* 2190169695Skan * Determine if resilver is needed, and if so the txg range. 2191169695Skan */ 2192169695Skanboolean_t 2193169695Skanvdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 2194169695Skan{ 2195169695Skan boolean_t needed = B_FALSE; 2196169695Skan uint64_t thismin = UINT64_MAX; 2197169695Skan uint64_t thismax = 0; 2198169695Skan 2199169695Skan if (vd->vdev_children == 0) { 2200169695Skan mutex_enter(&vd->vdev_dtl_lock); 2201169695Skan if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && 2202169695Skan vdev_writeable(vd)) { 2203169695Skan 2204169695Skan thismin = vdev_dtl_min(vd); 2205169695Skan thismax = vdev_dtl_max(vd); 2206169695Skan needed = B_TRUE; 2207169695Skan } 2208169695Skan mutex_exit(&vd->vdev_dtl_lock); 2209169695Skan } else { 2210169695Skan for (int c = 0; c < vd->vdev_children; c++) { 2211169695Skan vdev_t *cvd = vd->vdev_child[c]; 2212169695Skan uint64_t cmin, cmax; 2213169695Skan 2214169695Skan if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 2215169695Skan thismin = MIN(thismin, cmin); 2216169695Skan thismax = MAX(thismax, cmax); 2217169695Skan needed = B_TRUE; 2218169695Skan } 2219169695Skan } 2220169695Skan } 2221169695Skan 2222169695Skan if (needed && minp) { 2223169695Skan *minp = thismin; 2224169695Skan *maxp = thismax; 2225169695Skan } 2226169695Skan return (needed); 2227169695Skan} 2228169695Skan 2229169695Skanvoid 2230169695Skanvdev_load(vdev_t *vd) 2231169695Skan{ 2232169695Skan /* 2233169695Skan * Recursively load all children. 2234169695Skan */ 2235169695Skan for (int c = 0; c < vd->vdev_children; c++) 2236169695Skan vdev_load(vd->vdev_child[c]); 2237169695Skan 2238169695Skan /* 2239169695Skan * If this is a top-level vdev, initialize its metaslabs. 2240169695Skan */ 2241169695Skan if (vd == vd->vdev_top && !vd->vdev_ishole && 2242169695Skan (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 2243169695Skan vdev_metaslab_init(vd, 0) != 0)) 2244169695Skan vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2245169695Skan VDEV_AUX_CORRUPT_DATA); 2246169695Skan 2247169695Skan /* 2248169695Skan * If this is a leaf vdev, load its DTL. 2249169695Skan */ 2250169695Skan if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 2251169695Skan vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2252169695Skan VDEV_AUX_CORRUPT_DATA); 2253169695Skan} 2254169695Skan 2255169695Skan/* 2256169695Skan * The special vdev case is used for hot spares and l2cache devices. Its 2257169695Skan * sole purpose it to set the vdev state for the associated vdev. To do this, 2258169695Skan * we make sure that we can open the underlying device, then try to read the 2259169695Skan * label, and make sure that the label is sane and that it hasn't been 2260169695Skan * repurposed to another pool. 2261169695Skan */ 2262169695Skanint 2263169695Skanvdev_validate_aux(vdev_t *vd) 2264169695Skan{ 2265169695Skan nvlist_t *label; 2266169695Skan uint64_t guid, version; 2267169695Skan uint64_t state; 2268169695Skan 2269169695Skan if (!vdev_readable(vd)) 2270169695Skan return (0); 2271169695Skan 2272169695Skan if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 2273169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2274169695Skan VDEV_AUX_CORRUPT_DATA); 2275169695Skan return (-1); 2276169695Skan } 2277169695Skan 2278169695Skan if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 2279169695Skan !SPA_VERSION_IS_SUPPORTED(version) || 2280169695Skan nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 2281169695Skan guid != vd->vdev_guid || 2282169695Skan nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2283169695Skan vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2284169695Skan VDEV_AUX_CORRUPT_DATA); 2285169695Skan nvlist_free(label); 2286169695Skan return (-1); 2287169695Skan } 2288169695Skan 2289169695Skan /* 2290169695Skan * We don't actually check the pool state here. If it's in fact in 2291169695Skan * use by another pool, we update this fact on the fly when requested. 2292169695Skan */ 2293169695Skan nvlist_free(label); 2294169695Skan return (0); 2295169695Skan} 2296169695Skan 2297169695Skanvoid 2298169695Skanvdev_remove(vdev_t *vd, uint64_t txg) 2299169695Skan{ 2300169695Skan spa_t *spa = vd->vdev_spa; 2301169695Skan objset_t *mos = spa->spa_meta_objset; 2302169695Skan dmu_tx_t *tx; 2303169695Skan 2304169695Skan tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2305169695Skan 2306169695Skan if (vd->vdev_ms != NULL) { 2307169695Skan metaslab_group_t *mg = vd->vdev_mg; 2308169695Skan 2309169695Skan metaslab_group_histogram_verify(mg); 2310169695Skan metaslab_class_histogram_verify(mg->mg_class); 2311169695Skan 2312169695Skan for (int m = 0; m < vd->vdev_ms_count; m++) { 2313169695Skan metaslab_t *msp = vd->vdev_ms[m]; 2314169695Skan 2315169695Skan if (msp == NULL || msp->ms_sm == NULL) 2316169695Skan continue; 2317169695Skan 2318169695Skan mutex_enter(&msp->ms_lock); 2319169695Skan /* 2320169695Skan * If the metaslab was not loaded when the vdev 2321169695Skan * was removed then the histogram accounting may 2322169695Skan * not be accurate. Update the histogram information 2323169695Skan * here so that we ensure that the metaslab group 2324169695Skan * and metaslab class are up-to-date. 2325169695Skan */ 2326169695Skan metaslab_group_histogram_remove(mg, msp); 2327169695Skan 2328169695Skan VERIFY0(space_map_allocated(msp->ms_sm)); 2329169695Skan space_map_free(msp->ms_sm, tx); 2330169695Skan space_map_close(msp->ms_sm); 2331169695Skan msp->ms_sm = NULL; 2332169695Skan mutex_exit(&msp->ms_lock); 2333169695Skan } 2334169695Skan 2335169695Skan metaslab_group_histogram_verify(mg); 2336169695Skan metaslab_class_histogram_verify(mg->mg_class); 2337169695Skan for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 2338169695Skan ASSERT0(mg->mg_histogram[i]); 2339169695Skan 2340169695Skan } 2341169695Skan 2342169695Skan if (vd->vdev_ms_array) { 2343169695Skan (void) dmu_object_free(mos, vd->vdev_ms_array, tx); 2344169695Skan vd->vdev_ms_array = 0; 2345169695Skan } 2346169695Skan dmu_tx_commit(tx); 2347169695Skan} 2348169695Skan 2349169695Skanvoid 2350169695Skanvdev_sync_done(vdev_t *vd, uint64_t txg) 2351169695Skan{ 2352169695Skan metaslab_t *msp; 2353169695Skan boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2354169695Skan 2355169695Skan ASSERT(!vd->vdev_ishole); 2356169695Skan 2357169695Skan while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2358169695Skan metaslab_sync_done(msp, txg); 2359169695Skan 2360169695Skan if (reassess) 2361169695Skan metaslab_sync_reassess(vd->vdev_mg); 2362169695Skan} 2363169695Skan 2364169695Skanvoid 2365169695Skanvdev_sync(vdev_t *vd, uint64_t txg) 2366169695Skan{ 2367169695Skan spa_t *spa = vd->vdev_spa; 2368169695Skan vdev_t *lvd; 2369169695Skan metaslab_t *msp; 2370169695Skan dmu_tx_t *tx; 2371169695Skan 2372169695Skan ASSERT(!vd->vdev_ishole); 2373169695Skan 2374169695Skan if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 2375169695Skan ASSERT(vd == vd->vdev_top); 2376169695Skan tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2377169695Skan vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2378169695Skan DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2379169695Skan ASSERT(vd->vdev_ms_array != 0); 2380169695Skan vdev_config_dirty(vd); 2381169695Skan dmu_tx_commit(tx); 2382169695Skan } 2383169695Skan 2384169695Skan /* 2385169695Skan * Remove the metadata associated with this vdev once it's empty. 2386169695Skan */ 2387169695Skan if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) 2388169695Skan vdev_remove(vd, txg); 2389169695Skan 2390169695Skan while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2391169695Skan metaslab_sync(msp, txg); 2392169695Skan (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2393169695Skan } 2394169695Skan 2395169695Skan while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2396169695Skan vdev_dtl_sync(lvd, txg); 2397169695Skan 2398169695Skan (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2399169695Skan} 2400169695Skan 2401169695Skanuint64_t 2402169695Skanvdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2403169695Skan{ 2404169695Skan return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2405169695Skan} 2406169695Skan 2407169695Skan/* 2408169695Skan * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2409169695Skan * not be opened, and no I/O is attempted. 2410169695Skan */ 2411169695Skanint 2412169695Skanvdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2413169695Skan{ 2414169695Skan vdev_t *vd, *tvd; 2415169695Skan 2416169695Skan spa_vdev_state_enter(spa, SCL_NONE); 2417169695Skan 2418169695Skan if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2419169695Skan return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2420169695Skan 2421169695Skan if (!vd->vdev_ops->vdev_op_leaf) 2422169695Skan return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2423169695Skan 2424169695Skan tvd = vd->vdev_top; 2425169695Skan 2426169695Skan /* 2427169695Skan * We don't directly use the aux state here, but if we do a 2428169695Skan * vdev_reopen(), we need this value to be present to remember why we 2429169695Skan * were faulted. 2430169695Skan */ 2431169695Skan vd->vdev_label_aux = aux; 2432169695Skan 2433169695Skan /* 2434169695Skan * Faulted state takes precedence over degraded. 2435169695Skan */ 2436169695Skan vd->vdev_delayed_close = B_FALSE; 2437169695Skan vd->vdev_faulted = 1ULL; 2438169695Skan vd->vdev_degraded = 0ULL; 2439169695Skan vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2440169695Skan 2441169695Skan /* 2442169695Skan * If this device has the only valid copy of the data, then 2443169695Skan * back off and simply mark the vdev as degraded instead. 2444169695Skan */ 2445169695Skan if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2446169695Skan vd->vdev_degraded = 1ULL; 2447169695Skan vd->vdev_faulted = 0ULL; 2448169695Skan 2449169695Skan /* 2450169695Skan * If we reopen the device and it's not dead, only then do we 2451169695Skan * mark it degraded. 2452169695Skan */ 2453169695Skan vdev_reopen(tvd); 2454169695Skan 2455169695Skan if (vdev_readable(vd)) 2456169695Skan vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2457169695Skan } 2458169695Skan 2459169695Skan return (spa_vdev_state_exit(spa, vd, 0)); 2460169695Skan} 2461169695Skan 2462169695Skan/* 2463169695Skan * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2464169695Skan * user that something is wrong. The vdev continues to operate as normal as far 2465169695Skan * as I/O is concerned. 2466169695Skan */ 2467169695Skanint 2468169695Skanvdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2469169695Skan{ 2470169695Skan vdev_t *vd; 2471169695Skan 2472169695Skan spa_vdev_state_enter(spa, SCL_NONE); 2473169695Skan 2474169695Skan if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2475169695Skan return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2476169695Skan 2477169695Skan if (!vd->vdev_ops->vdev_op_leaf) 2478169695Skan return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2479169695Skan 2480169695Skan /* 2481169695Skan * If the vdev is already faulted, then don't do anything. 2482169695Skan */ 2483169695Skan if (vd->vdev_faulted || vd->vdev_degraded) 2484169695Skan return (spa_vdev_state_exit(spa, NULL, 0)); 2485169695Skan 2486169695Skan vd->vdev_degraded = 1ULL; 2487169695Skan if (!vdev_is_dead(vd)) 2488169695Skan vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2489169695Skan aux); 2490169695Skan 2491169695Skan return (spa_vdev_state_exit(spa, vd, 0)); 2492169695Skan} 2493169695Skan 2494169695Skan/* 2495169695Skan * Online the given vdev. 2496169695Skan * 2497169695Skan * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2498169695Skan * spare device should be detached when the device finishes resilvering. 2499169695Skan * Second, the online should be treated like a 'test' online case, so no FMA 2500169695Skan * events are generated if the device fails to open. 2501169695Skan */ 2502169695Skanint 2503169695Skanvdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 2504169695Skan{ 2505169695Skan vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 2506169695Skan 2507169695Skan spa_vdev_state_enter(spa, SCL_NONE); 2508169695Skan 2509169695Skan if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2510169695Skan return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2511169695Skan 2512169695Skan if (!vd->vdev_ops->vdev_op_leaf) 2513169695Skan return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2514169695Skan 2515169695Skan tvd = vd->vdev_top; 2516169695Skan vd->vdev_offline = B_FALSE; 2517169695Skan vd->vdev_tmpoffline = B_FALSE; 2518169695Skan vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2519169695Skan vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2520169695Skan 2521169695Skan /* XXX - L2ARC 1.0 does not support expansion */ 2522169695Skan if (!vd->vdev_aux) { 2523169695Skan for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2524169695Skan pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2525169695Skan } 2526169695Skan 2527169695Skan vdev_reopen(tvd); 2528169695Skan vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2529169695Skan 2530169695Skan if (!vd->vdev_aux) { 2531169695Skan for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2532169695Skan pvd->vdev_expanding = B_FALSE; 2533169695Skan } 2534169695Skan 2535169695Skan if (newstate) 2536169695Skan *newstate = vd->vdev_state; 2537169695Skan if ((flags & ZFS_ONLINE_UNSPARE) && 2538169695Skan !vdev_is_dead(vd) && vd->vdev_parent && 2539169695Skan vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2540169695Skan vd->vdev_parent->vdev_child[0] == vd) 2541169695Skan vd->vdev_unspare = B_TRUE; 2542169695Skan 2543169695Skan if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2544169695Skan 2545169695Skan /* XXX - L2ARC 1.0 does not support expansion */ 2546169695Skan if (vd->vdev_aux) 2547169695Skan return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2548169695Skan spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2549169695Skan } 2550169695Skan return (spa_vdev_state_exit(spa, vd, 0)); 2551169695Skan} 2552169695Skan 2553169695Skanstatic int 2554169695Skanvdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2555169695Skan{ 2556169695Skan vdev_t *vd, *tvd; 2557169695Skan int error = 0; 2558169695Skan uint64_t generation; 2559169695Skan metaslab_group_t *mg; 2560169695Skan 2561169695Skantop: 2562169695Skan spa_vdev_state_enter(spa, SCL_ALLOC); 2563169695Skan 2564169695Skan if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2565169695Skan return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2566169695Skan 2567169695Skan if (!vd->vdev_ops->vdev_op_leaf) 2568169695Skan return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2569169695Skan 2570169695Skan tvd = vd->vdev_top; 2571169695Skan mg = tvd->vdev_mg; 2572169695Skan generation = spa->spa_config_generation + 1; 2573169695Skan 2574169695Skan /* 2575169695Skan * If the device isn't already offline, try to offline it. 2576169695Skan */ 2577169695Skan if (!vd->vdev_offline) { 2578169695Skan /* 2579169695Skan * If this device has the only valid copy of some data, 2580169695Skan * don't allow it to be offlined. Log devices are always 2581169695Skan * expendable. 2582169695Skan */ 2583169695Skan if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2584169695Skan vdev_dtl_required(vd)) 2585169695Skan return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2586169695Skan 2587169695Skan /* 2588169695Skan * If the top-level is a slog and it has had allocations 2589169695Skan * then proceed. We check that the vdev's metaslab group 2590169695Skan * is not NULL since it's possible that we may have just 2591169695Skan * added this vdev but not yet initialized its metaslabs. 2592169695Skan */ 2593169695Skan if (tvd->vdev_islog && mg != NULL) { 2594169695Skan /* 2595169695Skan * Prevent any future allocations. 2596169695Skan */ 2597169695Skan metaslab_group_passivate(mg); 2598169695Skan (void) spa_vdev_state_exit(spa, vd, 0); 2599169695Skan 2600169695Skan error = spa_offline_log(spa); 2601169695Skan 2602169695Skan spa_vdev_state_enter(spa, SCL_ALLOC); 2603169695Skan 2604169695Skan /* 2605169695Skan * Check to see if the config has changed. 2606169695Skan */ 2607169695Skan if (error || generation != spa->spa_config_generation) { 2608169695Skan metaslab_group_activate(mg); 2609169695Skan if (error) 2610169695Skan return (spa_vdev_state_exit(spa, 2611169695Skan vd, error)); 2612169695Skan (void) spa_vdev_state_exit(spa, vd, 0); 2613169695Skan goto top; 2614169695Skan } 2615169695Skan ASSERT0(tvd->vdev_stat.vs_alloc); 2616169695Skan } 2617169695Skan 2618169695Skan /* 2619169695Skan * Offline this device and reopen its top-level vdev. 2620169695Skan * If the top-level vdev is a log device then just offline 2621169695Skan * it. Otherwise, if this action results in the top-level 2622169695Skan * vdev becoming unusable, undo it and fail the request. 2623169695Skan */ 2624169695Skan vd->vdev_offline = B_TRUE; 2625169695Skan vdev_reopen(tvd); 2626169695Skan 2627169695Skan if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2628169695Skan vdev_is_dead(tvd)) { 2629169695Skan vd->vdev_offline = B_FALSE; 2630169695Skan vdev_reopen(tvd); 2631169695Skan return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2632169695Skan } 2633169695Skan 2634169695Skan /* 2635169695Skan * Add the device back into the metaslab rotor so that 2636169695Skan * once we online the device it's open for business. 2637169695Skan */ 2638169695Skan if (tvd->vdev_islog && mg != NULL) 2639169695Skan metaslab_group_activate(mg); 2640169695Skan } 2641169695Skan 2642169695Skan vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2643169695Skan 2644169695Skan return (spa_vdev_state_exit(spa, vd, 0)); 2645169695Skan} 2646169695Skan 2647169695Skanint 2648169695Skanvdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2649169695Skan{ 2650169695Skan int error; 2651169695Skan 2652169695Skan mutex_enter(&spa->spa_vdev_top_lock); 2653169695Skan error = vdev_offline_locked(spa, guid, flags); 2654169695Skan mutex_exit(&spa->spa_vdev_top_lock); 2655169695Skan 2656169695Skan return (error); 2657169695Skan} 2658169695Skan 2659169695Skan/* 2660169695Skan * Clear the error counts associated with this vdev. Unlike vdev_online() and 2661169695Skan * vdev_offline(), we assume the spa config is locked. We also clear all 2662169695Skan * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2663169695Skan */ 2664169695Skanvoid 2665169695Skanvdev_clear(spa_t *spa, vdev_t *vd) 2666169695Skan{ 2667169695Skan vdev_t *rvd = spa->spa_root_vdev; 2668169695Skan 2669169695Skan ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2670169695Skan 2671169695Skan if (vd == NULL) 2672169695Skan vd = rvd; 2673169695Skan 2674169695Skan vd->vdev_stat.vs_read_errors = 0; 2675169695Skan vd->vdev_stat.vs_write_errors = 0; 2676169695Skan vd->vdev_stat.vs_checksum_errors = 0; 2677169695Skan 2678169695Skan for (int c = 0; c < vd->vdev_children; c++) 2679169695Skan vdev_clear(spa, vd->vdev_child[c]); 2680169695Skan 2681169695Skan if (vd == rvd) { 2682169695Skan for (int c = 0; c < spa->spa_l2cache.sav_count; c++) 2683169695Skan vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); 2684169695Skan 2685169695Skan for (int c = 0; c < spa->spa_spares.sav_count; c++) 2686169695Skan vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); 2687169695Skan } 2688169695Skan 2689169695Skan /* 2690169695Skan * If we're in the FAULTED state or have experienced failed I/O, then 2691169695Skan * clear the persistent state and attempt to reopen the device. We 2692169695Skan * also mark the vdev config dirty, so that the new faulted state is 2693169695Skan * written out to disk. 2694169695Skan */ 2695169695Skan if (vd->vdev_faulted || vd->vdev_degraded || 2696169695Skan !vdev_readable(vd) || !vdev_writeable(vd)) { 2697169695Skan 2698169695Skan /* 2699169695Skan * When reopening in reponse to a clear event, it may be due to 2700169695Skan * a fmadm repair request. In this case, if the device is 2701169695Skan * still broken, we want to still post the ereport again. 2702169695Skan */ 2703169695Skan vd->vdev_forcefault = B_TRUE; 2704169695Skan 2705169695Skan vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2706169695Skan vd->vdev_cant_read = B_FALSE; 2707169695Skan vd->vdev_cant_write = B_FALSE; 2708169695Skan 2709169695Skan vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 2710169695Skan 2711169695Skan vd->vdev_forcefault = B_FALSE; 2712169695Skan 2713169695Skan if (vd != rvd && vdev_writeable(vd->vdev_top)) 2714169695Skan vdev_state_dirty(vd->vdev_top); 2715169695Skan 2716169695Skan if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2717169695Skan spa_async_request(spa, SPA_ASYNC_RESILVER); 2718169695Skan 2719169695Skan spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 2720169695Skan } 2721169695Skan 2722169695Skan /* 2723169695Skan * When clearing a FMA-diagnosed fault, we always want to 2724169695Skan * unspare the device, as we assume that the original spare was 2725169695Skan * done in response to the FMA fault. 2726169695Skan */ 2727169695Skan if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2728169695Skan vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2729169695Skan vd->vdev_parent->vdev_child[0] == vd) 2730169695Skan vd->vdev_unspare = B_TRUE; 2731169695Skan} 2732169695Skan 2733169695Skanboolean_t 2734169695Skanvdev_is_dead(vdev_t *vd) 2735169695Skan{ 2736169695Skan /* 2737169695Skan * Holes and missing devices are always considered "dead". 2738169695Skan * This simplifies the code since we don't have to check for 2739169695Skan * these types of devices in the various code paths. 2740169695Skan * Instead we rely on the fact that we skip over dead devices 2741169695Skan * before issuing I/O to them. 2742169695Skan */ 2743169695Skan return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 2744169695Skan vd->vdev_ops == &vdev_missing_ops); 2745169695Skan} 2746169695Skan 2747169695Skanboolean_t 2748169695Skanvdev_readable(vdev_t *vd) 2749169695Skan{ 2750169695Skan return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 2751169695Skan} 2752169695Skan 2753169695Skanboolean_t 2754169695Skanvdev_writeable(vdev_t *vd) 2755169695Skan{ 2756169695Skan return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 2757169695Skan} 2758169695Skan 2759169695Skanboolean_t 2760169695Skanvdev_allocatable(vdev_t *vd) 2761169695Skan{ 2762169695Skan uint64_t state = vd->vdev_state; 2763169695Skan 2764169695Skan /* 2765169695Skan * We currently allow allocations from vdevs which may be in the 2766169695Skan * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2767169695Skan * fails to reopen then we'll catch it later when we're holding 2768169695Skan * the proper locks. Note that we have to get the vdev state 2769169695Skan * in a local variable because although it changes atomically, 2770169695Skan * we're asking two separate questions about it. 2771169695Skan */ 2772169695Skan return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 2773169695Skan !vd->vdev_cant_write && !vd->vdev_ishole); 2774169695Skan} 2775169695Skan 2776169695Skanboolean_t 2777169695Skanvdev_accessible(vdev_t *vd, zio_t *zio) 2778169695Skan{ 2779169695Skan ASSERT(zio->io_vd == vd); 2780169695Skan 2781169695Skan if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2782169695Skan return (B_FALSE); 2783169695Skan 2784169695Skan if (zio->io_type == ZIO_TYPE_READ) 2785169695Skan return (!vd->vdev_cant_read); 2786169695Skan 2787169695Skan if (zio->io_type == ZIO_TYPE_WRITE) 2788169695Skan return (!vd->vdev_cant_write); 2789169695Skan 2790169695Skan return (B_TRUE); 2791169695Skan} 2792169695Skan 2793169695Skan/* 2794169695Skan * Get statistics for the given vdev. 2795169695Skan */ 2796169695Skanvoid 2797169695Skanvdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2798169695Skan{ 2799169695Skan spa_t *spa = vd->vdev_spa; 2800169695Skan vdev_t *rvd = spa->spa_root_vdev; 2801169695Skan 2802169695Skan ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2803169695Skan 2804169695Skan mutex_enter(&vd->vdev_stat_lock); 2805169695Skan bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2806169695Skan vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2807169695Skan vs->vs_state = vd->vdev_state; 2808169695Skan vs->vs_rsize = vdev_get_min_asize(vd); 2809169695Skan if (vd->vdev_ops->vdev_op_leaf) 2810169695Skan vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2811169695Skan vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; 2812169695Skan vs->vs_configured_ashift = vd->vdev_top != NULL 2813169695Skan ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; 2814169695Skan vs->vs_logical_ashift = vd->vdev_logical_ashift; 2815169695Skan vs->vs_physical_ashift = vd->vdev_physical_ashift; 2816169695Skan if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { 2817169695Skan vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 2818169695Skan } 2819169695Skan 2820169695Skan /* 2821169695Skan * If we're getting stats on the root vdev, aggregate the I/O counts 2822169695Skan * over all top-level vdevs (i.e. the direct children of the root). 2823169695Skan */ 2824169695Skan if (vd == rvd) { 2825169695Skan for (int c = 0; c < rvd->vdev_children; c++) { 2826169695Skan vdev_t *cvd = rvd->vdev_child[c]; 2827169695Skan vdev_stat_t *cvs = &cvd->vdev_stat; 2828169695Skan 2829169695Skan for (int t = 0; t < ZIO_TYPES; t++) { 2830169695Skan vs->vs_ops[t] += cvs->vs_ops[t]; 2831169695Skan vs->vs_bytes[t] += cvs->vs_bytes[t]; 2832169695Skan } 2833169695Skan cvs->vs_scan_removing = cvd->vdev_removing; 2834169695Skan } 2835169695Skan } 2836169695Skan mutex_exit(&vd->vdev_stat_lock); 2837169695Skan} 2838169695Skan 2839169695Skanvoid 2840169695Skanvdev_clear_stats(vdev_t *vd) 2841169695Skan{ 2842169695Skan mutex_enter(&vd->vdev_stat_lock); 2843169695Skan vd->vdev_stat.vs_space = 0; 2844169695Skan vd->vdev_stat.vs_dspace = 0; 2845169695Skan vd->vdev_stat.vs_alloc = 0; 2846169695Skan mutex_exit(&vd->vdev_stat_lock); 2847169695Skan} 2848169695Skan 2849169695Skanvoid 2850169695Skanvdev_scan_stat_init(vdev_t *vd) 2851169695Skan{ 2852169695Skan vdev_stat_t *vs = &vd->vdev_stat; 2853169695Skan 2854169695Skan for (int c = 0; c < vd->vdev_children; c++) 2855169695Skan vdev_scan_stat_init(vd->vdev_child[c]); 2856169695Skan 2857169695Skan mutex_enter(&vd->vdev_stat_lock); 2858169695Skan vs->vs_scan_processed = 0; 2859169695Skan mutex_exit(&vd->vdev_stat_lock); 2860169695Skan} 2861169695Skan 2862169695Skanvoid 2863169695Skanvdev_stat_update(zio_t *zio, uint64_t psize) 2864169695Skan{ 2865169695Skan spa_t *spa = zio->io_spa; 2866169695Skan vdev_t *rvd = spa->spa_root_vdev; 2867169695Skan vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2868169695Skan vdev_t *pvd; 2869169695Skan uint64_t txg = zio->io_txg; 2870169695Skan vdev_stat_t *vs = &vd->vdev_stat; 2871169695Skan zio_type_t type = zio->io_type; 2872169695Skan int flags = zio->io_flags; 2873169695Skan 2874169695Skan /* 2875169695Skan * If this i/o is a gang leader, it didn't do any actual work. 2876169695Skan */ 2877169695Skan if (zio->io_gang_tree) 2878169695Skan return; 2879169695Skan 2880169695Skan if (zio->io_error == 0) { 2881169695Skan /* 2882169695Skan * If this is a root i/o, don't count it -- we've already 2883169695Skan * counted the top-level vdevs, and vdev_get_stats() will 2884169695Skan * aggregate them when asked. This reduces contention on 2885169695Skan * the root vdev_stat_lock and implicitly handles blocks 2886169695Skan * that compress away to holes, for which there is no i/o. 2887169695Skan * (Holes never create vdev children, so all the counters 2888169695Skan * remain zero, which is what we want.) 2889169695Skan * 2890169695Skan * Note: this only applies to successful i/o (io_error == 0) 2891169695Skan * because unlike i/o counts, errors are not additive. 2892169695Skan * When reading a ditto block, for example, failure of 2893169695Skan * one top-level vdev does not imply a root-level error. 2894169695Skan */ 2895169695Skan if (vd == rvd) 2896169695Skan return; 2897169695Skan 2898169695Skan ASSERT(vd == zio->io_vd); 2899169695Skan 2900169695Skan if (flags & ZIO_FLAG_IO_BYPASS) 2901169695Skan return; 2902169695Skan 2903169695Skan mutex_enter(&vd->vdev_stat_lock); 2904169695Skan 2905169695Skan if (flags & ZIO_FLAG_IO_REPAIR) { 2906169695Skan if (flags & ZIO_FLAG_SCAN_THREAD) { 2907169695Skan dsl_scan_phys_t *scn_phys = 2908169695Skan &spa->spa_dsl_pool->dp_scan->scn_phys; 2909169695Skan uint64_t *processed = &scn_phys->scn_processed; 2910169695Skan 2911169695Skan /* XXX cleanup? */ 2912169695Skan if (vd->vdev_ops->vdev_op_leaf) 2913169695Skan atomic_add_64(processed, psize); 2914169695Skan vs->vs_scan_processed += psize; 2915169695Skan } 2916169695Skan 2917169695Skan if (flags & ZIO_FLAG_SELF_HEAL) 2918169695Skan vs->vs_self_healed += psize; 2919169695Skan } 2920169695Skan 2921169695Skan vs->vs_ops[type]++; 2922169695Skan vs->vs_bytes[type] += psize; 2923169695Skan 2924169695Skan mutex_exit(&vd->vdev_stat_lock); 2925169695Skan return; 2926169695Skan } 2927169695Skan 2928169695Skan if (flags & ZIO_FLAG_SPECULATIVE) 2929169695Skan return; 2930169695Skan 2931169695Skan /* 2932169695Skan * If this is an I/O error that is going to be retried, then ignore the 2933169695Skan * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 2934169695Skan * hard errors, when in reality they can happen for any number of 2935169695Skan * innocuous reasons (bus resets, MPxIO link failure, etc). 2936169695Skan */ 2937169695Skan if (zio->io_error == EIO && 2938169695Skan !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 2939169695Skan return; 2940169695Skan 2941169695Skan /* 2942169695Skan * Intent logs writes won't propagate their error to the root 2943169695Skan * I/O so don't mark these types of failures as pool-level 2944169695Skan * errors. 2945169695Skan */ 2946169695Skan if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 2947169695Skan return; 2948169695Skan 2949169695Skan mutex_enter(&vd->vdev_stat_lock); 2950169695Skan if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 2951169695Skan if (zio->io_error == ECKSUM) 2952169695Skan vs->vs_checksum_errors++; 2953169695Skan else 2954169695Skan vs->vs_read_errors++; 2955169695Skan } 2956169695Skan if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 2957169695Skan vs->vs_write_errors++; 2958169695Skan mutex_exit(&vd->vdev_stat_lock); 2959169695Skan 2960169695Skan if (type == ZIO_TYPE_WRITE && txg != 0 && 2961169695Skan (!(flags & ZIO_FLAG_IO_REPAIR) || 2962169695Skan (flags & ZIO_FLAG_SCAN_THREAD) || 2963169695Skan spa->spa_claiming)) { 2964169695Skan /* 2965169695Skan * This is either a normal write (not a repair), or it's 2966169695Skan * a repair induced by the scrub thread, or it's a repair 2967169695Skan * made by zil_claim() during spa_load() in the first txg. 2968169695Skan * In the normal case, we commit the DTL change in the same 2969169695Skan * txg as the block was born. In the scrub-induced repair 2970169695Skan * case, we know that scrubs run in first-pass syncing context, 2971169695Skan * so we commit the DTL change in spa_syncing_txg(spa). 2972169695Skan * In the zil_claim() case, we commit in spa_first_txg(spa). 2973169695Skan * 2974169695Skan * We currently do not make DTL entries for failed spontaneous 2975169695Skan * self-healing writes triggered by normal (non-scrubbing) 2976169695Skan * reads, because we have no transactional context in which to 2977169695Skan * do so -- and it's not clear that it'd be desirable anyway. 2978169695Skan */ 2979169695Skan if (vd->vdev_ops->vdev_op_leaf) { 2980169695Skan uint64_t commit_txg = txg; 2981169695Skan if (flags & ZIO_FLAG_SCAN_THREAD) { 2982169695Skan ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2983169695Skan ASSERT(spa_sync_pass(spa) == 1); 2984169695Skan vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 2985169695Skan commit_txg = spa_syncing_txg(spa); 2986169695Skan } else if (spa->spa_claiming) { 2987169695Skan ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2988169695Skan commit_txg = spa_first_txg(spa); 2989169695Skan } 2990169695Skan ASSERT(commit_txg >= spa_syncing_txg(spa)); 2991169695Skan if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 2992169695Skan return; 2993169695Skan for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2994169695Skan vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 2995169695Skan vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 2996169695Skan } 2997169695Skan if (vd != rvd) 2998169695Skan vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 2999169695Skan } 3000169695Skan} 3001169695Skan 3002169695Skan/* 3003169695Skan * Update the in-core space usage stats for this vdev, its metaslab class, 3004169695Skan * and the root vdev. 3005169695Skan */ 3006169695Skanvoid 3007169695Skanvdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 3008169695Skan int64_t space_delta) 3009169695Skan{ 3010169695Skan int64_t dspace_delta = space_delta; 3011169695Skan spa_t *spa = vd->vdev_spa; 3012169695Skan vdev_t *rvd = spa->spa_root_vdev; 3013169695Skan metaslab_group_t *mg = vd->vdev_mg; 3014169695Skan metaslab_class_t *mc = mg ? mg->mg_class : NULL; 3015169695Skan 3016169695Skan ASSERT(vd == vd->vdev_top); 3017169695Skan 3018169695Skan /* 3019169695Skan * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 3020169695Skan * factor. We must calculate this here and not at the root vdev 3021169695Skan * because the root vdev's psize-to-asize is simply the max of its 3022169695Skan * childrens', thus not accurate enough for us. 3023169695Skan */ 3024169695Skan ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 3025169695Skan ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 3026169695Skan dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 3027169695Skan vd->vdev_deflate_ratio; 3028169695Skan 3029169695Skan mutex_enter(&vd->vdev_stat_lock); 3030169695Skan vd->vdev_stat.vs_alloc += alloc_delta; 3031169695Skan vd->vdev_stat.vs_space += space_delta; 3032169695Skan vd->vdev_stat.vs_dspace += dspace_delta; 3033169695Skan mutex_exit(&vd->vdev_stat_lock); 3034169695Skan 3035169695Skan if (mc == spa_normal_class(spa)) { 3036169695Skan mutex_enter(&rvd->vdev_stat_lock); 3037169695Skan rvd->vdev_stat.vs_alloc += alloc_delta; 3038169695Skan rvd->vdev_stat.vs_space += space_delta; 3039169695Skan rvd->vdev_stat.vs_dspace += dspace_delta; 3040169695Skan mutex_exit(&rvd->vdev_stat_lock); 3041169695Skan } 3042169695Skan 3043169695Skan if (mc != NULL) { 3044169695Skan ASSERT(rvd == vd->vdev_parent); 3045169695Skan ASSERT(vd->vdev_ms_count != 0); 3046169695Skan 3047169695Skan metaslab_class_space_update(mc, 3048169695Skan alloc_delta, defer_delta, space_delta, dspace_delta); 3049169695Skan } 3050169695Skan} 3051169695Skan 3052169695Skan/* 3053169695Skan * Mark a top-level vdev's config as dirty, placing it on the dirty list 3054169695Skan * so that it will be written out next time the vdev configuration is synced. 3055169695Skan * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 3056169695Skan */ 3057169695Skanvoid 3058169695Skanvdev_config_dirty(vdev_t *vd) 3059169695Skan{ 3060169695Skan spa_t *spa = vd->vdev_spa; 3061169695Skan vdev_t *rvd = spa->spa_root_vdev; 3062169695Skan int c; 3063169695Skan 3064169695Skan ASSERT(spa_writeable(spa)); 3065169695Skan 3066169695Skan /* 3067169695Skan * If this is an aux vdev (as with l2cache and spare devices), then we 3068169695Skan * update the vdev config manually and set the sync flag. 3069169695Skan */ 3070169695Skan if (vd->vdev_aux != NULL) { 3071169695Skan spa_aux_vdev_t *sav = vd->vdev_aux; 3072169695Skan nvlist_t **aux; 3073169695Skan uint_t naux; 3074169695Skan 3075169695Skan for (c = 0; c < sav->sav_count; c++) { 3076169695Skan if (sav->sav_vdevs[c] == vd) 3077169695Skan break; 3078169695Skan } 3079169695Skan 3080169695Skan if (c == sav->sav_count) { 3081169695Skan /* 3082169695Skan * We're being removed. There's nothing more to do. 3083169695Skan */ 3084169695Skan ASSERT(sav->sav_sync == B_TRUE); 3085169695Skan return; 3086169695Skan } 3087169695Skan 3088169695Skan sav->sav_sync = B_TRUE; 3089169695Skan 3090169695Skan if (nvlist_lookup_nvlist_array(sav->sav_config, 3091169695Skan ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 3092169695Skan VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 3093169695Skan ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 3094169695Skan } 3095169695Skan 3096169695Skan ASSERT(c < naux); 3097169695Skan 3098169695Skan /* 3099169695Skan * Setting the nvlist in the middle if the array is a little 3100169695Skan * sketchy, but it will work. 3101169695Skan */ 3102169695Skan nvlist_free(aux[c]); 3103169695Skan aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3104169695Skan 3105169695Skan return; 3106169695Skan } 3107169695Skan 3108169695Skan /* 3109169695Skan * The dirty list is protected by the SCL_CONFIG lock. The caller 3110169695Skan * must either hold SCL_CONFIG as writer, or must be the sync thread 3111169695Skan * (which holds SCL_CONFIG as reader). There's only one sync thread, 3112169695Skan * so this is sufficient to ensure mutual exclusion. 3113169695Skan */ 3114169695Skan ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3115169695Skan (dsl_pool_sync_context(spa_get_dsl(spa)) && 3116169695Skan spa_config_held(spa, SCL_CONFIG, RW_READER))); 3117169695Skan 3118169695Skan if (vd == rvd) { 3119169695Skan for (c = 0; c < rvd->vdev_children; c++) 3120169695Skan vdev_config_dirty(rvd->vdev_child[c]); 3121169695Skan } else { 3122169695Skan ASSERT(vd == vd->vdev_top); 3123169695Skan 3124169695Skan if (!list_link_active(&vd->vdev_config_dirty_node) && 3125169695Skan !vd->vdev_ishole) 3126169695Skan list_insert_head(&spa->spa_config_dirty_list, vd); 3127169695Skan } 3128169695Skan} 3129169695Skan 3130169695Skanvoid 3131169695Skanvdev_config_clean(vdev_t *vd) 3132169695Skan{ 3133169695Skan spa_t *spa = vd->vdev_spa; 3134169695Skan 3135169695Skan ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3136169695Skan (dsl_pool_sync_context(spa_get_dsl(spa)) && 3137169695Skan spa_config_held(spa, SCL_CONFIG, RW_READER))); 3138169695Skan 3139169695Skan ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3140169695Skan list_remove(&spa->spa_config_dirty_list, vd); 3141169695Skan} 3142169695Skan 3143169695Skan/* 3144169695Skan * Mark a top-level vdev's state as dirty, so that the next pass of 3145169695Skan * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3146169695Skan * the state changes from larger config changes because they require 3147169695Skan * much less locking, and are often needed for administrative actions. 3148169695Skan */ 3149169695Skanvoid 3150169695Skanvdev_state_dirty(vdev_t *vd) 3151169695Skan{ 3152169695Skan spa_t *spa = vd->vdev_spa; 3153169695Skan 3154169695Skan ASSERT(spa_writeable(spa)); 3155169695Skan ASSERT(vd == vd->vdev_top); 3156169695Skan 3157169695Skan /* 3158169695Skan * The state list is protected by the SCL_STATE lock. The caller 3159169695Skan * must either hold SCL_STATE as writer, or must be the sync thread 3160169695Skan * (which holds SCL_STATE as reader). There's only one sync thread, 3161169695Skan * so this is sufficient to ensure mutual exclusion. 3162169695Skan */ 3163169695Skan ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3164169695Skan (dsl_pool_sync_context(spa_get_dsl(spa)) && 3165169695Skan spa_config_held(spa, SCL_STATE, RW_READER))); 3166169695Skan 3167169695Skan if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 3168169695Skan list_insert_head(&spa->spa_state_dirty_list, vd); 3169169695Skan} 3170169695Skan 3171169695Skanvoid 3172169695Skanvdev_state_clean(vdev_t *vd) 3173169695Skan{ 3174169695Skan spa_t *spa = vd->vdev_spa; 3175169695Skan 3176169695Skan ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3177169695Skan (dsl_pool_sync_context(spa_get_dsl(spa)) && 3178169695Skan spa_config_held(spa, SCL_STATE, RW_READER))); 3179169695Skan 3180169695Skan ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 3181169695Skan list_remove(&spa->spa_state_dirty_list, vd); 3182169695Skan} 3183169695Skan 3184169695Skan/* 3185169695Skan * Propagate vdev state up from children to parent. 3186169695Skan */ 3187169695Skanvoid 3188169695Skanvdev_propagate_state(vdev_t *vd) 3189169695Skan{ 3190169695Skan spa_t *spa = vd->vdev_spa; 3191169695Skan vdev_t *rvd = spa->spa_root_vdev; 3192169695Skan int degraded = 0, faulted = 0; 3193169695Skan int corrupted = 0; 3194169695Skan vdev_t *child; 3195169695Skan 3196169695Skan if (vd->vdev_children > 0) { 3197169695Skan for (int c = 0; c < vd->vdev_children; c++) { 3198169695Skan child = vd->vdev_child[c]; 3199169695Skan 3200169695Skan /* 3201169695Skan * Don't factor holes into the decision. 3202169695Skan */ 3203169695Skan if (child->vdev_ishole) 3204169695Skan continue; 3205169695Skan 3206169695Skan if (!vdev_readable(child) || 3207169695Skan (!vdev_writeable(child) && spa_writeable(spa))) { 3208169695Skan /* 3209169695Skan * Root special: if there is a top-level log 3210169695Skan * device, treat the root vdev as if it were 3211169695Skan * degraded. 3212169695Skan */ 3213169695Skan if (child->vdev_islog && vd == rvd) 3214169695Skan degraded++; 3215169695Skan else 3216169695Skan faulted++; 3217169695Skan } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 3218169695Skan degraded++; 3219169695Skan } 3220169695Skan 3221169695Skan if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 3222169695Skan corrupted++; 3223169695Skan } 3224169695Skan 3225169695Skan vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 3226169695Skan 3227169695Skan /* 3228169695Skan * Root special: if there is a top-level vdev that cannot be 3229169695Skan * opened due to corrupted metadata, then propagate the root 3230169695Skan * vdev's aux state as 'corrupt' rather than 'insufficient 3231169695Skan * replicas'. 3232169695Skan */ 3233169695Skan if (corrupted && vd == rvd && 3234169695Skan rvd->vdev_state == VDEV_STATE_CANT_OPEN) 3235169695Skan vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 3236169695Skan VDEV_AUX_CORRUPT_DATA); 3237169695Skan } 3238169695Skan 3239169695Skan if (vd->vdev_parent) 3240169695Skan vdev_propagate_state(vd->vdev_parent); 3241169695Skan} 3242169695Skan 3243169695Skan/* 3244169695Skan * Set a vdev's state. If this is during an open, we don't update the parent 3245169695Skan * state, because we're in the process of opening children depth-first. 3246169695Skan * Otherwise, we propagate the change to the parent. 3247169695Skan * 3248169695Skan * If this routine places a device in a faulted state, an appropriate ereport is 3249169695Skan * generated. 3250169695Skan */ 3251169695Skanvoid 3252169695Skanvdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 3253169695Skan{ 3254169695Skan uint64_t save_state; 3255169695Skan spa_t *spa = vd->vdev_spa; 3256169695Skan 3257169695Skan if (state == vd->vdev_state) { 3258169695Skan vd->vdev_stat.vs_aux = aux; 3259169695Skan return; 3260169695Skan } 3261169695Skan 3262169695Skan save_state = vd->vdev_state; 3263169695Skan 3264169695Skan vd->vdev_state = state; 3265169695Skan vd->vdev_stat.vs_aux = aux; 3266169695Skan 3267169695Skan /* 3268169695Skan * If we are setting the vdev state to anything but an open state, then 3269169695Skan * always close the underlying device unless the device has requested 3270169695Skan * a delayed close (i.e. we're about to remove or fault the device). 3271169695Skan * Otherwise, we keep accessible but invalid devices open forever. 3272169695Skan * We don't call vdev_close() itself, because that implies some extra 3273169695Skan * checks (offline, etc) that we don't want here. This is limited to 3274169695Skan * leaf devices, because otherwise closing the device will affect other 3275169695Skan * children. 3276169695Skan */ 3277169695Skan if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 3278169695Skan vd->vdev_ops->vdev_op_leaf) 3279169695Skan vd->vdev_ops->vdev_op_close(vd); 3280169695Skan 3281169695Skan /* 3282169695Skan * If we have brought this vdev back into service, we need 3283169695Skan * to notify fmd so that it can gracefully repair any outstanding 3284169695Skan * cases due to a missing device. We do this in all cases, even those 3285169695Skan * that probably don't correlate to a repaired fault. This is sure to 3286169695Skan * catch all cases, and we let the zfs-retire agent sort it out. If 3287169695Skan * this is a transient state it's OK, as the retire agent will 3288169695Skan * double-check the state of the vdev before repairing it. 3289169695Skan */ 3290169695Skan if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 3291169695Skan vd->vdev_prevstate != state) 3292169695Skan zfs_post_state_change(spa, vd); 3293169695Skan 3294169695Skan if (vd->vdev_removed && 3295169695Skan state == VDEV_STATE_CANT_OPEN && 3296169695Skan (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 3297169695Skan /* 3298169695Skan * If the previous state is set to VDEV_STATE_REMOVED, then this 3299169695Skan * device was previously marked removed and someone attempted to 3300169695Skan * reopen it. If this failed due to a nonexistent device, then 3301169695Skan * keep the device in the REMOVED state. We also let this be if 3302169695Skan * it is one of our special test online cases, which is only 3303169695Skan * attempting to online the device and shouldn't generate an FMA 3304169695Skan * fault. 3305169695Skan */ 3306169695Skan vd->vdev_state = VDEV_STATE_REMOVED; 3307169695Skan vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 3308169695Skan } else if (state == VDEV_STATE_REMOVED) { 3309169695Skan vd->vdev_removed = B_TRUE; 3310169695Skan } else if (state == VDEV_STATE_CANT_OPEN) { 3311169695Skan /* 3312169695Skan * If we fail to open a vdev during an import or recovery, we 3313169695Skan * mark it as "not available", which signifies that it was 3314169695Skan * never there to begin with. Failure to open such a device 3315169695Skan * is not considered an error. 3316169695Skan */ 3317169695Skan if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 3318169695Skan spa_load_state(spa) == SPA_LOAD_RECOVER) && 3319169695Skan vd->vdev_ops->vdev_op_leaf) 3320169695Skan vd->vdev_not_present = 1; 3321169695Skan 3322169695Skan /* 3323169695Skan * Post the appropriate ereport. If the 'prevstate' field is 3324169695Skan * set to something other than VDEV_STATE_UNKNOWN, it indicates 3325169695Skan * that this is part of a vdev_reopen(). In this case, we don't 3326169695Skan * want to post the ereport if the device was already in the 3327169695Skan * CANT_OPEN state beforehand. 3328169695Skan * 3329169695Skan * If the 'checkremove' flag is set, then this is an attempt to 3330169695Skan * online the device in response to an insertion event. If we 3331169695Skan * hit this case, then we have detected an insertion event for a 3332169695Skan * faulted or offline device that wasn't in the removed state. 3333169695Skan * In this scenario, we don't post an ereport because we are 3334169695Skan * about to replace the device, or attempt an online with 3335169695Skan * vdev_forcefault, which will generate the fault for us. 3336169695Skan */ 3337169695Skan if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 3338169695Skan !vd->vdev_not_present && !vd->vdev_checkremove && 3339169695Skan vd != spa->spa_root_vdev) { 3340169695Skan const char *class; 3341169695Skan 3342169695Skan switch (aux) { 3343169695Skan case VDEV_AUX_OPEN_FAILED: 3344169695Skan class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3345169695Skan break; 3346169695Skan case VDEV_AUX_CORRUPT_DATA: 3347169695Skan class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3348169695Skan break; 3349169695Skan case VDEV_AUX_NO_REPLICAS: 3350169695Skan class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3351169695Skan break; 3352169695Skan case VDEV_AUX_BAD_GUID_SUM: 3353169695Skan class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3354169695Skan break; 3355169695Skan case VDEV_AUX_TOO_SMALL: 3356169695Skan class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3357169695Skan break; 3358169695Skan case VDEV_AUX_BAD_LABEL: 3359169695Skan class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3360169695Skan break; 3361169695Skan default: 3362169695Skan class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3363169695Skan } 3364169695Skan 3365169695Skan zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3366169695Skan } 3367169695Skan 3368169695Skan /* Erase any notion of persistent removed state */ 3369169695Skan vd->vdev_removed = B_FALSE; 3370169695Skan } else { 3371169695Skan vd->vdev_removed = B_FALSE; 3372169695Skan } 3373169695Skan 3374169695Skan if (!isopen && vd->vdev_parent) 3375169695Skan vdev_propagate_state(vd->vdev_parent); 3376169695Skan} 3377169695Skan 3378169695Skan/* 3379169695Skan * Check the vdev configuration to ensure that it's capable of supporting 3380169695Skan * a root pool. 3381169695Skan * 3382169695Skan * On Solaris, we do not support RAID-Z or partial configuration. In 3383169695Skan * addition, only a single top-level vdev is allowed and none of the 3384169695Skan * leaves can be wholedisks. 3385169695Skan * 3386169695Skan * For FreeBSD, we can boot from any configuration. There is a 3387169695Skan * limitation that the boot filesystem must be either uncompressed or 3388169695Skan * compresses with lzjb compression but I'm not sure how to enforce 3389169695Skan * that here. 3390169695Skan */ 3391169695Skanboolean_t 3392169695Skanvdev_is_bootable(vdev_t *vd) 3393169695Skan{ 3394169695Skan#ifdef sun 3395169695Skan if (!vd->vdev_ops->vdev_op_leaf) { 3396169695Skan char *vdev_type = vd->vdev_ops->vdev_op_type; 3397169695Skan 3398169695Skan if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3399169695Skan vd->vdev_children > 1) { 3400169695Skan return (B_FALSE); 3401169695Skan } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 3402169695Skan strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 3403169695Skan return (B_FALSE); 3404169695Skan } 3405169695Skan } else if (vd->vdev_wholedisk == 1) { 3406169695Skan return (B_FALSE); 3407169695Skan } 3408169695Skan 3409169695Skan for (int c = 0; c < vd->vdev_children; c++) { 3410169695Skan if (!vdev_is_bootable(vd->vdev_child[c])) 3411169695Skan return (B_FALSE); 3412169695Skan } 3413169695Skan#endif /* sun */ 3414169695Skan return (B_TRUE); 3415169695Skan} 3416169695Skan 3417169695Skan/* 3418169695Skan * Load the state from the original vdev tree (ovd) which 3419169695Skan * we've retrieved from the MOS config object. If the original 3420169695Skan * vdev was offline or faulted then we transfer that state to the 3421169695Skan * device in the current vdev tree (nvd). 3422169695Skan */ 3423169695Skanvoid 3424169695Skanvdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3425169695Skan{ 3426169695Skan spa_t *spa = nvd->vdev_spa; 3427169695Skan 3428169695Skan ASSERT(nvd->vdev_top->vdev_islog); 3429169695Skan ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3430169695Skan ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3431169695Skan 3432169695Skan for (int c = 0; c < nvd->vdev_children; c++) 3433169695Skan vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3434169695Skan 3435169695Skan if (nvd->vdev_ops->vdev_op_leaf) { 3436169695Skan /* 3437169695Skan * Restore the persistent vdev state 3438169695Skan */ 3439169695Skan nvd->vdev_offline = ovd->vdev_offline; 3440169695Skan nvd->vdev_faulted = ovd->vdev_faulted; 3441169695Skan nvd->vdev_degraded = ovd->vdev_degraded; 3442169695Skan nvd->vdev_removed = ovd->vdev_removed; 3443169695Skan } 3444169695Skan} 3445169695Skan 3446169695Skan/* 3447169695Skan * Determine if a log device has valid content. If the vdev was 3448169695Skan * removed or faulted in the MOS config then we know that 3449169695Skan * the content on the log device has already been written to the pool. 3450169695Skan */ 3451169695Skanboolean_t 3452169695Skanvdev_log_state_valid(vdev_t *vd) 3453169695Skan{ 3454169695Skan if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3455169695Skan !vd->vdev_removed) 3456169695Skan return (B_TRUE); 3457169695Skan 3458169695Skan for (int c = 0; c < vd->vdev_children; c++) 3459169695Skan if (vdev_log_state_valid(vd->vdev_child[c])) 3460169695Skan return (B_TRUE); 3461169695Skan 3462169695Skan return (B_FALSE); 3463169695Skan} 3464169695Skan 3465169695Skan/* 3466169695Skan * Expand a vdev if possible. 3467169695Skan */ 3468169695Skanvoid 3469169695Skanvdev_expand(vdev_t *vd, uint64_t txg) 3470169695Skan{ 3471169695Skan ASSERT(vd->vdev_top == vd); 3472169695Skan ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3473169695Skan 3474169695Skan if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 3475169695Skan VERIFY(vdev_metaslab_init(vd, txg) == 0); 3476169695Skan vdev_config_dirty(vd); 3477169695Skan } 3478169695Skan} 3479169695Skan 3480169695Skan/* 3481169695Skan * Split a vdev. 3482169695Skan */ 3483169695Skanvoid 3484169695Skanvdev_split(vdev_t *vd) 3485169695Skan{ 3486169695Skan vdev_t *cvd, *pvd = vd->vdev_parent; 3487169695Skan 3488169695Skan vdev_remove_child(pvd, vd); 3489169695Skan vdev_compact_children(pvd); 3490169695Skan 3491169695Skan cvd = pvd->vdev_child[0]; 3492169695Skan if (pvd->vdev_children == 1) { 3493169695Skan vdev_remove_parent(cvd); 3494169695Skan cvd->vdev_splitting = B_TRUE; 3495169695Skan } 3496169695Skan vdev_propagate_state(cvd); 3497169695Skan} 3498169695Skan 3499169695Skanvoid 3500169695Skanvdev_deadman(vdev_t *vd) 3501169695Skan{ 3502169695Skan for (int c = 0; c < vd->vdev_children; c++) { 3503169695Skan vdev_t *cvd = vd->vdev_child[c]; 3504169695Skan 3505169695Skan vdev_deadman(cvd); 3506169695Skan } 3507169695Skan 3508169695Skan if (vd->vdev_ops->vdev_op_leaf) { 3509169695Skan vdev_queue_t *vq = &vd->vdev_queue; 3510169695Skan 3511169695Skan mutex_enter(&vq->vq_lock); 3512169695Skan if (avl_numnodes(&vq->vq_active_tree) > 0) { 3513169695Skan spa_t *spa = vd->vdev_spa; 3514169695Skan zio_t *fio; 3515169695Skan uint64_t delta; 3516169695Skan 3517169695Skan /* 3518169695Skan * Look at the head of all the pending queues, 3519169695Skan * if any I/O has been outstanding for longer than 3520169695Skan * the spa_deadman_synctime we panic the system. 3521169695Skan */ 3522169695Skan fio = avl_first(&vq->vq_active_tree); 3523169695Skan delta = gethrtime() - fio->io_timestamp; 3524169695Skan if (delta > spa_deadman_synctime(spa)) { 3525169695Skan zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " 3526169695Skan "delta %lluns, last io %lluns", 3527169695Skan fio->io_timestamp, delta, 3528169695Skan vq->vq_io_complete_ts); 3529169695Skan fm_panic("I/O to pool '%s' appears to be " 3530169695Skan "hung on vdev guid %llu at '%s'.", 3531169695Skan spa_name(spa), 3532169695Skan (long long unsigned int) vd->vdev_guid, 3533169695Skan vd->vdev_path); 3534169695Skan } 3535169695Skan } 3536169695Skan mutex_exit(&vq->vq_lock); 3537169695Skan } 3538169695Skan} 3539169695Skan