vdev.c revision 288569
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24285001Savg * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 25285001Savg * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 26247348Smm * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/fm/fs/zfs.h> 31168404Spjd#include <sys/spa.h> 32168404Spjd#include <sys/spa_impl.h> 33168404Spjd#include <sys/dmu.h> 34168404Spjd#include <sys/dmu_tx.h> 35168404Spjd#include <sys/vdev_impl.h> 36168404Spjd#include <sys/uberblock_impl.h> 37168404Spjd#include <sys/metaslab.h> 38168404Spjd#include <sys/metaslab_impl.h> 39168404Spjd#include <sys/space_map.h> 40262093Savg#include <sys/space_reftree.h> 41168404Spjd#include <sys/zio.h> 42168404Spjd#include <sys/zap.h> 43168404Spjd#include <sys/fs/zfs.h> 44185029Spjd#include <sys/arc.h> 45213197Smm#include <sys/zil.h> 46219089Spjd#include <sys/dsl_scan.h> 47240868Spjd#include <sys/trim_map.h> 48168404Spjd 49168404SpjdSYSCTL_DECL(_vfs_zfs); 50168404SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 51168404Spjd 52168404Spjd/* 53168404Spjd * Virtual device management. 54168404Spjd */ 55168404Spjd 56266122Ssmh/* 57254591Sgibbs * The limit for ZFS to automatically increase a top-level vdev's ashift 58254591Sgibbs * from logical ashift to physical ashift. 59254591Sgibbs * 60254591Sgibbs * Example: one or more 512B emulation child vdevs 61254591Sgibbs * child->vdev_ashift = 9 (512 bytes) 62254591Sgibbs * child->vdev_physical_ashift = 12 (4096 bytes) 63254591Sgibbs * zfs_max_auto_ashift = 11 (2048 bytes) 64266122Ssmh * zfs_min_auto_ashift = 9 (512 bytes) 65254591Sgibbs * 66266122Ssmh * On pool creation or the addition of a new top-level vdev, ZFS will 67266122Ssmh * increase the ashift of the top-level vdev to 2048 as limited by 68266122Ssmh * zfs_max_auto_ashift. 69254591Sgibbs * 70254591Sgibbs * Example: one or more 512B emulation child vdevs 71254591Sgibbs * child->vdev_ashift = 9 (512 bytes) 72254591Sgibbs * child->vdev_physical_ashift = 12 (4096 bytes) 73254591Sgibbs * zfs_max_auto_ashift = 13 (8192 bytes) 74266122Ssmh * zfs_min_auto_ashift = 9 (512 bytes) 75254591Sgibbs * 76266122Ssmh * On pool creation or the addition of a new top-level vdev, ZFS will 77266122Ssmh * increase the ashift of the top-level vdev to 4096 to match the 78266122Ssmh * max vdev_physical_ashift. 79266122Ssmh * 80266122Ssmh * Example: one or more 512B emulation child vdevs 81266122Ssmh * child->vdev_ashift = 9 (512 bytes) 82266122Ssmh * child->vdev_physical_ashift = 9 (512 bytes) 83266122Ssmh * zfs_max_auto_ashift = 13 (8192 bytes) 84266122Ssmh * zfs_min_auto_ashift = 12 (4096 bytes) 85266122Ssmh * 86266122Ssmh * On pool creation or the addition of a new top-level vdev, ZFS will 87266122Ssmh * increase the ashift of the top-level vdev to 4096 to match the 88266122Ssmh * zfs_min_auto_ashift. 89254591Sgibbs */ 90254591Sgibbsstatic uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; 91266122Ssmhstatic uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; 92254591Sgibbs 93254591Sgibbsstatic int 94254591Sgibbssysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) 95254591Sgibbs{ 96254591Sgibbs uint64_t val; 97254591Sgibbs int err; 98254591Sgibbs 99254591Sgibbs val = zfs_max_auto_ashift; 100254591Sgibbs err = sysctl_handle_64(oidp, &val, 0, req); 101254591Sgibbs if (err != 0 || req->newptr == NULL) 102254591Sgibbs return (err); 103254591Sgibbs 104266122Ssmh if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) 105266122Ssmh return (EINVAL); 106254591Sgibbs 107254591Sgibbs zfs_max_auto_ashift = val; 108254591Sgibbs 109254591Sgibbs return (0); 110254591Sgibbs} 111254591SgibbsSYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, 112254591Sgibbs CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 113254591Sgibbs sysctl_vfs_zfs_max_auto_ashift, "QU", 114266122Ssmh "Max ashift used when optimising for logical -> physical sectors size on " 115266122Ssmh "new top-level vdevs."); 116254591Sgibbs 117266122Ssmhstatic int 118266122Ssmhsysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) 119266122Ssmh{ 120266122Ssmh uint64_t val; 121266122Ssmh int err; 122266122Ssmh 123266122Ssmh val = zfs_min_auto_ashift; 124266122Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 125266122Ssmh if (err != 0 || req->newptr == NULL) 126266122Ssmh return (err); 127266122Ssmh 128266122Ssmh if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) 129266122Ssmh return (EINVAL); 130266122Ssmh 131266122Ssmh zfs_min_auto_ashift = val; 132266122Ssmh 133266122Ssmh return (0); 134266122Ssmh} 135266122SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, 136266122Ssmh CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 137266122Ssmh sysctl_vfs_zfs_min_auto_ashift, "QU", 138266122Ssmh "Min ashift used when creating new top-level vdevs."); 139266122Ssmh 140168404Spjdstatic vdev_ops_t *vdev_ops_table[] = { 141168404Spjd &vdev_root_ops, 142168404Spjd &vdev_raidz_ops, 143168404Spjd &vdev_mirror_ops, 144168404Spjd &vdev_replacing_ops, 145168404Spjd &vdev_spare_ops, 146168404Spjd#ifdef _KERNEL 147168404Spjd &vdev_geom_ops, 148168404Spjd#else 149168404Spjd &vdev_disk_ops, 150185029Spjd#endif 151168404Spjd &vdev_file_ops, 152168404Spjd &vdev_missing_ops, 153219089Spjd &vdev_hole_ops, 154168404Spjd NULL 155168404Spjd}; 156168404Spjd 157168404Spjd 158168404Spjd/* 159273343Sdelphij * When a vdev is added, it will be divided into approximately (but no 160273343Sdelphij * more than) this number of metaslabs. 161273343Sdelphij */ 162273343Sdelphijint metaslabs_per_vdev = 200; 163273343SdelphijSYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN, 164273343Sdelphij &metaslabs_per_vdev, 0, 165273343Sdelphij "When a vdev is added, how many metaslabs the vdev should be divided into"); 166273343Sdelphij 167273343Sdelphij/* 168168404Spjd * Given a vdev type, return the appropriate ops vector. 169168404Spjd */ 170168404Spjdstatic vdev_ops_t * 171168404Spjdvdev_getops(const char *type) 172168404Spjd{ 173168404Spjd vdev_ops_t *ops, **opspp; 174168404Spjd 175168404Spjd for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 176168404Spjd if (strcmp(ops->vdev_op_type, type) == 0) 177168404Spjd break; 178168404Spjd 179168404Spjd return (ops); 180168404Spjd} 181168404Spjd 182168404Spjd/* 183168404Spjd * Default asize function: return the MAX of psize with the asize of 184168404Spjd * all children. This is what's used by anything other than RAID-Z. 185168404Spjd */ 186168404Spjduint64_t 187168404Spjdvdev_default_asize(vdev_t *vd, uint64_t psize) 188168404Spjd{ 189168404Spjd uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 190168404Spjd uint64_t csize; 191168404Spjd 192219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 193168404Spjd csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 194168404Spjd asize = MAX(asize, csize); 195168404Spjd } 196168404Spjd 197168404Spjd return (asize); 198168404Spjd} 199168404Spjd 200168404Spjd/* 201219089Spjd * Get the minimum allocatable size. We define the allocatable size as 202219089Spjd * the vdev's asize rounded to the nearest metaslab. This allows us to 203219089Spjd * replace or attach devices which don't have the same physical size but 204219089Spjd * can still satisfy the same number of allocations. 205168404Spjd */ 206168404Spjduint64_t 207219089Spjdvdev_get_min_asize(vdev_t *vd) 208168404Spjd{ 209219089Spjd vdev_t *pvd = vd->vdev_parent; 210168404Spjd 211219089Spjd /* 212236155Smm * If our parent is NULL (inactive spare or cache) or is the root, 213219089Spjd * just return our own asize. 214219089Spjd */ 215219089Spjd if (pvd == NULL) 216219089Spjd return (vd->vdev_asize); 217168404Spjd 218168404Spjd /* 219219089Spjd * The top-level vdev just returns the allocatable size rounded 220219089Spjd * to the nearest metaslab. 221168404Spjd */ 222219089Spjd if (vd == vd->vdev_top) 223219089Spjd return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 224168404Spjd 225219089Spjd /* 226219089Spjd * The allocatable space for a raidz vdev is N * sizeof(smallest child), 227219089Spjd * so each child must provide at least 1/Nth of its asize. 228219089Spjd */ 229219089Spjd if (pvd->vdev_ops == &vdev_raidz_ops) 230219089Spjd return (pvd->vdev_min_asize / pvd->vdev_children); 231168404Spjd 232219089Spjd return (pvd->vdev_min_asize); 233219089Spjd} 234168404Spjd 235219089Spjdvoid 236219089Spjdvdev_set_min_asize(vdev_t *vd) 237219089Spjd{ 238219089Spjd vd->vdev_min_asize = vdev_get_min_asize(vd); 239219089Spjd 240219089Spjd for (int c = 0; c < vd->vdev_children; c++) 241219089Spjd vdev_set_min_asize(vd->vdev_child[c]); 242168404Spjd} 243168404Spjd 244168404Spjdvdev_t * 245168404Spjdvdev_lookup_top(spa_t *spa, uint64_t vdev) 246168404Spjd{ 247168404Spjd vdev_t *rvd = spa->spa_root_vdev; 248168404Spjd 249185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 250185029Spjd 251185029Spjd if (vdev < rvd->vdev_children) { 252185029Spjd ASSERT(rvd->vdev_child[vdev] != NULL); 253168404Spjd return (rvd->vdev_child[vdev]); 254185029Spjd } 255168404Spjd 256168404Spjd return (NULL); 257168404Spjd} 258168404Spjd 259168404Spjdvdev_t * 260168404Spjdvdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 261168404Spjd{ 262168404Spjd vdev_t *mvd; 263168404Spjd 264168404Spjd if (vd->vdev_guid == guid) 265168404Spjd return (vd); 266168404Spjd 267219089Spjd for (int c = 0; c < vd->vdev_children; c++) 268168404Spjd if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 269168404Spjd NULL) 270168404Spjd return (mvd); 271168404Spjd 272168404Spjd return (NULL); 273168404Spjd} 274168404Spjd 275288569Smavstatic int 276288569Smavvdev_count_leaves_impl(vdev_t *vd) 277288569Smav{ 278288569Smav int n = 0; 279288569Smav 280288569Smav if (vd->vdev_ops->vdev_op_leaf) 281288569Smav return (1); 282288569Smav 283288569Smav for (int c = 0; c < vd->vdev_children; c++) 284288569Smav n += vdev_count_leaves_impl(vd->vdev_child[c]); 285288569Smav 286288569Smav return (n); 287288569Smav} 288288569Smav 289288569Smavint 290288569Smavvdev_count_leaves(spa_t *spa) 291288569Smav{ 292288569Smav return (vdev_count_leaves_impl(spa->spa_root_vdev)); 293288569Smav} 294288569Smav 295168404Spjdvoid 296168404Spjdvdev_add_child(vdev_t *pvd, vdev_t *cvd) 297168404Spjd{ 298168404Spjd size_t oldsize, newsize; 299168404Spjd uint64_t id = cvd->vdev_id; 300168404Spjd vdev_t **newchild; 301285001Savg spa_t *spa = cvd->vdev_spa; 302168404Spjd 303285001Savg ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 304168404Spjd ASSERT(cvd->vdev_parent == NULL); 305168404Spjd 306168404Spjd cvd->vdev_parent = pvd; 307168404Spjd 308168404Spjd if (pvd == NULL) 309168404Spjd return; 310168404Spjd 311168404Spjd ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 312168404Spjd 313168404Spjd oldsize = pvd->vdev_children * sizeof (vdev_t *); 314168404Spjd pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 315168404Spjd newsize = pvd->vdev_children * sizeof (vdev_t *); 316168404Spjd 317168404Spjd newchild = kmem_zalloc(newsize, KM_SLEEP); 318168404Spjd if (pvd->vdev_child != NULL) { 319168404Spjd bcopy(pvd->vdev_child, newchild, oldsize); 320168404Spjd kmem_free(pvd->vdev_child, oldsize); 321168404Spjd } 322168404Spjd 323168404Spjd pvd->vdev_child = newchild; 324168404Spjd pvd->vdev_child[id] = cvd; 325168404Spjd 326168404Spjd cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 327168404Spjd ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 328168404Spjd 329168404Spjd /* 330168404Spjd * Walk up all ancestors to update guid sum. 331168404Spjd */ 332168404Spjd for (; pvd != NULL; pvd = pvd->vdev_parent) 333168404Spjd pvd->vdev_guid_sum += cvd->vdev_guid_sum; 334168404Spjd} 335168404Spjd 336168404Spjdvoid 337168404Spjdvdev_remove_child(vdev_t *pvd, vdev_t *cvd) 338168404Spjd{ 339168404Spjd int c; 340168404Spjd uint_t id = cvd->vdev_id; 341168404Spjd 342168404Spjd ASSERT(cvd->vdev_parent == pvd); 343168404Spjd 344168404Spjd if (pvd == NULL) 345168404Spjd return; 346168404Spjd 347168404Spjd ASSERT(id < pvd->vdev_children); 348168404Spjd ASSERT(pvd->vdev_child[id] == cvd); 349168404Spjd 350168404Spjd pvd->vdev_child[id] = NULL; 351168404Spjd cvd->vdev_parent = NULL; 352168404Spjd 353168404Spjd for (c = 0; c < pvd->vdev_children; c++) 354168404Spjd if (pvd->vdev_child[c]) 355168404Spjd break; 356168404Spjd 357168404Spjd if (c == pvd->vdev_children) { 358168404Spjd kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 359168404Spjd pvd->vdev_child = NULL; 360168404Spjd pvd->vdev_children = 0; 361168404Spjd } 362168404Spjd 363168404Spjd /* 364168404Spjd * Walk up all ancestors to update guid sum. 365168404Spjd */ 366168404Spjd for (; pvd != NULL; pvd = pvd->vdev_parent) 367168404Spjd pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 368168404Spjd} 369168404Spjd 370168404Spjd/* 371168404Spjd * Remove any holes in the child array. 372168404Spjd */ 373168404Spjdvoid 374168404Spjdvdev_compact_children(vdev_t *pvd) 375168404Spjd{ 376168404Spjd vdev_t **newchild, *cvd; 377168404Spjd int oldc = pvd->vdev_children; 378219089Spjd int newc; 379168404Spjd 380185029Spjd ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 381168404Spjd 382219089Spjd for (int c = newc = 0; c < oldc; c++) 383168404Spjd if (pvd->vdev_child[c]) 384168404Spjd newc++; 385168404Spjd 386168404Spjd newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 387168404Spjd 388219089Spjd for (int c = newc = 0; c < oldc; c++) { 389168404Spjd if ((cvd = pvd->vdev_child[c]) != NULL) { 390168404Spjd newchild[newc] = cvd; 391168404Spjd cvd->vdev_id = newc++; 392168404Spjd } 393168404Spjd } 394168404Spjd 395168404Spjd kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 396168404Spjd pvd->vdev_child = newchild; 397168404Spjd pvd->vdev_children = newc; 398168404Spjd} 399168404Spjd 400168404Spjd/* 401168404Spjd * Allocate and minimally initialize a vdev_t. 402168404Spjd */ 403219089Spjdvdev_t * 404168404Spjdvdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 405168404Spjd{ 406168404Spjd vdev_t *vd; 407168404Spjd 408168404Spjd vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 409168404Spjd 410168404Spjd if (spa->spa_root_vdev == NULL) { 411168404Spjd ASSERT(ops == &vdev_root_ops); 412168404Spjd spa->spa_root_vdev = vd; 413228103Smm spa->spa_load_guid = spa_generate_guid(NULL); 414168404Spjd } 415168404Spjd 416219089Spjd if (guid == 0 && ops != &vdev_hole_ops) { 417168404Spjd if (spa->spa_root_vdev == vd) { 418168404Spjd /* 419168404Spjd * The root vdev's guid will also be the pool guid, 420168404Spjd * which must be unique among all pools. 421168404Spjd */ 422219089Spjd guid = spa_generate_guid(NULL); 423168404Spjd } else { 424168404Spjd /* 425168404Spjd * Any other vdev's guid must be unique within the pool. 426168404Spjd */ 427219089Spjd guid = spa_generate_guid(spa); 428168404Spjd } 429168404Spjd ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 430168404Spjd } 431168404Spjd 432168404Spjd vd->vdev_spa = spa; 433168404Spjd vd->vdev_id = id; 434168404Spjd vd->vdev_guid = guid; 435168404Spjd vd->vdev_guid_sum = guid; 436168404Spjd vd->vdev_ops = ops; 437168404Spjd vd->vdev_state = VDEV_STATE_CLOSED; 438219089Spjd vd->vdev_ishole = (ops == &vdev_hole_ops); 439168404Spjd 440168404Spjd mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 441168404Spjd mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 442185029Spjd mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 443209962Smm for (int t = 0; t < DTL_TYPES; t++) { 444262093Savg vd->vdev_dtl[t] = range_tree_create(NULL, NULL, 445209962Smm &vd->vdev_dtl_lock); 446209962Smm } 447168404Spjd txg_list_create(&vd->vdev_ms_list, 448168404Spjd offsetof(struct metaslab, ms_txg_node)); 449168404Spjd txg_list_create(&vd->vdev_dtl_list, 450168404Spjd offsetof(struct vdev, vdev_dtl_node)); 451168404Spjd vd->vdev_stat.vs_timestamp = gethrtime(); 452185029Spjd vdev_queue_init(vd); 453185029Spjd vdev_cache_init(vd); 454168404Spjd 455168404Spjd return (vd); 456168404Spjd} 457168404Spjd 458168404Spjd/* 459168404Spjd * Allocate a new vdev. The 'alloctype' is used to control whether we are 460168404Spjd * creating a new vdev or loading an existing one - the behavior is slightly 461168404Spjd * different for each case. 462168404Spjd */ 463168404Spjdint 464168404Spjdvdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 465168404Spjd int alloctype) 466168404Spjd{ 467168404Spjd vdev_ops_t *ops; 468168404Spjd char *type; 469185029Spjd uint64_t guid = 0, islog, nparity; 470168404Spjd vdev_t *vd; 471168404Spjd 472185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 473168404Spjd 474168404Spjd if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 475249195Smm return (SET_ERROR(EINVAL)); 476168404Spjd 477168404Spjd if ((ops = vdev_getops(type)) == NULL) 478249195Smm return (SET_ERROR(EINVAL)); 479168404Spjd 480168404Spjd /* 481168404Spjd * If this is a load, get the vdev guid from the nvlist. 482168404Spjd * Otherwise, vdev_alloc_common() will generate one for us. 483168404Spjd */ 484168404Spjd if (alloctype == VDEV_ALLOC_LOAD) { 485168404Spjd uint64_t label_id; 486168404Spjd 487168404Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 488168404Spjd label_id != id) 489249195Smm return (SET_ERROR(EINVAL)); 490168404Spjd 491168404Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 492249195Smm return (SET_ERROR(EINVAL)); 493168404Spjd } else if (alloctype == VDEV_ALLOC_SPARE) { 494168404Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 495249195Smm return (SET_ERROR(EINVAL)); 496185029Spjd } else if (alloctype == VDEV_ALLOC_L2CACHE) { 497185029Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 498249195Smm return (SET_ERROR(EINVAL)); 499219089Spjd } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 500219089Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 501249195Smm return (SET_ERROR(EINVAL)); 502168404Spjd } 503168404Spjd 504168404Spjd /* 505168404Spjd * The first allocated vdev must be of type 'root'. 506168404Spjd */ 507168404Spjd if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 508249195Smm return (SET_ERROR(EINVAL)); 509168404Spjd 510185029Spjd /* 511185029Spjd * Determine whether we're a log vdev. 512185029Spjd */ 513185029Spjd islog = 0; 514185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 515185029Spjd if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 516249195Smm return (SET_ERROR(ENOTSUP)); 517168404Spjd 518219089Spjd if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 519249195Smm return (SET_ERROR(ENOTSUP)); 520219089Spjd 521168404Spjd /* 522185029Spjd * Set the nparity property for RAID-Z vdevs. 523168404Spjd */ 524185029Spjd nparity = -1ULL; 525168404Spjd if (ops == &vdev_raidz_ops) { 526168404Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 527185029Spjd &nparity) == 0) { 528219089Spjd if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 529249195Smm return (SET_ERROR(EINVAL)); 530168404Spjd /* 531219089Spjd * Previous versions could only support 1 or 2 parity 532219089Spjd * device. 533168404Spjd */ 534219089Spjd if (nparity > 1 && 535219089Spjd spa_version(spa) < SPA_VERSION_RAIDZ2) 536249195Smm return (SET_ERROR(ENOTSUP)); 537219089Spjd if (nparity > 2 && 538219089Spjd spa_version(spa) < SPA_VERSION_RAIDZ3) 539249195Smm return (SET_ERROR(ENOTSUP)); 540168404Spjd } else { 541168404Spjd /* 542168404Spjd * We require the parity to be specified for SPAs that 543168404Spjd * support multiple parity levels. 544168404Spjd */ 545219089Spjd if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 546249195Smm return (SET_ERROR(EINVAL)); 547168404Spjd /* 548168404Spjd * Otherwise, we default to 1 parity device for RAID-Z. 549168404Spjd */ 550185029Spjd nparity = 1; 551168404Spjd } 552168404Spjd } else { 553185029Spjd nparity = 0; 554168404Spjd } 555185029Spjd ASSERT(nparity != -1ULL); 556168404Spjd 557185029Spjd vd = vdev_alloc_common(spa, id, guid, ops); 558185029Spjd 559185029Spjd vd->vdev_islog = islog; 560185029Spjd vd->vdev_nparity = nparity; 561185029Spjd 562185029Spjd if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 563185029Spjd vd->vdev_path = spa_strdup(vd->vdev_path); 564185029Spjd if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 565185029Spjd vd->vdev_devid = spa_strdup(vd->vdev_devid); 566185029Spjd if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 567185029Spjd &vd->vdev_physpath) == 0) 568185029Spjd vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 569209962Smm if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 570209962Smm vd->vdev_fru = spa_strdup(vd->vdev_fru); 571185029Spjd 572168404Spjd /* 573168404Spjd * Set the whole_disk property. If it's not specified, leave the value 574168404Spjd * as -1. 575168404Spjd */ 576168404Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 577168404Spjd &vd->vdev_wholedisk) != 0) 578168404Spjd vd->vdev_wholedisk = -1ULL; 579168404Spjd 580168404Spjd /* 581168404Spjd * Look for the 'not present' flag. This will only be set if the device 582168404Spjd * was not present at the time of import. 583168404Spjd */ 584209962Smm (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 585209962Smm &vd->vdev_not_present); 586168404Spjd 587168404Spjd /* 588168404Spjd * Get the alignment requirement. 589168404Spjd */ 590168404Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 591168404Spjd 592168404Spjd /* 593219089Spjd * Retrieve the vdev creation time. 594219089Spjd */ 595219089Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 596219089Spjd &vd->vdev_crtxg); 597219089Spjd 598219089Spjd /* 599168404Spjd * If we're a top-level vdev, try to load the allocation parameters. 600168404Spjd */ 601219089Spjd if (parent && !parent->vdev_parent && 602219089Spjd (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 603168404Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 604168404Spjd &vd->vdev_ms_array); 605168404Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 606168404Spjd &vd->vdev_ms_shift); 607168404Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 608168404Spjd &vd->vdev_asize); 609219089Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 610219089Spjd &vd->vdev_removing); 611168404Spjd } 612168404Spjd 613230514Smm if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 614219089Spjd ASSERT(alloctype == VDEV_ALLOC_LOAD || 615219089Spjd alloctype == VDEV_ALLOC_ADD || 616219089Spjd alloctype == VDEV_ALLOC_SPLIT || 617219089Spjd alloctype == VDEV_ALLOC_ROOTPOOL); 618219089Spjd vd->vdev_mg = metaslab_group_create(islog ? 619219089Spjd spa_log_class(spa) : spa_normal_class(spa), vd); 620219089Spjd } 621219089Spjd 622168404Spjd /* 623185029Spjd * If we're a leaf vdev, try to load the DTL object and other state. 624168404Spjd */ 625185029Spjd if (vd->vdev_ops->vdev_op_leaf && 626219089Spjd (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 627219089Spjd alloctype == VDEV_ALLOC_ROOTPOOL)) { 628185029Spjd if (alloctype == VDEV_ALLOC_LOAD) { 629185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 630262093Savg &vd->vdev_dtl_object); 631185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 632185029Spjd &vd->vdev_unspare); 633185029Spjd } 634219089Spjd 635219089Spjd if (alloctype == VDEV_ALLOC_ROOTPOOL) { 636219089Spjd uint64_t spare = 0; 637219089Spjd 638219089Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 639219089Spjd &spare) == 0 && spare) 640219089Spjd spa_spare_add(vd); 641219089Spjd } 642219089Spjd 643168404Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 644168404Spjd &vd->vdev_offline); 645185029Spjd 646254112Sdelphij (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 647254112Sdelphij &vd->vdev_resilver_txg); 648219089Spjd 649185029Spjd /* 650185029Spjd * When importing a pool, we want to ignore the persistent fault 651185029Spjd * state, as the diagnosis made on another system may not be 652219089Spjd * valid in the current context. Local vdevs will 653219089Spjd * remain in the faulted state. 654185029Spjd */ 655219089Spjd if (spa_load_state(spa) == SPA_LOAD_OPEN) { 656185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 657185029Spjd &vd->vdev_faulted); 658185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 659185029Spjd &vd->vdev_degraded); 660185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 661185029Spjd &vd->vdev_removed); 662219089Spjd 663219089Spjd if (vd->vdev_faulted || vd->vdev_degraded) { 664219089Spjd char *aux; 665219089Spjd 666219089Spjd vd->vdev_label_aux = 667219089Spjd VDEV_AUX_ERR_EXCEEDED; 668219089Spjd if (nvlist_lookup_string(nv, 669219089Spjd ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 670219089Spjd strcmp(aux, "external") == 0) 671219089Spjd vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 672219089Spjd } 673185029Spjd } 674168404Spjd } 675168404Spjd 676168404Spjd /* 677168404Spjd * Add ourselves to the parent's list of children. 678168404Spjd */ 679168404Spjd vdev_add_child(parent, vd); 680168404Spjd 681168404Spjd *vdp = vd; 682168404Spjd 683168404Spjd return (0); 684168404Spjd} 685168404Spjd 686168404Spjdvoid 687168404Spjdvdev_free(vdev_t *vd) 688168404Spjd{ 689185029Spjd spa_t *spa = vd->vdev_spa; 690168404Spjd 691168404Spjd /* 692168404Spjd * vdev_free() implies closing the vdev first. This is simpler than 693168404Spjd * trying to ensure complicated semantics for all callers. 694168404Spjd */ 695168404Spjd vdev_close(vd); 696168404Spjd 697185029Spjd ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 698219089Spjd ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 699168404Spjd 700168404Spjd /* 701168404Spjd * Free all children. 702168404Spjd */ 703219089Spjd for (int c = 0; c < vd->vdev_children; c++) 704168404Spjd vdev_free(vd->vdev_child[c]); 705168404Spjd 706168404Spjd ASSERT(vd->vdev_child == NULL); 707168404Spjd ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 708168404Spjd 709168404Spjd /* 710168404Spjd * Discard allocation state. 711168404Spjd */ 712219089Spjd if (vd->vdev_mg != NULL) { 713168404Spjd vdev_metaslab_fini(vd); 714219089Spjd metaslab_group_destroy(vd->vdev_mg); 715219089Spjd } 716168404Spjd 717240415Smm ASSERT0(vd->vdev_stat.vs_space); 718240415Smm ASSERT0(vd->vdev_stat.vs_dspace); 719240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 720168404Spjd 721168404Spjd /* 722168404Spjd * Remove this vdev from its parent's child list. 723168404Spjd */ 724168404Spjd vdev_remove_child(vd->vdev_parent, vd); 725168404Spjd 726168404Spjd ASSERT(vd->vdev_parent == NULL); 727168404Spjd 728185029Spjd /* 729185029Spjd * Clean up vdev structure. 730185029Spjd */ 731185029Spjd vdev_queue_fini(vd); 732185029Spjd vdev_cache_fini(vd); 733185029Spjd 734185029Spjd if (vd->vdev_path) 735185029Spjd spa_strfree(vd->vdev_path); 736185029Spjd if (vd->vdev_devid) 737185029Spjd spa_strfree(vd->vdev_devid); 738185029Spjd if (vd->vdev_physpath) 739185029Spjd spa_strfree(vd->vdev_physpath); 740209962Smm if (vd->vdev_fru) 741209962Smm spa_strfree(vd->vdev_fru); 742185029Spjd 743185029Spjd if (vd->vdev_isspare) 744185029Spjd spa_spare_remove(vd); 745185029Spjd if (vd->vdev_isl2cache) 746185029Spjd spa_l2cache_remove(vd); 747185029Spjd 748185029Spjd txg_list_destroy(&vd->vdev_ms_list); 749185029Spjd txg_list_destroy(&vd->vdev_dtl_list); 750209962Smm 751185029Spjd mutex_enter(&vd->vdev_dtl_lock); 752262093Savg space_map_close(vd->vdev_dtl_sm); 753209962Smm for (int t = 0; t < DTL_TYPES; t++) { 754262093Savg range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 755262093Savg range_tree_destroy(vd->vdev_dtl[t]); 756209962Smm } 757185029Spjd mutex_exit(&vd->vdev_dtl_lock); 758209962Smm 759185029Spjd mutex_destroy(&vd->vdev_dtl_lock); 760185029Spjd mutex_destroy(&vd->vdev_stat_lock); 761185029Spjd mutex_destroy(&vd->vdev_probe_lock); 762185029Spjd 763185029Spjd if (vd == spa->spa_root_vdev) 764185029Spjd spa->spa_root_vdev = NULL; 765185029Spjd 766185029Spjd kmem_free(vd, sizeof (vdev_t)); 767168404Spjd} 768168404Spjd 769168404Spjd/* 770168404Spjd * Transfer top-level vdev state from svd to tvd. 771168404Spjd */ 772168404Spjdstatic void 773168404Spjdvdev_top_transfer(vdev_t *svd, vdev_t *tvd) 774168404Spjd{ 775168404Spjd spa_t *spa = svd->vdev_spa; 776168404Spjd metaslab_t *msp; 777168404Spjd vdev_t *vd; 778168404Spjd int t; 779168404Spjd 780168404Spjd ASSERT(tvd == tvd->vdev_top); 781168404Spjd 782168404Spjd tvd->vdev_ms_array = svd->vdev_ms_array; 783168404Spjd tvd->vdev_ms_shift = svd->vdev_ms_shift; 784168404Spjd tvd->vdev_ms_count = svd->vdev_ms_count; 785168404Spjd 786168404Spjd svd->vdev_ms_array = 0; 787168404Spjd svd->vdev_ms_shift = 0; 788168404Spjd svd->vdev_ms_count = 0; 789168404Spjd 790230514Smm if (tvd->vdev_mg) 791230514Smm ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 792168404Spjd tvd->vdev_mg = svd->vdev_mg; 793168404Spjd tvd->vdev_ms = svd->vdev_ms; 794168404Spjd 795168404Spjd svd->vdev_mg = NULL; 796168404Spjd svd->vdev_ms = NULL; 797168404Spjd 798168404Spjd if (tvd->vdev_mg != NULL) 799168404Spjd tvd->vdev_mg->mg_vd = tvd; 800168404Spjd 801168404Spjd tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 802168404Spjd tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 803168404Spjd tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 804168404Spjd 805168404Spjd svd->vdev_stat.vs_alloc = 0; 806168404Spjd svd->vdev_stat.vs_space = 0; 807168404Spjd svd->vdev_stat.vs_dspace = 0; 808168404Spjd 809168404Spjd for (t = 0; t < TXG_SIZE; t++) { 810168404Spjd while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 811168404Spjd (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 812168404Spjd while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 813168404Spjd (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 814168404Spjd if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 815168404Spjd (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 816168404Spjd } 817168404Spjd 818185029Spjd if (list_link_active(&svd->vdev_config_dirty_node)) { 819168404Spjd vdev_config_clean(svd); 820168404Spjd vdev_config_dirty(tvd); 821168404Spjd } 822168404Spjd 823185029Spjd if (list_link_active(&svd->vdev_state_dirty_node)) { 824185029Spjd vdev_state_clean(svd); 825185029Spjd vdev_state_dirty(tvd); 826185029Spjd } 827168404Spjd 828168404Spjd tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 829168404Spjd svd->vdev_deflate_ratio = 0; 830185029Spjd 831185029Spjd tvd->vdev_islog = svd->vdev_islog; 832185029Spjd svd->vdev_islog = 0; 833168404Spjd} 834168404Spjd 835168404Spjdstatic void 836168404Spjdvdev_top_update(vdev_t *tvd, vdev_t *vd) 837168404Spjd{ 838168404Spjd if (vd == NULL) 839168404Spjd return; 840168404Spjd 841168404Spjd vd->vdev_top = tvd; 842168404Spjd 843219089Spjd for (int c = 0; c < vd->vdev_children; c++) 844168404Spjd vdev_top_update(tvd, vd->vdev_child[c]); 845168404Spjd} 846168404Spjd 847168404Spjd/* 848168404Spjd * Add a mirror/replacing vdev above an existing vdev. 849168404Spjd */ 850168404Spjdvdev_t * 851168404Spjdvdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 852168404Spjd{ 853168404Spjd spa_t *spa = cvd->vdev_spa; 854168404Spjd vdev_t *pvd = cvd->vdev_parent; 855168404Spjd vdev_t *mvd; 856168404Spjd 857185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 858168404Spjd 859168404Spjd mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 860168404Spjd 861168404Spjd mvd->vdev_asize = cvd->vdev_asize; 862219089Spjd mvd->vdev_min_asize = cvd->vdev_min_asize; 863236155Smm mvd->vdev_max_asize = cvd->vdev_max_asize; 864168404Spjd mvd->vdev_ashift = cvd->vdev_ashift; 865254591Sgibbs mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; 866254591Sgibbs mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; 867168404Spjd mvd->vdev_state = cvd->vdev_state; 868219089Spjd mvd->vdev_crtxg = cvd->vdev_crtxg; 869168404Spjd 870168404Spjd vdev_remove_child(pvd, cvd); 871168404Spjd vdev_add_child(pvd, mvd); 872168404Spjd cvd->vdev_id = mvd->vdev_children; 873168404Spjd vdev_add_child(mvd, cvd); 874168404Spjd vdev_top_update(cvd->vdev_top, cvd->vdev_top); 875168404Spjd 876168404Spjd if (mvd == mvd->vdev_top) 877168404Spjd vdev_top_transfer(cvd, mvd); 878168404Spjd 879168404Spjd return (mvd); 880168404Spjd} 881168404Spjd 882168404Spjd/* 883168404Spjd * Remove a 1-way mirror/replacing vdev from the tree. 884168404Spjd */ 885168404Spjdvoid 886168404Spjdvdev_remove_parent(vdev_t *cvd) 887168404Spjd{ 888168404Spjd vdev_t *mvd = cvd->vdev_parent; 889168404Spjd vdev_t *pvd = mvd->vdev_parent; 890168404Spjd 891185029Spjd ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 892168404Spjd 893168404Spjd ASSERT(mvd->vdev_children == 1); 894168404Spjd ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 895168404Spjd mvd->vdev_ops == &vdev_replacing_ops || 896168404Spjd mvd->vdev_ops == &vdev_spare_ops); 897168404Spjd cvd->vdev_ashift = mvd->vdev_ashift; 898254591Sgibbs cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; 899254591Sgibbs cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; 900168404Spjd 901168404Spjd vdev_remove_child(mvd, cvd); 902168404Spjd vdev_remove_child(pvd, mvd); 903209962Smm 904185029Spjd /* 905185029Spjd * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 906185029Spjd * Otherwise, we could have detached an offline device, and when we 907185029Spjd * go to import the pool we'll think we have two top-level vdevs, 908185029Spjd * instead of a different version of the same top-level vdev. 909185029Spjd */ 910209962Smm if (mvd->vdev_top == mvd) { 911209962Smm uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 912219089Spjd cvd->vdev_orig_guid = cvd->vdev_guid; 913209962Smm cvd->vdev_guid += guid_delta; 914209962Smm cvd->vdev_guid_sum += guid_delta; 915209962Smm } 916168404Spjd cvd->vdev_id = mvd->vdev_id; 917168404Spjd vdev_add_child(pvd, cvd); 918168404Spjd vdev_top_update(cvd->vdev_top, cvd->vdev_top); 919168404Spjd 920168404Spjd if (cvd == cvd->vdev_top) 921168404Spjd vdev_top_transfer(mvd, cvd); 922168404Spjd 923168404Spjd ASSERT(mvd->vdev_children == 0); 924168404Spjd vdev_free(mvd); 925168404Spjd} 926168404Spjd 927168404Spjdint 928168404Spjdvdev_metaslab_init(vdev_t *vd, uint64_t txg) 929168404Spjd{ 930168404Spjd spa_t *spa = vd->vdev_spa; 931168404Spjd objset_t *mos = spa->spa_meta_objset; 932168404Spjd uint64_t m; 933168404Spjd uint64_t oldc = vd->vdev_ms_count; 934168404Spjd uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 935168404Spjd metaslab_t **mspp; 936168404Spjd int error; 937168404Spjd 938219089Spjd ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 939219089Spjd 940219089Spjd /* 941219089Spjd * This vdev is not being allocated from yet or is a hole. 942219089Spjd */ 943219089Spjd if (vd->vdev_ms_shift == 0) 944168404Spjd return (0); 945168404Spjd 946219089Spjd ASSERT(!vd->vdev_ishole); 947219089Spjd 948213197Smm /* 949213197Smm * Compute the raidz-deflation ratio. Note, we hard-code 950276081Sdelphij * in 128k (1 << 17) because it is the "typical" blocksize. 951276081Sdelphij * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, 952276081Sdelphij * otherwise it would inconsistently account for existing bp's. 953213197Smm */ 954213197Smm vd->vdev_deflate_ratio = (1 << 17) / 955213197Smm (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 956213197Smm 957168404Spjd ASSERT(oldc <= newc); 958168404Spjd 959168404Spjd mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 960168404Spjd 961168404Spjd if (oldc != 0) { 962168404Spjd bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 963168404Spjd kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 964168404Spjd } 965168404Spjd 966168404Spjd vd->vdev_ms = mspp; 967168404Spjd vd->vdev_ms_count = newc; 968168404Spjd 969168404Spjd for (m = oldc; m < newc; m++) { 970262093Savg uint64_t object = 0; 971262093Savg 972168404Spjd if (txg == 0) { 973168404Spjd error = dmu_read(mos, vd->vdev_ms_array, 974209962Smm m * sizeof (uint64_t), sizeof (uint64_t), &object, 975209962Smm DMU_READ_PREFETCH); 976168404Spjd if (error) 977168404Spjd return (error); 978168404Spjd } 979277553Sdelphij 980277553Sdelphij error = metaslab_init(vd->vdev_mg, m, object, txg, 981277553Sdelphij &(vd->vdev_ms[m])); 982277553Sdelphij if (error) 983277553Sdelphij return (error); 984168404Spjd } 985168404Spjd 986219089Spjd if (txg == 0) 987219089Spjd spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 988219089Spjd 989219089Spjd /* 990219089Spjd * If the vdev is being removed we don't activate 991219089Spjd * the metaslabs since we want to ensure that no new 992219089Spjd * allocations are performed on this device. 993219089Spjd */ 994219089Spjd if (oldc == 0 && !vd->vdev_removing) 995219089Spjd metaslab_group_activate(vd->vdev_mg); 996219089Spjd 997219089Spjd if (txg == 0) 998219089Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 999219089Spjd 1000168404Spjd return (0); 1001168404Spjd} 1002168404Spjd 1003168404Spjdvoid 1004168404Spjdvdev_metaslab_fini(vdev_t *vd) 1005168404Spjd{ 1006168404Spjd uint64_t m; 1007168404Spjd uint64_t count = vd->vdev_ms_count; 1008168404Spjd 1009168404Spjd if (vd->vdev_ms != NULL) { 1010219089Spjd metaslab_group_passivate(vd->vdev_mg); 1011262093Savg for (m = 0; m < count; m++) { 1012262093Savg metaslab_t *msp = vd->vdev_ms[m]; 1013262093Savg 1014262093Savg if (msp != NULL) 1015262093Savg metaslab_fini(msp); 1016262093Savg } 1017168404Spjd kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 1018168404Spjd vd->vdev_ms = NULL; 1019168404Spjd } 1020168404Spjd} 1021168404Spjd 1022185029Spjdtypedef struct vdev_probe_stats { 1023185029Spjd boolean_t vps_readable; 1024185029Spjd boolean_t vps_writeable; 1025185029Spjd int vps_flags; 1026185029Spjd} vdev_probe_stats_t; 1027185029Spjd 1028185029Spjdstatic void 1029185029Spjdvdev_probe_done(zio_t *zio) 1030185029Spjd{ 1031209962Smm spa_t *spa = zio->io_spa; 1032209962Smm vdev_t *vd = zio->io_vd; 1033185029Spjd vdev_probe_stats_t *vps = zio->io_private; 1034185029Spjd 1035209962Smm ASSERT(vd->vdev_probe_zio != NULL); 1036209962Smm 1037185029Spjd if (zio->io_type == ZIO_TYPE_READ) { 1038185029Spjd if (zio->io_error == 0) 1039185029Spjd vps->vps_readable = 1; 1040209962Smm if (zio->io_error == 0 && spa_writeable(spa)) { 1041209962Smm zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 1042185029Spjd zio->io_offset, zio->io_size, zio->io_data, 1043185029Spjd ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1044185029Spjd ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 1045185029Spjd } else { 1046185029Spjd zio_buf_free(zio->io_data, zio->io_size); 1047185029Spjd } 1048185029Spjd } else if (zio->io_type == ZIO_TYPE_WRITE) { 1049185029Spjd if (zio->io_error == 0) 1050185029Spjd vps->vps_writeable = 1; 1051185029Spjd zio_buf_free(zio->io_data, zio->io_size); 1052185029Spjd } else if (zio->io_type == ZIO_TYPE_NULL) { 1053209962Smm zio_t *pio; 1054185029Spjd 1055185029Spjd vd->vdev_cant_read |= !vps->vps_readable; 1056185029Spjd vd->vdev_cant_write |= !vps->vps_writeable; 1057185029Spjd 1058185029Spjd if (vdev_readable(vd) && 1059209962Smm (vdev_writeable(vd) || !spa_writeable(spa))) { 1060185029Spjd zio->io_error = 0; 1061185029Spjd } else { 1062185029Spjd ASSERT(zio->io_error != 0); 1063185029Spjd zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 1064209962Smm spa, vd, NULL, 0, 0); 1065249195Smm zio->io_error = SET_ERROR(ENXIO); 1066185029Spjd } 1067209962Smm 1068209962Smm mutex_enter(&vd->vdev_probe_lock); 1069209962Smm ASSERT(vd->vdev_probe_zio == zio); 1070209962Smm vd->vdev_probe_zio = NULL; 1071209962Smm mutex_exit(&vd->vdev_probe_lock); 1072209962Smm 1073209962Smm while ((pio = zio_walk_parents(zio)) != NULL) 1074209962Smm if (!vdev_accessible(vd, pio)) 1075249195Smm pio->io_error = SET_ERROR(ENXIO); 1076209962Smm 1077185029Spjd kmem_free(vps, sizeof (*vps)); 1078185029Spjd } 1079185029Spjd} 1080185029Spjd 1081168404Spjd/* 1082251631Sdelphij * Determine whether this device is accessible. 1083251631Sdelphij * 1084251631Sdelphij * Read and write to several known locations: the pad regions of each 1085251631Sdelphij * vdev label but the first, which we leave alone in case it contains 1086251631Sdelphij * a VTOC. 1087185029Spjd */ 1088185029Spjdzio_t * 1089209962Smmvdev_probe(vdev_t *vd, zio_t *zio) 1090185029Spjd{ 1091185029Spjd spa_t *spa = vd->vdev_spa; 1092209962Smm vdev_probe_stats_t *vps = NULL; 1093209962Smm zio_t *pio; 1094185029Spjd 1095209962Smm ASSERT(vd->vdev_ops->vdev_op_leaf); 1096185029Spjd 1097209962Smm /* 1098209962Smm * Don't probe the probe. 1099209962Smm */ 1100209962Smm if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1101209962Smm return (NULL); 1102185029Spjd 1103209962Smm /* 1104209962Smm * To prevent 'probe storms' when a device fails, we create 1105209962Smm * just one probe i/o at a time. All zios that want to probe 1106209962Smm * this vdev will become parents of the probe io. 1107209962Smm */ 1108209962Smm mutex_enter(&vd->vdev_probe_lock); 1109209962Smm 1110209962Smm if ((pio = vd->vdev_probe_zio) == NULL) { 1111209962Smm vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 1112209962Smm 1113209962Smm vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 1114209962Smm ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 1115213198Smm ZIO_FLAG_TRYHARD; 1116209962Smm 1117209962Smm if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1118209962Smm /* 1119209962Smm * vdev_cant_read and vdev_cant_write can only 1120209962Smm * transition from TRUE to FALSE when we have the 1121209962Smm * SCL_ZIO lock as writer; otherwise they can only 1122209962Smm * transition from FALSE to TRUE. This ensures that 1123209962Smm * any zio looking at these values can assume that 1124209962Smm * failures persist for the life of the I/O. That's 1125209962Smm * important because when a device has intermittent 1126209962Smm * connectivity problems, we want to ensure that 1127209962Smm * they're ascribed to the device (ENXIO) and not 1128209962Smm * the zio (EIO). 1129209962Smm * 1130209962Smm * Since we hold SCL_ZIO as writer here, clear both 1131209962Smm * values so the probe can reevaluate from first 1132209962Smm * principles. 1133209962Smm */ 1134209962Smm vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1135209962Smm vd->vdev_cant_read = B_FALSE; 1136209962Smm vd->vdev_cant_write = B_FALSE; 1137209962Smm } 1138209962Smm 1139209962Smm vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1140209962Smm vdev_probe_done, vps, 1141209962Smm vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1142209962Smm 1143219089Spjd /* 1144219089Spjd * We can't change the vdev state in this context, so we 1145219089Spjd * kick off an async task to do it on our behalf. 1146219089Spjd */ 1147209962Smm if (zio != NULL) { 1148209962Smm vd->vdev_probe_wanted = B_TRUE; 1149209962Smm spa_async_request(spa, SPA_ASYNC_PROBE); 1150209962Smm } 1151185029Spjd } 1152185029Spjd 1153209962Smm if (zio != NULL) 1154209962Smm zio_add_child(zio, pio); 1155185029Spjd 1156209962Smm mutex_exit(&vd->vdev_probe_lock); 1157185029Spjd 1158209962Smm if (vps == NULL) { 1159209962Smm ASSERT(zio != NULL); 1160209962Smm return (NULL); 1161209962Smm } 1162185029Spjd 1163185029Spjd for (int l = 1; l < VDEV_LABELS; l++) { 1164209962Smm zio_nowait(zio_read_phys(pio, vd, 1165185029Spjd vdev_label_offset(vd->vdev_psize, l, 1166209962Smm offsetof(vdev_label_t, vl_pad2)), 1167209962Smm VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), 1168185029Spjd ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1169185029Spjd ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1170185029Spjd } 1171185029Spjd 1172209962Smm if (zio == NULL) 1173209962Smm return (pio); 1174209962Smm 1175209962Smm zio_nowait(pio); 1176209962Smm return (NULL); 1177185029Spjd} 1178185029Spjd 1179219089Spjdstatic void 1180219089Spjdvdev_open_child(void *arg) 1181219089Spjd{ 1182219089Spjd vdev_t *vd = arg; 1183219089Spjd 1184219089Spjd vd->vdev_open_thread = curthread; 1185219089Spjd vd->vdev_open_error = vdev_open(vd); 1186219089Spjd vd->vdev_open_thread = NULL; 1187219089Spjd} 1188219089Spjd 1189219089Spjdboolean_t 1190219089Spjdvdev_uses_zvols(vdev_t *vd) 1191219089Spjd{ 1192219089Spjd if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1193219089Spjd strlen(ZVOL_DIR)) == 0) 1194219089Spjd return (B_TRUE); 1195219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1196219089Spjd if (vdev_uses_zvols(vd->vdev_child[c])) 1197219089Spjd return (B_TRUE); 1198219089Spjd return (B_FALSE); 1199219089Spjd} 1200219089Spjd 1201219089Spjdvoid 1202219089Spjdvdev_open_children(vdev_t *vd) 1203219089Spjd{ 1204219089Spjd taskq_t *tq; 1205219089Spjd int children = vd->vdev_children; 1206219089Spjd 1207219089Spjd /* 1208219089Spjd * in order to handle pools on top of zvols, do the opens 1209219089Spjd * in a single thread so that the same thread holds the 1210219089Spjd * spa_namespace_lock 1211219089Spjd */ 1212219089Spjd if (B_TRUE || vdev_uses_zvols(vd)) { 1213219089Spjd for (int c = 0; c < children; c++) 1214219089Spjd vd->vdev_child[c]->vdev_open_error = 1215219089Spjd vdev_open(vd->vdev_child[c]); 1216219089Spjd return; 1217219089Spjd } 1218219089Spjd tq = taskq_create("vdev_open", children, minclsyspri, 1219219089Spjd children, children, TASKQ_PREPOPULATE); 1220219089Spjd 1221219089Spjd for (int c = 0; c < children; c++) 1222219089Spjd VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1223219089Spjd TQ_SLEEP) != 0); 1224219089Spjd 1225219089Spjd taskq_destroy(tq); 1226219089Spjd} 1227219089Spjd 1228185029Spjd/* 1229168404Spjd * Prepare a virtual device for access. 1230168404Spjd */ 1231168404Spjdint 1232168404Spjdvdev_open(vdev_t *vd) 1233168404Spjd{ 1234209962Smm spa_t *spa = vd->vdev_spa; 1235168404Spjd int error; 1236168404Spjd uint64_t osize = 0; 1237236155Smm uint64_t max_osize = 0; 1238236155Smm uint64_t asize, max_asize, psize; 1239254591Sgibbs uint64_t logical_ashift = 0; 1240254591Sgibbs uint64_t physical_ashift = 0; 1241168404Spjd 1242219089Spjd ASSERT(vd->vdev_open_thread == curthread || 1243219089Spjd spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1244168404Spjd ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1245168404Spjd vd->vdev_state == VDEV_STATE_CANT_OPEN || 1246168404Spjd vd->vdev_state == VDEV_STATE_OFFLINE); 1247168404Spjd 1248168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1249213197Smm vd->vdev_cant_read = B_FALSE; 1250213197Smm vd->vdev_cant_write = B_FALSE; 1251274800Ssmh vd->vdev_notrim = B_FALSE; 1252219089Spjd vd->vdev_min_asize = vdev_get_min_asize(vd); 1253168404Spjd 1254219089Spjd /* 1255219089Spjd * If this vdev is not removed, check its fault status. If it's 1256219089Spjd * faulted, bail out of the open. 1257219089Spjd */ 1258185029Spjd if (!vd->vdev_removed && vd->vdev_faulted) { 1259168404Spjd ASSERT(vd->vdev_children == 0); 1260219089Spjd ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1261219089Spjd vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1262185029Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1263219089Spjd vd->vdev_label_aux); 1264249195Smm return (SET_ERROR(ENXIO)); 1265185029Spjd } else if (vd->vdev_offline) { 1266185029Spjd ASSERT(vd->vdev_children == 0); 1267168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1268249195Smm return (SET_ERROR(ENXIO)); 1269168404Spjd } 1270168404Spjd 1271254591Sgibbs error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, 1272254591Sgibbs &logical_ashift, &physical_ashift); 1273168404Spjd 1274219089Spjd /* 1275219089Spjd * Reset the vdev_reopening flag so that we actually close 1276219089Spjd * the vdev on error. 1277219089Spjd */ 1278219089Spjd vd->vdev_reopening = B_FALSE; 1279168404Spjd if (zio_injection_enabled && error == 0) 1280213198Smm error = zio_handle_device_injection(vd, NULL, ENXIO); 1281168404Spjd 1282185029Spjd if (error) { 1283185029Spjd if (vd->vdev_removed && 1284185029Spjd vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1285185029Spjd vd->vdev_removed = B_FALSE; 1286168404Spjd 1287168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1288168404Spjd vd->vdev_stat.vs_aux); 1289168404Spjd return (error); 1290168404Spjd } 1291168404Spjd 1292185029Spjd vd->vdev_removed = B_FALSE; 1293168404Spjd 1294219089Spjd /* 1295219089Spjd * Recheck the faulted flag now that we have confirmed that 1296219089Spjd * the vdev is accessible. If we're faulted, bail. 1297219089Spjd */ 1298219089Spjd if (vd->vdev_faulted) { 1299219089Spjd ASSERT(vd->vdev_children == 0); 1300219089Spjd ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1301219089Spjd vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1302219089Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1303219089Spjd vd->vdev_label_aux); 1304249195Smm return (SET_ERROR(ENXIO)); 1305219089Spjd } 1306219089Spjd 1307185029Spjd if (vd->vdev_degraded) { 1308185029Spjd ASSERT(vd->vdev_children == 0); 1309185029Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1310185029Spjd VDEV_AUX_ERR_EXCEEDED); 1311185029Spjd } else { 1312219089Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1313185029Spjd } 1314185029Spjd 1315219089Spjd /* 1316219089Spjd * For hole or missing vdevs we just return success. 1317219089Spjd */ 1318219089Spjd if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1319219089Spjd return (0); 1320219089Spjd 1321274800Ssmh if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) 1322240868Spjd trim_map_create(vd); 1323240868Spjd 1324219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 1325168404Spjd if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1326168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1327168404Spjd VDEV_AUX_NONE); 1328168404Spjd break; 1329168404Spjd } 1330219089Spjd } 1331168404Spjd 1332168404Spjd osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1333236155Smm max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1334168404Spjd 1335168404Spjd if (vd->vdev_children == 0) { 1336168404Spjd if (osize < SPA_MINDEVSIZE) { 1337168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1338168404Spjd VDEV_AUX_TOO_SMALL); 1339249195Smm return (SET_ERROR(EOVERFLOW)); 1340168404Spjd } 1341168404Spjd psize = osize; 1342168404Spjd asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1343236155Smm max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1344236155Smm VDEV_LABEL_END_SIZE); 1345168404Spjd } else { 1346168404Spjd if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1347168404Spjd (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1348168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1349168404Spjd VDEV_AUX_TOO_SMALL); 1350249195Smm return (SET_ERROR(EOVERFLOW)); 1351168404Spjd } 1352168404Spjd psize = 0; 1353168404Spjd asize = osize; 1354236155Smm max_asize = max_osize; 1355168404Spjd } 1356168404Spjd 1357168404Spjd vd->vdev_psize = psize; 1358168404Spjd 1359219089Spjd /* 1360219089Spjd * Make sure the allocatable size hasn't shrunk. 1361219089Spjd */ 1362219089Spjd if (asize < vd->vdev_min_asize) { 1363219089Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1364219089Spjd VDEV_AUX_BAD_LABEL); 1365249195Smm return (SET_ERROR(EINVAL)); 1366219089Spjd } 1367219089Spjd 1368254591Sgibbs vd->vdev_physical_ashift = 1369254591Sgibbs MAX(physical_ashift, vd->vdev_physical_ashift); 1370254591Sgibbs vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); 1371254591Sgibbs vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); 1372254591Sgibbs 1373254591Sgibbs if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { 1374254591Sgibbs vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1375254591Sgibbs VDEV_AUX_ASHIFT_TOO_BIG); 1376254591Sgibbs return (EINVAL); 1377254591Sgibbs } 1378254591Sgibbs 1379168404Spjd if (vd->vdev_asize == 0) { 1380168404Spjd /* 1381168404Spjd * This is the first-ever open, so use the computed values. 1382168404Spjd * For testing purposes, a higher ashift can be requested. 1383168404Spjd */ 1384168404Spjd vd->vdev_asize = asize; 1385236155Smm vd->vdev_max_asize = max_asize; 1386168404Spjd } else { 1387168404Spjd /* 1388254591Sgibbs * Make sure the alignment requirement hasn't increased. 1389168404Spjd */ 1390254591Sgibbs if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && 1391253441Sdelphij vd->vdev_ops->vdev_op_leaf) { 1392254591Sgibbs vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1393254591Sgibbs VDEV_AUX_BAD_LABEL); 1394254591Sgibbs return (EINVAL); 1395168404Spjd } 1396236155Smm vd->vdev_max_asize = max_asize; 1397219089Spjd } 1398168404Spjd 1399219089Spjd /* 1400219089Spjd * If all children are healthy and the asize has increased, 1401219089Spjd * then we've experienced dynamic LUN growth. If automatic 1402219089Spjd * expansion is enabled then use the additional space. 1403219089Spjd */ 1404219089Spjd if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && 1405219089Spjd (vd->vdev_expanding || spa->spa_autoexpand)) 1406219089Spjd vd->vdev_asize = asize; 1407168404Spjd 1408219089Spjd vdev_set_min_asize(vd); 1409168404Spjd 1410168404Spjd /* 1411185029Spjd * Ensure we can issue some IO before declaring the 1412185029Spjd * vdev open for business. 1413185029Spjd */ 1414185029Spjd if (vd->vdev_ops->vdev_op_leaf && 1415185029Spjd (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1416219089Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1417219089Spjd VDEV_AUX_ERR_EXCEEDED); 1418185029Spjd return (error); 1419185029Spjd } 1420185029Spjd 1421185029Spjd /* 1422285001Savg * Track the min and max ashift values for normal data devices. 1423285001Savg */ 1424285001Savg if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1425285001Savg !vd->vdev_islog && vd->vdev_aux == NULL) { 1426285001Savg if (vd->vdev_ashift > spa->spa_max_ashift) 1427285001Savg spa->spa_max_ashift = vd->vdev_ashift; 1428285001Savg if (vd->vdev_ashift < spa->spa_min_ashift) 1429285001Savg spa->spa_min_ashift = vd->vdev_ashift; 1430285001Savg } 1431285001Savg 1432285001Savg /* 1433185029Spjd * If a leaf vdev has a DTL, and seems healthy, then kick off a 1434209962Smm * resilver. But don't do this if we are doing a reopen for a scrub, 1435209962Smm * since this would just restart the scrub we are already doing. 1436168404Spjd */ 1437209962Smm if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1438209962Smm vdev_resilver_needed(vd, NULL, NULL)) 1439209962Smm spa_async_request(spa, SPA_ASYNC_RESILVER); 1440168404Spjd 1441168404Spjd return (0); 1442168404Spjd} 1443168404Spjd 1444168404Spjd/* 1445168404Spjd * Called once the vdevs are all opened, this routine validates the label 1446168404Spjd * contents. This needs to be done before vdev_load() so that we don't 1447185029Spjd * inadvertently do repair I/Os to the wrong device. 1448168404Spjd * 1449230514Smm * If 'strict' is false ignore the spa guid check. This is necessary because 1450230514Smm * if the machine crashed during a re-guid the new guid might have been written 1451230514Smm * to all of the vdev labels, but not the cached config. The strict check 1452230514Smm * will be performed when the pool is opened again using the mos config. 1453230514Smm * 1454168404Spjd * This function will only return failure if one of the vdevs indicates that it 1455168404Spjd * has since been destroyed or exported. This is only possible if 1456168404Spjd * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1457168404Spjd * will be updated but the function will return 0. 1458168404Spjd */ 1459168404Spjdint 1460230514Smmvdev_validate(vdev_t *vd, boolean_t strict) 1461168404Spjd{ 1462168404Spjd spa_t *spa = vd->vdev_spa; 1463168404Spjd nvlist_t *label; 1464219089Spjd uint64_t guid = 0, top_guid; 1465168404Spjd uint64_t state; 1466168404Spjd 1467219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1468230514Smm if (vdev_validate(vd->vdev_child[c], strict) != 0) 1469249195Smm return (SET_ERROR(EBADF)); 1470168404Spjd 1471168404Spjd /* 1472168404Spjd * If the device has already failed, or was marked offline, don't do 1473168404Spjd * any further validation. Otherwise, label I/O will fail and we will 1474168404Spjd * overwrite the previous state. 1475168404Spjd */ 1476185029Spjd if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1477219089Spjd uint64_t aux_guid = 0; 1478219089Spjd nvlist_t *nvl; 1479246631Smm uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1480246631Smm spa_last_synced_txg(spa) : -1ULL; 1481168404Spjd 1482239620Smm if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1483168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1484168404Spjd VDEV_AUX_BAD_LABEL); 1485168404Spjd return (0); 1486168404Spjd } 1487168404Spjd 1488219089Spjd /* 1489219089Spjd * Determine if this vdev has been split off into another 1490219089Spjd * pool. If so, then refuse to open it. 1491219089Spjd */ 1492219089Spjd if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1493219089Spjd &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1494219089Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1495219089Spjd VDEV_AUX_SPLIT_POOL); 1496219089Spjd nvlist_free(label); 1497219089Spjd return (0); 1498219089Spjd } 1499219089Spjd 1500230514Smm if (strict && (nvlist_lookup_uint64(label, 1501230514Smm ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 1502230514Smm guid != spa_guid(spa))) { 1503168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1504168404Spjd VDEV_AUX_CORRUPT_DATA); 1505168404Spjd nvlist_free(label); 1506168404Spjd return (0); 1507168404Spjd } 1508168404Spjd 1509219089Spjd if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1510219089Spjd != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1511219089Spjd &aux_guid) != 0) 1512219089Spjd aux_guid = 0; 1513219089Spjd 1514185029Spjd /* 1515185029Spjd * If this vdev just became a top-level vdev because its 1516185029Spjd * sibling was detached, it will have adopted the parent's 1517185029Spjd * vdev guid -- but the label may or may not be on disk yet. 1518185029Spjd * Fortunately, either version of the label will have the 1519185029Spjd * same top guid, so if we're a top-level vdev, we can 1520185029Spjd * safely compare to that instead. 1521219089Spjd * 1522219089Spjd * If we split this vdev off instead, then we also check the 1523219089Spjd * original pool's guid. We don't want to consider the vdev 1524219089Spjd * corrupt if it is partway through a split operation. 1525185029Spjd */ 1526168404Spjd if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1527185029Spjd &guid) != 0 || 1528185029Spjd nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1529185029Spjd &top_guid) != 0 || 1530219089Spjd ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1531185029Spjd (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1532168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1533168404Spjd VDEV_AUX_CORRUPT_DATA); 1534168404Spjd nvlist_free(label); 1535168404Spjd return (0); 1536168404Spjd } 1537168404Spjd 1538168404Spjd if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1539168404Spjd &state) != 0) { 1540168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1541168404Spjd VDEV_AUX_CORRUPT_DATA); 1542168404Spjd nvlist_free(label); 1543168404Spjd return (0); 1544168404Spjd } 1545168404Spjd 1546168404Spjd nvlist_free(label); 1547168404Spjd 1548209962Smm /* 1549219089Spjd * If this is a verbatim import, no need to check the 1550209962Smm * state of the pool. 1551209962Smm */ 1552219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1553219089Spjd spa_load_state(spa) == SPA_LOAD_OPEN && 1554168404Spjd state != POOL_STATE_ACTIVE) 1555249195Smm return (SET_ERROR(EBADF)); 1556185029Spjd 1557185029Spjd /* 1558185029Spjd * If we were able to open and validate a vdev that was 1559185029Spjd * previously marked permanently unavailable, clear that state 1560185029Spjd * now. 1561185029Spjd */ 1562185029Spjd if (vd->vdev_not_present) 1563185029Spjd vd->vdev_not_present = 0; 1564168404Spjd } 1565168404Spjd 1566168404Spjd return (0); 1567168404Spjd} 1568168404Spjd 1569168404Spjd/* 1570168404Spjd * Close a virtual device. 1571168404Spjd */ 1572168404Spjdvoid 1573168404Spjdvdev_close(vdev_t *vd) 1574168404Spjd{ 1575209962Smm spa_t *spa = vd->vdev_spa; 1576219089Spjd vdev_t *pvd = vd->vdev_parent; 1577209962Smm 1578209962Smm ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1579209962Smm 1580219089Spjd /* 1581219089Spjd * If our parent is reopening, then we are as well, unless we are 1582219089Spjd * going offline. 1583219089Spjd */ 1584219089Spjd if (pvd != NULL && pvd->vdev_reopening) 1585219089Spjd vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1586219089Spjd 1587168404Spjd vd->vdev_ops->vdev_op_close(vd); 1588168404Spjd 1589185029Spjd vdev_cache_purge(vd); 1590168404Spjd 1591240868Spjd if (vd->vdev_ops->vdev_op_leaf) 1592240868Spjd trim_map_destroy(vd); 1593240868Spjd 1594168404Spjd /* 1595219089Spjd * We record the previous state before we close it, so that if we are 1596168404Spjd * doing a reopen(), we don't generate FMA ereports if we notice that 1597168404Spjd * it's still faulted. 1598168404Spjd */ 1599168404Spjd vd->vdev_prevstate = vd->vdev_state; 1600168404Spjd 1601168404Spjd if (vd->vdev_offline) 1602168404Spjd vd->vdev_state = VDEV_STATE_OFFLINE; 1603168404Spjd else 1604168404Spjd vd->vdev_state = VDEV_STATE_CLOSED; 1605168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1606168404Spjd} 1607168404Spjd 1608168404Spjdvoid 1609219089Spjdvdev_hold(vdev_t *vd) 1610219089Spjd{ 1611219089Spjd spa_t *spa = vd->vdev_spa; 1612219089Spjd 1613219089Spjd ASSERT(spa_is_root(spa)); 1614219089Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1615219089Spjd return; 1616219089Spjd 1617219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1618219089Spjd vdev_hold(vd->vdev_child[c]); 1619219089Spjd 1620219089Spjd if (vd->vdev_ops->vdev_op_leaf) 1621219089Spjd vd->vdev_ops->vdev_op_hold(vd); 1622219089Spjd} 1623219089Spjd 1624219089Spjdvoid 1625219089Spjdvdev_rele(vdev_t *vd) 1626219089Spjd{ 1627219089Spjd spa_t *spa = vd->vdev_spa; 1628219089Spjd 1629219089Spjd ASSERT(spa_is_root(spa)); 1630219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1631219089Spjd vdev_rele(vd->vdev_child[c]); 1632219089Spjd 1633219089Spjd if (vd->vdev_ops->vdev_op_leaf) 1634219089Spjd vd->vdev_ops->vdev_op_rele(vd); 1635219089Spjd} 1636219089Spjd 1637219089Spjd/* 1638219089Spjd * Reopen all interior vdevs and any unopened leaves. We don't actually 1639219089Spjd * reopen leaf vdevs which had previously been opened as they might deadlock 1640219089Spjd * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1641219089Spjd * If the leaf has never been opened then open it, as usual. 1642219089Spjd */ 1643219089Spjdvoid 1644168404Spjdvdev_reopen(vdev_t *vd) 1645168404Spjd{ 1646168404Spjd spa_t *spa = vd->vdev_spa; 1647168404Spjd 1648185029Spjd ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1649168404Spjd 1650219089Spjd /* set the reopening flag unless we're taking the vdev offline */ 1651219089Spjd vd->vdev_reopening = !vd->vdev_offline; 1652168404Spjd vdev_close(vd); 1653168404Spjd (void) vdev_open(vd); 1654168404Spjd 1655168404Spjd /* 1656168404Spjd * Call vdev_validate() here to make sure we have the same device. 1657168404Spjd * Otherwise, a device with an invalid label could be successfully 1658168404Spjd * opened in response to vdev_reopen(). 1659168404Spjd */ 1660185029Spjd if (vd->vdev_aux) { 1661185029Spjd (void) vdev_validate_aux(vd); 1662185029Spjd if (vdev_readable(vd) && vdev_writeable(vd) && 1663209962Smm vd->vdev_aux == &spa->spa_l2cache && 1664219089Spjd !l2arc_vdev_present(vd)) 1665219089Spjd l2arc_add_vdev(spa, vd); 1666185029Spjd } else { 1667246631Smm (void) vdev_validate(vd, B_TRUE); 1668185029Spjd } 1669168404Spjd 1670168404Spjd /* 1671185029Spjd * Reassess parent vdev's health. 1672168404Spjd */ 1673185029Spjd vdev_propagate_state(vd); 1674168404Spjd} 1675168404Spjd 1676168404Spjdint 1677168404Spjdvdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1678168404Spjd{ 1679168404Spjd int error; 1680168404Spjd 1681168404Spjd /* 1682168404Spjd * Normally, partial opens (e.g. of a mirror) are allowed. 1683168404Spjd * For a create, however, we want to fail the request if 1684168404Spjd * there are any components we can't open. 1685168404Spjd */ 1686168404Spjd error = vdev_open(vd); 1687168404Spjd 1688168404Spjd if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1689168404Spjd vdev_close(vd); 1690168404Spjd return (error ? error : ENXIO); 1691168404Spjd } 1692168404Spjd 1693168404Spjd /* 1694262093Savg * Recursively load DTLs and initialize all labels. 1695168404Spjd */ 1696262093Savg if ((error = vdev_dtl_load(vd)) != 0 || 1697262093Savg (error = vdev_label_init(vd, txg, isreplacing ? 1698168404Spjd VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1699168404Spjd vdev_close(vd); 1700168404Spjd return (error); 1701168404Spjd } 1702168404Spjd 1703168404Spjd return (0); 1704168404Spjd} 1705168404Spjd 1706168404Spjdvoid 1707219089Spjdvdev_metaslab_set_size(vdev_t *vd) 1708168404Spjd{ 1709168404Spjd /* 1710273343Sdelphij * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. 1711168404Spjd */ 1712273343Sdelphij vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); 1713168404Spjd vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1714168404Spjd} 1715168404Spjd 1716254591Sgibbs/* 1717266122Ssmh * Maximize performance by inflating the configured ashift for top level 1718266122Ssmh * vdevs to be as close to the physical ashift as possible while maintaining 1719266122Ssmh * administrator defined limits and ensuring it doesn't go below the 1720266122Ssmh * logical ashift. 1721254591Sgibbs */ 1722168404Spjdvoid 1723254591Sgibbsvdev_ashift_optimize(vdev_t *vd) 1724254591Sgibbs{ 1725266122Ssmh if (vd == vd->vdev_top) { 1726266122Ssmh if (vd->vdev_ashift < vd->vdev_physical_ashift) { 1727266122Ssmh vd->vdev_ashift = MIN( 1728266122Ssmh MAX(zfs_max_auto_ashift, vd->vdev_ashift), 1729266122Ssmh MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); 1730266122Ssmh } else { 1731266122Ssmh /* 1732266122Ssmh * Unusual case where logical ashift > physical ashift 1733266122Ssmh * so we can't cap the calculated ashift based on max 1734266122Ssmh * ashift as that would cause failures. 1735266122Ssmh * We still check if we need to increase it to match 1736266122Ssmh * the min ashift. 1737266122Ssmh */ 1738266122Ssmh vd->vdev_ashift = MAX(zfs_min_auto_ashift, 1739266122Ssmh vd->vdev_ashift); 1740266122Ssmh } 1741254591Sgibbs } 1742254591Sgibbs} 1743254591Sgibbs 1744254591Sgibbsvoid 1745168404Spjdvdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1746168404Spjd{ 1747168404Spjd ASSERT(vd == vd->vdev_top); 1748219089Spjd ASSERT(!vd->vdev_ishole); 1749168404Spjd ASSERT(ISP2(flags)); 1750219089Spjd ASSERT(spa_writeable(vd->vdev_spa)); 1751168404Spjd 1752168404Spjd if (flags & VDD_METASLAB) 1753168404Spjd (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1754168404Spjd 1755168404Spjd if (flags & VDD_DTL) 1756168404Spjd (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1757168404Spjd 1758168404Spjd (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1759168404Spjd} 1760168404Spjd 1761262093Savgvoid 1762262093Savgvdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) 1763262093Savg{ 1764262093Savg for (int c = 0; c < vd->vdev_children; c++) 1765262093Savg vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 1766262093Savg 1767262093Savg if (vd->vdev_ops->vdev_op_leaf) 1768262093Savg vdev_dirty(vd->vdev_top, flags, vd, txg); 1769262093Savg} 1770262093Savg 1771209962Smm/* 1772209962Smm * DTLs. 1773209962Smm * 1774209962Smm * A vdev's DTL (dirty time log) is the set of transaction groups for which 1775219089Spjd * the vdev has less than perfect replication. There are four kinds of DTL: 1776209962Smm * 1777209962Smm * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1778209962Smm * 1779209962Smm * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1780209962Smm * 1781209962Smm * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1782209962Smm * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1783209962Smm * txgs that was scrubbed. 1784209962Smm * 1785209962Smm * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1786209962Smm * persistent errors or just some device being offline. 1787209962Smm * Unlike the other three, the DTL_OUTAGE map is not generally 1788209962Smm * maintained; it's only computed when needed, typically to 1789209962Smm * determine whether a device can be detached. 1790209962Smm * 1791209962Smm * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1792209962Smm * either has the data or it doesn't. 1793209962Smm * 1794209962Smm * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1795209962Smm * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1796209962Smm * if any child is less than fully replicated, then so is its parent. 1797209962Smm * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1798209962Smm * comprising only those txgs which appear in 'maxfaults' or more children; 1799209962Smm * those are the txgs we don't have enough replication to read. For example, 1800209962Smm * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1801209962Smm * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1802209962Smm * two child DTL_MISSING maps. 1803209962Smm * 1804209962Smm * It should be clear from the above that to compute the DTLs and outage maps 1805209962Smm * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1806209962Smm * Therefore, that is all we keep on disk. When loading the pool, or after 1807209962Smm * a configuration change, we generate all other DTLs from first principles. 1808209962Smm */ 1809168404Spjdvoid 1810209962Smmvdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1811168404Spjd{ 1812262093Savg range_tree_t *rt = vd->vdev_dtl[t]; 1813209962Smm 1814209962Smm ASSERT(t < DTL_TYPES); 1815209962Smm ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1816219089Spjd ASSERT(spa_writeable(vd->vdev_spa)); 1817209962Smm 1818262093Savg mutex_enter(rt->rt_lock); 1819262093Savg if (!range_tree_contains(rt, txg, size)) 1820262093Savg range_tree_add(rt, txg, size); 1821262093Savg mutex_exit(rt->rt_lock); 1822168404Spjd} 1823168404Spjd 1824209962Smmboolean_t 1825209962Smmvdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1826168404Spjd{ 1827262093Savg range_tree_t *rt = vd->vdev_dtl[t]; 1828209962Smm boolean_t dirty = B_FALSE; 1829168404Spjd 1830209962Smm ASSERT(t < DTL_TYPES); 1831209962Smm ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1832168404Spjd 1833262093Savg mutex_enter(rt->rt_lock); 1834262093Savg if (range_tree_space(rt) != 0) 1835262093Savg dirty = range_tree_contains(rt, txg, size); 1836262093Savg mutex_exit(rt->rt_lock); 1837168404Spjd 1838168404Spjd return (dirty); 1839168404Spjd} 1840168404Spjd 1841209962Smmboolean_t 1842209962Smmvdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1843209962Smm{ 1844262093Savg range_tree_t *rt = vd->vdev_dtl[t]; 1845209962Smm boolean_t empty; 1846209962Smm 1847262093Savg mutex_enter(rt->rt_lock); 1848262093Savg empty = (range_tree_space(rt) == 0); 1849262093Savg mutex_exit(rt->rt_lock); 1850209962Smm 1851209962Smm return (empty); 1852209962Smm} 1853209962Smm 1854168404Spjd/* 1855254112Sdelphij * Returns the lowest txg in the DTL range. 1856254112Sdelphij */ 1857254112Sdelphijstatic uint64_t 1858254112Sdelphijvdev_dtl_min(vdev_t *vd) 1859254112Sdelphij{ 1860262093Savg range_seg_t *rs; 1861254112Sdelphij 1862254112Sdelphij ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1863262093Savg ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1864254112Sdelphij ASSERT0(vd->vdev_children); 1865254112Sdelphij 1866262093Savg rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1867262093Savg return (rs->rs_start - 1); 1868254112Sdelphij} 1869254112Sdelphij 1870254112Sdelphij/* 1871254112Sdelphij * Returns the highest txg in the DTL. 1872254112Sdelphij */ 1873254112Sdelphijstatic uint64_t 1874254112Sdelphijvdev_dtl_max(vdev_t *vd) 1875254112Sdelphij{ 1876262093Savg range_seg_t *rs; 1877254112Sdelphij 1878254112Sdelphij ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1879262093Savg ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1880254112Sdelphij ASSERT0(vd->vdev_children); 1881254112Sdelphij 1882262093Savg rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1883262093Savg return (rs->rs_end); 1884254112Sdelphij} 1885254112Sdelphij 1886254112Sdelphij/* 1887254112Sdelphij * Determine if a resilvering vdev should remove any DTL entries from 1888254112Sdelphij * its range. If the vdev was resilvering for the entire duration of the 1889254112Sdelphij * scan then it should excise that range from its DTLs. Otherwise, this 1890254112Sdelphij * vdev is considered partially resilvered and should leave its DTL 1891254112Sdelphij * entries intact. The comment in vdev_dtl_reassess() describes how we 1892254112Sdelphij * excise the DTLs. 1893254112Sdelphij */ 1894254112Sdelphijstatic boolean_t 1895254112Sdelphijvdev_dtl_should_excise(vdev_t *vd) 1896254112Sdelphij{ 1897254112Sdelphij spa_t *spa = vd->vdev_spa; 1898254112Sdelphij dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1899254112Sdelphij 1900254112Sdelphij ASSERT0(scn->scn_phys.scn_errors); 1901254112Sdelphij ASSERT0(vd->vdev_children); 1902254112Sdelphij 1903254112Sdelphij if (vd->vdev_resilver_txg == 0 || 1904262093Savg range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) 1905254112Sdelphij return (B_TRUE); 1906254112Sdelphij 1907254112Sdelphij /* 1908254112Sdelphij * When a resilver is initiated the scan will assign the scn_max_txg 1909254112Sdelphij * value to the highest txg value that exists in all DTLs. If this 1910254112Sdelphij * device's max DTL is not part of this scan (i.e. it is not in 1911254112Sdelphij * the range (scn_min_txg, scn_max_txg] then it is not eligible 1912254112Sdelphij * for excision. 1913254112Sdelphij */ 1914254112Sdelphij if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 1915254112Sdelphij ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); 1916254112Sdelphij ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); 1917254112Sdelphij ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); 1918254112Sdelphij return (B_TRUE); 1919254112Sdelphij } 1920254112Sdelphij return (B_FALSE); 1921254112Sdelphij} 1922254112Sdelphij 1923254112Sdelphij/* 1924168404Spjd * Reassess DTLs after a config change or scrub completion. 1925168404Spjd */ 1926168404Spjdvoid 1927168404Spjdvdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1928168404Spjd{ 1929168404Spjd spa_t *spa = vd->vdev_spa; 1930209962Smm avl_tree_t reftree; 1931209962Smm int minref; 1932168404Spjd 1933209962Smm ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1934168404Spjd 1935209962Smm for (int c = 0; c < vd->vdev_children; c++) 1936209962Smm vdev_dtl_reassess(vd->vdev_child[c], txg, 1937209962Smm scrub_txg, scrub_done); 1938209962Smm 1939219089Spjd if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) 1940209962Smm return; 1941209962Smm 1942209962Smm if (vd->vdev_ops->vdev_op_leaf) { 1943219089Spjd dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1944219089Spjd 1945168404Spjd mutex_enter(&vd->vdev_dtl_lock); 1946254112Sdelphij 1947254112Sdelphij /* 1948254112Sdelphij * If we've completed a scan cleanly then determine 1949254112Sdelphij * if this vdev should remove any DTLs. We only want to 1950254112Sdelphij * excise regions on vdevs that were available during 1951254112Sdelphij * the entire duration of this scan. 1952254112Sdelphij */ 1953185029Spjd if (scrub_txg != 0 && 1954219089Spjd (spa->spa_scrub_started || 1955254112Sdelphij (scn != NULL && scn->scn_phys.scn_errors == 0)) && 1956254112Sdelphij vdev_dtl_should_excise(vd)) { 1957185029Spjd /* 1958185029Spjd * We completed a scrub up to scrub_txg. If we 1959185029Spjd * did it without rebooting, then the scrub dtl 1960185029Spjd * will be valid, so excise the old region and 1961185029Spjd * fold in the scrub dtl. Otherwise, leave the 1962185029Spjd * dtl as-is if there was an error. 1963209962Smm * 1964209962Smm * There's little trick here: to excise the beginning 1965209962Smm * of the DTL_MISSING map, we put it into a reference 1966209962Smm * tree and then add a segment with refcnt -1 that 1967209962Smm * covers the range [0, scrub_txg). This means 1968209962Smm * that each txg in that range has refcnt -1 or 0. 1969209962Smm * We then add DTL_SCRUB with a refcnt of 2, so that 1970209962Smm * entries in the range [0, scrub_txg) will have a 1971209962Smm * positive refcnt -- either 1 or 2. We then convert 1972209962Smm * the reference tree into the new DTL_MISSING map. 1973185029Spjd */ 1974262093Savg space_reftree_create(&reftree); 1975262093Savg space_reftree_add_map(&reftree, 1976262093Savg vd->vdev_dtl[DTL_MISSING], 1); 1977262093Savg space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 1978262093Savg space_reftree_add_map(&reftree, 1979262093Savg vd->vdev_dtl[DTL_SCRUB], 2); 1980262093Savg space_reftree_generate_map(&reftree, 1981262093Savg vd->vdev_dtl[DTL_MISSING], 1); 1982262093Savg space_reftree_destroy(&reftree); 1983168404Spjd } 1984262093Savg range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 1985262093Savg range_tree_walk(vd->vdev_dtl[DTL_MISSING], 1986262093Savg range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 1987168404Spjd if (scrub_done) 1988262093Savg range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 1989262093Savg range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 1990209962Smm if (!vdev_readable(vd)) 1991262093Savg range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 1992209962Smm else 1993262093Savg range_tree_walk(vd->vdev_dtl[DTL_MISSING], 1994262093Savg range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 1995254112Sdelphij 1996254112Sdelphij /* 1997254112Sdelphij * If the vdev was resilvering and no longer has any 1998271776Ssmh * DTLs then reset its resilvering flag and dirty 1999271776Ssmh * the top level so that we persist the change. 2000254112Sdelphij */ 2001254112Sdelphij if (vd->vdev_resilver_txg != 0 && 2002262093Savg range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && 2003271776Ssmh range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { 2004254112Sdelphij vd->vdev_resilver_txg = 0; 2005271776Ssmh vdev_config_dirty(vd->vdev_top); 2006271776Ssmh } 2007254112Sdelphij 2008168404Spjd mutex_exit(&vd->vdev_dtl_lock); 2009185029Spjd 2010168404Spjd if (txg != 0) 2011168404Spjd vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 2012168404Spjd return; 2013168404Spjd } 2014168404Spjd 2015168404Spjd mutex_enter(&vd->vdev_dtl_lock); 2016209962Smm for (int t = 0; t < DTL_TYPES; t++) { 2017209962Smm /* account for child's outage in parent's missing map */ 2018209962Smm int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 2019209962Smm if (t == DTL_SCRUB) 2020209962Smm continue; /* leaf vdevs only */ 2021209962Smm if (t == DTL_PARTIAL) 2022209962Smm minref = 1; /* i.e. non-zero */ 2023209962Smm else if (vd->vdev_nparity != 0) 2024209962Smm minref = vd->vdev_nparity + 1; /* RAID-Z */ 2025209962Smm else 2026209962Smm minref = vd->vdev_children; /* any kind of mirror */ 2027262093Savg space_reftree_create(&reftree); 2028209962Smm for (int c = 0; c < vd->vdev_children; c++) { 2029209962Smm vdev_t *cvd = vd->vdev_child[c]; 2030209962Smm mutex_enter(&cvd->vdev_dtl_lock); 2031262093Savg space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); 2032209962Smm mutex_exit(&cvd->vdev_dtl_lock); 2033209962Smm } 2034262093Savg space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); 2035262093Savg space_reftree_destroy(&reftree); 2036209962Smm } 2037168404Spjd mutex_exit(&vd->vdev_dtl_lock); 2038168404Spjd} 2039168404Spjd 2040262093Savgint 2041168404Spjdvdev_dtl_load(vdev_t *vd) 2042168404Spjd{ 2043168404Spjd spa_t *spa = vd->vdev_spa; 2044168404Spjd objset_t *mos = spa->spa_meta_objset; 2045262093Savg int error = 0; 2046168404Spjd 2047262093Savg if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 2048262093Savg ASSERT(!vd->vdev_ishole); 2049168404Spjd 2050262093Savg error = space_map_open(&vd->vdev_dtl_sm, mos, 2051262093Savg vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); 2052262093Savg if (error) 2053262093Savg return (error); 2054262093Savg ASSERT(vd->vdev_dtl_sm != NULL); 2055168404Spjd 2056262093Savg mutex_enter(&vd->vdev_dtl_lock); 2057219089Spjd 2058262093Savg /* 2059262093Savg * Now that we've opened the space_map we need to update 2060262093Savg * the in-core DTL. 2061262093Savg */ 2062262093Savg space_map_update(vd->vdev_dtl_sm); 2063262093Savg 2064262093Savg error = space_map_load(vd->vdev_dtl_sm, 2065262093Savg vd->vdev_dtl[DTL_MISSING], SM_ALLOC); 2066262093Savg mutex_exit(&vd->vdev_dtl_lock); 2067262093Savg 2068168404Spjd return (error); 2069262093Savg } 2070168404Spjd 2071262093Savg for (int c = 0; c < vd->vdev_children; c++) { 2072262093Savg error = vdev_dtl_load(vd->vdev_child[c]); 2073262093Savg if (error != 0) 2074262093Savg break; 2075262093Savg } 2076168404Spjd 2077168404Spjd return (error); 2078168404Spjd} 2079168404Spjd 2080168404Spjdvoid 2081168404Spjdvdev_dtl_sync(vdev_t *vd, uint64_t txg) 2082168404Spjd{ 2083168404Spjd spa_t *spa = vd->vdev_spa; 2084262093Savg range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; 2085168404Spjd objset_t *mos = spa->spa_meta_objset; 2086262093Savg range_tree_t *rtsync; 2087262093Savg kmutex_t rtlock; 2088168404Spjd dmu_tx_t *tx; 2089262093Savg uint64_t object = space_map_object(vd->vdev_dtl_sm); 2090168404Spjd 2091219089Spjd ASSERT(!vd->vdev_ishole); 2092262093Savg ASSERT(vd->vdev_ops->vdev_op_leaf); 2093219089Spjd 2094168404Spjd tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2095168404Spjd 2096262093Savg if (vd->vdev_detached || vd->vdev_top->vdev_removing) { 2097262093Savg mutex_enter(&vd->vdev_dtl_lock); 2098262093Savg space_map_free(vd->vdev_dtl_sm, tx); 2099262093Savg space_map_close(vd->vdev_dtl_sm); 2100262093Savg vd->vdev_dtl_sm = NULL; 2101262093Savg mutex_exit(&vd->vdev_dtl_lock); 2102168404Spjd dmu_tx_commit(tx); 2103168404Spjd return; 2104168404Spjd } 2105168404Spjd 2106262093Savg if (vd->vdev_dtl_sm == NULL) { 2107262093Savg uint64_t new_object; 2108262093Savg 2109262093Savg new_object = space_map_alloc(mos, tx); 2110262093Savg VERIFY3U(new_object, !=, 0); 2111262093Savg 2112262093Savg VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 2113262093Savg 0, -1ULL, 0, &vd->vdev_dtl_lock)); 2114262093Savg ASSERT(vd->vdev_dtl_sm != NULL); 2115168404Spjd } 2116168404Spjd 2117262093Savg bzero(&rtlock, sizeof(rtlock)); 2118262093Savg mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); 2119168404Spjd 2120262093Savg rtsync = range_tree_create(NULL, NULL, &rtlock); 2121168404Spjd 2122262093Savg mutex_enter(&rtlock); 2123168404Spjd 2124168404Spjd mutex_enter(&vd->vdev_dtl_lock); 2125262093Savg range_tree_walk(rt, range_tree_add, rtsync); 2126168404Spjd mutex_exit(&vd->vdev_dtl_lock); 2127168404Spjd 2128262093Savg space_map_truncate(vd->vdev_dtl_sm, tx); 2129262093Savg space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); 2130262093Savg range_tree_vacate(rtsync, NULL, NULL); 2131168404Spjd 2132262093Savg range_tree_destroy(rtsync); 2133168404Spjd 2134262093Savg mutex_exit(&rtlock); 2135262093Savg mutex_destroy(&rtlock); 2136168404Spjd 2137262093Savg /* 2138262093Savg * If the object for the space map has changed then dirty 2139262093Savg * the top level so that we update the config. 2140262093Savg */ 2141262093Savg if (object != space_map_object(vd->vdev_dtl_sm)) { 2142262093Savg zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " 2143262093Savg "new object %llu", txg, spa_name(spa), object, 2144262093Savg space_map_object(vd->vdev_dtl_sm)); 2145262093Savg vdev_config_dirty(vd->vdev_top); 2146262093Savg } 2147168404Spjd 2148168404Spjd dmu_tx_commit(tx); 2149262093Savg 2150262093Savg mutex_enter(&vd->vdev_dtl_lock); 2151262093Savg space_map_update(vd->vdev_dtl_sm); 2152262093Savg mutex_exit(&vd->vdev_dtl_lock); 2153168404Spjd} 2154168404Spjd 2155185029Spjd/* 2156209962Smm * Determine whether the specified vdev can be offlined/detached/removed 2157209962Smm * without losing data. 2158209962Smm */ 2159209962Smmboolean_t 2160209962Smmvdev_dtl_required(vdev_t *vd) 2161209962Smm{ 2162209962Smm spa_t *spa = vd->vdev_spa; 2163209962Smm vdev_t *tvd = vd->vdev_top; 2164209962Smm uint8_t cant_read = vd->vdev_cant_read; 2165209962Smm boolean_t required; 2166209962Smm 2167209962Smm ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2168209962Smm 2169209962Smm if (vd == spa->spa_root_vdev || vd == tvd) 2170209962Smm return (B_TRUE); 2171209962Smm 2172209962Smm /* 2173209962Smm * Temporarily mark the device as unreadable, and then determine 2174209962Smm * whether this results in any DTL outages in the top-level vdev. 2175209962Smm * If not, we can safely offline/detach/remove the device. 2176209962Smm */ 2177209962Smm vd->vdev_cant_read = B_TRUE; 2178209962Smm vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2179209962Smm required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 2180209962Smm vd->vdev_cant_read = cant_read; 2181209962Smm vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2182209962Smm 2183219089Spjd if (!required && zio_injection_enabled) 2184219089Spjd required = !!zio_handle_device_injection(vd, NULL, ECHILD); 2185219089Spjd 2186209962Smm return (required); 2187209962Smm} 2188209962Smm 2189209962Smm/* 2190185029Spjd * Determine if resilver is needed, and if so the txg range. 2191185029Spjd */ 2192185029Spjdboolean_t 2193185029Spjdvdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 2194185029Spjd{ 2195185029Spjd boolean_t needed = B_FALSE; 2196185029Spjd uint64_t thismin = UINT64_MAX; 2197185029Spjd uint64_t thismax = 0; 2198185029Spjd 2199185029Spjd if (vd->vdev_children == 0) { 2200185029Spjd mutex_enter(&vd->vdev_dtl_lock); 2201262093Savg if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && 2202209962Smm vdev_writeable(vd)) { 2203185029Spjd 2204254112Sdelphij thismin = vdev_dtl_min(vd); 2205254112Sdelphij thismax = vdev_dtl_max(vd); 2206185029Spjd needed = B_TRUE; 2207185029Spjd } 2208185029Spjd mutex_exit(&vd->vdev_dtl_lock); 2209185029Spjd } else { 2210209962Smm for (int c = 0; c < vd->vdev_children; c++) { 2211185029Spjd vdev_t *cvd = vd->vdev_child[c]; 2212185029Spjd uint64_t cmin, cmax; 2213185029Spjd 2214185029Spjd if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 2215185029Spjd thismin = MIN(thismin, cmin); 2216185029Spjd thismax = MAX(thismax, cmax); 2217185029Spjd needed = B_TRUE; 2218185029Spjd } 2219185029Spjd } 2220185029Spjd } 2221185029Spjd 2222185029Spjd if (needed && minp) { 2223185029Spjd *minp = thismin; 2224185029Spjd *maxp = thismax; 2225185029Spjd } 2226185029Spjd return (needed); 2227185029Spjd} 2228185029Spjd 2229168404Spjdvoid 2230168404Spjdvdev_load(vdev_t *vd) 2231168404Spjd{ 2232168404Spjd /* 2233168404Spjd * Recursively load all children. 2234168404Spjd */ 2235209962Smm for (int c = 0; c < vd->vdev_children; c++) 2236168404Spjd vdev_load(vd->vdev_child[c]); 2237168404Spjd 2238168404Spjd /* 2239168404Spjd * If this is a top-level vdev, initialize its metaslabs. 2240168404Spjd */ 2241219089Spjd if (vd == vd->vdev_top && !vd->vdev_ishole && 2242168404Spjd (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 2243168404Spjd vdev_metaslab_init(vd, 0) != 0)) 2244168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2245168404Spjd VDEV_AUX_CORRUPT_DATA); 2246168404Spjd 2247168404Spjd /* 2248168404Spjd * If this is a leaf vdev, load its DTL. 2249168404Spjd */ 2250168404Spjd if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 2251168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2252168404Spjd VDEV_AUX_CORRUPT_DATA); 2253168404Spjd} 2254168404Spjd 2255168404Spjd/* 2256185029Spjd * The special vdev case is used for hot spares and l2cache devices. Its 2257185029Spjd * sole purpose it to set the vdev state for the associated vdev. To do this, 2258185029Spjd * we make sure that we can open the underlying device, then try to read the 2259185029Spjd * label, and make sure that the label is sane and that it hasn't been 2260185029Spjd * repurposed to another pool. 2261168404Spjd */ 2262168404Spjdint 2263185029Spjdvdev_validate_aux(vdev_t *vd) 2264168404Spjd{ 2265168404Spjd nvlist_t *label; 2266168404Spjd uint64_t guid, version; 2267168404Spjd uint64_t state; 2268168404Spjd 2269185029Spjd if (!vdev_readable(vd)) 2270185029Spjd return (0); 2271185029Spjd 2272239620Smm if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 2273168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2274168404Spjd VDEV_AUX_CORRUPT_DATA); 2275168404Spjd return (-1); 2276168404Spjd } 2277168404Spjd 2278168404Spjd if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 2279236884Smm !SPA_VERSION_IS_SUPPORTED(version) || 2280168404Spjd nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 2281168404Spjd guid != vd->vdev_guid || 2282168404Spjd nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2283168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2284168404Spjd VDEV_AUX_CORRUPT_DATA); 2285168404Spjd nvlist_free(label); 2286168404Spjd return (-1); 2287168404Spjd } 2288168404Spjd 2289168404Spjd /* 2290168404Spjd * We don't actually check the pool state here. If it's in fact in 2291168404Spjd * use by another pool, we update this fact on the fly when requested. 2292168404Spjd */ 2293168404Spjd nvlist_free(label); 2294168404Spjd return (0); 2295168404Spjd} 2296168404Spjd 2297168404Spjdvoid 2298219089Spjdvdev_remove(vdev_t *vd, uint64_t txg) 2299219089Spjd{ 2300219089Spjd spa_t *spa = vd->vdev_spa; 2301219089Spjd objset_t *mos = spa->spa_meta_objset; 2302219089Spjd dmu_tx_t *tx; 2303219089Spjd 2304219089Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2305219089Spjd 2306219089Spjd if (vd->vdev_ms != NULL) { 2307269773Sdelphij metaslab_group_t *mg = vd->vdev_mg; 2308269773Sdelphij 2309269773Sdelphij metaslab_group_histogram_verify(mg); 2310269773Sdelphij metaslab_class_histogram_verify(mg->mg_class); 2311269773Sdelphij 2312219089Spjd for (int m = 0; m < vd->vdev_ms_count; m++) { 2313219089Spjd metaslab_t *msp = vd->vdev_ms[m]; 2314219089Spjd 2315262093Savg if (msp == NULL || msp->ms_sm == NULL) 2316219089Spjd continue; 2317219089Spjd 2318262093Savg mutex_enter(&msp->ms_lock); 2319269773Sdelphij /* 2320269773Sdelphij * If the metaslab was not loaded when the vdev 2321269773Sdelphij * was removed then the histogram accounting may 2322269773Sdelphij * not be accurate. Update the histogram information 2323269773Sdelphij * here so that we ensure that the metaslab group 2324269773Sdelphij * and metaslab class are up-to-date. 2325269773Sdelphij */ 2326269773Sdelphij metaslab_group_histogram_remove(mg, msp); 2327269773Sdelphij 2328262093Savg VERIFY0(space_map_allocated(msp->ms_sm)); 2329262093Savg space_map_free(msp->ms_sm, tx); 2330262093Savg space_map_close(msp->ms_sm); 2331262093Savg msp->ms_sm = NULL; 2332262093Savg mutex_exit(&msp->ms_lock); 2333219089Spjd } 2334269773Sdelphij 2335269773Sdelphij metaslab_group_histogram_verify(mg); 2336269773Sdelphij metaslab_class_histogram_verify(mg->mg_class); 2337269773Sdelphij for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 2338269773Sdelphij ASSERT0(mg->mg_histogram[i]); 2339269773Sdelphij 2340219089Spjd } 2341219089Spjd 2342219089Spjd if (vd->vdev_ms_array) { 2343219089Spjd (void) dmu_object_free(mos, vd->vdev_ms_array, tx); 2344219089Spjd vd->vdev_ms_array = 0; 2345219089Spjd } 2346219089Spjd dmu_tx_commit(tx); 2347219089Spjd} 2348219089Spjd 2349219089Spjdvoid 2350168404Spjdvdev_sync_done(vdev_t *vd, uint64_t txg) 2351168404Spjd{ 2352168404Spjd metaslab_t *msp; 2353211931Smm boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2354168404Spjd 2355219089Spjd ASSERT(!vd->vdev_ishole); 2356219089Spjd 2357168404Spjd while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2358168404Spjd metaslab_sync_done(msp, txg); 2359211931Smm 2360211931Smm if (reassess) 2361211931Smm metaslab_sync_reassess(vd->vdev_mg); 2362168404Spjd} 2363168404Spjd 2364168404Spjdvoid 2365168404Spjdvdev_sync(vdev_t *vd, uint64_t txg) 2366168404Spjd{ 2367168404Spjd spa_t *spa = vd->vdev_spa; 2368168404Spjd vdev_t *lvd; 2369168404Spjd metaslab_t *msp; 2370168404Spjd dmu_tx_t *tx; 2371168404Spjd 2372219089Spjd ASSERT(!vd->vdev_ishole); 2373219089Spjd 2374168404Spjd if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 2375168404Spjd ASSERT(vd == vd->vdev_top); 2376168404Spjd tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2377168404Spjd vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2378168404Spjd DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2379168404Spjd ASSERT(vd->vdev_ms_array != 0); 2380168404Spjd vdev_config_dirty(vd); 2381168404Spjd dmu_tx_commit(tx); 2382168404Spjd } 2383168404Spjd 2384219089Spjd /* 2385219089Spjd * Remove the metadata associated with this vdev once it's empty. 2386219089Spjd */ 2387219089Spjd if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) 2388219089Spjd vdev_remove(vd, txg); 2389219089Spjd 2390168404Spjd while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2391168404Spjd metaslab_sync(msp, txg); 2392168404Spjd (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2393168404Spjd } 2394168404Spjd 2395168404Spjd while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2396168404Spjd vdev_dtl_sync(lvd, txg); 2397168404Spjd 2398168404Spjd (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2399168404Spjd} 2400168404Spjd 2401168404Spjduint64_t 2402168404Spjdvdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2403168404Spjd{ 2404168404Spjd return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2405168404Spjd} 2406168404Spjd 2407185029Spjd/* 2408185029Spjd * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2409185029Spjd * not be opened, and no I/O is attempted. 2410185029Spjd */ 2411185029Spjdint 2412219089Spjdvdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2413168404Spjd{ 2414219089Spjd vdev_t *vd, *tvd; 2415168404Spjd 2416219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 2417185029Spjd 2418185029Spjd if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2419185029Spjd return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2420185029Spjd 2421185029Spjd if (!vd->vdev_ops->vdev_op_leaf) 2422185029Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2423185029Spjd 2424219089Spjd tvd = vd->vdev_top; 2425219089Spjd 2426185029Spjd /* 2427219089Spjd * We don't directly use the aux state here, but if we do a 2428219089Spjd * vdev_reopen(), we need this value to be present to remember why we 2429219089Spjd * were faulted. 2430219089Spjd */ 2431219089Spjd vd->vdev_label_aux = aux; 2432219089Spjd 2433219089Spjd /* 2434185029Spjd * Faulted state takes precedence over degraded. 2435185029Spjd */ 2436219089Spjd vd->vdev_delayed_close = B_FALSE; 2437185029Spjd vd->vdev_faulted = 1ULL; 2438185029Spjd vd->vdev_degraded = 0ULL; 2439219089Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2440185029Spjd 2441185029Spjd /* 2442219089Spjd * If this device has the only valid copy of the data, then 2443219089Spjd * back off and simply mark the vdev as degraded instead. 2444185029Spjd */ 2445219089Spjd if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2446185029Spjd vd->vdev_degraded = 1ULL; 2447185029Spjd vd->vdev_faulted = 0ULL; 2448185029Spjd 2449185029Spjd /* 2450185029Spjd * If we reopen the device and it's not dead, only then do we 2451185029Spjd * mark it degraded. 2452185029Spjd */ 2453219089Spjd vdev_reopen(tvd); 2454185029Spjd 2455219089Spjd if (vdev_readable(vd)) 2456219089Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2457185029Spjd } 2458185029Spjd 2459185029Spjd return (spa_vdev_state_exit(spa, vd, 0)); 2460168404Spjd} 2461168404Spjd 2462185029Spjd/* 2463185029Spjd * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2464185029Spjd * user that something is wrong. The vdev continues to operate as normal as far 2465185029Spjd * as I/O is concerned. 2466185029Spjd */ 2467185029Spjdint 2468219089Spjdvdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2469168404Spjd{ 2470185029Spjd vdev_t *vd; 2471168404Spjd 2472219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 2473168404Spjd 2474185029Spjd if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2475185029Spjd return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2476168404Spjd 2477185029Spjd if (!vd->vdev_ops->vdev_op_leaf) 2478185029Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2479185029Spjd 2480185029Spjd /* 2481185029Spjd * If the vdev is already faulted, then don't do anything. 2482185029Spjd */ 2483185029Spjd if (vd->vdev_faulted || vd->vdev_degraded) 2484185029Spjd return (spa_vdev_state_exit(spa, NULL, 0)); 2485185029Spjd 2486185029Spjd vd->vdev_degraded = 1ULL; 2487185029Spjd if (!vdev_is_dead(vd)) 2488185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2489219089Spjd aux); 2490185029Spjd 2491185029Spjd return (spa_vdev_state_exit(spa, vd, 0)); 2492168404Spjd} 2493168404Spjd 2494185029Spjd/* 2495251631Sdelphij * Online the given vdev. 2496251631Sdelphij * 2497251631Sdelphij * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2498251631Sdelphij * spare device should be detached when the device finishes resilvering. 2499251631Sdelphij * Second, the online should be treated like a 'test' online case, so no FMA 2500251631Sdelphij * events are generated if the device fails to open. 2501185029Spjd */ 2502168404Spjdint 2503185029Spjdvdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 2504168404Spjd{ 2505219089Spjd vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 2506168404Spjd 2507219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 2508168404Spjd 2509185029Spjd if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2510185029Spjd return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2511168404Spjd 2512168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 2513185029Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2514168404Spjd 2515219089Spjd tvd = vd->vdev_top; 2516168404Spjd vd->vdev_offline = B_FALSE; 2517168404Spjd vd->vdev_tmpoffline = B_FALSE; 2518185029Spjd vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2519185029Spjd vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2520219089Spjd 2521219089Spjd /* XXX - L2ARC 1.0 does not support expansion */ 2522219089Spjd if (!vd->vdev_aux) { 2523219089Spjd for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2524219089Spjd pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2525219089Spjd } 2526219089Spjd 2527219089Spjd vdev_reopen(tvd); 2528185029Spjd vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2529168404Spjd 2530219089Spjd if (!vd->vdev_aux) { 2531219089Spjd for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2532219089Spjd pvd->vdev_expanding = B_FALSE; 2533219089Spjd } 2534219089Spjd 2535185029Spjd if (newstate) 2536185029Spjd *newstate = vd->vdev_state; 2537185029Spjd if ((flags & ZFS_ONLINE_UNSPARE) && 2538185029Spjd !vdev_is_dead(vd) && vd->vdev_parent && 2539185029Spjd vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2540185029Spjd vd->vdev_parent->vdev_child[0] == vd) 2541185029Spjd vd->vdev_unspare = B_TRUE; 2542168404Spjd 2543219089Spjd if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2544219089Spjd 2545219089Spjd /* XXX - L2ARC 1.0 does not support expansion */ 2546219089Spjd if (vd->vdev_aux) 2547219089Spjd return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2548219089Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2549219089Spjd } 2550209962Smm return (spa_vdev_state_exit(spa, vd, 0)); 2551168404Spjd} 2552168404Spjd 2553219089Spjdstatic int 2554219089Spjdvdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2555168404Spjd{ 2556213197Smm vdev_t *vd, *tvd; 2557219089Spjd int error = 0; 2558219089Spjd uint64_t generation; 2559219089Spjd metaslab_group_t *mg; 2560168404Spjd 2561219089Spjdtop: 2562219089Spjd spa_vdev_state_enter(spa, SCL_ALLOC); 2563168404Spjd 2564185029Spjd if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2565185029Spjd return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2566168404Spjd 2567168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 2568185029Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2569168404Spjd 2570213197Smm tvd = vd->vdev_top; 2571219089Spjd mg = tvd->vdev_mg; 2572219089Spjd generation = spa->spa_config_generation + 1; 2573213197Smm 2574168404Spjd /* 2575168404Spjd * If the device isn't already offline, try to offline it. 2576168404Spjd */ 2577168404Spjd if (!vd->vdev_offline) { 2578168404Spjd /* 2579209962Smm * If this device has the only valid copy of some data, 2580213197Smm * don't allow it to be offlined. Log devices are always 2581213197Smm * expendable. 2582168404Spjd */ 2583213197Smm if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2584213197Smm vdev_dtl_required(vd)) 2585185029Spjd return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2586168404Spjd 2587168404Spjd /* 2588219089Spjd * If the top-level is a slog and it has had allocations 2589219089Spjd * then proceed. We check that the vdev's metaslab group 2590219089Spjd * is not NULL since it's possible that we may have just 2591219089Spjd * added this vdev but not yet initialized its metaslabs. 2592219089Spjd */ 2593219089Spjd if (tvd->vdev_islog && mg != NULL) { 2594219089Spjd /* 2595219089Spjd * Prevent any future allocations. 2596219089Spjd */ 2597219089Spjd metaslab_group_passivate(mg); 2598219089Spjd (void) spa_vdev_state_exit(spa, vd, 0); 2599219089Spjd 2600219089Spjd error = spa_offline_log(spa); 2601219089Spjd 2602219089Spjd spa_vdev_state_enter(spa, SCL_ALLOC); 2603219089Spjd 2604219089Spjd /* 2605219089Spjd * Check to see if the config has changed. 2606219089Spjd */ 2607219089Spjd if (error || generation != spa->spa_config_generation) { 2608219089Spjd metaslab_group_activate(mg); 2609219089Spjd if (error) 2610219089Spjd return (spa_vdev_state_exit(spa, 2611219089Spjd vd, error)); 2612219089Spjd (void) spa_vdev_state_exit(spa, vd, 0); 2613219089Spjd goto top; 2614219089Spjd } 2615240415Smm ASSERT0(tvd->vdev_stat.vs_alloc); 2616219089Spjd } 2617219089Spjd 2618219089Spjd /* 2619168404Spjd * Offline this device and reopen its top-level vdev. 2620213197Smm * If the top-level vdev is a log device then just offline 2621213197Smm * it. Otherwise, if this action results in the top-level 2622213197Smm * vdev becoming unusable, undo it and fail the request. 2623168404Spjd */ 2624168404Spjd vd->vdev_offline = B_TRUE; 2625213197Smm vdev_reopen(tvd); 2626213197Smm 2627213197Smm if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2628213197Smm vdev_is_dead(tvd)) { 2629168404Spjd vd->vdev_offline = B_FALSE; 2630213197Smm vdev_reopen(tvd); 2631185029Spjd return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2632168404Spjd } 2633219089Spjd 2634219089Spjd /* 2635219089Spjd * Add the device back into the metaslab rotor so that 2636219089Spjd * once we online the device it's open for business. 2637219089Spjd */ 2638219089Spjd if (tvd->vdev_islog && mg != NULL) 2639219089Spjd metaslab_group_activate(mg); 2640168404Spjd } 2641168404Spjd 2642185029Spjd vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2643168404Spjd 2644219089Spjd return (spa_vdev_state_exit(spa, vd, 0)); 2645219089Spjd} 2646213197Smm 2647219089Spjdint 2648219089Spjdvdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2649219089Spjd{ 2650219089Spjd int error; 2651213197Smm 2652219089Spjd mutex_enter(&spa->spa_vdev_top_lock); 2653219089Spjd error = vdev_offline_locked(spa, guid, flags); 2654219089Spjd mutex_exit(&spa->spa_vdev_top_lock); 2655219089Spjd 2656219089Spjd return (error); 2657168404Spjd} 2658168404Spjd 2659168404Spjd/* 2660168404Spjd * Clear the error counts associated with this vdev. Unlike vdev_online() and 2661168404Spjd * vdev_offline(), we assume the spa config is locked. We also clear all 2662168404Spjd * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2663168404Spjd */ 2664168404Spjdvoid 2665168404Spjdvdev_clear(spa_t *spa, vdev_t *vd) 2666168404Spjd{ 2667185029Spjd vdev_t *rvd = spa->spa_root_vdev; 2668168404Spjd 2669185029Spjd ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2670185029Spjd 2671168404Spjd if (vd == NULL) 2672185029Spjd vd = rvd; 2673168404Spjd 2674168404Spjd vd->vdev_stat.vs_read_errors = 0; 2675168404Spjd vd->vdev_stat.vs_write_errors = 0; 2676168404Spjd vd->vdev_stat.vs_checksum_errors = 0; 2677168404Spjd 2678185029Spjd for (int c = 0; c < vd->vdev_children; c++) 2679168404Spjd vdev_clear(spa, vd->vdev_child[c]); 2680185029Spjd 2681253991Smav if (vd == rvd) { 2682253991Smav for (int c = 0; c < spa->spa_l2cache.sav_count; c++) 2683253991Smav vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); 2684253991Smav 2685253991Smav for (int c = 0; c < spa->spa_spares.sav_count; c++) 2686253991Smav vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); 2687253991Smav } 2688253991Smav 2689185029Spjd /* 2690185029Spjd * If we're in the FAULTED state or have experienced failed I/O, then 2691185029Spjd * clear the persistent state and attempt to reopen the device. We 2692185029Spjd * also mark the vdev config dirty, so that the new faulted state is 2693185029Spjd * written out to disk. 2694185029Spjd */ 2695185029Spjd if (vd->vdev_faulted || vd->vdev_degraded || 2696185029Spjd !vdev_readable(vd) || !vdev_writeable(vd)) { 2697185029Spjd 2698219089Spjd /* 2699219089Spjd * When reopening in reponse to a clear event, it may be due to 2700219089Spjd * a fmadm repair request. In this case, if the device is 2701219089Spjd * still broken, we want to still post the ereport again. 2702219089Spjd */ 2703219089Spjd vd->vdev_forcefault = B_TRUE; 2704219089Spjd 2705219089Spjd vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2706185029Spjd vd->vdev_cant_read = B_FALSE; 2707185029Spjd vd->vdev_cant_write = B_FALSE; 2708185029Spjd 2709219089Spjd vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 2710185029Spjd 2711219089Spjd vd->vdev_forcefault = B_FALSE; 2712219089Spjd 2713219089Spjd if (vd != rvd && vdev_writeable(vd->vdev_top)) 2714185029Spjd vdev_state_dirty(vd->vdev_top); 2715185029Spjd 2716185029Spjd if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2717185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2718185029Spjd 2719185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 2720185029Spjd } 2721219089Spjd 2722219089Spjd /* 2723219089Spjd * When clearing a FMA-diagnosed fault, we always want to 2724219089Spjd * unspare the device, as we assume that the original spare was 2725219089Spjd * done in response to the FMA fault. 2726219089Spjd */ 2727219089Spjd if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2728219089Spjd vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2729219089Spjd vd->vdev_parent->vdev_child[0] == vd) 2730219089Spjd vd->vdev_unspare = B_TRUE; 2731168404Spjd} 2732168404Spjd 2733185029Spjdboolean_t 2734168404Spjdvdev_is_dead(vdev_t *vd) 2735168404Spjd{ 2736219089Spjd /* 2737219089Spjd * Holes and missing devices are always considered "dead". 2738219089Spjd * This simplifies the code since we don't have to check for 2739219089Spjd * these types of devices in the various code paths. 2740219089Spjd * Instead we rely on the fact that we skip over dead devices 2741219089Spjd * before issuing I/O to them. 2742219089Spjd */ 2743219089Spjd return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 2744219089Spjd vd->vdev_ops == &vdev_missing_ops); 2745168404Spjd} 2746168404Spjd 2747185029Spjdboolean_t 2748185029Spjdvdev_readable(vdev_t *vd) 2749168404Spjd{ 2750185029Spjd return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 2751185029Spjd} 2752168404Spjd 2753185029Spjdboolean_t 2754185029Spjdvdev_writeable(vdev_t *vd) 2755185029Spjd{ 2756185029Spjd return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 2757185029Spjd} 2758168404Spjd 2759185029Spjdboolean_t 2760208370Smmvdev_allocatable(vdev_t *vd) 2761208370Smm{ 2762209962Smm uint64_t state = vd->vdev_state; 2763209962Smm 2764208370Smm /* 2765209962Smm * We currently allow allocations from vdevs which may be in the 2766208370Smm * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2767208370Smm * fails to reopen then we'll catch it later when we're holding 2768209962Smm * the proper locks. Note that we have to get the vdev state 2769209962Smm * in a local variable because although it changes atomically, 2770209962Smm * we're asking two separate questions about it. 2771208370Smm */ 2772209962Smm return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 2773219089Spjd !vd->vdev_cant_write && !vd->vdev_ishole); 2774208370Smm} 2775208370Smm 2776208370Smmboolean_t 2777185029Spjdvdev_accessible(vdev_t *vd, zio_t *zio) 2778185029Spjd{ 2779185029Spjd ASSERT(zio->io_vd == vd); 2780168404Spjd 2781185029Spjd if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2782185029Spjd return (B_FALSE); 2783168404Spjd 2784185029Spjd if (zio->io_type == ZIO_TYPE_READ) 2785185029Spjd return (!vd->vdev_cant_read); 2786168404Spjd 2787185029Spjd if (zio->io_type == ZIO_TYPE_WRITE) 2788185029Spjd return (!vd->vdev_cant_write); 2789168404Spjd 2790185029Spjd return (B_TRUE); 2791168404Spjd} 2792168404Spjd 2793168404Spjd/* 2794168404Spjd * Get statistics for the given vdev. 2795168404Spjd */ 2796168404Spjdvoid 2797168404Spjdvdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2798168404Spjd{ 2799269773Sdelphij spa_t *spa = vd->vdev_spa; 2800269773Sdelphij vdev_t *rvd = spa->spa_root_vdev; 2801168404Spjd 2802269773Sdelphij ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2803269773Sdelphij 2804168404Spjd mutex_enter(&vd->vdev_stat_lock); 2805168404Spjd bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2806168404Spjd vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2807168404Spjd vs->vs_state = vd->vdev_state; 2808219089Spjd vs->vs_rsize = vdev_get_min_asize(vd); 2809219089Spjd if (vd->vdev_ops->vdev_op_leaf) 2810219089Spjd vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2811236155Smm vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; 2812254591Sgibbs vs->vs_configured_ashift = vd->vdev_top != NULL 2813254591Sgibbs ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; 2814254591Sgibbs vs->vs_logical_ashift = vd->vdev_logical_ashift; 2815254591Sgibbs vs->vs_physical_ashift = vd->vdev_physical_ashift; 2816270128Sdelphij if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { 2817269773Sdelphij vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 2818270128Sdelphij } 2819168404Spjd 2820168404Spjd /* 2821168404Spjd * If we're getting stats on the root vdev, aggregate the I/O counts 2822168404Spjd * over all top-level vdevs (i.e. the direct children of the root). 2823168404Spjd */ 2824168404Spjd if (vd == rvd) { 2825185029Spjd for (int c = 0; c < rvd->vdev_children; c++) { 2826168404Spjd vdev_t *cvd = rvd->vdev_child[c]; 2827168404Spjd vdev_stat_t *cvs = &cvd->vdev_stat; 2828168404Spjd 2829185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 2830168404Spjd vs->vs_ops[t] += cvs->vs_ops[t]; 2831168404Spjd vs->vs_bytes[t] += cvs->vs_bytes[t]; 2832168404Spjd } 2833219089Spjd cvs->vs_scan_removing = cvd->vdev_removing; 2834168404Spjd } 2835168404Spjd } 2836269773Sdelphij mutex_exit(&vd->vdev_stat_lock); 2837168404Spjd} 2838168404Spjd 2839168404Spjdvoid 2840185029Spjdvdev_clear_stats(vdev_t *vd) 2841168404Spjd{ 2842185029Spjd mutex_enter(&vd->vdev_stat_lock); 2843185029Spjd vd->vdev_stat.vs_space = 0; 2844185029Spjd vd->vdev_stat.vs_dspace = 0; 2845185029Spjd vd->vdev_stat.vs_alloc = 0; 2846185029Spjd mutex_exit(&vd->vdev_stat_lock); 2847185029Spjd} 2848185029Spjd 2849185029Spjdvoid 2850219089Spjdvdev_scan_stat_init(vdev_t *vd) 2851219089Spjd{ 2852219089Spjd vdev_stat_t *vs = &vd->vdev_stat; 2853219089Spjd 2854219089Spjd for (int c = 0; c < vd->vdev_children; c++) 2855219089Spjd vdev_scan_stat_init(vd->vdev_child[c]); 2856219089Spjd 2857219089Spjd mutex_enter(&vd->vdev_stat_lock); 2858219089Spjd vs->vs_scan_processed = 0; 2859219089Spjd mutex_exit(&vd->vdev_stat_lock); 2860219089Spjd} 2861219089Spjd 2862219089Spjdvoid 2863185029Spjdvdev_stat_update(zio_t *zio, uint64_t psize) 2864185029Spjd{ 2865209962Smm spa_t *spa = zio->io_spa; 2866209962Smm vdev_t *rvd = spa->spa_root_vdev; 2867185029Spjd vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2868168404Spjd vdev_t *pvd; 2869168404Spjd uint64_t txg = zio->io_txg; 2870168404Spjd vdev_stat_t *vs = &vd->vdev_stat; 2871168404Spjd zio_type_t type = zio->io_type; 2872168404Spjd int flags = zio->io_flags; 2873168404Spjd 2874185029Spjd /* 2875185029Spjd * If this i/o is a gang leader, it didn't do any actual work. 2876185029Spjd */ 2877185029Spjd if (zio->io_gang_tree) 2878185029Spjd return; 2879185029Spjd 2880168404Spjd if (zio->io_error == 0) { 2881185029Spjd /* 2882185029Spjd * If this is a root i/o, don't count it -- we've already 2883185029Spjd * counted the top-level vdevs, and vdev_get_stats() will 2884185029Spjd * aggregate them when asked. This reduces contention on 2885185029Spjd * the root vdev_stat_lock and implicitly handles blocks 2886185029Spjd * that compress away to holes, for which there is no i/o. 2887185029Spjd * (Holes never create vdev children, so all the counters 2888185029Spjd * remain zero, which is what we want.) 2889185029Spjd * 2890185029Spjd * Note: this only applies to successful i/o (io_error == 0) 2891185029Spjd * because unlike i/o counts, errors are not additive. 2892185029Spjd * When reading a ditto block, for example, failure of 2893185029Spjd * one top-level vdev does not imply a root-level error. 2894185029Spjd */ 2895185029Spjd if (vd == rvd) 2896185029Spjd return; 2897185029Spjd 2898185029Spjd ASSERT(vd == zio->io_vd); 2899209962Smm 2900209962Smm if (flags & ZIO_FLAG_IO_BYPASS) 2901209962Smm return; 2902209962Smm 2903209962Smm mutex_enter(&vd->vdev_stat_lock); 2904209962Smm 2905185029Spjd if (flags & ZIO_FLAG_IO_REPAIR) { 2906219089Spjd if (flags & ZIO_FLAG_SCAN_THREAD) { 2907219089Spjd dsl_scan_phys_t *scn_phys = 2908219089Spjd &spa->spa_dsl_pool->dp_scan->scn_phys; 2909219089Spjd uint64_t *processed = &scn_phys->scn_processed; 2910219089Spjd 2911219089Spjd /* XXX cleanup? */ 2912219089Spjd if (vd->vdev_ops->vdev_op_leaf) 2913219089Spjd atomic_add_64(processed, psize); 2914219089Spjd vs->vs_scan_processed += psize; 2915219089Spjd } 2916219089Spjd 2917209962Smm if (flags & ZIO_FLAG_SELF_HEAL) 2918185029Spjd vs->vs_self_healed += psize; 2919168404Spjd } 2920209962Smm 2921209962Smm vs->vs_ops[type]++; 2922209962Smm vs->vs_bytes[type] += psize; 2923209962Smm 2924209962Smm mutex_exit(&vd->vdev_stat_lock); 2925168404Spjd return; 2926168404Spjd } 2927168404Spjd 2928168404Spjd if (flags & ZIO_FLAG_SPECULATIVE) 2929168404Spjd return; 2930168404Spjd 2931213198Smm /* 2932213198Smm * If this is an I/O error that is going to be retried, then ignore the 2933213198Smm * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 2934213198Smm * hard errors, when in reality they can happen for any number of 2935213198Smm * innocuous reasons (bus resets, MPxIO link failure, etc). 2936213198Smm */ 2937213198Smm if (zio->io_error == EIO && 2938213198Smm !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 2939213198Smm return; 2940213198Smm 2941219089Spjd /* 2942219089Spjd * Intent logs writes won't propagate their error to the root 2943219089Spjd * I/O so don't mark these types of failures as pool-level 2944219089Spjd * errors. 2945219089Spjd */ 2946219089Spjd if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 2947219089Spjd return; 2948219089Spjd 2949185029Spjd mutex_enter(&vd->vdev_stat_lock); 2950209962Smm if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 2951185029Spjd if (zio->io_error == ECKSUM) 2952185029Spjd vs->vs_checksum_errors++; 2953185029Spjd else 2954185029Spjd vs->vs_read_errors++; 2955168404Spjd } 2956209962Smm if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 2957185029Spjd vs->vs_write_errors++; 2958185029Spjd mutex_exit(&vd->vdev_stat_lock); 2959168404Spjd 2960209962Smm if (type == ZIO_TYPE_WRITE && txg != 0 && 2961209962Smm (!(flags & ZIO_FLAG_IO_REPAIR) || 2962219089Spjd (flags & ZIO_FLAG_SCAN_THREAD) || 2963219089Spjd spa->spa_claiming)) { 2964209962Smm /* 2965219089Spjd * This is either a normal write (not a repair), or it's 2966219089Spjd * a repair induced by the scrub thread, or it's a repair 2967219089Spjd * made by zil_claim() during spa_load() in the first txg. 2968219089Spjd * In the normal case, we commit the DTL change in the same 2969219089Spjd * txg as the block was born. In the scrub-induced repair 2970219089Spjd * case, we know that scrubs run in first-pass syncing context, 2971219089Spjd * so we commit the DTL change in spa_syncing_txg(spa). 2972219089Spjd * In the zil_claim() case, we commit in spa_first_txg(spa). 2973209962Smm * 2974209962Smm * We currently do not make DTL entries for failed spontaneous 2975209962Smm * self-healing writes triggered by normal (non-scrubbing) 2976209962Smm * reads, because we have no transactional context in which to 2977209962Smm * do so -- and it's not clear that it'd be desirable anyway. 2978209962Smm */ 2979209962Smm if (vd->vdev_ops->vdev_op_leaf) { 2980209962Smm uint64_t commit_txg = txg; 2981219089Spjd if (flags & ZIO_FLAG_SCAN_THREAD) { 2982209962Smm ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2983209962Smm ASSERT(spa_sync_pass(spa) == 1); 2984209962Smm vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 2985219089Spjd commit_txg = spa_syncing_txg(spa); 2986219089Spjd } else if (spa->spa_claiming) { 2987219089Spjd ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2988219089Spjd commit_txg = spa_first_txg(spa); 2989209962Smm } 2990219089Spjd ASSERT(commit_txg >= spa_syncing_txg(spa)); 2991209962Smm if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 2992168404Spjd return; 2993209962Smm for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2994209962Smm vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 2995209962Smm vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 2996168404Spjd } 2997209962Smm if (vd != rvd) 2998209962Smm vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 2999168404Spjd } 3000168404Spjd} 3001168404Spjd 3002168404Spjd/* 3003219089Spjd * Update the in-core space usage stats for this vdev, its metaslab class, 3004219089Spjd * and the root vdev. 3005168404Spjd */ 3006168404Spjdvoid 3007219089Spjdvdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 3008219089Spjd int64_t space_delta) 3009168404Spjd{ 3010168404Spjd int64_t dspace_delta = space_delta; 3011185029Spjd spa_t *spa = vd->vdev_spa; 3012185029Spjd vdev_t *rvd = spa->spa_root_vdev; 3013219089Spjd metaslab_group_t *mg = vd->vdev_mg; 3014219089Spjd metaslab_class_t *mc = mg ? mg->mg_class : NULL; 3015168404Spjd 3016185029Spjd ASSERT(vd == vd->vdev_top); 3017168404Spjd 3018185029Spjd /* 3019185029Spjd * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 3020185029Spjd * factor. We must calculate this here and not at the root vdev 3021185029Spjd * because the root vdev's psize-to-asize is simply the max of its 3022185029Spjd * childrens', thus not accurate enough for us. 3023185029Spjd */ 3024185029Spjd ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 3025213197Smm ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 3026185029Spjd dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 3027185029Spjd vd->vdev_deflate_ratio; 3028185029Spjd 3029185029Spjd mutex_enter(&vd->vdev_stat_lock); 3030219089Spjd vd->vdev_stat.vs_alloc += alloc_delta; 3031185029Spjd vd->vdev_stat.vs_space += space_delta; 3032185029Spjd vd->vdev_stat.vs_dspace += dspace_delta; 3033185029Spjd mutex_exit(&vd->vdev_stat_lock); 3034185029Spjd 3035219089Spjd if (mc == spa_normal_class(spa)) { 3036185029Spjd mutex_enter(&rvd->vdev_stat_lock); 3037219089Spjd rvd->vdev_stat.vs_alloc += alloc_delta; 3038185029Spjd rvd->vdev_stat.vs_space += space_delta; 3039185029Spjd rvd->vdev_stat.vs_dspace += dspace_delta; 3040185029Spjd mutex_exit(&rvd->vdev_stat_lock); 3041185029Spjd } 3042219089Spjd 3043219089Spjd if (mc != NULL) { 3044219089Spjd ASSERT(rvd == vd->vdev_parent); 3045219089Spjd ASSERT(vd->vdev_ms_count != 0); 3046219089Spjd 3047219089Spjd metaslab_class_space_update(mc, 3048219089Spjd alloc_delta, defer_delta, space_delta, dspace_delta); 3049219089Spjd } 3050168404Spjd} 3051168404Spjd 3052168404Spjd/* 3053168404Spjd * Mark a top-level vdev's config as dirty, placing it on the dirty list 3054168404Spjd * so that it will be written out next time the vdev configuration is synced. 3055168404Spjd * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 3056168404Spjd */ 3057168404Spjdvoid 3058168404Spjdvdev_config_dirty(vdev_t *vd) 3059168404Spjd{ 3060168404Spjd spa_t *spa = vd->vdev_spa; 3061168404Spjd vdev_t *rvd = spa->spa_root_vdev; 3062168404Spjd int c; 3063168404Spjd 3064219089Spjd ASSERT(spa_writeable(spa)); 3065219089Spjd 3066168404Spjd /* 3067209962Smm * If this is an aux vdev (as with l2cache and spare devices), then we 3068209962Smm * update the vdev config manually and set the sync flag. 3069185029Spjd */ 3070185029Spjd if (vd->vdev_aux != NULL) { 3071185029Spjd spa_aux_vdev_t *sav = vd->vdev_aux; 3072185029Spjd nvlist_t **aux; 3073185029Spjd uint_t naux; 3074185029Spjd 3075185029Spjd for (c = 0; c < sav->sav_count; c++) { 3076185029Spjd if (sav->sav_vdevs[c] == vd) 3077185029Spjd break; 3078185029Spjd } 3079185029Spjd 3080185029Spjd if (c == sav->sav_count) { 3081185029Spjd /* 3082185029Spjd * We're being removed. There's nothing more to do. 3083185029Spjd */ 3084185029Spjd ASSERT(sav->sav_sync == B_TRUE); 3085185029Spjd return; 3086185029Spjd } 3087185029Spjd 3088185029Spjd sav->sav_sync = B_TRUE; 3089185029Spjd 3090209962Smm if (nvlist_lookup_nvlist_array(sav->sav_config, 3091209962Smm ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 3092209962Smm VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 3093209962Smm ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 3094209962Smm } 3095185029Spjd 3096185029Spjd ASSERT(c < naux); 3097185029Spjd 3098185029Spjd /* 3099185029Spjd * Setting the nvlist in the middle if the array is a little 3100185029Spjd * sketchy, but it will work. 3101185029Spjd */ 3102185029Spjd nvlist_free(aux[c]); 3103219089Spjd aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3104185029Spjd 3105185029Spjd return; 3106185029Spjd } 3107185029Spjd 3108185029Spjd /* 3109185029Spjd * The dirty list is protected by the SCL_CONFIG lock. The caller 3110185029Spjd * must either hold SCL_CONFIG as writer, or must be the sync thread 3111185029Spjd * (which holds SCL_CONFIG as reader). There's only one sync thread, 3112168404Spjd * so this is sufficient to ensure mutual exclusion. 3113168404Spjd */ 3114185029Spjd ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3115185029Spjd (dsl_pool_sync_context(spa_get_dsl(spa)) && 3116185029Spjd spa_config_held(spa, SCL_CONFIG, RW_READER))); 3117168404Spjd 3118168404Spjd if (vd == rvd) { 3119168404Spjd for (c = 0; c < rvd->vdev_children; c++) 3120168404Spjd vdev_config_dirty(rvd->vdev_child[c]); 3121168404Spjd } else { 3122168404Spjd ASSERT(vd == vd->vdev_top); 3123168404Spjd 3124219089Spjd if (!list_link_active(&vd->vdev_config_dirty_node) && 3125219089Spjd !vd->vdev_ishole) 3126185029Spjd list_insert_head(&spa->spa_config_dirty_list, vd); 3127168404Spjd } 3128168404Spjd} 3129168404Spjd 3130168404Spjdvoid 3131168404Spjdvdev_config_clean(vdev_t *vd) 3132168404Spjd{ 3133168404Spjd spa_t *spa = vd->vdev_spa; 3134168404Spjd 3135185029Spjd ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3136185029Spjd (dsl_pool_sync_context(spa_get_dsl(spa)) && 3137185029Spjd spa_config_held(spa, SCL_CONFIG, RW_READER))); 3138168404Spjd 3139185029Spjd ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3140185029Spjd list_remove(&spa->spa_config_dirty_list, vd); 3141168404Spjd} 3142168404Spjd 3143185029Spjd/* 3144185029Spjd * Mark a top-level vdev's state as dirty, so that the next pass of 3145185029Spjd * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3146185029Spjd * the state changes from larger config changes because they require 3147185029Spjd * much less locking, and are often needed for administrative actions. 3148185029Spjd */ 3149168404Spjdvoid 3150185029Spjdvdev_state_dirty(vdev_t *vd) 3151185029Spjd{ 3152185029Spjd spa_t *spa = vd->vdev_spa; 3153185029Spjd 3154219089Spjd ASSERT(spa_writeable(spa)); 3155185029Spjd ASSERT(vd == vd->vdev_top); 3156185029Spjd 3157185029Spjd /* 3158185029Spjd * The state list is protected by the SCL_STATE lock. The caller 3159185029Spjd * must either hold SCL_STATE as writer, or must be the sync thread 3160185029Spjd * (which holds SCL_STATE as reader). There's only one sync thread, 3161185029Spjd * so this is sufficient to ensure mutual exclusion. 3162185029Spjd */ 3163185029Spjd ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3164185029Spjd (dsl_pool_sync_context(spa_get_dsl(spa)) && 3165185029Spjd spa_config_held(spa, SCL_STATE, RW_READER))); 3166185029Spjd 3167219089Spjd if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 3168185029Spjd list_insert_head(&spa->spa_state_dirty_list, vd); 3169185029Spjd} 3170185029Spjd 3171185029Spjdvoid 3172185029Spjdvdev_state_clean(vdev_t *vd) 3173185029Spjd{ 3174185029Spjd spa_t *spa = vd->vdev_spa; 3175185029Spjd 3176185029Spjd ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3177185029Spjd (dsl_pool_sync_context(spa_get_dsl(spa)) && 3178185029Spjd spa_config_held(spa, SCL_STATE, RW_READER))); 3179185029Spjd 3180185029Spjd ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 3181185029Spjd list_remove(&spa->spa_state_dirty_list, vd); 3182185029Spjd} 3183185029Spjd 3184185029Spjd/* 3185185029Spjd * Propagate vdev state up from children to parent. 3186185029Spjd */ 3187185029Spjdvoid 3188168404Spjdvdev_propagate_state(vdev_t *vd) 3189168404Spjd{ 3190209962Smm spa_t *spa = vd->vdev_spa; 3191209962Smm vdev_t *rvd = spa->spa_root_vdev; 3192168404Spjd int degraded = 0, faulted = 0; 3193168404Spjd int corrupted = 0; 3194168404Spjd vdev_t *child; 3195168404Spjd 3196185029Spjd if (vd->vdev_children > 0) { 3197219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 3198185029Spjd child = vd->vdev_child[c]; 3199168404Spjd 3200219089Spjd /* 3201219089Spjd * Don't factor holes into the decision. 3202219089Spjd */ 3203219089Spjd if (child->vdev_ishole) 3204219089Spjd continue; 3205219089Spjd 3206185029Spjd if (!vdev_readable(child) || 3207209962Smm (!vdev_writeable(child) && spa_writeable(spa))) { 3208185029Spjd /* 3209185029Spjd * Root special: if there is a top-level log 3210185029Spjd * device, treat the root vdev as if it were 3211185029Spjd * degraded. 3212185029Spjd */ 3213185029Spjd if (child->vdev_islog && vd == rvd) 3214185029Spjd degraded++; 3215185029Spjd else 3216185029Spjd faulted++; 3217185029Spjd } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 3218185029Spjd degraded++; 3219185029Spjd } 3220185029Spjd 3221185029Spjd if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 3222185029Spjd corrupted++; 3223185029Spjd } 3224185029Spjd 3225185029Spjd vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 3226185029Spjd 3227185029Spjd /* 3228185029Spjd * Root special: if there is a top-level vdev that cannot be 3229185029Spjd * opened due to corrupted metadata, then propagate the root 3230185029Spjd * vdev's aux state as 'corrupt' rather than 'insufficient 3231185029Spjd * replicas'. 3232185029Spjd */ 3233185029Spjd if (corrupted && vd == rvd && 3234185029Spjd rvd->vdev_state == VDEV_STATE_CANT_OPEN) 3235185029Spjd vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 3236185029Spjd VDEV_AUX_CORRUPT_DATA); 3237168404Spjd } 3238168404Spjd 3239185029Spjd if (vd->vdev_parent) 3240185029Spjd vdev_propagate_state(vd->vdev_parent); 3241168404Spjd} 3242168404Spjd 3243168404Spjd/* 3244168404Spjd * Set a vdev's state. If this is during an open, we don't update the parent 3245168404Spjd * state, because we're in the process of opening children depth-first. 3246168404Spjd * Otherwise, we propagate the change to the parent. 3247168404Spjd * 3248168404Spjd * If this routine places a device in a faulted state, an appropriate ereport is 3249168404Spjd * generated. 3250168404Spjd */ 3251168404Spjdvoid 3252168404Spjdvdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 3253168404Spjd{ 3254168404Spjd uint64_t save_state; 3255185029Spjd spa_t *spa = vd->vdev_spa; 3256168404Spjd 3257168404Spjd if (state == vd->vdev_state) { 3258168404Spjd vd->vdev_stat.vs_aux = aux; 3259168404Spjd return; 3260168404Spjd } 3261168404Spjd 3262168404Spjd save_state = vd->vdev_state; 3263168404Spjd 3264168404Spjd vd->vdev_state = state; 3265168404Spjd vd->vdev_stat.vs_aux = aux; 3266168404Spjd 3267173373Spjd /* 3268173373Spjd * If we are setting the vdev state to anything but an open state, then 3269219089Spjd * always close the underlying device unless the device has requested 3270219089Spjd * a delayed close (i.e. we're about to remove or fault the device). 3271219089Spjd * Otherwise, we keep accessible but invalid devices open forever. 3272219089Spjd * We don't call vdev_close() itself, because that implies some extra 3273219089Spjd * checks (offline, etc) that we don't want here. This is limited to 3274219089Spjd * leaf devices, because otherwise closing the device will affect other 3275219089Spjd * children. 3276173373Spjd */ 3277219089Spjd if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 3278219089Spjd vd->vdev_ops->vdev_op_leaf) 3279173373Spjd vd->vdev_ops->vdev_op_close(vd); 3280173373Spjd 3281219089Spjd /* 3282219089Spjd * If we have brought this vdev back into service, we need 3283219089Spjd * to notify fmd so that it can gracefully repair any outstanding 3284219089Spjd * cases due to a missing device. We do this in all cases, even those 3285219089Spjd * that probably don't correlate to a repaired fault. This is sure to 3286219089Spjd * catch all cases, and we let the zfs-retire agent sort it out. If 3287219089Spjd * this is a transient state it's OK, as the retire agent will 3288219089Spjd * double-check the state of the vdev before repairing it. 3289219089Spjd */ 3290219089Spjd if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 3291219089Spjd vd->vdev_prevstate != state) 3292219089Spjd zfs_post_state_change(spa, vd); 3293219089Spjd 3294185029Spjd if (vd->vdev_removed && 3295185029Spjd state == VDEV_STATE_CANT_OPEN && 3296185029Spjd (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 3297168404Spjd /* 3298185029Spjd * If the previous state is set to VDEV_STATE_REMOVED, then this 3299185029Spjd * device was previously marked removed and someone attempted to 3300185029Spjd * reopen it. If this failed due to a nonexistent device, then 3301185029Spjd * keep the device in the REMOVED state. We also let this be if 3302185029Spjd * it is one of our special test online cases, which is only 3303185029Spjd * attempting to online the device and shouldn't generate an FMA 3304185029Spjd * fault. 3305185029Spjd */ 3306185029Spjd vd->vdev_state = VDEV_STATE_REMOVED; 3307185029Spjd vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 3308185029Spjd } else if (state == VDEV_STATE_REMOVED) { 3309185029Spjd vd->vdev_removed = B_TRUE; 3310185029Spjd } else if (state == VDEV_STATE_CANT_OPEN) { 3311185029Spjd /* 3312219089Spjd * If we fail to open a vdev during an import or recovery, we 3313219089Spjd * mark it as "not available", which signifies that it was 3314219089Spjd * never there to begin with. Failure to open such a device 3315219089Spjd * is not considered an error. 3316168404Spjd */ 3317219089Spjd if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 3318219089Spjd spa_load_state(spa) == SPA_LOAD_RECOVER) && 3319168404Spjd vd->vdev_ops->vdev_op_leaf) 3320168404Spjd vd->vdev_not_present = 1; 3321168404Spjd 3322168404Spjd /* 3323168404Spjd * Post the appropriate ereport. If the 'prevstate' field is 3324168404Spjd * set to something other than VDEV_STATE_UNKNOWN, it indicates 3325168404Spjd * that this is part of a vdev_reopen(). In this case, we don't 3326168404Spjd * want to post the ereport if the device was already in the 3327168404Spjd * CANT_OPEN state beforehand. 3328185029Spjd * 3329185029Spjd * If the 'checkremove' flag is set, then this is an attempt to 3330185029Spjd * online the device in response to an insertion event. If we 3331185029Spjd * hit this case, then we have detected an insertion event for a 3332185029Spjd * faulted or offline device that wasn't in the removed state. 3333185029Spjd * In this scenario, we don't post an ereport because we are 3334185029Spjd * about to replace the device, or attempt an online with 3335185029Spjd * vdev_forcefault, which will generate the fault for us. 3336168404Spjd */ 3337185029Spjd if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 3338185029Spjd !vd->vdev_not_present && !vd->vdev_checkremove && 3339185029Spjd vd != spa->spa_root_vdev) { 3340168404Spjd const char *class; 3341168404Spjd 3342168404Spjd switch (aux) { 3343168404Spjd case VDEV_AUX_OPEN_FAILED: 3344168404Spjd class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3345168404Spjd break; 3346168404Spjd case VDEV_AUX_CORRUPT_DATA: 3347168404Spjd class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3348168404Spjd break; 3349168404Spjd case VDEV_AUX_NO_REPLICAS: 3350168404Spjd class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3351168404Spjd break; 3352168404Spjd case VDEV_AUX_BAD_GUID_SUM: 3353168404Spjd class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3354168404Spjd break; 3355168404Spjd case VDEV_AUX_TOO_SMALL: 3356168404Spjd class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3357168404Spjd break; 3358168404Spjd case VDEV_AUX_BAD_LABEL: 3359168404Spjd class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3360168404Spjd break; 3361168404Spjd default: 3362168404Spjd class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3363168404Spjd } 3364168404Spjd 3365185029Spjd zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3366168404Spjd } 3367185029Spjd 3368185029Spjd /* Erase any notion of persistent removed state */ 3369185029Spjd vd->vdev_removed = B_FALSE; 3370185029Spjd } else { 3371185029Spjd vd->vdev_removed = B_FALSE; 3372168404Spjd } 3373168404Spjd 3374209962Smm if (!isopen && vd->vdev_parent) 3375209962Smm vdev_propagate_state(vd->vdev_parent); 3376185029Spjd} 3377168404Spjd 3378185029Spjd/* 3379185029Spjd * Check the vdev configuration to ensure that it's capable of supporting 3380193163Sdfr * a root pool. 3381193163Sdfr * 3382193163Sdfr * On Solaris, we do not support RAID-Z or partial configuration. In 3383193163Sdfr * addition, only a single top-level vdev is allowed and none of the 3384193163Sdfr * leaves can be wholedisks. 3385193163Sdfr * 3386193163Sdfr * For FreeBSD, we can boot from any configuration. There is a 3387193163Sdfr * limitation that the boot filesystem must be either uncompressed or 3388193163Sdfr * compresses with lzjb compression but I'm not sure how to enforce 3389193163Sdfr * that here. 3390185029Spjd */ 3391185029Spjdboolean_t 3392185029Spjdvdev_is_bootable(vdev_t *vd) 3393185029Spjd{ 3394213197Smm#ifdef sun 3395185029Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3396185029Spjd char *vdev_type = vd->vdev_ops->vdev_op_type; 3397185029Spjd 3398185029Spjd if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3399185029Spjd vd->vdev_children > 1) { 3400185029Spjd return (B_FALSE); 3401185029Spjd } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 3402185029Spjd strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 3403185029Spjd return (B_FALSE); 3404185029Spjd } 3405185029Spjd } else if (vd->vdev_wholedisk == 1) { 3406185029Spjd return (B_FALSE); 3407185029Spjd } 3408185029Spjd 3409219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 3410185029Spjd if (!vdev_is_bootable(vd->vdev_child[c])) 3411185029Spjd return (B_FALSE); 3412185029Spjd } 3413213197Smm#endif /* sun */ 3414185029Spjd return (B_TRUE); 3415168404Spjd} 3416213197Smm 3417219089Spjd/* 3418219089Spjd * Load the state from the original vdev tree (ovd) which 3419219089Spjd * we've retrieved from the MOS config object. If the original 3420219089Spjd * vdev was offline or faulted then we transfer that state to the 3421219089Spjd * device in the current vdev tree (nvd). 3422219089Spjd */ 3423213197Smmvoid 3424219089Spjdvdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3425213197Smm{ 3426219089Spjd spa_t *spa = nvd->vdev_spa; 3427213197Smm 3428219089Spjd ASSERT(nvd->vdev_top->vdev_islog); 3429219089Spjd ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3430219089Spjd ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3431213197Smm 3432219089Spjd for (int c = 0; c < nvd->vdev_children; c++) 3433219089Spjd vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3434213197Smm 3435219089Spjd if (nvd->vdev_ops->vdev_op_leaf) { 3436213197Smm /* 3437219089Spjd * Restore the persistent vdev state 3438213197Smm */ 3439219089Spjd nvd->vdev_offline = ovd->vdev_offline; 3440219089Spjd nvd->vdev_faulted = ovd->vdev_faulted; 3441219089Spjd nvd->vdev_degraded = ovd->vdev_degraded; 3442219089Spjd nvd->vdev_removed = ovd->vdev_removed; 3443213197Smm } 3444213197Smm} 3445219089Spjd 3446219089Spjd/* 3447219089Spjd * Determine if a log device has valid content. If the vdev was 3448219089Spjd * removed or faulted in the MOS config then we know that 3449219089Spjd * the content on the log device has already been written to the pool. 3450219089Spjd */ 3451219089Spjdboolean_t 3452219089Spjdvdev_log_state_valid(vdev_t *vd) 3453219089Spjd{ 3454219089Spjd if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3455219089Spjd !vd->vdev_removed) 3456219089Spjd return (B_TRUE); 3457219089Spjd 3458219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3459219089Spjd if (vdev_log_state_valid(vd->vdev_child[c])) 3460219089Spjd return (B_TRUE); 3461219089Spjd 3462219089Spjd return (B_FALSE); 3463219089Spjd} 3464219089Spjd 3465219089Spjd/* 3466219089Spjd * Expand a vdev if possible. 3467219089Spjd */ 3468219089Spjdvoid 3469219089Spjdvdev_expand(vdev_t *vd, uint64_t txg) 3470219089Spjd{ 3471219089Spjd ASSERT(vd->vdev_top == vd); 3472219089Spjd ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3473219089Spjd 3474219089Spjd if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 3475219089Spjd VERIFY(vdev_metaslab_init(vd, txg) == 0); 3476219089Spjd vdev_config_dirty(vd); 3477219089Spjd } 3478219089Spjd} 3479219089Spjd 3480219089Spjd/* 3481219089Spjd * Split a vdev. 3482219089Spjd */ 3483219089Spjdvoid 3484219089Spjdvdev_split(vdev_t *vd) 3485219089Spjd{ 3486219089Spjd vdev_t *cvd, *pvd = vd->vdev_parent; 3487219089Spjd 3488219089Spjd vdev_remove_child(pvd, vd); 3489219089Spjd vdev_compact_children(pvd); 3490219089Spjd 3491219089Spjd cvd = pvd->vdev_child[0]; 3492219089Spjd if (pvd->vdev_children == 1) { 3493219089Spjd vdev_remove_parent(cvd); 3494219089Spjd cvd->vdev_splitting = B_TRUE; 3495219089Spjd } 3496219089Spjd vdev_propagate_state(cvd); 3497219089Spjd} 3498247265Smm 3499247265Smmvoid 3500247265Smmvdev_deadman(vdev_t *vd) 3501247265Smm{ 3502247265Smm for (int c = 0; c < vd->vdev_children; c++) { 3503247265Smm vdev_t *cvd = vd->vdev_child[c]; 3504247265Smm 3505247265Smm vdev_deadman(cvd); 3506247265Smm } 3507247265Smm 3508247265Smm if (vd->vdev_ops->vdev_op_leaf) { 3509247265Smm vdev_queue_t *vq = &vd->vdev_queue; 3510247265Smm 3511247265Smm mutex_enter(&vq->vq_lock); 3512260763Savg if (avl_numnodes(&vq->vq_active_tree) > 0) { 3513247265Smm spa_t *spa = vd->vdev_spa; 3514247265Smm zio_t *fio; 3515247265Smm uint64_t delta; 3516247265Smm 3517247265Smm /* 3518247265Smm * Look at the head of all the pending queues, 3519247265Smm * if any I/O has been outstanding for longer than 3520247265Smm * the spa_deadman_synctime we panic the system. 3521247265Smm */ 3522260763Savg fio = avl_first(&vq->vq_active_tree); 3523249206Smm delta = gethrtime() - fio->io_timestamp; 3524249206Smm if (delta > spa_deadman_synctime(spa)) { 3525249206Smm zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " 3526249206Smm "delta %lluns, last io %lluns", 3527247265Smm fio->io_timestamp, delta, 3528247265Smm vq->vq_io_complete_ts); 3529247265Smm fm_panic("I/O to pool '%s' appears to be " 3530247348Smm "hung on vdev guid %llu at '%s'.", 3531247348Smm spa_name(spa), 3532247348Smm (long long unsigned int) vd->vdev_guid, 3533247348Smm vd->vdev_path); 3534247265Smm } 3535247265Smm } 3536247265Smm mutex_exit(&vq->vq_lock); 3537247265Smm } 3538247265Smm} 3539