1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24285001Savg * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 25285001Savg * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 26247348Smm * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27297112Smav * Copyright (c) 2014 Integros [integros.com] 28168404Spjd */ 29168404Spjd 30168404Spjd#include <sys/zfs_context.h> 31168404Spjd#include <sys/fm/fs/zfs.h> 32168404Spjd#include <sys/spa.h> 33168404Spjd#include <sys/spa_impl.h> 34168404Spjd#include <sys/dmu.h> 35168404Spjd#include <sys/dmu_tx.h> 36168404Spjd#include <sys/vdev_impl.h> 37168404Spjd#include <sys/uberblock_impl.h> 38168404Spjd#include <sys/metaslab.h> 39168404Spjd#include <sys/metaslab_impl.h> 40168404Spjd#include <sys/space_map.h> 41262093Savg#include <sys/space_reftree.h> 42168404Spjd#include <sys/zio.h> 43168404Spjd#include <sys/zap.h> 44168404Spjd#include <sys/fs/zfs.h> 45185029Spjd#include <sys/arc.h> 46213197Smm#include <sys/zil.h> 47219089Spjd#include <sys/dsl_scan.h> 48240868Spjd#include <sys/trim_map.h> 49168404Spjd 50168404SpjdSYSCTL_DECL(_vfs_zfs); 51168404SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 52168404Spjd 53168404Spjd/* 54168404Spjd * Virtual device management. 55168404Spjd */ 56168404Spjd 57266122Ssmh/* 58254591Sgibbs * The limit for ZFS to automatically increase a top-level vdev's ashift 59254591Sgibbs * from logical ashift to physical ashift. 60254591Sgibbs * 61254591Sgibbs * Example: one or more 512B emulation child vdevs 62254591Sgibbs * child->vdev_ashift = 9 (512 bytes) 63254591Sgibbs * child->vdev_physical_ashift = 12 (4096 bytes) 64254591Sgibbs * zfs_max_auto_ashift = 11 (2048 bytes) 65266122Ssmh * zfs_min_auto_ashift = 9 (512 bytes) 66254591Sgibbs * 67266122Ssmh * On pool creation or the addition of a new top-level vdev, ZFS will 68266122Ssmh * increase the ashift of the top-level vdev to 2048 as limited by 69266122Ssmh * zfs_max_auto_ashift. 70254591Sgibbs * 71254591Sgibbs * Example: one or more 512B emulation child vdevs 72254591Sgibbs * child->vdev_ashift = 9 (512 bytes) 73254591Sgibbs * child->vdev_physical_ashift = 12 (4096 bytes) 74254591Sgibbs * zfs_max_auto_ashift = 13 (8192 bytes) 75266122Ssmh * zfs_min_auto_ashift = 9 (512 bytes) 76254591Sgibbs * 77266122Ssmh * On pool creation or the addition of a new top-level vdev, ZFS will 78266122Ssmh * increase the ashift of the top-level vdev to 4096 to match the 79266122Ssmh * max vdev_physical_ashift. 80266122Ssmh * 81266122Ssmh * Example: one or more 512B emulation child vdevs 82266122Ssmh * child->vdev_ashift = 9 (512 bytes) 83266122Ssmh * child->vdev_physical_ashift = 9 (512 bytes) 84266122Ssmh * zfs_max_auto_ashift = 13 (8192 bytes) 85266122Ssmh * zfs_min_auto_ashift = 12 (4096 bytes) 86266122Ssmh * 87266122Ssmh * On pool creation or the addition of a new top-level vdev, ZFS will 88266122Ssmh * increase the ashift of the top-level vdev to 4096 to match the 89266122Ssmh * zfs_min_auto_ashift. 90254591Sgibbs */ 91254591Sgibbsstatic uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; 92266122Ssmhstatic uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; 93254591Sgibbs 94254591Sgibbsstatic int 95254591Sgibbssysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) 96254591Sgibbs{ 97254591Sgibbs uint64_t val; 98254591Sgibbs int err; 99254591Sgibbs 100254591Sgibbs val = zfs_max_auto_ashift; 101254591Sgibbs err = sysctl_handle_64(oidp, &val, 0, req); 102254591Sgibbs if (err != 0 || req->newptr == NULL) 103254591Sgibbs return (err); 104254591Sgibbs 105266122Ssmh if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) 106266122Ssmh return (EINVAL); 107254591Sgibbs 108254591Sgibbs zfs_max_auto_ashift = val; 109254591Sgibbs 110254591Sgibbs return (0); 111254591Sgibbs} 112254591SgibbsSYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, 113254591Sgibbs CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 114254591Sgibbs sysctl_vfs_zfs_max_auto_ashift, "QU", 115266122Ssmh "Max ashift used when optimising for logical -> physical sectors size on " 116266122Ssmh "new top-level vdevs."); 117254591Sgibbs 118266122Ssmhstatic int 119266122Ssmhsysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) 120266122Ssmh{ 121266122Ssmh uint64_t val; 122266122Ssmh int err; 123266122Ssmh 124266122Ssmh val = zfs_min_auto_ashift; 125266122Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 126266122Ssmh if (err != 0 || req->newptr == NULL) 127266122Ssmh return (err); 128266122Ssmh 129266122Ssmh if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) 130266122Ssmh return (EINVAL); 131266122Ssmh 132266122Ssmh zfs_min_auto_ashift = val; 133266122Ssmh 134266122Ssmh return (0); 135266122Ssmh} 136266122SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, 137266122Ssmh CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 138266122Ssmh sysctl_vfs_zfs_min_auto_ashift, "QU", 139266122Ssmh "Min ashift used when creating new top-level vdevs."); 140266122Ssmh 141168404Spjdstatic vdev_ops_t *vdev_ops_table[] = { 142168404Spjd &vdev_root_ops, 143168404Spjd &vdev_raidz_ops, 144168404Spjd &vdev_mirror_ops, 145168404Spjd &vdev_replacing_ops, 146168404Spjd &vdev_spare_ops, 147168404Spjd#ifdef _KERNEL 148168404Spjd &vdev_geom_ops, 149168404Spjd#else 150168404Spjd &vdev_disk_ops, 151185029Spjd#endif 152168404Spjd &vdev_file_ops, 153168404Spjd &vdev_missing_ops, 154219089Spjd &vdev_hole_ops, 155168404Spjd NULL 156168404Spjd}; 157168404Spjd 158168404Spjd 159168404Spjd/* 160273343Sdelphij * When a vdev is added, it will be divided into approximately (but no 161273343Sdelphij * more than) this number of metaslabs. 162273343Sdelphij */ 163273343Sdelphijint metaslabs_per_vdev = 200; 164273343SdelphijSYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN, 165273343Sdelphij &metaslabs_per_vdev, 0, 166273343Sdelphij "When a vdev is added, how many metaslabs the vdev should be divided into"); 167273343Sdelphij 168273343Sdelphij/* 169168404Spjd * Given a vdev type, return the appropriate ops vector. 170168404Spjd */ 171168404Spjdstatic vdev_ops_t * 172168404Spjdvdev_getops(const char *type) 173168404Spjd{ 174168404Spjd vdev_ops_t *ops, **opspp; 175168404Spjd 176168404Spjd for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 177168404Spjd if (strcmp(ops->vdev_op_type, type) == 0) 178168404Spjd break; 179168404Spjd 180168404Spjd return (ops); 181168404Spjd} 182168404Spjd 183168404Spjd/* 184168404Spjd * Default asize function: return the MAX of psize with the asize of 185168404Spjd * all children. This is what's used by anything other than RAID-Z. 186168404Spjd */ 187168404Spjduint64_t 188168404Spjdvdev_default_asize(vdev_t *vd, uint64_t psize) 189168404Spjd{ 190168404Spjd uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 191168404Spjd uint64_t csize; 192168404Spjd 193219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 194168404Spjd csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 195168404Spjd asize = MAX(asize, csize); 196168404Spjd } 197168404Spjd 198168404Spjd return (asize); 199168404Spjd} 200168404Spjd 201168404Spjd/* 202219089Spjd * Get the minimum allocatable size. We define the allocatable size as 203219089Spjd * the vdev's asize rounded to the nearest metaslab. This allows us to 204219089Spjd * replace or attach devices which don't have the same physical size but 205219089Spjd * can still satisfy the same number of allocations. 206168404Spjd */ 207168404Spjduint64_t 208219089Spjdvdev_get_min_asize(vdev_t *vd) 209168404Spjd{ 210219089Spjd vdev_t *pvd = vd->vdev_parent; 211168404Spjd 212219089Spjd /* 213236155Smm * If our parent is NULL (inactive spare or cache) or is the root, 214219089Spjd * just return our own asize. 215219089Spjd */ 216219089Spjd if (pvd == NULL) 217219089Spjd return (vd->vdev_asize); 218168404Spjd 219168404Spjd /* 220219089Spjd * The top-level vdev just returns the allocatable size rounded 221219089Spjd * to the nearest metaslab. 222168404Spjd */ 223219089Spjd if (vd == vd->vdev_top) 224219089Spjd return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 225168404Spjd 226219089Spjd /* 227219089Spjd * The allocatable space for a raidz vdev is N * sizeof(smallest child), 228219089Spjd * so each child must provide at least 1/Nth of its asize. 229219089Spjd */ 230219089Spjd if (pvd->vdev_ops == &vdev_raidz_ops) 231219089Spjd return (pvd->vdev_min_asize / pvd->vdev_children); 232168404Spjd 233219089Spjd return (pvd->vdev_min_asize); 234219089Spjd} 235168404Spjd 236219089Spjdvoid 237219089Spjdvdev_set_min_asize(vdev_t *vd) 238219089Spjd{ 239219089Spjd vd->vdev_min_asize = vdev_get_min_asize(vd); 240219089Spjd 241219089Spjd for (int c = 0; c < vd->vdev_children; c++) 242219089Spjd vdev_set_min_asize(vd->vdev_child[c]); 243168404Spjd} 244168404Spjd 245168404Spjdvdev_t * 246168404Spjdvdev_lookup_top(spa_t *spa, uint64_t vdev) 247168404Spjd{ 248168404Spjd vdev_t *rvd = spa->spa_root_vdev; 249168404Spjd 250185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 251185029Spjd 252185029Spjd if (vdev < rvd->vdev_children) { 253185029Spjd ASSERT(rvd->vdev_child[vdev] != NULL); 254168404Spjd return (rvd->vdev_child[vdev]); 255185029Spjd } 256168404Spjd 257168404Spjd return (NULL); 258168404Spjd} 259168404Spjd 260168404Spjdvdev_t * 261168404Spjdvdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 262168404Spjd{ 263168404Spjd vdev_t *mvd; 264168404Spjd 265168404Spjd if (vd->vdev_guid == guid) 266168404Spjd return (vd); 267168404Spjd 268219089Spjd for (int c = 0; c < vd->vdev_children; c++) 269168404Spjd if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 270168404Spjd NULL) 271168404Spjd return (mvd); 272168404Spjd 273168404Spjd return (NULL); 274168404Spjd} 275168404Spjd 276288569Smavstatic int 277288569Smavvdev_count_leaves_impl(vdev_t *vd) 278288569Smav{ 279288569Smav int n = 0; 280288569Smav 281288569Smav if (vd->vdev_ops->vdev_op_leaf) 282288569Smav return (1); 283288569Smav 284288569Smav for (int c = 0; c < vd->vdev_children; c++) 285288569Smav n += vdev_count_leaves_impl(vd->vdev_child[c]); 286288569Smav 287288569Smav return (n); 288288569Smav} 289288569Smav 290288569Smavint 291288569Smavvdev_count_leaves(spa_t *spa) 292288569Smav{ 293288569Smav return (vdev_count_leaves_impl(spa->spa_root_vdev)); 294288569Smav} 295288569Smav 296168404Spjdvoid 297168404Spjdvdev_add_child(vdev_t *pvd, vdev_t *cvd) 298168404Spjd{ 299168404Spjd size_t oldsize, newsize; 300168404Spjd uint64_t id = cvd->vdev_id; 301168404Spjd vdev_t **newchild; 302285001Savg spa_t *spa = cvd->vdev_spa; 303168404Spjd 304285001Savg ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 305168404Spjd ASSERT(cvd->vdev_parent == NULL); 306168404Spjd 307168404Spjd cvd->vdev_parent = pvd; 308168404Spjd 309168404Spjd if (pvd == NULL) 310168404Spjd return; 311168404Spjd 312168404Spjd ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 313168404Spjd 314168404Spjd oldsize = pvd->vdev_children * sizeof (vdev_t *); 315168404Spjd pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 316168404Spjd newsize = pvd->vdev_children * sizeof (vdev_t *); 317168404Spjd 318168404Spjd newchild = kmem_zalloc(newsize, KM_SLEEP); 319168404Spjd if (pvd->vdev_child != NULL) { 320168404Spjd bcopy(pvd->vdev_child, newchild, oldsize); 321168404Spjd kmem_free(pvd->vdev_child, oldsize); 322168404Spjd } 323168404Spjd 324168404Spjd pvd->vdev_child = newchild; 325168404Spjd pvd->vdev_child[id] = cvd; 326168404Spjd 327168404Spjd cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 328168404Spjd ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 329168404Spjd 330168404Spjd /* 331168404Spjd * Walk up all ancestors to update guid sum. 332168404Spjd */ 333168404Spjd for (; pvd != NULL; pvd = pvd->vdev_parent) 334168404Spjd pvd->vdev_guid_sum += cvd->vdev_guid_sum; 335168404Spjd} 336168404Spjd 337168404Spjdvoid 338168404Spjdvdev_remove_child(vdev_t *pvd, vdev_t *cvd) 339168404Spjd{ 340168404Spjd int c; 341168404Spjd uint_t id = cvd->vdev_id; 342168404Spjd 343168404Spjd ASSERT(cvd->vdev_parent == pvd); 344168404Spjd 345168404Spjd if (pvd == NULL) 346168404Spjd return; 347168404Spjd 348168404Spjd ASSERT(id < pvd->vdev_children); 349168404Spjd ASSERT(pvd->vdev_child[id] == cvd); 350168404Spjd 351168404Spjd pvd->vdev_child[id] = NULL; 352168404Spjd cvd->vdev_parent = NULL; 353168404Spjd 354168404Spjd for (c = 0; c < pvd->vdev_children; c++) 355168404Spjd if (pvd->vdev_child[c]) 356168404Spjd break; 357168404Spjd 358168404Spjd if (c == pvd->vdev_children) { 359168404Spjd kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 360168404Spjd pvd->vdev_child = NULL; 361168404Spjd pvd->vdev_children = 0; 362168404Spjd } 363168404Spjd 364168404Spjd /* 365168404Spjd * Walk up all ancestors to update guid sum. 366168404Spjd */ 367168404Spjd for (; pvd != NULL; pvd = pvd->vdev_parent) 368168404Spjd pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 369168404Spjd} 370168404Spjd 371168404Spjd/* 372168404Spjd * Remove any holes in the child array. 373168404Spjd */ 374168404Spjdvoid 375168404Spjdvdev_compact_children(vdev_t *pvd) 376168404Spjd{ 377168404Spjd vdev_t **newchild, *cvd; 378168404Spjd int oldc = pvd->vdev_children; 379219089Spjd int newc; 380168404Spjd 381185029Spjd ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 382168404Spjd 383219089Spjd for (int c = newc = 0; c < oldc; c++) 384168404Spjd if (pvd->vdev_child[c]) 385168404Spjd newc++; 386168404Spjd 387168404Spjd newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 388168404Spjd 389219089Spjd for (int c = newc = 0; c < oldc; c++) { 390168404Spjd if ((cvd = pvd->vdev_child[c]) != NULL) { 391168404Spjd newchild[newc] = cvd; 392168404Spjd cvd->vdev_id = newc++; 393168404Spjd } 394168404Spjd } 395168404Spjd 396168404Spjd kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 397168404Spjd pvd->vdev_child = newchild; 398168404Spjd pvd->vdev_children = newc; 399168404Spjd} 400168404Spjd 401168404Spjd/* 402168404Spjd * Allocate and minimally initialize a vdev_t. 403168404Spjd */ 404219089Spjdvdev_t * 405168404Spjdvdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 406168404Spjd{ 407168404Spjd vdev_t *vd; 408168404Spjd 409168404Spjd vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 410168404Spjd 411168404Spjd if (spa->spa_root_vdev == NULL) { 412168404Spjd ASSERT(ops == &vdev_root_ops); 413168404Spjd spa->spa_root_vdev = vd; 414228103Smm spa->spa_load_guid = spa_generate_guid(NULL); 415168404Spjd } 416168404Spjd 417219089Spjd if (guid == 0 && ops != &vdev_hole_ops) { 418168404Spjd if (spa->spa_root_vdev == vd) { 419168404Spjd /* 420168404Spjd * The root vdev's guid will also be the pool guid, 421168404Spjd * which must be unique among all pools. 422168404Spjd */ 423219089Spjd guid = spa_generate_guid(NULL); 424168404Spjd } else { 425168404Spjd /* 426168404Spjd * Any other vdev's guid must be unique within the pool. 427168404Spjd */ 428219089Spjd guid = spa_generate_guid(spa); 429168404Spjd } 430168404Spjd ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 431168404Spjd } 432168404Spjd 433168404Spjd vd->vdev_spa = spa; 434168404Spjd vd->vdev_id = id; 435168404Spjd vd->vdev_guid = guid; 436168404Spjd vd->vdev_guid_sum = guid; 437168404Spjd vd->vdev_ops = ops; 438168404Spjd vd->vdev_state = VDEV_STATE_CLOSED; 439219089Spjd vd->vdev_ishole = (ops == &vdev_hole_ops); 440168404Spjd 441168404Spjd mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 442168404Spjd mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 443185029Spjd mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 444307279Smav mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); 445209962Smm for (int t = 0; t < DTL_TYPES; t++) { 446262093Savg vd->vdev_dtl[t] = range_tree_create(NULL, NULL, 447209962Smm &vd->vdev_dtl_lock); 448209962Smm } 449168404Spjd txg_list_create(&vd->vdev_ms_list, 450168404Spjd offsetof(struct metaslab, ms_txg_node)); 451168404Spjd txg_list_create(&vd->vdev_dtl_list, 452168404Spjd offsetof(struct vdev, vdev_dtl_node)); 453168404Spjd vd->vdev_stat.vs_timestamp = gethrtime(); 454185029Spjd vdev_queue_init(vd); 455185029Spjd vdev_cache_init(vd); 456168404Spjd 457168404Spjd return (vd); 458168404Spjd} 459168404Spjd 460168404Spjd/* 461168404Spjd * Allocate a new vdev. The 'alloctype' is used to control whether we are 462168404Spjd * creating a new vdev or loading an existing one - the behavior is slightly 463168404Spjd * different for each case. 464168404Spjd */ 465168404Spjdint 466168404Spjdvdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 467168404Spjd int alloctype) 468168404Spjd{ 469168404Spjd vdev_ops_t *ops; 470168404Spjd char *type; 471185029Spjd uint64_t guid = 0, islog, nparity; 472168404Spjd vdev_t *vd; 473168404Spjd 474185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 475168404Spjd 476168404Spjd if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 477249195Smm return (SET_ERROR(EINVAL)); 478168404Spjd 479168404Spjd if ((ops = vdev_getops(type)) == NULL) 480249195Smm return (SET_ERROR(EINVAL)); 481168404Spjd 482168404Spjd /* 483168404Spjd * If this is a load, get the vdev guid from the nvlist. 484168404Spjd * Otherwise, vdev_alloc_common() will generate one for us. 485168404Spjd */ 486168404Spjd if (alloctype == VDEV_ALLOC_LOAD) { 487168404Spjd uint64_t label_id; 488168404Spjd 489168404Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 490168404Spjd label_id != id) 491249195Smm return (SET_ERROR(EINVAL)); 492168404Spjd 493168404Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 494249195Smm return (SET_ERROR(EINVAL)); 495168404Spjd } else if (alloctype == VDEV_ALLOC_SPARE) { 496168404Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 497249195Smm return (SET_ERROR(EINVAL)); 498185029Spjd } else if (alloctype == VDEV_ALLOC_L2CACHE) { 499185029Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 500249195Smm return (SET_ERROR(EINVAL)); 501219089Spjd } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 502219089Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 503249195Smm return (SET_ERROR(EINVAL)); 504168404Spjd } 505168404Spjd 506168404Spjd /* 507168404Spjd * The first allocated vdev must be of type 'root'. 508168404Spjd */ 509168404Spjd if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 510249195Smm return (SET_ERROR(EINVAL)); 511168404Spjd 512185029Spjd /* 513185029Spjd * Determine whether we're a log vdev. 514185029Spjd */ 515185029Spjd islog = 0; 516185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 517185029Spjd if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 518249195Smm return (SET_ERROR(ENOTSUP)); 519168404Spjd 520219089Spjd if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 521249195Smm return (SET_ERROR(ENOTSUP)); 522219089Spjd 523168404Spjd /* 524185029Spjd * Set the nparity property for RAID-Z vdevs. 525168404Spjd */ 526185029Spjd nparity = -1ULL; 527168404Spjd if (ops == &vdev_raidz_ops) { 528168404Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 529185029Spjd &nparity) == 0) { 530219089Spjd if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 531249195Smm return (SET_ERROR(EINVAL)); 532168404Spjd /* 533219089Spjd * Previous versions could only support 1 or 2 parity 534219089Spjd * device. 535168404Spjd */ 536219089Spjd if (nparity > 1 && 537219089Spjd spa_version(spa) < SPA_VERSION_RAIDZ2) 538249195Smm return (SET_ERROR(ENOTSUP)); 539219089Spjd if (nparity > 2 && 540219089Spjd spa_version(spa) < SPA_VERSION_RAIDZ3) 541249195Smm return (SET_ERROR(ENOTSUP)); 542168404Spjd } else { 543168404Spjd /* 544168404Spjd * We require the parity to be specified for SPAs that 545168404Spjd * support multiple parity levels. 546168404Spjd */ 547219089Spjd if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 548249195Smm return (SET_ERROR(EINVAL)); 549168404Spjd /* 550168404Spjd * Otherwise, we default to 1 parity device for RAID-Z. 551168404Spjd */ 552185029Spjd nparity = 1; 553168404Spjd } 554168404Spjd } else { 555185029Spjd nparity = 0; 556168404Spjd } 557185029Spjd ASSERT(nparity != -1ULL); 558168404Spjd 559185029Spjd vd = vdev_alloc_common(spa, id, guid, ops); 560185029Spjd 561185029Spjd vd->vdev_islog = islog; 562185029Spjd vd->vdev_nparity = nparity; 563185029Spjd 564185029Spjd if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 565185029Spjd vd->vdev_path = spa_strdup(vd->vdev_path); 566185029Spjd if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 567185029Spjd vd->vdev_devid = spa_strdup(vd->vdev_devid); 568185029Spjd if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 569185029Spjd &vd->vdev_physpath) == 0) 570185029Spjd vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 571209962Smm if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 572209962Smm vd->vdev_fru = spa_strdup(vd->vdev_fru); 573185029Spjd 574168404Spjd /* 575168404Spjd * Set the whole_disk property. If it's not specified, leave the value 576168404Spjd * as -1. 577168404Spjd */ 578168404Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 579168404Spjd &vd->vdev_wholedisk) != 0) 580168404Spjd vd->vdev_wholedisk = -1ULL; 581168404Spjd 582168404Spjd /* 583168404Spjd * Look for the 'not present' flag. This will only be set if the device 584168404Spjd * was not present at the time of import. 585168404Spjd */ 586209962Smm (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 587209962Smm &vd->vdev_not_present); 588168404Spjd 589168404Spjd /* 590168404Spjd * Get the alignment requirement. 591168404Spjd */ 592168404Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 593168404Spjd 594168404Spjd /* 595219089Spjd * Retrieve the vdev creation time. 596219089Spjd */ 597219089Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 598219089Spjd &vd->vdev_crtxg); 599219089Spjd 600219089Spjd /* 601168404Spjd * If we're a top-level vdev, try to load the allocation parameters. 602168404Spjd */ 603219089Spjd if (parent && !parent->vdev_parent && 604219089Spjd (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 605168404Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 606168404Spjd &vd->vdev_ms_array); 607168404Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 608168404Spjd &vd->vdev_ms_shift); 609168404Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 610168404Spjd &vd->vdev_asize); 611219089Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 612219089Spjd &vd->vdev_removing); 613168404Spjd } 614168404Spjd 615230514Smm if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 616219089Spjd ASSERT(alloctype == VDEV_ALLOC_LOAD || 617219089Spjd alloctype == VDEV_ALLOC_ADD || 618219089Spjd alloctype == VDEV_ALLOC_SPLIT || 619219089Spjd alloctype == VDEV_ALLOC_ROOTPOOL); 620219089Spjd vd->vdev_mg = metaslab_group_create(islog ? 621219089Spjd spa_log_class(spa) : spa_normal_class(spa), vd); 622219089Spjd } 623219089Spjd 624168404Spjd /* 625185029Spjd * If we're a leaf vdev, try to load the DTL object and other state. 626168404Spjd */ 627185029Spjd if (vd->vdev_ops->vdev_op_leaf && 628219089Spjd (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 629219089Spjd alloctype == VDEV_ALLOC_ROOTPOOL)) { 630185029Spjd if (alloctype == VDEV_ALLOC_LOAD) { 631185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 632262093Savg &vd->vdev_dtl_object); 633185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 634185029Spjd &vd->vdev_unspare); 635185029Spjd } 636219089Spjd 637219089Spjd if (alloctype == VDEV_ALLOC_ROOTPOOL) { 638219089Spjd uint64_t spare = 0; 639219089Spjd 640219089Spjd if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 641219089Spjd &spare) == 0 && spare) 642219089Spjd spa_spare_add(vd); 643219089Spjd } 644219089Spjd 645168404Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 646168404Spjd &vd->vdev_offline); 647185029Spjd 648254112Sdelphij (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 649254112Sdelphij &vd->vdev_resilver_txg); 650219089Spjd 651185029Spjd /* 652185029Spjd * When importing a pool, we want to ignore the persistent fault 653185029Spjd * state, as the diagnosis made on another system may not be 654219089Spjd * valid in the current context. Local vdevs will 655219089Spjd * remain in the faulted state. 656185029Spjd */ 657219089Spjd if (spa_load_state(spa) == SPA_LOAD_OPEN) { 658185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 659185029Spjd &vd->vdev_faulted); 660185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 661185029Spjd &vd->vdev_degraded); 662185029Spjd (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 663185029Spjd &vd->vdev_removed); 664219089Spjd 665219089Spjd if (vd->vdev_faulted || vd->vdev_degraded) { 666219089Spjd char *aux; 667219089Spjd 668219089Spjd vd->vdev_label_aux = 669219089Spjd VDEV_AUX_ERR_EXCEEDED; 670219089Spjd if (nvlist_lookup_string(nv, 671219089Spjd ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 672219089Spjd strcmp(aux, "external") == 0) 673219089Spjd vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 674219089Spjd } 675185029Spjd } 676168404Spjd } 677168404Spjd 678168404Spjd /* 679168404Spjd * Add ourselves to the parent's list of children. 680168404Spjd */ 681168404Spjd vdev_add_child(parent, vd); 682168404Spjd 683168404Spjd *vdp = vd; 684168404Spjd 685168404Spjd return (0); 686168404Spjd} 687168404Spjd 688168404Spjdvoid 689168404Spjdvdev_free(vdev_t *vd) 690168404Spjd{ 691185029Spjd spa_t *spa = vd->vdev_spa; 692168404Spjd 693168404Spjd /* 694168404Spjd * vdev_free() implies closing the vdev first. This is simpler than 695168404Spjd * trying to ensure complicated semantics for all callers. 696168404Spjd */ 697168404Spjd vdev_close(vd); 698168404Spjd 699185029Spjd ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 700219089Spjd ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 701168404Spjd 702168404Spjd /* 703168404Spjd * Free all children. 704168404Spjd */ 705219089Spjd for (int c = 0; c < vd->vdev_children; c++) 706168404Spjd vdev_free(vd->vdev_child[c]); 707168404Spjd 708168404Spjd ASSERT(vd->vdev_child == NULL); 709168404Spjd ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 710168404Spjd 711168404Spjd /* 712168404Spjd * Discard allocation state. 713168404Spjd */ 714219089Spjd if (vd->vdev_mg != NULL) { 715168404Spjd vdev_metaslab_fini(vd); 716219089Spjd metaslab_group_destroy(vd->vdev_mg); 717219089Spjd } 718168404Spjd 719240415Smm ASSERT0(vd->vdev_stat.vs_space); 720240415Smm ASSERT0(vd->vdev_stat.vs_dspace); 721240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 722168404Spjd 723168404Spjd /* 724168404Spjd * Remove this vdev from its parent's child list. 725168404Spjd */ 726168404Spjd vdev_remove_child(vd->vdev_parent, vd); 727168404Spjd 728168404Spjd ASSERT(vd->vdev_parent == NULL); 729168404Spjd 730185029Spjd /* 731185029Spjd * Clean up vdev structure. 732185029Spjd */ 733185029Spjd vdev_queue_fini(vd); 734185029Spjd vdev_cache_fini(vd); 735185029Spjd 736185029Spjd if (vd->vdev_path) 737185029Spjd spa_strfree(vd->vdev_path); 738185029Spjd if (vd->vdev_devid) 739185029Spjd spa_strfree(vd->vdev_devid); 740185029Spjd if (vd->vdev_physpath) 741185029Spjd spa_strfree(vd->vdev_physpath); 742209962Smm if (vd->vdev_fru) 743209962Smm spa_strfree(vd->vdev_fru); 744185029Spjd 745185029Spjd if (vd->vdev_isspare) 746185029Spjd spa_spare_remove(vd); 747185029Spjd if (vd->vdev_isl2cache) 748185029Spjd spa_l2cache_remove(vd); 749185029Spjd 750185029Spjd txg_list_destroy(&vd->vdev_ms_list); 751185029Spjd txg_list_destroy(&vd->vdev_dtl_list); 752209962Smm 753185029Spjd mutex_enter(&vd->vdev_dtl_lock); 754262093Savg space_map_close(vd->vdev_dtl_sm); 755209962Smm for (int t = 0; t < DTL_TYPES; t++) { 756262093Savg range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 757262093Savg range_tree_destroy(vd->vdev_dtl[t]); 758209962Smm } 759185029Spjd mutex_exit(&vd->vdev_dtl_lock); 760209962Smm 761307279Smav mutex_destroy(&vd->vdev_queue_lock); 762185029Spjd mutex_destroy(&vd->vdev_dtl_lock); 763185029Spjd mutex_destroy(&vd->vdev_stat_lock); 764185029Spjd mutex_destroy(&vd->vdev_probe_lock); 765185029Spjd 766185029Spjd if (vd == spa->spa_root_vdev) 767185029Spjd spa->spa_root_vdev = NULL; 768185029Spjd 769185029Spjd kmem_free(vd, sizeof (vdev_t)); 770168404Spjd} 771168404Spjd 772168404Spjd/* 773168404Spjd * Transfer top-level vdev state from svd to tvd. 774168404Spjd */ 775168404Spjdstatic void 776168404Spjdvdev_top_transfer(vdev_t *svd, vdev_t *tvd) 777168404Spjd{ 778168404Spjd spa_t *spa = svd->vdev_spa; 779168404Spjd metaslab_t *msp; 780168404Spjd vdev_t *vd; 781168404Spjd int t; 782168404Spjd 783168404Spjd ASSERT(tvd == tvd->vdev_top); 784168404Spjd 785168404Spjd tvd->vdev_ms_array = svd->vdev_ms_array; 786168404Spjd tvd->vdev_ms_shift = svd->vdev_ms_shift; 787168404Spjd tvd->vdev_ms_count = svd->vdev_ms_count; 788168404Spjd 789168404Spjd svd->vdev_ms_array = 0; 790168404Spjd svd->vdev_ms_shift = 0; 791168404Spjd svd->vdev_ms_count = 0; 792168404Spjd 793230514Smm if (tvd->vdev_mg) 794230514Smm ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 795168404Spjd tvd->vdev_mg = svd->vdev_mg; 796168404Spjd tvd->vdev_ms = svd->vdev_ms; 797168404Spjd 798168404Spjd svd->vdev_mg = NULL; 799168404Spjd svd->vdev_ms = NULL; 800168404Spjd 801168404Spjd if (tvd->vdev_mg != NULL) 802168404Spjd tvd->vdev_mg->mg_vd = tvd; 803168404Spjd 804168404Spjd tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 805168404Spjd tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 806168404Spjd tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 807168404Spjd 808168404Spjd svd->vdev_stat.vs_alloc = 0; 809168404Spjd svd->vdev_stat.vs_space = 0; 810168404Spjd svd->vdev_stat.vs_dspace = 0; 811168404Spjd 812168404Spjd for (t = 0; t < TXG_SIZE; t++) { 813168404Spjd while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 814168404Spjd (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 815168404Spjd while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 816168404Spjd (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 817168404Spjd if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 818168404Spjd (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 819168404Spjd } 820168404Spjd 821185029Spjd if (list_link_active(&svd->vdev_config_dirty_node)) { 822168404Spjd vdev_config_clean(svd); 823168404Spjd vdev_config_dirty(tvd); 824168404Spjd } 825168404Spjd 826185029Spjd if (list_link_active(&svd->vdev_state_dirty_node)) { 827185029Spjd vdev_state_clean(svd); 828185029Spjd vdev_state_dirty(tvd); 829185029Spjd } 830168404Spjd 831168404Spjd tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 832168404Spjd svd->vdev_deflate_ratio = 0; 833185029Spjd 834185029Spjd tvd->vdev_islog = svd->vdev_islog; 835185029Spjd svd->vdev_islog = 0; 836168404Spjd} 837168404Spjd 838168404Spjdstatic void 839168404Spjdvdev_top_update(vdev_t *tvd, vdev_t *vd) 840168404Spjd{ 841168404Spjd if (vd == NULL) 842168404Spjd return; 843168404Spjd 844168404Spjd vd->vdev_top = tvd; 845168404Spjd 846219089Spjd for (int c = 0; c < vd->vdev_children; c++) 847168404Spjd vdev_top_update(tvd, vd->vdev_child[c]); 848168404Spjd} 849168404Spjd 850168404Spjd/* 851168404Spjd * Add a mirror/replacing vdev above an existing vdev. 852168404Spjd */ 853168404Spjdvdev_t * 854168404Spjdvdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 855168404Spjd{ 856168404Spjd spa_t *spa = cvd->vdev_spa; 857168404Spjd vdev_t *pvd = cvd->vdev_parent; 858168404Spjd vdev_t *mvd; 859168404Spjd 860185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 861168404Spjd 862168404Spjd mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 863168404Spjd 864168404Spjd mvd->vdev_asize = cvd->vdev_asize; 865219089Spjd mvd->vdev_min_asize = cvd->vdev_min_asize; 866236155Smm mvd->vdev_max_asize = cvd->vdev_max_asize; 867168404Spjd mvd->vdev_ashift = cvd->vdev_ashift; 868254591Sgibbs mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; 869254591Sgibbs mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; 870168404Spjd mvd->vdev_state = cvd->vdev_state; 871219089Spjd mvd->vdev_crtxg = cvd->vdev_crtxg; 872168404Spjd 873168404Spjd vdev_remove_child(pvd, cvd); 874168404Spjd vdev_add_child(pvd, mvd); 875168404Spjd cvd->vdev_id = mvd->vdev_children; 876168404Spjd vdev_add_child(mvd, cvd); 877168404Spjd vdev_top_update(cvd->vdev_top, cvd->vdev_top); 878168404Spjd 879168404Spjd if (mvd == mvd->vdev_top) 880168404Spjd vdev_top_transfer(cvd, mvd); 881168404Spjd 882168404Spjd return (mvd); 883168404Spjd} 884168404Spjd 885168404Spjd/* 886168404Spjd * Remove a 1-way mirror/replacing vdev from the tree. 887168404Spjd */ 888168404Spjdvoid 889168404Spjdvdev_remove_parent(vdev_t *cvd) 890168404Spjd{ 891168404Spjd vdev_t *mvd = cvd->vdev_parent; 892168404Spjd vdev_t *pvd = mvd->vdev_parent; 893168404Spjd 894185029Spjd ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 895168404Spjd 896168404Spjd ASSERT(mvd->vdev_children == 1); 897168404Spjd ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 898168404Spjd mvd->vdev_ops == &vdev_replacing_ops || 899168404Spjd mvd->vdev_ops == &vdev_spare_ops); 900168404Spjd cvd->vdev_ashift = mvd->vdev_ashift; 901254591Sgibbs cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; 902254591Sgibbs cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; 903168404Spjd 904168404Spjd vdev_remove_child(mvd, cvd); 905168404Spjd vdev_remove_child(pvd, mvd); 906209962Smm 907185029Spjd /* 908185029Spjd * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 909185029Spjd * Otherwise, we could have detached an offline device, and when we 910185029Spjd * go to import the pool we'll think we have two top-level vdevs, 911185029Spjd * instead of a different version of the same top-level vdev. 912185029Spjd */ 913209962Smm if (mvd->vdev_top == mvd) { 914209962Smm uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 915219089Spjd cvd->vdev_orig_guid = cvd->vdev_guid; 916209962Smm cvd->vdev_guid += guid_delta; 917209962Smm cvd->vdev_guid_sum += guid_delta; 918209962Smm } 919168404Spjd cvd->vdev_id = mvd->vdev_id; 920168404Spjd vdev_add_child(pvd, cvd); 921168404Spjd vdev_top_update(cvd->vdev_top, cvd->vdev_top); 922168404Spjd 923168404Spjd if (cvd == cvd->vdev_top) 924168404Spjd vdev_top_transfer(mvd, cvd); 925168404Spjd 926168404Spjd ASSERT(mvd->vdev_children == 0); 927168404Spjd vdev_free(mvd); 928168404Spjd} 929168404Spjd 930168404Spjdint 931168404Spjdvdev_metaslab_init(vdev_t *vd, uint64_t txg) 932168404Spjd{ 933168404Spjd spa_t *spa = vd->vdev_spa; 934168404Spjd objset_t *mos = spa->spa_meta_objset; 935168404Spjd uint64_t m; 936168404Spjd uint64_t oldc = vd->vdev_ms_count; 937168404Spjd uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 938168404Spjd metaslab_t **mspp; 939168404Spjd int error; 940168404Spjd 941219089Spjd ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 942219089Spjd 943219089Spjd /* 944219089Spjd * This vdev is not being allocated from yet or is a hole. 945219089Spjd */ 946219089Spjd if (vd->vdev_ms_shift == 0) 947168404Spjd return (0); 948168404Spjd 949219089Spjd ASSERT(!vd->vdev_ishole); 950219089Spjd 951213197Smm /* 952213197Smm * Compute the raidz-deflation ratio. Note, we hard-code 953276081Sdelphij * in 128k (1 << 17) because it is the "typical" blocksize. 954276081Sdelphij * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, 955276081Sdelphij * otherwise it would inconsistently account for existing bp's. 956213197Smm */ 957213197Smm vd->vdev_deflate_ratio = (1 << 17) / 958213197Smm (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 959213197Smm 960168404Spjd ASSERT(oldc <= newc); 961168404Spjd 962168404Spjd mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 963168404Spjd 964168404Spjd if (oldc != 0) { 965168404Spjd bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 966168404Spjd kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 967168404Spjd } 968168404Spjd 969168404Spjd vd->vdev_ms = mspp; 970168404Spjd vd->vdev_ms_count = newc; 971168404Spjd 972168404Spjd for (m = oldc; m < newc; m++) { 973262093Savg uint64_t object = 0; 974262093Savg 975168404Spjd if (txg == 0) { 976168404Spjd error = dmu_read(mos, vd->vdev_ms_array, 977209962Smm m * sizeof (uint64_t), sizeof (uint64_t), &object, 978209962Smm DMU_READ_PREFETCH); 979168404Spjd if (error) 980168404Spjd return (error); 981168404Spjd } 982277553Sdelphij 983277553Sdelphij error = metaslab_init(vd->vdev_mg, m, object, txg, 984277553Sdelphij &(vd->vdev_ms[m])); 985277553Sdelphij if (error) 986277553Sdelphij return (error); 987168404Spjd } 988168404Spjd 989219089Spjd if (txg == 0) 990219089Spjd spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 991219089Spjd 992219089Spjd /* 993219089Spjd * If the vdev is being removed we don't activate 994219089Spjd * the metaslabs since we want to ensure that no new 995219089Spjd * allocations are performed on this device. 996219089Spjd */ 997219089Spjd if (oldc == 0 && !vd->vdev_removing) 998219089Spjd metaslab_group_activate(vd->vdev_mg); 999219089Spjd 1000219089Spjd if (txg == 0) 1001219089Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 1002219089Spjd 1003168404Spjd return (0); 1004168404Spjd} 1005168404Spjd 1006168404Spjdvoid 1007168404Spjdvdev_metaslab_fini(vdev_t *vd) 1008168404Spjd{ 1009168404Spjd uint64_t m; 1010168404Spjd uint64_t count = vd->vdev_ms_count; 1011168404Spjd 1012168404Spjd if (vd->vdev_ms != NULL) { 1013219089Spjd metaslab_group_passivate(vd->vdev_mg); 1014262093Savg for (m = 0; m < count; m++) { 1015262093Savg metaslab_t *msp = vd->vdev_ms[m]; 1016262093Savg 1017262093Savg if (msp != NULL) 1018262093Savg metaslab_fini(msp); 1019262093Savg } 1020168404Spjd kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 1021168404Spjd vd->vdev_ms = NULL; 1022168404Spjd } 1023168404Spjd} 1024168404Spjd 1025185029Spjdtypedef struct vdev_probe_stats { 1026185029Spjd boolean_t vps_readable; 1027185029Spjd boolean_t vps_writeable; 1028185029Spjd int vps_flags; 1029185029Spjd} vdev_probe_stats_t; 1030185029Spjd 1031185029Spjdstatic void 1032185029Spjdvdev_probe_done(zio_t *zio) 1033185029Spjd{ 1034209962Smm spa_t *spa = zio->io_spa; 1035209962Smm vdev_t *vd = zio->io_vd; 1036185029Spjd vdev_probe_stats_t *vps = zio->io_private; 1037185029Spjd 1038209962Smm ASSERT(vd->vdev_probe_zio != NULL); 1039209962Smm 1040185029Spjd if (zio->io_type == ZIO_TYPE_READ) { 1041185029Spjd if (zio->io_error == 0) 1042185029Spjd vps->vps_readable = 1; 1043209962Smm if (zio->io_error == 0 && spa_writeable(spa)) { 1044209962Smm zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 1045185029Spjd zio->io_offset, zio->io_size, zio->io_data, 1046185029Spjd ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1047185029Spjd ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 1048185029Spjd } else { 1049185029Spjd zio_buf_free(zio->io_data, zio->io_size); 1050185029Spjd } 1051185029Spjd } else if (zio->io_type == ZIO_TYPE_WRITE) { 1052185029Spjd if (zio->io_error == 0) 1053185029Spjd vps->vps_writeable = 1; 1054185029Spjd zio_buf_free(zio->io_data, zio->io_size); 1055185029Spjd } else if (zio->io_type == ZIO_TYPE_NULL) { 1056209962Smm zio_t *pio; 1057185029Spjd 1058185029Spjd vd->vdev_cant_read |= !vps->vps_readable; 1059185029Spjd vd->vdev_cant_write |= !vps->vps_writeable; 1060185029Spjd 1061185029Spjd if (vdev_readable(vd) && 1062209962Smm (vdev_writeable(vd) || !spa_writeable(spa))) { 1063185029Spjd zio->io_error = 0; 1064185029Spjd } else { 1065185029Spjd ASSERT(zio->io_error != 0); 1066185029Spjd zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 1067209962Smm spa, vd, NULL, 0, 0); 1068249195Smm zio->io_error = SET_ERROR(ENXIO); 1069185029Spjd } 1070209962Smm 1071209962Smm mutex_enter(&vd->vdev_probe_lock); 1072209962Smm ASSERT(vd->vdev_probe_zio == zio); 1073209962Smm vd->vdev_probe_zio = NULL; 1074209962Smm mutex_exit(&vd->vdev_probe_lock); 1075209962Smm 1076307279Smav zio_link_t *zl = NULL; 1077307279Smav while ((pio = zio_walk_parents(zio, &zl)) != NULL) 1078209962Smm if (!vdev_accessible(vd, pio)) 1079249195Smm pio->io_error = SET_ERROR(ENXIO); 1080209962Smm 1081185029Spjd kmem_free(vps, sizeof (*vps)); 1082185029Spjd } 1083185029Spjd} 1084185029Spjd 1085168404Spjd/* 1086251631Sdelphij * Determine whether this device is accessible. 1087251631Sdelphij * 1088251631Sdelphij * Read and write to several known locations: the pad regions of each 1089251631Sdelphij * vdev label but the first, which we leave alone in case it contains 1090251631Sdelphij * a VTOC. 1091185029Spjd */ 1092185029Spjdzio_t * 1093209962Smmvdev_probe(vdev_t *vd, zio_t *zio) 1094185029Spjd{ 1095185029Spjd spa_t *spa = vd->vdev_spa; 1096209962Smm vdev_probe_stats_t *vps = NULL; 1097209962Smm zio_t *pio; 1098185029Spjd 1099209962Smm ASSERT(vd->vdev_ops->vdev_op_leaf); 1100185029Spjd 1101209962Smm /* 1102209962Smm * Don't probe the probe. 1103209962Smm */ 1104209962Smm if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1105209962Smm return (NULL); 1106185029Spjd 1107209962Smm /* 1108209962Smm * To prevent 'probe storms' when a device fails, we create 1109209962Smm * just one probe i/o at a time. All zios that want to probe 1110209962Smm * this vdev will become parents of the probe io. 1111209962Smm */ 1112209962Smm mutex_enter(&vd->vdev_probe_lock); 1113209962Smm 1114209962Smm if ((pio = vd->vdev_probe_zio) == NULL) { 1115209962Smm vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 1116209962Smm 1117209962Smm vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 1118209962Smm ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 1119213198Smm ZIO_FLAG_TRYHARD; 1120209962Smm 1121209962Smm if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1122209962Smm /* 1123209962Smm * vdev_cant_read and vdev_cant_write can only 1124209962Smm * transition from TRUE to FALSE when we have the 1125209962Smm * SCL_ZIO lock as writer; otherwise they can only 1126209962Smm * transition from FALSE to TRUE. This ensures that 1127209962Smm * any zio looking at these values can assume that 1128209962Smm * failures persist for the life of the I/O. That's 1129209962Smm * important because when a device has intermittent 1130209962Smm * connectivity problems, we want to ensure that 1131209962Smm * they're ascribed to the device (ENXIO) and not 1132209962Smm * the zio (EIO). 1133209962Smm * 1134209962Smm * Since we hold SCL_ZIO as writer here, clear both 1135209962Smm * values so the probe can reevaluate from first 1136209962Smm * principles. 1137209962Smm */ 1138209962Smm vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1139209962Smm vd->vdev_cant_read = B_FALSE; 1140209962Smm vd->vdev_cant_write = B_FALSE; 1141209962Smm } 1142209962Smm 1143209962Smm vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1144209962Smm vdev_probe_done, vps, 1145209962Smm vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1146209962Smm 1147219089Spjd /* 1148219089Spjd * We can't change the vdev state in this context, so we 1149219089Spjd * kick off an async task to do it on our behalf. 1150219089Spjd */ 1151209962Smm if (zio != NULL) { 1152209962Smm vd->vdev_probe_wanted = B_TRUE; 1153209962Smm spa_async_request(spa, SPA_ASYNC_PROBE); 1154209962Smm } 1155185029Spjd } 1156185029Spjd 1157209962Smm if (zio != NULL) 1158209962Smm zio_add_child(zio, pio); 1159185029Spjd 1160209962Smm mutex_exit(&vd->vdev_probe_lock); 1161185029Spjd 1162209962Smm if (vps == NULL) { 1163209962Smm ASSERT(zio != NULL); 1164209962Smm return (NULL); 1165209962Smm } 1166185029Spjd 1167185029Spjd for (int l = 1; l < VDEV_LABELS; l++) { 1168209962Smm zio_nowait(zio_read_phys(pio, vd, 1169185029Spjd vdev_label_offset(vd->vdev_psize, l, 1170209962Smm offsetof(vdev_label_t, vl_pad2)), 1171209962Smm VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), 1172185029Spjd ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1173185029Spjd ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1174185029Spjd } 1175185029Spjd 1176209962Smm if (zio == NULL) 1177209962Smm return (pio); 1178209962Smm 1179209962Smm zio_nowait(pio); 1180209962Smm return (NULL); 1181185029Spjd} 1182185029Spjd 1183219089Spjdstatic void 1184219089Spjdvdev_open_child(void *arg) 1185219089Spjd{ 1186219089Spjd vdev_t *vd = arg; 1187219089Spjd 1188219089Spjd vd->vdev_open_thread = curthread; 1189219089Spjd vd->vdev_open_error = vdev_open(vd); 1190219089Spjd vd->vdev_open_thread = NULL; 1191219089Spjd} 1192219089Spjd 1193219089Spjdboolean_t 1194219089Spjdvdev_uses_zvols(vdev_t *vd) 1195219089Spjd{ 1196219089Spjd if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1197219089Spjd strlen(ZVOL_DIR)) == 0) 1198219089Spjd return (B_TRUE); 1199219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1200219089Spjd if (vdev_uses_zvols(vd->vdev_child[c])) 1201219089Spjd return (B_TRUE); 1202219089Spjd return (B_FALSE); 1203219089Spjd} 1204219089Spjd 1205219089Spjdvoid 1206219089Spjdvdev_open_children(vdev_t *vd) 1207219089Spjd{ 1208219089Spjd taskq_t *tq; 1209219089Spjd int children = vd->vdev_children; 1210219089Spjd 1211219089Spjd /* 1212219089Spjd * in order to handle pools on top of zvols, do the opens 1213219089Spjd * in a single thread so that the same thread holds the 1214219089Spjd * spa_namespace_lock 1215219089Spjd */ 1216219089Spjd if (B_TRUE || vdev_uses_zvols(vd)) { 1217219089Spjd for (int c = 0; c < children; c++) 1218219089Spjd vd->vdev_child[c]->vdev_open_error = 1219219089Spjd vdev_open(vd->vdev_child[c]); 1220219089Spjd return; 1221219089Spjd } 1222219089Spjd tq = taskq_create("vdev_open", children, minclsyspri, 1223219089Spjd children, children, TASKQ_PREPOPULATE); 1224219089Spjd 1225219089Spjd for (int c = 0; c < children; c++) 1226219089Spjd VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1227219089Spjd TQ_SLEEP) != 0); 1228219089Spjd 1229219089Spjd taskq_destroy(tq); 1230219089Spjd} 1231219089Spjd 1232185029Spjd/* 1233168404Spjd * Prepare a virtual device for access. 1234168404Spjd */ 1235168404Spjdint 1236168404Spjdvdev_open(vdev_t *vd) 1237168404Spjd{ 1238209962Smm spa_t *spa = vd->vdev_spa; 1239168404Spjd int error; 1240168404Spjd uint64_t osize = 0; 1241236155Smm uint64_t max_osize = 0; 1242236155Smm uint64_t asize, max_asize, psize; 1243254591Sgibbs uint64_t logical_ashift = 0; 1244254591Sgibbs uint64_t physical_ashift = 0; 1245168404Spjd 1246219089Spjd ASSERT(vd->vdev_open_thread == curthread || 1247219089Spjd spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1248168404Spjd ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1249168404Spjd vd->vdev_state == VDEV_STATE_CANT_OPEN || 1250168404Spjd vd->vdev_state == VDEV_STATE_OFFLINE); 1251168404Spjd 1252168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1253213197Smm vd->vdev_cant_read = B_FALSE; 1254213197Smm vd->vdev_cant_write = B_FALSE; 1255274800Ssmh vd->vdev_notrim = B_FALSE; 1256219089Spjd vd->vdev_min_asize = vdev_get_min_asize(vd); 1257168404Spjd 1258219089Spjd /* 1259219089Spjd * If this vdev is not removed, check its fault status. If it's 1260219089Spjd * faulted, bail out of the open. 1261219089Spjd */ 1262185029Spjd if (!vd->vdev_removed && vd->vdev_faulted) { 1263168404Spjd ASSERT(vd->vdev_children == 0); 1264219089Spjd ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1265219089Spjd vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1266185029Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1267219089Spjd vd->vdev_label_aux); 1268249195Smm return (SET_ERROR(ENXIO)); 1269185029Spjd } else if (vd->vdev_offline) { 1270185029Spjd ASSERT(vd->vdev_children == 0); 1271168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1272249195Smm return (SET_ERROR(ENXIO)); 1273168404Spjd } 1274168404Spjd 1275254591Sgibbs error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, 1276254591Sgibbs &logical_ashift, &physical_ashift); 1277168404Spjd 1278219089Spjd /* 1279219089Spjd * Reset the vdev_reopening flag so that we actually close 1280219089Spjd * the vdev on error. 1281219089Spjd */ 1282219089Spjd vd->vdev_reopening = B_FALSE; 1283168404Spjd if (zio_injection_enabled && error == 0) 1284213198Smm error = zio_handle_device_injection(vd, NULL, ENXIO); 1285168404Spjd 1286185029Spjd if (error) { 1287185029Spjd if (vd->vdev_removed && 1288185029Spjd vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1289185029Spjd vd->vdev_removed = B_FALSE; 1290168404Spjd 1291168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1292168404Spjd vd->vdev_stat.vs_aux); 1293168404Spjd return (error); 1294168404Spjd } 1295168404Spjd 1296185029Spjd vd->vdev_removed = B_FALSE; 1297168404Spjd 1298219089Spjd /* 1299219089Spjd * Recheck the faulted flag now that we have confirmed that 1300219089Spjd * the vdev is accessible. If we're faulted, bail. 1301219089Spjd */ 1302219089Spjd if (vd->vdev_faulted) { 1303219089Spjd ASSERT(vd->vdev_children == 0); 1304219089Spjd ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1305219089Spjd vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1306219089Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1307219089Spjd vd->vdev_label_aux); 1308249195Smm return (SET_ERROR(ENXIO)); 1309219089Spjd } 1310219089Spjd 1311185029Spjd if (vd->vdev_degraded) { 1312185029Spjd ASSERT(vd->vdev_children == 0); 1313185029Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1314185029Spjd VDEV_AUX_ERR_EXCEEDED); 1315185029Spjd } else { 1316219089Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1317185029Spjd } 1318185029Spjd 1319219089Spjd /* 1320219089Spjd * For hole or missing vdevs we just return success. 1321219089Spjd */ 1322219089Spjd if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1323219089Spjd return (0); 1324219089Spjd 1325274800Ssmh if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) 1326240868Spjd trim_map_create(vd); 1327240868Spjd 1328219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 1329168404Spjd if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1330168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1331168404Spjd VDEV_AUX_NONE); 1332168404Spjd break; 1333168404Spjd } 1334219089Spjd } 1335168404Spjd 1336168404Spjd osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1337236155Smm max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1338168404Spjd 1339168404Spjd if (vd->vdev_children == 0) { 1340168404Spjd if (osize < SPA_MINDEVSIZE) { 1341168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1342168404Spjd VDEV_AUX_TOO_SMALL); 1343249195Smm return (SET_ERROR(EOVERFLOW)); 1344168404Spjd } 1345168404Spjd psize = osize; 1346168404Spjd asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1347236155Smm max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1348236155Smm VDEV_LABEL_END_SIZE); 1349168404Spjd } else { 1350168404Spjd if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1351168404Spjd (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1352168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1353168404Spjd VDEV_AUX_TOO_SMALL); 1354249195Smm return (SET_ERROR(EOVERFLOW)); 1355168404Spjd } 1356168404Spjd psize = 0; 1357168404Spjd asize = osize; 1358236155Smm max_asize = max_osize; 1359168404Spjd } 1360168404Spjd 1361168404Spjd vd->vdev_psize = psize; 1362168404Spjd 1363219089Spjd /* 1364219089Spjd * Make sure the allocatable size hasn't shrunk. 1365219089Spjd */ 1366219089Spjd if (asize < vd->vdev_min_asize) { 1367219089Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1368219089Spjd VDEV_AUX_BAD_LABEL); 1369249195Smm return (SET_ERROR(EINVAL)); 1370219089Spjd } 1371219089Spjd 1372254591Sgibbs vd->vdev_physical_ashift = 1373254591Sgibbs MAX(physical_ashift, vd->vdev_physical_ashift); 1374254591Sgibbs vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); 1375254591Sgibbs vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); 1376254591Sgibbs 1377254591Sgibbs if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { 1378254591Sgibbs vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1379254591Sgibbs VDEV_AUX_ASHIFT_TOO_BIG); 1380254591Sgibbs return (EINVAL); 1381254591Sgibbs } 1382254591Sgibbs 1383168404Spjd if (vd->vdev_asize == 0) { 1384168404Spjd /* 1385168404Spjd * This is the first-ever open, so use the computed values. 1386168404Spjd * For testing purposes, a higher ashift can be requested. 1387168404Spjd */ 1388168404Spjd vd->vdev_asize = asize; 1389236155Smm vd->vdev_max_asize = max_asize; 1390168404Spjd } else { 1391168404Spjd /* 1392254591Sgibbs * Make sure the alignment requirement hasn't increased. 1393168404Spjd */ 1394254591Sgibbs if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && 1395253441Sdelphij vd->vdev_ops->vdev_op_leaf) { 1396254591Sgibbs vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1397254591Sgibbs VDEV_AUX_BAD_LABEL); 1398254591Sgibbs return (EINVAL); 1399168404Spjd } 1400236155Smm vd->vdev_max_asize = max_asize; 1401219089Spjd } 1402168404Spjd 1403219089Spjd /* 1404219089Spjd * If all children are healthy and the asize has increased, 1405219089Spjd * then we've experienced dynamic LUN growth. If automatic 1406219089Spjd * expansion is enabled then use the additional space. 1407219089Spjd */ 1408219089Spjd if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && 1409219089Spjd (vd->vdev_expanding || spa->spa_autoexpand)) 1410219089Spjd vd->vdev_asize = asize; 1411168404Spjd 1412219089Spjd vdev_set_min_asize(vd); 1413168404Spjd 1414168404Spjd /* 1415185029Spjd * Ensure we can issue some IO before declaring the 1416185029Spjd * vdev open for business. 1417185029Spjd */ 1418185029Spjd if (vd->vdev_ops->vdev_op_leaf && 1419185029Spjd (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1420219089Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1421219089Spjd VDEV_AUX_ERR_EXCEEDED); 1422185029Spjd return (error); 1423185029Spjd } 1424185029Spjd 1425185029Spjd /* 1426285001Savg * Track the min and max ashift values for normal data devices. 1427285001Savg */ 1428285001Savg if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1429285001Savg !vd->vdev_islog && vd->vdev_aux == NULL) { 1430285001Savg if (vd->vdev_ashift > spa->spa_max_ashift) 1431285001Savg spa->spa_max_ashift = vd->vdev_ashift; 1432285001Savg if (vd->vdev_ashift < spa->spa_min_ashift) 1433285001Savg spa->spa_min_ashift = vd->vdev_ashift; 1434285001Savg } 1435285001Savg 1436285001Savg /* 1437185029Spjd * If a leaf vdev has a DTL, and seems healthy, then kick off a 1438209962Smm * resilver. But don't do this if we are doing a reopen for a scrub, 1439209962Smm * since this would just restart the scrub we are already doing. 1440168404Spjd */ 1441209962Smm if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1442209962Smm vdev_resilver_needed(vd, NULL, NULL)) 1443209962Smm spa_async_request(spa, SPA_ASYNC_RESILVER); 1444168404Spjd 1445168404Spjd return (0); 1446168404Spjd} 1447168404Spjd 1448168404Spjd/* 1449168404Spjd * Called once the vdevs are all opened, this routine validates the label 1450168404Spjd * contents. This needs to be done before vdev_load() so that we don't 1451185029Spjd * inadvertently do repair I/Os to the wrong device. 1452168404Spjd * 1453230514Smm * If 'strict' is false ignore the spa guid check. This is necessary because 1454230514Smm * if the machine crashed during a re-guid the new guid might have been written 1455230514Smm * to all of the vdev labels, but not the cached config. The strict check 1456230514Smm * will be performed when the pool is opened again using the mos config. 1457230514Smm * 1458168404Spjd * This function will only return failure if one of the vdevs indicates that it 1459168404Spjd * has since been destroyed or exported. This is only possible if 1460168404Spjd * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1461168404Spjd * will be updated but the function will return 0. 1462168404Spjd */ 1463168404Spjdint 1464230514Smmvdev_validate(vdev_t *vd, boolean_t strict) 1465168404Spjd{ 1466168404Spjd spa_t *spa = vd->vdev_spa; 1467168404Spjd nvlist_t *label; 1468219089Spjd uint64_t guid = 0, top_guid; 1469168404Spjd uint64_t state; 1470168404Spjd 1471219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1472230514Smm if (vdev_validate(vd->vdev_child[c], strict) != 0) 1473249195Smm return (SET_ERROR(EBADF)); 1474168404Spjd 1475168404Spjd /* 1476168404Spjd * If the device has already failed, or was marked offline, don't do 1477168404Spjd * any further validation. Otherwise, label I/O will fail and we will 1478168404Spjd * overwrite the previous state. 1479168404Spjd */ 1480185029Spjd if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1481219089Spjd uint64_t aux_guid = 0; 1482219089Spjd nvlist_t *nvl; 1483246631Smm uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1484246631Smm spa_last_synced_txg(spa) : -1ULL; 1485168404Spjd 1486239620Smm if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1487168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1488168404Spjd VDEV_AUX_BAD_LABEL); 1489168404Spjd return (0); 1490168404Spjd } 1491168404Spjd 1492219089Spjd /* 1493219089Spjd * Determine if this vdev has been split off into another 1494219089Spjd * pool. If so, then refuse to open it. 1495219089Spjd */ 1496219089Spjd if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1497219089Spjd &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1498219089Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1499219089Spjd VDEV_AUX_SPLIT_POOL); 1500219089Spjd nvlist_free(label); 1501219089Spjd return (0); 1502219089Spjd } 1503219089Spjd 1504230514Smm if (strict && (nvlist_lookup_uint64(label, 1505230514Smm ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 1506230514Smm guid != spa_guid(spa))) { 1507168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1508168404Spjd VDEV_AUX_CORRUPT_DATA); 1509168404Spjd nvlist_free(label); 1510168404Spjd return (0); 1511168404Spjd } 1512168404Spjd 1513219089Spjd if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1514219089Spjd != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1515219089Spjd &aux_guid) != 0) 1516219089Spjd aux_guid = 0; 1517219089Spjd 1518185029Spjd /* 1519185029Spjd * If this vdev just became a top-level vdev because its 1520185029Spjd * sibling was detached, it will have adopted the parent's 1521185029Spjd * vdev guid -- but the label may or may not be on disk yet. 1522185029Spjd * Fortunately, either version of the label will have the 1523185029Spjd * same top guid, so if we're a top-level vdev, we can 1524185029Spjd * safely compare to that instead. 1525219089Spjd * 1526219089Spjd * If we split this vdev off instead, then we also check the 1527219089Spjd * original pool's guid. We don't want to consider the vdev 1528219089Spjd * corrupt if it is partway through a split operation. 1529185029Spjd */ 1530168404Spjd if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1531185029Spjd &guid) != 0 || 1532185029Spjd nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1533185029Spjd &top_guid) != 0 || 1534219089Spjd ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1535185029Spjd (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1536168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1537168404Spjd VDEV_AUX_CORRUPT_DATA); 1538168404Spjd nvlist_free(label); 1539168404Spjd return (0); 1540168404Spjd } 1541168404Spjd 1542168404Spjd if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1543168404Spjd &state) != 0) { 1544168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1545168404Spjd VDEV_AUX_CORRUPT_DATA); 1546168404Spjd nvlist_free(label); 1547168404Spjd return (0); 1548168404Spjd } 1549168404Spjd 1550168404Spjd nvlist_free(label); 1551168404Spjd 1552209962Smm /* 1553219089Spjd * If this is a verbatim import, no need to check the 1554209962Smm * state of the pool. 1555209962Smm */ 1556219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1557219089Spjd spa_load_state(spa) == SPA_LOAD_OPEN && 1558168404Spjd state != POOL_STATE_ACTIVE) 1559249195Smm return (SET_ERROR(EBADF)); 1560185029Spjd 1561185029Spjd /* 1562185029Spjd * If we were able to open and validate a vdev that was 1563185029Spjd * previously marked permanently unavailable, clear that state 1564185029Spjd * now. 1565185029Spjd */ 1566185029Spjd if (vd->vdev_not_present) 1567185029Spjd vd->vdev_not_present = 0; 1568168404Spjd } 1569168404Spjd 1570168404Spjd return (0); 1571168404Spjd} 1572168404Spjd 1573168404Spjd/* 1574168404Spjd * Close a virtual device. 1575168404Spjd */ 1576168404Spjdvoid 1577168404Spjdvdev_close(vdev_t *vd) 1578168404Spjd{ 1579209962Smm spa_t *spa = vd->vdev_spa; 1580219089Spjd vdev_t *pvd = vd->vdev_parent; 1581209962Smm 1582209962Smm ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1583209962Smm 1584219089Spjd /* 1585219089Spjd * If our parent is reopening, then we are as well, unless we are 1586219089Spjd * going offline. 1587219089Spjd */ 1588219089Spjd if (pvd != NULL && pvd->vdev_reopening) 1589219089Spjd vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1590219089Spjd 1591168404Spjd vd->vdev_ops->vdev_op_close(vd); 1592168404Spjd 1593185029Spjd vdev_cache_purge(vd); 1594168404Spjd 1595240868Spjd if (vd->vdev_ops->vdev_op_leaf) 1596240868Spjd trim_map_destroy(vd); 1597240868Spjd 1598168404Spjd /* 1599219089Spjd * We record the previous state before we close it, so that if we are 1600168404Spjd * doing a reopen(), we don't generate FMA ereports if we notice that 1601168404Spjd * it's still faulted. 1602168404Spjd */ 1603168404Spjd vd->vdev_prevstate = vd->vdev_state; 1604168404Spjd 1605168404Spjd if (vd->vdev_offline) 1606168404Spjd vd->vdev_state = VDEV_STATE_OFFLINE; 1607168404Spjd else 1608168404Spjd vd->vdev_state = VDEV_STATE_CLOSED; 1609168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1610168404Spjd} 1611168404Spjd 1612168404Spjdvoid 1613219089Spjdvdev_hold(vdev_t *vd) 1614219089Spjd{ 1615219089Spjd spa_t *spa = vd->vdev_spa; 1616219089Spjd 1617219089Spjd ASSERT(spa_is_root(spa)); 1618219089Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1619219089Spjd return; 1620219089Spjd 1621219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1622219089Spjd vdev_hold(vd->vdev_child[c]); 1623219089Spjd 1624219089Spjd if (vd->vdev_ops->vdev_op_leaf) 1625219089Spjd vd->vdev_ops->vdev_op_hold(vd); 1626219089Spjd} 1627219089Spjd 1628219089Spjdvoid 1629219089Spjdvdev_rele(vdev_t *vd) 1630219089Spjd{ 1631219089Spjd spa_t *spa = vd->vdev_spa; 1632219089Spjd 1633219089Spjd ASSERT(spa_is_root(spa)); 1634219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1635219089Spjd vdev_rele(vd->vdev_child[c]); 1636219089Spjd 1637219089Spjd if (vd->vdev_ops->vdev_op_leaf) 1638219089Spjd vd->vdev_ops->vdev_op_rele(vd); 1639219089Spjd} 1640219089Spjd 1641219089Spjd/* 1642219089Spjd * Reopen all interior vdevs and any unopened leaves. We don't actually 1643219089Spjd * reopen leaf vdevs which had previously been opened as they might deadlock 1644219089Spjd * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1645219089Spjd * If the leaf has never been opened then open it, as usual. 1646219089Spjd */ 1647219089Spjdvoid 1648168404Spjdvdev_reopen(vdev_t *vd) 1649168404Spjd{ 1650168404Spjd spa_t *spa = vd->vdev_spa; 1651168404Spjd 1652185029Spjd ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1653168404Spjd 1654219089Spjd /* set the reopening flag unless we're taking the vdev offline */ 1655219089Spjd vd->vdev_reopening = !vd->vdev_offline; 1656168404Spjd vdev_close(vd); 1657168404Spjd (void) vdev_open(vd); 1658168404Spjd 1659168404Spjd /* 1660168404Spjd * Call vdev_validate() here to make sure we have the same device. 1661168404Spjd * Otherwise, a device with an invalid label could be successfully 1662168404Spjd * opened in response to vdev_reopen(). 1663168404Spjd */ 1664185029Spjd if (vd->vdev_aux) { 1665185029Spjd (void) vdev_validate_aux(vd); 1666185029Spjd if (vdev_readable(vd) && vdev_writeable(vd) && 1667209962Smm vd->vdev_aux == &spa->spa_l2cache && 1668219089Spjd !l2arc_vdev_present(vd)) 1669219089Spjd l2arc_add_vdev(spa, vd); 1670185029Spjd } else { 1671246631Smm (void) vdev_validate(vd, B_TRUE); 1672185029Spjd } 1673168404Spjd 1674168404Spjd /* 1675185029Spjd * Reassess parent vdev's health. 1676168404Spjd */ 1677185029Spjd vdev_propagate_state(vd); 1678168404Spjd} 1679168404Spjd 1680168404Spjdint 1681168404Spjdvdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1682168404Spjd{ 1683168404Spjd int error; 1684168404Spjd 1685168404Spjd /* 1686168404Spjd * Normally, partial opens (e.g. of a mirror) are allowed. 1687168404Spjd * For a create, however, we want to fail the request if 1688168404Spjd * there are any components we can't open. 1689168404Spjd */ 1690168404Spjd error = vdev_open(vd); 1691168404Spjd 1692168404Spjd if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1693168404Spjd vdev_close(vd); 1694168404Spjd return (error ? error : ENXIO); 1695168404Spjd } 1696168404Spjd 1697168404Spjd /* 1698262093Savg * Recursively load DTLs and initialize all labels. 1699168404Spjd */ 1700262093Savg if ((error = vdev_dtl_load(vd)) != 0 || 1701262093Savg (error = vdev_label_init(vd, txg, isreplacing ? 1702168404Spjd VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1703168404Spjd vdev_close(vd); 1704168404Spjd return (error); 1705168404Spjd } 1706168404Spjd 1707168404Spjd return (0); 1708168404Spjd} 1709168404Spjd 1710168404Spjdvoid 1711219089Spjdvdev_metaslab_set_size(vdev_t *vd) 1712168404Spjd{ 1713168404Spjd /* 1714273343Sdelphij * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. 1715168404Spjd */ 1716273343Sdelphij vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); 1717168404Spjd vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1718168404Spjd} 1719168404Spjd 1720254591Sgibbs/* 1721266122Ssmh * Maximize performance by inflating the configured ashift for top level 1722266122Ssmh * vdevs to be as close to the physical ashift as possible while maintaining 1723266122Ssmh * administrator defined limits and ensuring it doesn't go below the 1724266122Ssmh * logical ashift. 1725254591Sgibbs */ 1726168404Spjdvoid 1727254591Sgibbsvdev_ashift_optimize(vdev_t *vd) 1728254591Sgibbs{ 1729266122Ssmh if (vd == vd->vdev_top) { 1730266122Ssmh if (vd->vdev_ashift < vd->vdev_physical_ashift) { 1731266122Ssmh vd->vdev_ashift = MIN( 1732266122Ssmh MAX(zfs_max_auto_ashift, vd->vdev_ashift), 1733266122Ssmh MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); 1734266122Ssmh } else { 1735266122Ssmh /* 1736266122Ssmh * Unusual case where logical ashift > physical ashift 1737266122Ssmh * so we can't cap the calculated ashift based on max 1738266122Ssmh * ashift as that would cause failures. 1739266122Ssmh * We still check if we need to increase it to match 1740266122Ssmh * the min ashift. 1741266122Ssmh */ 1742266122Ssmh vd->vdev_ashift = MAX(zfs_min_auto_ashift, 1743266122Ssmh vd->vdev_ashift); 1744266122Ssmh } 1745254591Sgibbs } 1746254591Sgibbs} 1747254591Sgibbs 1748254591Sgibbsvoid 1749168404Spjdvdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1750168404Spjd{ 1751168404Spjd ASSERT(vd == vd->vdev_top); 1752219089Spjd ASSERT(!vd->vdev_ishole); 1753168404Spjd ASSERT(ISP2(flags)); 1754219089Spjd ASSERT(spa_writeable(vd->vdev_spa)); 1755168404Spjd 1756168404Spjd if (flags & VDD_METASLAB) 1757168404Spjd (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1758168404Spjd 1759168404Spjd if (flags & VDD_DTL) 1760168404Spjd (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1761168404Spjd 1762168404Spjd (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1763168404Spjd} 1764168404Spjd 1765262093Savgvoid 1766262093Savgvdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) 1767262093Savg{ 1768262093Savg for (int c = 0; c < vd->vdev_children; c++) 1769262093Savg vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 1770262093Savg 1771262093Savg if (vd->vdev_ops->vdev_op_leaf) 1772262093Savg vdev_dirty(vd->vdev_top, flags, vd, txg); 1773262093Savg} 1774262093Savg 1775209962Smm/* 1776209962Smm * DTLs. 1777209962Smm * 1778209962Smm * A vdev's DTL (dirty time log) is the set of transaction groups for which 1779219089Spjd * the vdev has less than perfect replication. There are four kinds of DTL: 1780209962Smm * 1781209962Smm * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1782209962Smm * 1783209962Smm * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1784209962Smm * 1785209962Smm * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1786209962Smm * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1787209962Smm * txgs that was scrubbed. 1788209962Smm * 1789209962Smm * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1790209962Smm * persistent errors or just some device being offline. 1791209962Smm * Unlike the other three, the DTL_OUTAGE map is not generally 1792209962Smm * maintained; it's only computed when needed, typically to 1793209962Smm * determine whether a device can be detached. 1794209962Smm * 1795209962Smm * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1796209962Smm * either has the data or it doesn't. 1797209962Smm * 1798209962Smm * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1799209962Smm * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1800209962Smm * if any child is less than fully replicated, then so is its parent. 1801209962Smm * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1802209962Smm * comprising only those txgs which appear in 'maxfaults' or more children; 1803209962Smm * those are the txgs we don't have enough replication to read. For example, 1804209962Smm * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1805209962Smm * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1806209962Smm * two child DTL_MISSING maps. 1807209962Smm * 1808209962Smm * It should be clear from the above that to compute the DTLs and outage maps 1809209962Smm * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1810209962Smm * Therefore, that is all we keep on disk. When loading the pool, or after 1811209962Smm * a configuration change, we generate all other DTLs from first principles. 1812209962Smm */ 1813168404Spjdvoid 1814209962Smmvdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1815168404Spjd{ 1816262093Savg range_tree_t *rt = vd->vdev_dtl[t]; 1817209962Smm 1818209962Smm ASSERT(t < DTL_TYPES); 1819209962Smm ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1820219089Spjd ASSERT(spa_writeable(vd->vdev_spa)); 1821209962Smm 1822262093Savg mutex_enter(rt->rt_lock); 1823262093Savg if (!range_tree_contains(rt, txg, size)) 1824262093Savg range_tree_add(rt, txg, size); 1825262093Savg mutex_exit(rt->rt_lock); 1826168404Spjd} 1827168404Spjd 1828209962Smmboolean_t 1829209962Smmvdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1830168404Spjd{ 1831262093Savg range_tree_t *rt = vd->vdev_dtl[t]; 1832209962Smm boolean_t dirty = B_FALSE; 1833168404Spjd 1834209962Smm ASSERT(t < DTL_TYPES); 1835209962Smm ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1836168404Spjd 1837262093Savg mutex_enter(rt->rt_lock); 1838262093Savg if (range_tree_space(rt) != 0) 1839262093Savg dirty = range_tree_contains(rt, txg, size); 1840262093Savg mutex_exit(rt->rt_lock); 1841168404Spjd 1842168404Spjd return (dirty); 1843168404Spjd} 1844168404Spjd 1845209962Smmboolean_t 1846209962Smmvdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1847209962Smm{ 1848262093Savg range_tree_t *rt = vd->vdev_dtl[t]; 1849209962Smm boolean_t empty; 1850209962Smm 1851262093Savg mutex_enter(rt->rt_lock); 1852262093Savg empty = (range_tree_space(rt) == 0); 1853262093Savg mutex_exit(rt->rt_lock); 1854209962Smm 1855209962Smm return (empty); 1856209962Smm} 1857209962Smm 1858168404Spjd/* 1859254112Sdelphij * Returns the lowest txg in the DTL range. 1860254112Sdelphij */ 1861254112Sdelphijstatic uint64_t 1862254112Sdelphijvdev_dtl_min(vdev_t *vd) 1863254112Sdelphij{ 1864262093Savg range_seg_t *rs; 1865254112Sdelphij 1866254112Sdelphij ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1867262093Savg ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1868254112Sdelphij ASSERT0(vd->vdev_children); 1869254112Sdelphij 1870262093Savg rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1871262093Savg return (rs->rs_start - 1); 1872254112Sdelphij} 1873254112Sdelphij 1874254112Sdelphij/* 1875254112Sdelphij * Returns the highest txg in the DTL. 1876254112Sdelphij */ 1877254112Sdelphijstatic uint64_t 1878254112Sdelphijvdev_dtl_max(vdev_t *vd) 1879254112Sdelphij{ 1880262093Savg range_seg_t *rs; 1881254112Sdelphij 1882254112Sdelphij ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1883262093Savg ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1884254112Sdelphij ASSERT0(vd->vdev_children); 1885254112Sdelphij 1886262093Savg rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1887262093Savg return (rs->rs_end); 1888254112Sdelphij} 1889254112Sdelphij 1890254112Sdelphij/* 1891254112Sdelphij * Determine if a resilvering vdev should remove any DTL entries from 1892254112Sdelphij * its range. If the vdev was resilvering for the entire duration of the 1893254112Sdelphij * scan then it should excise that range from its DTLs. Otherwise, this 1894254112Sdelphij * vdev is considered partially resilvered and should leave its DTL 1895254112Sdelphij * entries intact. The comment in vdev_dtl_reassess() describes how we 1896254112Sdelphij * excise the DTLs. 1897254112Sdelphij */ 1898254112Sdelphijstatic boolean_t 1899254112Sdelphijvdev_dtl_should_excise(vdev_t *vd) 1900254112Sdelphij{ 1901254112Sdelphij spa_t *spa = vd->vdev_spa; 1902254112Sdelphij dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1903254112Sdelphij 1904254112Sdelphij ASSERT0(scn->scn_phys.scn_errors); 1905254112Sdelphij ASSERT0(vd->vdev_children); 1906254112Sdelphij 1907319625Sgjb if (vd->vdev_state < VDEV_STATE_DEGRADED) 1908319625Sgjb return (B_FALSE); 1909319625Sgjb 1910254112Sdelphij if (vd->vdev_resilver_txg == 0 || 1911262093Savg range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) 1912254112Sdelphij return (B_TRUE); 1913254112Sdelphij 1914254112Sdelphij /* 1915254112Sdelphij * When a resilver is initiated the scan will assign the scn_max_txg 1916254112Sdelphij * value to the highest txg value that exists in all DTLs. If this 1917254112Sdelphij * device's max DTL is not part of this scan (i.e. it is not in 1918254112Sdelphij * the range (scn_min_txg, scn_max_txg] then it is not eligible 1919254112Sdelphij * for excision. 1920254112Sdelphij */ 1921254112Sdelphij if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 1922254112Sdelphij ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); 1923254112Sdelphij ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); 1924254112Sdelphij ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); 1925254112Sdelphij return (B_TRUE); 1926254112Sdelphij } 1927254112Sdelphij return (B_FALSE); 1928254112Sdelphij} 1929254112Sdelphij 1930254112Sdelphij/* 1931168404Spjd * Reassess DTLs after a config change or scrub completion. 1932168404Spjd */ 1933168404Spjdvoid 1934168404Spjdvdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1935168404Spjd{ 1936168404Spjd spa_t *spa = vd->vdev_spa; 1937209962Smm avl_tree_t reftree; 1938209962Smm int minref; 1939168404Spjd 1940209962Smm ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1941168404Spjd 1942209962Smm for (int c = 0; c < vd->vdev_children; c++) 1943209962Smm vdev_dtl_reassess(vd->vdev_child[c], txg, 1944209962Smm scrub_txg, scrub_done); 1945209962Smm 1946219089Spjd if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) 1947209962Smm return; 1948209962Smm 1949209962Smm if (vd->vdev_ops->vdev_op_leaf) { 1950219089Spjd dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1951219089Spjd 1952168404Spjd mutex_enter(&vd->vdev_dtl_lock); 1953254112Sdelphij 1954254112Sdelphij /* 1955254112Sdelphij * If we've completed a scan cleanly then determine 1956254112Sdelphij * if this vdev should remove any DTLs. We only want to 1957254112Sdelphij * excise regions on vdevs that were available during 1958254112Sdelphij * the entire duration of this scan. 1959254112Sdelphij */ 1960185029Spjd if (scrub_txg != 0 && 1961219089Spjd (spa->spa_scrub_started || 1962254112Sdelphij (scn != NULL && scn->scn_phys.scn_errors == 0)) && 1963254112Sdelphij vdev_dtl_should_excise(vd)) { 1964185029Spjd /* 1965185029Spjd * We completed a scrub up to scrub_txg. If we 1966185029Spjd * did it without rebooting, then the scrub dtl 1967185029Spjd * will be valid, so excise the old region and 1968185029Spjd * fold in the scrub dtl. Otherwise, leave the 1969185029Spjd * dtl as-is if there was an error. 1970209962Smm * 1971209962Smm * There's little trick here: to excise the beginning 1972209962Smm * of the DTL_MISSING map, we put it into a reference 1973209962Smm * tree and then add a segment with refcnt -1 that 1974209962Smm * covers the range [0, scrub_txg). This means 1975209962Smm * that each txg in that range has refcnt -1 or 0. 1976209962Smm * We then add DTL_SCRUB with a refcnt of 2, so that 1977209962Smm * entries in the range [0, scrub_txg) will have a 1978209962Smm * positive refcnt -- either 1 or 2. We then convert 1979209962Smm * the reference tree into the new DTL_MISSING map. 1980185029Spjd */ 1981262093Savg space_reftree_create(&reftree); 1982262093Savg space_reftree_add_map(&reftree, 1983262093Savg vd->vdev_dtl[DTL_MISSING], 1); 1984262093Savg space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 1985262093Savg space_reftree_add_map(&reftree, 1986262093Savg vd->vdev_dtl[DTL_SCRUB], 2); 1987262093Savg space_reftree_generate_map(&reftree, 1988262093Savg vd->vdev_dtl[DTL_MISSING], 1); 1989262093Savg space_reftree_destroy(&reftree); 1990168404Spjd } 1991262093Savg range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 1992262093Savg range_tree_walk(vd->vdev_dtl[DTL_MISSING], 1993262093Savg range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 1994168404Spjd if (scrub_done) 1995262093Savg range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 1996262093Savg range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 1997209962Smm if (!vdev_readable(vd)) 1998262093Savg range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 1999209962Smm else 2000262093Savg range_tree_walk(vd->vdev_dtl[DTL_MISSING], 2001262093Savg range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 2002254112Sdelphij 2003254112Sdelphij /* 2004254112Sdelphij * If the vdev was resilvering and no longer has any 2005271776Ssmh * DTLs then reset its resilvering flag and dirty 2006271776Ssmh * the top level so that we persist the change. 2007254112Sdelphij */ 2008254112Sdelphij if (vd->vdev_resilver_txg != 0 && 2009262093Savg range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && 2010271776Ssmh range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { 2011254112Sdelphij vd->vdev_resilver_txg = 0; 2012271776Ssmh vdev_config_dirty(vd->vdev_top); 2013271776Ssmh } 2014254112Sdelphij 2015168404Spjd mutex_exit(&vd->vdev_dtl_lock); 2016185029Spjd 2017168404Spjd if (txg != 0) 2018168404Spjd vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 2019168404Spjd return; 2020168404Spjd } 2021168404Spjd 2022168404Spjd mutex_enter(&vd->vdev_dtl_lock); 2023209962Smm for (int t = 0; t < DTL_TYPES; t++) { 2024209962Smm /* account for child's outage in parent's missing map */ 2025209962Smm int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 2026209962Smm if (t == DTL_SCRUB) 2027209962Smm continue; /* leaf vdevs only */ 2028209962Smm if (t == DTL_PARTIAL) 2029209962Smm minref = 1; /* i.e. non-zero */ 2030209962Smm else if (vd->vdev_nparity != 0) 2031209962Smm minref = vd->vdev_nparity + 1; /* RAID-Z */ 2032209962Smm else 2033209962Smm minref = vd->vdev_children; /* any kind of mirror */ 2034262093Savg space_reftree_create(&reftree); 2035209962Smm for (int c = 0; c < vd->vdev_children; c++) { 2036209962Smm vdev_t *cvd = vd->vdev_child[c]; 2037209962Smm mutex_enter(&cvd->vdev_dtl_lock); 2038262093Savg space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); 2039209962Smm mutex_exit(&cvd->vdev_dtl_lock); 2040209962Smm } 2041262093Savg space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); 2042262093Savg space_reftree_destroy(&reftree); 2043209962Smm } 2044168404Spjd mutex_exit(&vd->vdev_dtl_lock); 2045168404Spjd} 2046168404Spjd 2047262093Savgint 2048168404Spjdvdev_dtl_load(vdev_t *vd) 2049168404Spjd{ 2050168404Spjd spa_t *spa = vd->vdev_spa; 2051168404Spjd objset_t *mos = spa->spa_meta_objset; 2052262093Savg int error = 0; 2053168404Spjd 2054262093Savg if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 2055262093Savg ASSERT(!vd->vdev_ishole); 2056168404Spjd 2057262093Savg error = space_map_open(&vd->vdev_dtl_sm, mos, 2058262093Savg vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); 2059262093Savg if (error) 2060262093Savg return (error); 2061262093Savg ASSERT(vd->vdev_dtl_sm != NULL); 2062168404Spjd 2063262093Savg mutex_enter(&vd->vdev_dtl_lock); 2064219089Spjd 2065262093Savg /* 2066262093Savg * Now that we've opened the space_map we need to update 2067262093Savg * the in-core DTL. 2068262093Savg */ 2069262093Savg space_map_update(vd->vdev_dtl_sm); 2070262093Savg 2071262093Savg error = space_map_load(vd->vdev_dtl_sm, 2072262093Savg vd->vdev_dtl[DTL_MISSING], SM_ALLOC); 2073262093Savg mutex_exit(&vd->vdev_dtl_lock); 2074262093Savg 2075168404Spjd return (error); 2076262093Savg } 2077168404Spjd 2078262093Savg for (int c = 0; c < vd->vdev_children; c++) { 2079262093Savg error = vdev_dtl_load(vd->vdev_child[c]); 2080262093Savg if (error != 0) 2081262093Savg break; 2082262093Savg } 2083168404Spjd 2084168404Spjd return (error); 2085168404Spjd} 2086168404Spjd 2087168404Spjdvoid 2088168404Spjdvdev_dtl_sync(vdev_t *vd, uint64_t txg) 2089168404Spjd{ 2090168404Spjd spa_t *spa = vd->vdev_spa; 2091262093Savg range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; 2092168404Spjd objset_t *mos = spa->spa_meta_objset; 2093262093Savg range_tree_t *rtsync; 2094262093Savg kmutex_t rtlock; 2095168404Spjd dmu_tx_t *tx; 2096262093Savg uint64_t object = space_map_object(vd->vdev_dtl_sm); 2097168404Spjd 2098219089Spjd ASSERT(!vd->vdev_ishole); 2099262093Savg ASSERT(vd->vdev_ops->vdev_op_leaf); 2100219089Spjd 2101168404Spjd tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2102168404Spjd 2103262093Savg if (vd->vdev_detached || vd->vdev_top->vdev_removing) { 2104262093Savg mutex_enter(&vd->vdev_dtl_lock); 2105262093Savg space_map_free(vd->vdev_dtl_sm, tx); 2106262093Savg space_map_close(vd->vdev_dtl_sm); 2107262093Savg vd->vdev_dtl_sm = NULL; 2108262093Savg mutex_exit(&vd->vdev_dtl_lock); 2109168404Spjd dmu_tx_commit(tx); 2110168404Spjd return; 2111168404Spjd } 2112168404Spjd 2113262093Savg if (vd->vdev_dtl_sm == NULL) { 2114262093Savg uint64_t new_object; 2115262093Savg 2116262093Savg new_object = space_map_alloc(mos, tx); 2117262093Savg VERIFY3U(new_object, !=, 0); 2118262093Savg 2119262093Savg VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 2120262093Savg 0, -1ULL, 0, &vd->vdev_dtl_lock)); 2121262093Savg ASSERT(vd->vdev_dtl_sm != NULL); 2122168404Spjd } 2123168404Spjd 2124262093Savg bzero(&rtlock, sizeof(rtlock)); 2125262093Savg mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); 2126168404Spjd 2127262093Savg rtsync = range_tree_create(NULL, NULL, &rtlock); 2128168404Spjd 2129262093Savg mutex_enter(&rtlock); 2130168404Spjd 2131168404Spjd mutex_enter(&vd->vdev_dtl_lock); 2132262093Savg range_tree_walk(rt, range_tree_add, rtsync); 2133168404Spjd mutex_exit(&vd->vdev_dtl_lock); 2134168404Spjd 2135262093Savg space_map_truncate(vd->vdev_dtl_sm, tx); 2136262093Savg space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); 2137262093Savg range_tree_vacate(rtsync, NULL, NULL); 2138168404Spjd 2139262093Savg range_tree_destroy(rtsync); 2140168404Spjd 2141262093Savg mutex_exit(&rtlock); 2142262093Savg mutex_destroy(&rtlock); 2143168404Spjd 2144262093Savg /* 2145262093Savg * If the object for the space map has changed then dirty 2146262093Savg * the top level so that we update the config. 2147262093Savg */ 2148262093Savg if (object != space_map_object(vd->vdev_dtl_sm)) { 2149262093Savg zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " 2150262093Savg "new object %llu", txg, spa_name(spa), object, 2151262093Savg space_map_object(vd->vdev_dtl_sm)); 2152262093Savg vdev_config_dirty(vd->vdev_top); 2153262093Savg } 2154168404Spjd 2155168404Spjd dmu_tx_commit(tx); 2156262093Savg 2157262093Savg mutex_enter(&vd->vdev_dtl_lock); 2158262093Savg space_map_update(vd->vdev_dtl_sm); 2159262093Savg mutex_exit(&vd->vdev_dtl_lock); 2160168404Spjd} 2161168404Spjd 2162185029Spjd/* 2163209962Smm * Determine whether the specified vdev can be offlined/detached/removed 2164209962Smm * without losing data. 2165209962Smm */ 2166209962Smmboolean_t 2167209962Smmvdev_dtl_required(vdev_t *vd) 2168209962Smm{ 2169209962Smm spa_t *spa = vd->vdev_spa; 2170209962Smm vdev_t *tvd = vd->vdev_top; 2171209962Smm uint8_t cant_read = vd->vdev_cant_read; 2172209962Smm boolean_t required; 2173209962Smm 2174209962Smm ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2175209962Smm 2176209962Smm if (vd == spa->spa_root_vdev || vd == tvd) 2177209962Smm return (B_TRUE); 2178209962Smm 2179209962Smm /* 2180209962Smm * Temporarily mark the device as unreadable, and then determine 2181209962Smm * whether this results in any DTL outages in the top-level vdev. 2182209962Smm * If not, we can safely offline/detach/remove the device. 2183209962Smm */ 2184209962Smm vd->vdev_cant_read = B_TRUE; 2185209962Smm vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2186209962Smm required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 2187209962Smm vd->vdev_cant_read = cant_read; 2188209962Smm vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2189209962Smm 2190219089Spjd if (!required && zio_injection_enabled) 2191219089Spjd required = !!zio_handle_device_injection(vd, NULL, ECHILD); 2192219089Spjd 2193209962Smm return (required); 2194209962Smm} 2195209962Smm 2196209962Smm/* 2197185029Spjd * Determine if resilver is needed, and if so the txg range. 2198185029Spjd */ 2199185029Spjdboolean_t 2200185029Spjdvdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 2201185029Spjd{ 2202185029Spjd boolean_t needed = B_FALSE; 2203185029Spjd uint64_t thismin = UINT64_MAX; 2204185029Spjd uint64_t thismax = 0; 2205185029Spjd 2206185029Spjd if (vd->vdev_children == 0) { 2207185029Spjd mutex_enter(&vd->vdev_dtl_lock); 2208262093Savg if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && 2209209962Smm vdev_writeable(vd)) { 2210185029Spjd 2211254112Sdelphij thismin = vdev_dtl_min(vd); 2212254112Sdelphij thismax = vdev_dtl_max(vd); 2213185029Spjd needed = B_TRUE; 2214185029Spjd } 2215185029Spjd mutex_exit(&vd->vdev_dtl_lock); 2216185029Spjd } else { 2217209962Smm for (int c = 0; c < vd->vdev_children; c++) { 2218185029Spjd vdev_t *cvd = vd->vdev_child[c]; 2219185029Spjd uint64_t cmin, cmax; 2220185029Spjd 2221185029Spjd if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 2222185029Spjd thismin = MIN(thismin, cmin); 2223185029Spjd thismax = MAX(thismax, cmax); 2224185029Spjd needed = B_TRUE; 2225185029Spjd } 2226185029Spjd } 2227185029Spjd } 2228185029Spjd 2229185029Spjd if (needed && minp) { 2230185029Spjd *minp = thismin; 2231185029Spjd *maxp = thismax; 2232185029Spjd } 2233185029Spjd return (needed); 2234185029Spjd} 2235185029Spjd 2236168404Spjdvoid 2237168404Spjdvdev_load(vdev_t *vd) 2238168404Spjd{ 2239168404Spjd /* 2240168404Spjd * Recursively load all children. 2241168404Spjd */ 2242209962Smm for (int c = 0; c < vd->vdev_children; c++) 2243168404Spjd vdev_load(vd->vdev_child[c]); 2244168404Spjd 2245168404Spjd /* 2246168404Spjd * If this is a top-level vdev, initialize its metaslabs. 2247168404Spjd */ 2248219089Spjd if (vd == vd->vdev_top && !vd->vdev_ishole && 2249168404Spjd (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 2250168404Spjd vdev_metaslab_init(vd, 0) != 0)) 2251168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2252168404Spjd VDEV_AUX_CORRUPT_DATA); 2253168404Spjd 2254168404Spjd /* 2255168404Spjd * If this is a leaf vdev, load its DTL. 2256168404Spjd */ 2257168404Spjd if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 2258168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2259168404Spjd VDEV_AUX_CORRUPT_DATA); 2260168404Spjd} 2261168404Spjd 2262168404Spjd/* 2263185029Spjd * The special vdev case is used for hot spares and l2cache devices. Its 2264185029Spjd * sole purpose it to set the vdev state for the associated vdev. To do this, 2265185029Spjd * we make sure that we can open the underlying device, then try to read the 2266185029Spjd * label, and make sure that the label is sane and that it hasn't been 2267185029Spjd * repurposed to another pool. 2268168404Spjd */ 2269168404Spjdint 2270185029Spjdvdev_validate_aux(vdev_t *vd) 2271168404Spjd{ 2272168404Spjd nvlist_t *label; 2273168404Spjd uint64_t guid, version; 2274168404Spjd uint64_t state; 2275168404Spjd 2276185029Spjd if (!vdev_readable(vd)) 2277185029Spjd return (0); 2278185029Spjd 2279239620Smm if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 2280168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2281168404Spjd VDEV_AUX_CORRUPT_DATA); 2282168404Spjd return (-1); 2283168404Spjd } 2284168404Spjd 2285168404Spjd if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 2286236884Smm !SPA_VERSION_IS_SUPPORTED(version) || 2287168404Spjd nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 2288168404Spjd guid != vd->vdev_guid || 2289168404Spjd nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2290168404Spjd vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2291168404Spjd VDEV_AUX_CORRUPT_DATA); 2292168404Spjd nvlist_free(label); 2293168404Spjd return (-1); 2294168404Spjd } 2295168404Spjd 2296168404Spjd /* 2297168404Spjd * We don't actually check the pool state here. If it's in fact in 2298168404Spjd * use by another pool, we update this fact on the fly when requested. 2299168404Spjd */ 2300168404Spjd nvlist_free(label); 2301168404Spjd return (0); 2302168404Spjd} 2303168404Spjd 2304168404Spjdvoid 2305219089Spjdvdev_remove(vdev_t *vd, uint64_t txg) 2306219089Spjd{ 2307219089Spjd spa_t *spa = vd->vdev_spa; 2308219089Spjd objset_t *mos = spa->spa_meta_objset; 2309219089Spjd dmu_tx_t *tx; 2310219089Spjd 2311219089Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2312219089Spjd 2313219089Spjd if (vd->vdev_ms != NULL) { 2314269773Sdelphij metaslab_group_t *mg = vd->vdev_mg; 2315269773Sdelphij 2316269773Sdelphij metaslab_group_histogram_verify(mg); 2317269773Sdelphij metaslab_class_histogram_verify(mg->mg_class); 2318269773Sdelphij 2319219089Spjd for (int m = 0; m < vd->vdev_ms_count; m++) { 2320219089Spjd metaslab_t *msp = vd->vdev_ms[m]; 2321219089Spjd 2322262093Savg if (msp == NULL || msp->ms_sm == NULL) 2323219089Spjd continue; 2324219089Spjd 2325262093Savg mutex_enter(&msp->ms_lock); 2326269773Sdelphij /* 2327269773Sdelphij * If the metaslab was not loaded when the vdev 2328269773Sdelphij * was removed then the histogram accounting may 2329269773Sdelphij * not be accurate. Update the histogram information 2330269773Sdelphij * here so that we ensure that the metaslab group 2331269773Sdelphij * and metaslab class are up-to-date. 2332269773Sdelphij */ 2333269773Sdelphij metaslab_group_histogram_remove(mg, msp); 2334269773Sdelphij 2335262093Savg VERIFY0(space_map_allocated(msp->ms_sm)); 2336262093Savg space_map_free(msp->ms_sm, tx); 2337262093Savg space_map_close(msp->ms_sm); 2338262093Savg msp->ms_sm = NULL; 2339262093Savg mutex_exit(&msp->ms_lock); 2340219089Spjd } 2341269773Sdelphij 2342269773Sdelphij metaslab_group_histogram_verify(mg); 2343269773Sdelphij metaslab_class_histogram_verify(mg->mg_class); 2344269773Sdelphij for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 2345269773Sdelphij ASSERT0(mg->mg_histogram[i]); 2346269773Sdelphij 2347219089Spjd } 2348219089Spjd 2349219089Spjd if (vd->vdev_ms_array) { 2350219089Spjd (void) dmu_object_free(mos, vd->vdev_ms_array, tx); 2351219089Spjd vd->vdev_ms_array = 0; 2352219089Spjd } 2353219089Spjd dmu_tx_commit(tx); 2354219089Spjd} 2355219089Spjd 2356219089Spjdvoid 2357168404Spjdvdev_sync_done(vdev_t *vd, uint64_t txg) 2358168404Spjd{ 2359168404Spjd metaslab_t *msp; 2360211931Smm boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2361168404Spjd 2362219089Spjd ASSERT(!vd->vdev_ishole); 2363219089Spjd 2364168404Spjd while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2365168404Spjd metaslab_sync_done(msp, txg); 2366211931Smm 2367211931Smm if (reassess) 2368211931Smm metaslab_sync_reassess(vd->vdev_mg); 2369168404Spjd} 2370168404Spjd 2371168404Spjdvoid 2372168404Spjdvdev_sync(vdev_t *vd, uint64_t txg) 2373168404Spjd{ 2374168404Spjd spa_t *spa = vd->vdev_spa; 2375168404Spjd vdev_t *lvd; 2376168404Spjd metaslab_t *msp; 2377168404Spjd dmu_tx_t *tx; 2378168404Spjd 2379219089Spjd ASSERT(!vd->vdev_ishole); 2380219089Spjd 2381168404Spjd if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 2382168404Spjd ASSERT(vd == vd->vdev_top); 2383168404Spjd tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2384168404Spjd vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2385168404Spjd DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2386168404Spjd ASSERT(vd->vdev_ms_array != 0); 2387168404Spjd vdev_config_dirty(vd); 2388168404Spjd dmu_tx_commit(tx); 2389168404Spjd } 2390168404Spjd 2391219089Spjd /* 2392219089Spjd * Remove the metadata associated with this vdev once it's empty. 2393219089Spjd */ 2394219089Spjd if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) 2395219089Spjd vdev_remove(vd, txg); 2396219089Spjd 2397168404Spjd while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2398168404Spjd metaslab_sync(msp, txg); 2399168404Spjd (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2400168404Spjd } 2401168404Spjd 2402168404Spjd while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2403168404Spjd vdev_dtl_sync(lvd, txg); 2404168404Spjd 2405168404Spjd (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2406168404Spjd} 2407168404Spjd 2408168404Spjduint64_t 2409168404Spjdvdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2410168404Spjd{ 2411168404Spjd return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2412168404Spjd} 2413168404Spjd 2414185029Spjd/* 2415185029Spjd * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2416185029Spjd * not be opened, and no I/O is attempted. 2417185029Spjd */ 2418185029Spjdint 2419219089Spjdvdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2420168404Spjd{ 2421219089Spjd vdev_t *vd, *tvd; 2422168404Spjd 2423219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 2424185029Spjd 2425185029Spjd if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2426185029Spjd return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2427185029Spjd 2428185029Spjd if (!vd->vdev_ops->vdev_op_leaf) 2429185029Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2430185029Spjd 2431219089Spjd tvd = vd->vdev_top; 2432219089Spjd 2433185029Spjd /* 2434219089Spjd * We don't directly use the aux state here, but if we do a 2435219089Spjd * vdev_reopen(), we need this value to be present to remember why we 2436219089Spjd * were faulted. 2437219089Spjd */ 2438219089Spjd vd->vdev_label_aux = aux; 2439219089Spjd 2440219089Spjd /* 2441185029Spjd * Faulted state takes precedence over degraded. 2442185029Spjd */ 2443219089Spjd vd->vdev_delayed_close = B_FALSE; 2444185029Spjd vd->vdev_faulted = 1ULL; 2445185029Spjd vd->vdev_degraded = 0ULL; 2446219089Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2447185029Spjd 2448185029Spjd /* 2449219089Spjd * If this device has the only valid copy of the data, then 2450219089Spjd * back off and simply mark the vdev as degraded instead. 2451185029Spjd */ 2452219089Spjd if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2453185029Spjd vd->vdev_degraded = 1ULL; 2454185029Spjd vd->vdev_faulted = 0ULL; 2455185029Spjd 2456185029Spjd /* 2457185029Spjd * If we reopen the device and it's not dead, only then do we 2458185029Spjd * mark it degraded. 2459185029Spjd */ 2460219089Spjd vdev_reopen(tvd); 2461185029Spjd 2462219089Spjd if (vdev_readable(vd)) 2463219089Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2464185029Spjd } 2465185029Spjd 2466185029Spjd return (spa_vdev_state_exit(spa, vd, 0)); 2467168404Spjd} 2468168404Spjd 2469185029Spjd/* 2470185029Spjd * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2471185029Spjd * user that something is wrong. The vdev continues to operate as normal as far 2472185029Spjd * as I/O is concerned. 2473185029Spjd */ 2474185029Spjdint 2475219089Spjdvdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2476168404Spjd{ 2477185029Spjd vdev_t *vd; 2478168404Spjd 2479219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 2480168404Spjd 2481185029Spjd if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2482185029Spjd return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2483168404Spjd 2484185029Spjd if (!vd->vdev_ops->vdev_op_leaf) 2485185029Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2486185029Spjd 2487185029Spjd /* 2488185029Spjd * If the vdev is already faulted, then don't do anything. 2489185029Spjd */ 2490185029Spjd if (vd->vdev_faulted || vd->vdev_degraded) 2491185029Spjd return (spa_vdev_state_exit(spa, NULL, 0)); 2492185029Spjd 2493185029Spjd vd->vdev_degraded = 1ULL; 2494185029Spjd if (!vdev_is_dead(vd)) 2495185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2496219089Spjd aux); 2497185029Spjd 2498185029Spjd return (spa_vdev_state_exit(spa, vd, 0)); 2499168404Spjd} 2500168404Spjd 2501185029Spjd/* 2502251631Sdelphij * Online the given vdev. 2503251631Sdelphij * 2504251631Sdelphij * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2505251631Sdelphij * spare device should be detached when the device finishes resilvering. 2506251631Sdelphij * Second, the online should be treated like a 'test' online case, so no FMA 2507251631Sdelphij * events are generated if the device fails to open. 2508185029Spjd */ 2509168404Spjdint 2510185029Spjdvdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 2511168404Spjd{ 2512219089Spjd vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 2513290745Smav boolean_t postevent = B_FALSE; 2514168404Spjd 2515219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 2516168404Spjd 2517185029Spjd if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2518185029Spjd return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2519168404Spjd 2520168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 2521185029Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2522168404Spjd 2523290745Smav postevent = 2524290745Smav (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ? 2525290745Smav B_TRUE : B_FALSE; 2526290745Smav 2527219089Spjd tvd = vd->vdev_top; 2528168404Spjd vd->vdev_offline = B_FALSE; 2529168404Spjd vd->vdev_tmpoffline = B_FALSE; 2530185029Spjd vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2531185029Spjd vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2532219089Spjd 2533219089Spjd /* XXX - L2ARC 1.0 does not support expansion */ 2534219089Spjd if (!vd->vdev_aux) { 2535219089Spjd for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2536219089Spjd pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2537219089Spjd } 2538219089Spjd 2539219089Spjd vdev_reopen(tvd); 2540185029Spjd vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2541168404Spjd 2542219089Spjd if (!vd->vdev_aux) { 2543219089Spjd for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2544219089Spjd pvd->vdev_expanding = B_FALSE; 2545219089Spjd } 2546219089Spjd 2547185029Spjd if (newstate) 2548185029Spjd *newstate = vd->vdev_state; 2549185029Spjd if ((flags & ZFS_ONLINE_UNSPARE) && 2550185029Spjd !vdev_is_dead(vd) && vd->vdev_parent && 2551185029Spjd vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2552185029Spjd vd->vdev_parent->vdev_child[0] == vd) 2553185029Spjd vd->vdev_unspare = B_TRUE; 2554168404Spjd 2555219089Spjd if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2556219089Spjd 2557219089Spjd /* XXX - L2ARC 1.0 does not support expansion */ 2558219089Spjd if (vd->vdev_aux) 2559219089Spjd return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2560219089Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2561219089Spjd } 2562290745Smav 2563290745Smav if (postevent) 2564290745Smav spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE); 2565290745Smav 2566209962Smm return (spa_vdev_state_exit(spa, vd, 0)); 2567168404Spjd} 2568168404Spjd 2569219089Spjdstatic int 2570219089Spjdvdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2571168404Spjd{ 2572213197Smm vdev_t *vd, *tvd; 2573219089Spjd int error = 0; 2574219089Spjd uint64_t generation; 2575219089Spjd metaslab_group_t *mg; 2576168404Spjd 2577219089Spjdtop: 2578219089Spjd spa_vdev_state_enter(spa, SCL_ALLOC); 2579168404Spjd 2580185029Spjd if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2581185029Spjd return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2582168404Spjd 2583168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 2584185029Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2585168404Spjd 2586213197Smm tvd = vd->vdev_top; 2587219089Spjd mg = tvd->vdev_mg; 2588219089Spjd generation = spa->spa_config_generation + 1; 2589213197Smm 2590168404Spjd /* 2591168404Spjd * If the device isn't already offline, try to offline it. 2592168404Spjd */ 2593168404Spjd if (!vd->vdev_offline) { 2594168404Spjd /* 2595209962Smm * If this device has the only valid copy of some data, 2596213197Smm * don't allow it to be offlined. Log devices are always 2597213197Smm * expendable. 2598168404Spjd */ 2599213197Smm if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2600213197Smm vdev_dtl_required(vd)) 2601185029Spjd return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2602168404Spjd 2603168404Spjd /* 2604219089Spjd * If the top-level is a slog and it has had allocations 2605219089Spjd * then proceed. We check that the vdev's metaslab group 2606219089Spjd * is not NULL since it's possible that we may have just 2607219089Spjd * added this vdev but not yet initialized its metaslabs. 2608219089Spjd */ 2609219089Spjd if (tvd->vdev_islog && mg != NULL) { 2610219089Spjd /* 2611219089Spjd * Prevent any future allocations. 2612219089Spjd */ 2613219089Spjd metaslab_group_passivate(mg); 2614219089Spjd (void) spa_vdev_state_exit(spa, vd, 0); 2615219089Spjd 2616219089Spjd error = spa_offline_log(spa); 2617219089Spjd 2618219089Spjd spa_vdev_state_enter(spa, SCL_ALLOC); 2619219089Spjd 2620219089Spjd /* 2621219089Spjd * Check to see if the config has changed. 2622219089Spjd */ 2623219089Spjd if (error || generation != spa->spa_config_generation) { 2624219089Spjd metaslab_group_activate(mg); 2625219089Spjd if (error) 2626219089Spjd return (spa_vdev_state_exit(spa, 2627219089Spjd vd, error)); 2628219089Spjd (void) spa_vdev_state_exit(spa, vd, 0); 2629219089Spjd goto top; 2630219089Spjd } 2631240415Smm ASSERT0(tvd->vdev_stat.vs_alloc); 2632219089Spjd } 2633219089Spjd 2634219089Spjd /* 2635168404Spjd * Offline this device and reopen its top-level vdev. 2636213197Smm * If the top-level vdev is a log device then just offline 2637213197Smm * it. Otherwise, if this action results in the top-level 2638213197Smm * vdev becoming unusable, undo it and fail the request. 2639168404Spjd */ 2640168404Spjd vd->vdev_offline = B_TRUE; 2641213197Smm vdev_reopen(tvd); 2642213197Smm 2643213197Smm if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2644213197Smm vdev_is_dead(tvd)) { 2645168404Spjd vd->vdev_offline = B_FALSE; 2646213197Smm vdev_reopen(tvd); 2647185029Spjd return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2648168404Spjd } 2649219089Spjd 2650219089Spjd /* 2651219089Spjd * Add the device back into the metaslab rotor so that 2652219089Spjd * once we online the device it's open for business. 2653219089Spjd */ 2654219089Spjd if (tvd->vdev_islog && mg != NULL) 2655219089Spjd metaslab_group_activate(mg); 2656168404Spjd } 2657168404Spjd 2658185029Spjd vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2659168404Spjd 2660219089Spjd return (spa_vdev_state_exit(spa, vd, 0)); 2661219089Spjd} 2662213197Smm 2663219089Spjdint 2664219089Spjdvdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2665219089Spjd{ 2666219089Spjd int error; 2667213197Smm 2668219089Spjd mutex_enter(&spa->spa_vdev_top_lock); 2669219089Spjd error = vdev_offline_locked(spa, guid, flags); 2670219089Spjd mutex_exit(&spa->spa_vdev_top_lock); 2671219089Spjd 2672219089Spjd return (error); 2673168404Spjd} 2674168404Spjd 2675168404Spjd/* 2676168404Spjd * Clear the error counts associated with this vdev. Unlike vdev_online() and 2677168404Spjd * vdev_offline(), we assume the spa config is locked. We also clear all 2678168404Spjd * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2679168404Spjd */ 2680168404Spjdvoid 2681168404Spjdvdev_clear(spa_t *spa, vdev_t *vd) 2682168404Spjd{ 2683185029Spjd vdev_t *rvd = spa->spa_root_vdev; 2684168404Spjd 2685185029Spjd ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2686185029Spjd 2687168404Spjd if (vd == NULL) 2688185029Spjd vd = rvd; 2689168404Spjd 2690168404Spjd vd->vdev_stat.vs_read_errors = 0; 2691168404Spjd vd->vdev_stat.vs_write_errors = 0; 2692168404Spjd vd->vdev_stat.vs_checksum_errors = 0; 2693168404Spjd 2694185029Spjd for (int c = 0; c < vd->vdev_children; c++) 2695168404Spjd vdev_clear(spa, vd->vdev_child[c]); 2696185029Spjd 2697253991Smav if (vd == rvd) { 2698253991Smav for (int c = 0; c < spa->spa_l2cache.sav_count; c++) 2699253991Smav vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); 2700253991Smav 2701253991Smav for (int c = 0; c < spa->spa_spares.sav_count; c++) 2702253991Smav vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); 2703253991Smav } 2704253991Smav 2705185029Spjd /* 2706185029Spjd * If we're in the FAULTED state or have experienced failed I/O, then 2707185029Spjd * clear the persistent state and attempt to reopen the device. We 2708185029Spjd * also mark the vdev config dirty, so that the new faulted state is 2709185029Spjd * written out to disk. 2710185029Spjd */ 2711185029Spjd if (vd->vdev_faulted || vd->vdev_degraded || 2712185029Spjd !vdev_readable(vd) || !vdev_writeable(vd)) { 2713185029Spjd 2714219089Spjd /* 2715219089Spjd * When reopening in reponse to a clear event, it may be due to 2716219089Spjd * a fmadm repair request. In this case, if the device is 2717219089Spjd * still broken, we want to still post the ereport again. 2718219089Spjd */ 2719219089Spjd vd->vdev_forcefault = B_TRUE; 2720219089Spjd 2721219089Spjd vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2722185029Spjd vd->vdev_cant_read = B_FALSE; 2723185029Spjd vd->vdev_cant_write = B_FALSE; 2724185029Spjd 2725219089Spjd vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 2726185029Spjd 2727219089Spjd vd->vdev_forcefault = B_FALSE; 2728219089Spjd 2729219089Spjd if (vd != rvd && vdev_writeable(vd->vdev_top)) 2730185029Spjd vdev_state_dirty(vd->vdev_top); 2731185029Spjd 2732185029Spjd if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2733185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2734185029Spjd 2735185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 2736185029Spjd } 2737219089Spjd 2738219089Spjd /* 2739219089Spjd * When clearing a FMA-diagnosed fault, we always want to 2740219089Spjd * unspare the device, as we assume that the original spare was 2741219089Spjd * done in response to the FMA fault. 2742219089Spjd */ 2743219089Spjd if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2744219089Spjd vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2745219089Spjd vd->vdev_parent->vdev_child[0] == vd) 2746219089Spjd vd->vdev_unspare = B_TRUE; 2747168404Spjd} 2748168404Spjd 2749185029Spjdboolean_t 2750168404Spjdvdev_is_dead(vdev_t *vd) 2751168404Spjd{ 2752219089Spjd /* 2753219089Spjd * Holes and missing devices are always considered "dead". 2754219089Spjd * This simplifies the code since we don't have to check for 2755219089Spjd * these types of devices in the various code paths. 2756219089Spjd * Instead we rely on the fact that we skip over dead devices 2757219089Spjd * before issuing I/O to them. 2758219089Spjd */ 2759219089Spjd return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 2760219089Spjd vd->vdev_ops == &vdev_missing_ops); 2761168404Spjd} 2762168404Spjd 2763185029Spjdboolean_t 2764185029Spjdvdev_readable(vdev_t *vd) 2765168404Spjd{ 2766185029Spjd return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 2767185029Spjd} 2768168404Spjd 2769185029Spjdboolean_t 2770185029Spjdvdev_writeable(vdev_t *vd) 2771185029Spjd{ 2772185029Spjd return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 2773185029Spjd} 2774168404Spjd 2775185029Spjdboolean_t 2776208370Smmvdev_allocatable(vdev_t *vd) 2777208370Smm{ 2778209962Smm uint64_t state = vd->vdev_state; 2779209962Smm 2780208370Smm /* 2781209962Smm * We currently allow allocations from vdevs which may be in the 2782208370Smm * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2783208370Smm * fails to reopen then we'll catch it later when we're holding 2784209962Smm * the proper locks. Note that we have to get the vdev state 2785209962Smm * in a local variable because although it changes atomically, 2786209962Smm * we're asking two separate questions about it. 2787208370Smm */ 2788209962Smm return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 2789307279Smav !vd->vdev_cant_write && !vd->vdev_ishole && 2790307279Smav vd->vdev_mg->mg_initialized); 2791208370Smm} 2792208370Smm 2793208370Smmboolean_t 2794185029Spjdvdev_accessible(vdev_t *vd, zio_t *zio) 2795185029Spjd{ 2796185029Spjd ASSERT(zio->io_vd == vd); 2797168404Spjd 2798185029Spjd if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2799185029Spjd return (B_FALSE); 2800168404Spjd 2801185029Spjd if (zio->io_type == ZIO_TYPE_READ) 2802185029Spjd return (!vd->vdev_cant_read); 2803168404Spjd 2804185029Spjd if (zio->io_type == ZIO_TYPE_WRITE) 2805185029Spjd return (!vd->vdev_cant_write); 2806168404Spjd 2807185029Spjd return (B_TRUE); 2808168404Spjd} 2809168404Spjd 2810168404Spjd/* 2811168404Spjd * Get statistics for the given vdev. 2812168404Spjd */ 2813168404Spjdvoid 2814168404Spjdvdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2815168404Spjd{ 2816269773Sdelphij spa_t *spa = vd->vdev_spa; 2817269773Sdelphij vdev_t *rvd = spa->spa_root_vdev; 2818307268Smav vdev_t *tvd = vd->vdev_top; 2819168404Spjd 2820269773Sdelphij ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2821269773Sdelphij 2822168404Spjd mutex_enter(&vd->vdev_stat_lock); 2823168404Spjd bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2824168404Spjd vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2825168404Spjd vs->vs_state = vd->vdev_state; 2826219089Spjd vs->vs_rsize = vdev_get_min_asize(vd); 2827219089Spjd if (vd->vdev_ops->vdev_op_leaf) 2828219089Spjd vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2829307268Smav /* 2830307268Smav * Report expandable space on top-level, non-auxillary devices only. 2831307268Smav * The expandable space is reported in terms of metaslab sized units 2832307268Smav * since that determines how much space the pool can expand. 2833307268Smav */ 2834307268Smav if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { 2835307268Smav vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize, 2836307268Smav 1ULL << tvd->vdev_ms_shift); 2837307268Smav } 2838254591Sgibbs vs->vs_configured_ashift = vd->vdev_top != NULL 2839254591Sgibbs ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; 2840254591Sgibbs vs->vs_logical_ashift = vd->vdev_logical_ashift; 2841254591Sgibbs vs->vs_physical_ashift = vd->vdev_physical_ashift; 2842270128Sdelphij if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { 2843269773Sdelphij vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 2844270128Sdelphij } 2845168404Spjd 2846168404Spjd /* 2847168404Spjd * If we're getting stats on the root vdev, aggregate the I/O counts 2848168404Spjd * over all top-level vdevs (i.e. the direct children of the root). 2849168404Spjd */ 2850168404Spjd if (vd == rvd) { 2851185029Spjd for (int c = 0; c < rvd->vdev_children; c++) { 2852168404Spjd vdev_t *cvd = rvd->vdev_child[c]; 2853168404Spjd vdev_stat_t *cvs = &cvd->vdev_stat; 2854168404Spjd 2855185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 2856168404Spjd vs->vs_ops[t] += cvs->vs_ops[t]; 2857168404Spjd vs->vs_bytes[t] += cvs->vs_bytes[t]; 2858168404Spjd } 2859219089Spjd cvs->vs_scan_removing = cvd->vdev_removing; 2860168404Spjd } 2861168404Spjd } 2862269773Sdelphij mutex_exit(&vd->vdev_stat_lock); 2863168404Spjd} 2864168404Spjd 2865168404Spjdvoid 2866185029Spjdvdev_clear_stats(vdev_t *vd) 2867168404Spjd{ 2868185029Spjd mutex_enter(&vd->vdev_stat_lock); 2869185029Spjd vd->vdev_stat.vs_space = 0; 2870185029Spjd vd->vdev_stat.vs_dspace = 0; 2871185029Spjd vd->vdev_stat.vs_alloc = 0; 2872185029Spjd mutex_exit(&vd->vdev_stat_lock); 2873185029Spjd} 2874185029Spjd 2875185029Spjdvoid 2876219089Spjdvdev_scan_stat_init(vdev_t *vd) 2877219089Spjd{ 2878219089Spjd vdev_stat_t *vs = &vd->vdev_stat; 2879219089Spjd 2880219089Spjd for (int c = 0; c < vd->vdev_children; c++) 2881219089Spjd vdev_scan_stat_init(vd->vdev_child[c]); 2882219089Spjd 2883219089Spjd mutex_enter(&vd->vdev_stat_lock); 2884219089Spjd vs->vs_scan_processed = 0; 2885219089Spjd mutex_exit(&vd->vdev_stat_lock); 2886219089Spjd} 2887219089Spjd 2888219089Spjdvoid 2889185029Spjdvdev_stat_update(zio_t *zio, uint64_t psize) 2890185029Spjd{ 2891209962Smm spa_t *spa = zio->io_spa; 2892209962Smm vdev_t *rvd = spa->spa_root_vdev; 2893185029Spjd vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2894168404Spjd vdev_t *pvd; 2895168404Spjd uint64_t txg = zio->io_txg; 2896168404Spjd vdev_stat_t *vs = &vd->vdev_stat; 2897168404Spjd zio_type_t type = zio->io_type; 2898168404Spjd int flags = zio->io_flags; 2899168404Spjd 2900185029Spjd /* 2901185029Spjd * If this i/o is a gang leader, it didn't do any actual work. 2902185029Spjd */ 2903185029Spjd if (zio->io_gang_tree) 2904185029Spjd return; 2905185029Spjd 2906168404Spjd if (zio->io_error == 0) { 2907185029Spjd /* 2908185029Spjd * If this is a root i/o, don't count it -- we've already 2909185029Spjd * counted the top-level vdevs, and vdev_get_stats() will 2910185029Spjd * aggregate them when asked. This reduces contention on 2911185029Spjd * the root vdev_stat_lock and implicitly handles blocks 2912185029Spjd * that compress away to holes, for which there is no i/o. 2913185029Spjd * (Holes never create vdev children, so all the counters 2914185029Spjd * remain zero, which is what we want.) 2915185029Spjd * 2916185029Spjd * Note: this only applies to successful i/o (io_error == 0) 2917185029Spjd * because unlike i/o counts, errors are not additive. 2918185029Spjd * When reading a ditto block, for example, failure of 2919185029Spjd * one top-level vdev does not imply a root-level error. 2920185029Spjd */ 2921185029Spjd if (vd == rvd) 2922185029Spjd return; 2923185029Spjd 2924185029Spjd ASSERT(vd == zio->io_vd); 2925209962Smm 2926209962Smm if (flags & ZIO_FLAG_IO_BYPASS) 2927209962Smm return; 2928209962Smm 2929209962Smm mutex_enter(&vd->vdev_stat_lock); 2930209962Smm 2931185029Spjd if (flags & ZIO_FLAG_IO_REPAIR) { 2932219089Spjd if (flags & ZIO_FLAG_SCAN_THREAD) { 2933219089Spjd dsl_scan_phys_t *scn_phys = 2934219089Spjd &spa->spa_dsl_pool->dp_scan->scn_phys; 2935219089Spjd uint64_t *processed = &scn_phys->scn_processed; 2936219089Spjd 2937219089Spjd /* XXX cleanup? */ 2938219089Spjd if (vd->vdev_ops->vdev_op_leaf) 2939219089Spjd atomic_add_64(processed, psize); 2940219089Spjd vs->vs_scan_processed += psize; 2941219089Spjd } 2942219089Spjd 2943209962Smm if (flags & ZIO_FLAG_SELF_HEAL) 2944185029Spjd vs->vs_self_healed += psize; 2945168404Spjd } 2946209962Smm 2947209962Smm vs->vs_ops[type]++; 2948209962Smm vs->vs_bytes[type] += psize; 2949209962Smm 2950209962Smm mutex_exit(&vd->vdev_stat_lock); 2951168404Spjd return; 2952168404Spjd } 2953168404Spjd 2954168404Spjd if (flags & ZIO_FLAG_SPECULATIVE) 2955168404Spjd return; 2956168404Spjd 2957213198Smm /* 2958213198Smm * If this is an I/O error that is going to be retried, then ignore the 2959213198Smm * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 2960213198Smm * hard errors, when in reality they can happen for any number of 2961213198Smm * innocuous reasons (bus resets, MPxIO link failure, etc). 2962213198Smm */ 2963213198Smm if (zio->io_error == EIO && 2964213198Smm !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 2965213198Smm return; 2966213198Smm 2967219089Spjd /* 2968219089Spjd * Intent logs writes won't propagate their error to the root 2969219089Spjd * I/O so don't mark these types of failures as pool-level 2970219089Spjd * errors. 2971219089Spjd */ 2972219089Spjd if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 2973219089Spjd return; 2974219089Spjd 2975185029Spjd mutex_enter(&vd->vdev_stat_lock); 2976209962Smm if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 2977185029Spjd if (zio->io_error == ECKSUM) 2978185029Spjd vs->vs_checksum_errors++; 2979185029Spjd else 2980185029Spjd vs->vs_read_errors++; 2981168404Spjd } 2982209962Smm if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 2983185029Spjd vs->vs_write_errors++; 2984185029Spjd mutex_exit(&vd->vdev_stat_lock); 2985168404Spjd 2986209962Smm if (type == ZIO_TYPE_WRITE && txg != 0 && 2987209962Smm (!(flags & ZIO_FLAG_IO_REPAIR) || 2988219089Spjd (flags & ZIO_FLAG_SCAN_THREAD) || 2989219089Spjd spa->spa_claiming)) { 2990209962Smm /* 2991219089Spjd * This is either a normal write (not a repair), or it's 2992219089Spjd * a repair induced by the scrub thread, or it's a repair 2993219089Spjd * made by zil_claim() during spa_load() in the first txg. 2994219089Spjd * In the normal case, we commit the DTL change in the same 2995219089Spjd * txg as the block was born. In the scrub-induced repair 2996219089Spjd * case, we know that scrubs run in first-pass syncing context, 2997219089Spjd * so we commit the DTL change in spa_syncing_txg(spa). 2998219089Spjd * In the zil_claim() case, we commit in spa_first_txg(spa). 2999209962Smm * 3000209962Smm * We currently do not make DTL entries for failed spontaneous 3001209962Smm * self-healing writes triggered by normal (non-scrubbing) 3002209962Smm * reads, because we have no transactional context in which to 3003209962Smm * do so -- and it's not clear that it'd be desirable anyway. 3004209962Smm */ 3005209962Smm if (vd->vdev_ops->vdev_op_leaf) { 3006209962Smm uint64_t commit_txg = txg; 3007219089Spjd if (flags & ZIO_FLAG_SCAN_THREAD) { 3008209962Smm ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3009209962Smm ASSERT(spa_sync_pass(spa) == 1); 3010209962Smm vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 3011219089Spjd commit_txg = spa_syncing_txg(spa); 3012219089Spjd } else if (spa->spa_claiming) { 3013219089Spjd ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3014219089Spjd commit_txg = spa_first_txg(spa); 3015209962Smm } 3016219089Spjd ASSERT(commit_txg >= spa_syncing_txg(spa)); 3017209962Smm if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 3018168404Spjd return; 3019209962Smm for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 3020209962Smm vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 3021209962Smm vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 3022168404Spjd } 3023209962Smm if (vd != rvd) 3024209962Smm vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 3025168404Spjd } 3026168404Spjd} 3027168404Spjd 3028168404Spjd/* 3029219089Spjd * Update the in-core space usage stats for this vdev, its metaslab class, 3030219089Spjd * and the root vdev. 3031168404Spjd */ 3032168404Spjdvoid 3033219089Spjdvdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 3034219089Spjd int64_t space_delta) 3035168404Spjd{ 3036168404Spjd int64_t dspace_delta = space_delta; 3037185029Spjd spa_t *spa = vd->vdev_spa; 3038185029Spjd vdev_t *rvd = spa->spa_root_vdev; 3039219089Spjd metaslab_group_t *mg = vd->vdev_mg; 3040219089Spjd metaslab_class_t *mc = mg ? mg->mg_class : NULL; 3041168404Spjd 3042185029Spjd ASSERT(vd == vd->vdev_top); 3043168404Spjd 3044185029Spjd /* 3045185029Spjd * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 3046185029Spjd * factor. We must calculate this here and not at the root vdev 3047185029Spjd * because the root vdev's psize-to-asize is simply the max of its 3048185029Spjd * childrens', thus not accurate enough for us. 3049185029Spjd */ 3050185029Spjd ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 3051213197Smm ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 3052185029Spjd dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 3053185029Spjd vd->vdev_deflate_ratio; 3054185029Spjd 3055185029Spjd mutex_enter(&vd->vdev_stat_lock); 3056219089Spjd vd->vdev_stat.vs_alloc += alloc_delta; 3057185029Spjd vd->vdev_stat.vs_space += space_delta; 3058185029Spjd vd->vdev_stat.vs_dspace += dspace_delta; 3059185029Spjd mutex_exit(&vd->vdev_stat_lock); 3060185029Spjd 3061219089Spjd if (mc == spa_normal_class(spa)) { 3062185029Spjd mutex_enter(&rvd->vdev_stat_lock); 3063219089Spjd rvd->vdev_stat.vs_alloc += alloc_delta; 3064185029Spjd rvd->vdev_stat.vs_space += space_delta; 3065185029Spjd rvd->vdev_stat.vs_dspace += dspace_delta; 3066185029Spjd mutex_exit(&rvd->vdev_stat_lock); 3067185029Spjd } 3068219089Spjd 3069219089Spjd if (mc != NULL) { 3070219089Spjd ASSERT(rvd == vd->vdev_parent); 3071219089Spjd ASSERT(vd->vdev_ms_count != 0); 3072219089Spjd 3073219089Spjd metaslab_class_space_update(mc, 3074219089Spjd alloc_delta, defer_delta, space_delta, dspace_delta); 3075219089Spjd } 3076168404Spjd} 3077168404Spjd 3078168404Spjd/* 3079168404Spjd * Mark a top-level vdev's config as dirty, placing it on the dirty list 3080168404Spjd * so that it will be written out next time the vdev configuration is synced. 3081168404Spjd * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 3082168404Spjd */ 3083168404Spjdvoid 3084168404Spjdvdev_config_dirty(vdev_t *vd) 3085168404Spjd{ 3086168404Spjd spa_t *spa = vd->vdev_spa; 3087168404Spjd vdev_t *rvd = spa->spa_root_vdev; 3088168404Spjd int c; 3089168404Spjd 3090219089Spjd ASSERT(spa_writeable(spa)); 3091219089Spjd 3092168404Spjd /* 3093209962Smm * If this is an aux vdev (as with l2cache and spare devices), then we 3094209962Smm * update the vdev config manually and set the sync flag. 3095185029Spjd */ 3096185029Spjd if (vd->vdev_aux != NULL) { 3097185029Spjd spa_aux_vdev_t *sav = vd->vdev_aux; 3098185029Spjd nvlist_t **aux; 3099185029Spjd uint_t naux; 3100185029Spjd 3101185029Spjd for (c = 0; c < sav->sav_count; c++) { 3102185029Spjd if (sav->sav_vdevs[c] == vd) 3103185029Spjd break; 3104185029Spjd } 3105185029Spjd 3106185029Spjd if (c == sav->sav_count) { 3107185029Spjd /* 3108185029Spjd * We're being removed. There's nothing more to do. 3109185029Spjd */ 3110185029Spjd ASSERT(sav->sav_sync == B_TRUE); 3111185029Spjd return; 3112185029Spjd } 3113185029Spjd 3114185029Spjd sav->sav_sync = B_TRUE; 3115185029Spjd 3116209962Smm if (nvlist_lookup_nvlist_array(sav->sav_config, 3117209962Smm ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 3118209962Smm VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 3119209962Smm ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 3120209962Smm } 3121185029Spjd 3122185029Spjd ASSERT(c < naux); 3123185029Spjd 3124185029Spjd /* 3125185029Spjd * Setting the nvlist in the middle if the array is a little 3126185029Spjd * sketchy, but it will work. 3127185029Spjd */ 3128185029Spjd nvlist_free(aux[c]); 3129219089Spjd aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3130185029Spjd 3131185029Spjd return; 3132185029Spjd } 3133185029Spjd 3134185029Spjd /* 3135185029Spjd * The dirty list is protected by the SCL_CONFIG lock. The caller 3136185029Spjd * must either hold SCL_CONFIG as writer, or must be the sync thread 3137185029Spjd * (which holds SCL_CONFIG as reader). There's only one sync thread, 3138168404Spjd * so this is sufficient to ensure mutual exclusion. 3139168404Spjd */ 3140185029Spjd ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3141185029Spjd (dsl_pool_sync_context(spa_get_dsl(spa)) && 3142185029Spjd spa_config_held(spa, SCL_CONFIG, RW_READER))); 3143168404Spjd 3144168404Spjd if (vd == rvd) { 3145168404Spjd for (c = 0; c < rvd->vdev_children; c++) 3146168404Spjd vdev_config_dirty(rvd->vdev_child[c]); 3147168404Spjd } else { 3148168404Spjd ASSERT(vd == vd->vdev_top); 3149168404Spjd 3150219089Spjd if (!list_link_active(&vd->vdev_config_dirty_node) && 3151219089Spjd !vd->vdev_ishole) 3152185029Spjd list_insert_head(&spa->spa_config_dirty_list, vd); 3153168404Spjd } 3154168404Spjd} 3155168404Spjd 3156168404Spjdvoid 3157168404Spjdvdev_config_clean(vdev_t *vd) 3158168404Spjd{ 3159168404Spjd spa_t *spa = vd->vdev_spa; 3160168404Spjd 3161185029Spjd ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3162185029Spjd (dsl_pool_sync_context(spa_get_dsl(spa)) && 3163185029Spjd spa_config_held(spa, SCL_CONFIG, RW_READER))); 3164168404Spjd 3165185029Spjd ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3166185029Spjd list_remove(&spa->spa_config_dirty_list, vd); 3167168404Spjd} 3168168404Spjd 3169185029Spjd/* 3170185029Spjd * Mark a top-level vdev's state as dirty, so that the next pass of 3171185029Spjd * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3172185029Spjd * the state changes from larger config changes because they require 3173185029Spjd * much less locking, and are often needed for administrative actions. 3174185029Spjd */ 3175168404Spjdvoid 3176185029Spjdvdev_state_dirty(vdev_t *vd) 3177185029Spjd{ 3178185029Spjd spa_t *spa = vd->vdev_spa; 3179185029Spjd 3180219089Spjd ASSERT(spa_writeable(spa)); 3181185029Spjd ASSERT(vd == vd->vdev_top); 3182185029Spjd 3183185029Spjd /* 3184185029Spjd * The state list is protected by the SCL_STATE lock. The caller 3185185029Spjd * must either hold SCL_STATE as writer, or must be the sync thread 3186185029Spjd * (which holds SCL_STATE as reader). There's only one sync thread, 3187185029Spjd * so this is sufficient to ensure mutual exclusion. 3188185029Spjd */ 3189185029Spjd ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3190185029Spjd (dsl_pool_sync_context(spa_get_dsl(spa)) && 3191185029Spjd spa_config_held(spa, SCL_STATE, RW_READER))); 3192185029Spjd 3193219089Spjd if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 3194185029Spjd list_insert_head(&spa->spa_state_dirty_list, vd); 3195185029Spjd} 3196185029Spjd 3197185029Spjdvoid 3198185029Spjdvdev_state_clean(vdev_t *vd) 3199185029Spjd{ 3200185029Spjd spa_t *spa = vd->vdev_spa; 3201185029Spjd 3202185029Spjd ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3203185029Spjd (dsl_pool_sync_context(spa_get_dsl(spa)) && 3204185029Spjd spa_config_held(spa, SCL_STATE, RW_READER))); 3205185029Spjd 3206185029Spjd ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 3207185029Spjd list_remove(&spa->spa_state_dirty_list, vd); 3208185029Spjd} 3209185029Spjd 3210185029Spjd/* 3211185029Spjd * Propagate vdev state up from children to parent. 3212185029Spjd */ 3213185029Spjdvoid 3214168404Spjdvdev_propagate_state(vdev_t *vd) 3215168404Spjd{ 3216209962Smm spa_t *spa = vd->vdev_spa; 3217209962Smm vdev_t *rvd = spa->spa_root_vdev; 3218168404Spjd int degraded = 0, faulted = 0; 3219168404Spjd int corrupted = 0; 3220168404Spjd vdev_t *child; 3221168404Spjd 3222185029Spjd if (vd->vdev_children > 0) { 3223219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 3224185029Spjd child = vd->vdev_child[c]; 3225168404Spjd 3226219089Spjd /* 3227219089Spjd * Don't factor holes into the decision. 3228219089Spjd */ 3229219089Spjd if (child->vdev_ishole) 3230219089Spjd continue; 3231219089Spjd 3232185029Spjd if (!vdev_readable(child) || 3233209962Smm (!vdev_writeable(child) && spa_writeable(spa))) { 3234185029Spjd /* 3235185029Spjd * Root special: if there is a top-level log 3236185029Spjd * device, treat the root vdev as if it were 3237185029Spjd * degraded. 3238185029Spjd */ 3239185029Spjd if (child->vdev_islog && vd == rvd) 3240185029Spjd degraded++; 3241185029Spjd else 3242185029Spjd faulted++; 3243185029Spjd } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 3244185029Spjd degraded++; 3245185029Spjd } 3246185029Spjd 3247185029Spjd if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 3248185029Spjd corrupted++; 3249185029Spjd } 3250185029Spjd 3251185029Spjd vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 3252185029Spjd 3253185029Spjd /* 3254185029Spjd * Root special: if there is a top-level vdev that cannot be 3255185029Spjd * opened due to corrupted metadata, then propagate the root 3256185029Spjd * vdev's aux state as 'corrupt' rather than 'insufficient 3257185029Spjd * replicas'. 3258185029Spjd */ 3259185029Spjd if (corrupted && vd == rvd && 3260185029Spjd rvd->vdev_state == VDEV_STATE_CANT_OPEN) 3261185029Spjd vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 3262185029Spjd VDEV_AUX_CORRUPT_DATA); 3263168404Spjd } 3264168404Spjd 3265185029Spjd if (vd->vdev_parent) 3266185029Spjd vdev_propagate_state(vd->vdev_parent); 3267168404Spjd} 3268168404Spjd 3269168404Spjd/* 3270168404Spjd * Set a vdev's state. If this is during an open, we don't update the parent 3271168404Spjd * state, because we're in the process of opening children depth-first. 3272168404Spjd * Otherwise, we propagate the change to the parent. 3273168404Spjd * 3274168404Spjd * If this routine places a device in a faulted state, an appropriate ereport is 3275168404Spjd * generated. 3276168404Spjd */ 3277168404Spjdvoid 3278168404Spjdvdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 3279168404Spjd{ 3280168404Spjd uint64_t save_state; 3281185029Spjd spa_t *spa = vd->vdev_spa; 3282168404Spjd 3283168404Spjd if (state == vd->vdev_state) { 3284168404Spjd vd->vdev_stat.vs_aux = aux; 3285168404Spjd return; 3286168404Spjd } 3287168404Spjd 3288168404Spjd save_state = vd->vdev_state; 3289168404Spjd 3290168404Spjd vd->vdev_state = state; 3291168404Spjd vd->vdev_stat.vs_aux = aux; 3292168404Spjd 3293173373Spjd /* 3294173373Spjd * If we are setting the vdev state to anything but an open state, then 3295219089Spjd * always close the underlying device unless the device has requested 3296219089Spjd * a delayed close (i.e. we're about to remove or fault the device). 3297219089Spjd * Otherwise, we keep accessible but invalid devices open forever. 3298219089Spjd * We don't call vdev_close() itself, because that implies some extra 3299219089Spjd * checks (offline, etc) that we don't want here. This is limited to 3300219089Spjd * leaf devices, because otherwise closing the device will affect other 3301219089Spjd * children. 3302173373Spjd */ 3303219089Spjd if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 3304219089Spjd vd->vdev_ops->vdev_op_leaf) 3305173373Spjd vd->vdev_ops->vdev_op_close(vd); 3306173373Spjd 3307219089Spjd /* 3308219089Spjd * If we have brought this vdev back into service, we need 3309219089Spjd * to notify fmd so that it can gracefully repair any outstanding 3310219089Spjd * cases due to a missing device. We do this in all cases, even those 3311219089Spjd * that probably don't correlate to a repaired fault. This is sure to 3312219089Spjd * catch all cases, and we let the zfs-retire agent sort it out. If 3313219089Spjd * this is a transient state it's OK, as the retire agent will 3314219089Spjd * double-check the state of the vdev before repairing it. 3315219089Spjd */ 3316219089Spjd if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 3317219089Spjd vd->vdev_prevstate != state) 3318219089Spjd zfs_post_state_change(spa, vd); 3319219089Spjd 3320185029Spjd if (vd->vdev_removed && 3321185029Spjd state == VDEV_STATE_CANT_OPEN && 3322185029Spjd (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 3323168404Spjd /* 3324185029Spjd * If the previous state is set to VDEV_STATE_REMOVED, then this 3325185029Spjd * device was previously marked removed and someone attempted to 3326185029Spjd * reopen it. If this failed due to a nonexistent device, then 3327185029Spjd * keep the device in the REMOVED state. We also let this be if 3328185029Spjd * it is one of our special test online cases, which is only 3329185029Spjd * attempting to online the device and shouldn't generate an FMA 3330185029Spjd * fault. 3331185029Spjd */ 3332185029Spjd vd->vdev_state = VDEV_STATE_REMOVED; 3333185029Spjd vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 3334185029Spjd } else if (state == VDEV_STATE_REMOVED) { 3335185029Spjd vd->vdev_removed = B_TRUE; 3336185029Spjd } else if (state == VDEV_STATE_CANT_OPEN) { 3337185029Spjd /* 3338219089Spjd * If we fail to open a vdev during an import or recovery, we 3339219089Spjd * mark it as "not available", which signifies that it was 3340219089Spjd * never there to begin with. Failure to open such a device 3341219089Spjd * is not considered an error. 3342168404Spjd */ 3343219089Spjd if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 3344219089Spjd spa_load_state(spa) == SPA_LOAD_RECOVER) && 3345168404Spjd vd->vdev_ops->vdev_op_leaf) 3346168404Spjd vd->vdev_not_present = 1; 3347168404Spjd 3348168404Spjd /* 3349168404Spjd * Post the appropriate ereport. If the 'prevstate' field is 3350168404Spjd * set to something other than VDEV_STATE_UNKNOWN, it indicates 3351168404Spjd * that this is part of a vdev_reopen(). In this case, we don't 3352168404Spjd * want to post the ereport if the device was already in the 3353168404Spjd * CANT_OPEN state beforehand. 3354185029Spjd * 3355185029Spjd * If the 'checkremove' flag is set, then this is an attempt to 3356185029Spjd * online the device in response to an insertion event. If we 3357185029Spjd * hit this case, then we have detected an insertion event for a 3358185029Spjd * faulted or offline device that wasn't in the removed state. 3359185029Spjd * In this scenario, we don't post an ereport because we are 3360185029Spjd * about to replace the device, or attempt an online with 3361185029Spjd * vdev_forcefault, which will generate the fault for us. 3362168404Spjd */ 3363185029Spjd if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 3364185029Spjd !vd->vdev_not_present && !vd->vdev_checkremove && 3365185029Spjd vd != spa->spa_root_vdev) { 3366168404Spjd const char *class; 3367168404Spjd 3368168404Spjd switch (aux) { 3369168404Spjd case VDEV_AUX_OPEN_FAILED: 3370168404Spjd class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3371168404Spjd break; 3372168404Spjd case VDEV_AUX_CORRUPT_DATA: 3373168404Spjd class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3374168404Spjd break; 3375168404Spjd case VDEV_AUX_NO_REPLICAS: 3376168404Spjd class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3377168404Spjd break; 3378168404Spjd case VDEV_AUX_BAD_GUID_SUM: 3379168404Spjd class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3380168404Spjd break; 3381168404Spjd case VDEV_AUX_TOO_SMALL: 3382168404Spjd class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3383168404Spjd break; 3384168404Spjd case VDEV_AUX_BAD_LABEL: 3385168404Spjd class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3386168404Spjd break; 3387168404Spjd default: 3388168404Spjd class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3389168404Spjd } 3390168404Spjd 3391185029Spjd zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3392168404Spjd } 3393185029Spjd 3394185029Spjd /* Erase any notion of persistent removed state */ 3395185029Spjd vd->vdev_removed = B_FALSE; 3396185029Spjd } else { 3397185029Spjd vd->vdev_removed = B_FALSE; 3398168404Spjd } 3399168404Spjd 3400209962Smm if (!isopen && vd->vdev_parent) 3401209962Smm vdev_propagate_state(vd->vdev_parent); 3402185029Spjd} 3403168404Spjd 3404185029Spjd/* 3405185029Spjd * Check the vdev configuration to ensure that it's capable of supporting 3406193163Sdfr * a root pool. 3407193163Sdfr * 3408193163Sdfr * On Solaris, we do not support RAID-Z or partial configuration. In 3409193163Sdfr * addition, only a single top-level vdev is allowed and none of the 3410193163Sdfr * leaves can be wholedisks. 3411193163Sdfr * 3412193163Sdfr * For FreeBSD, we can boot from any configuration. There is a 3413193163Sdfr * limitation that the boot filesystem must be either uncompressed or 3414193163Sdfr * compresses with lzjb compression but I'm not sure how to enforce 3415193163Sdfr * that here. 3416185029Spjd */ 3417185029Spjdboolean_t 3418185029Spjdvdev_is_bootable(vdev_t *vd) 3419185029Spjd{ 3420297077Smav#ifdef illumos 3421185029Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3422185029Spjd char *vdev_type = vd->vdev_ops->vdev_op_type; 3423185029Spjd 3424185029Spjd if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3425185029Spjd vd->vdev_children > 1) { 3426185029Spjd return (B_FALSE); 3427185029Spjd } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 3428185029Spjd strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 3429185029Spjd return (B_FALSE); 3430185029Spjd } 3431185029Spjd } 3432185029Spjd 3433219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 3434185029Spjd if (!vdev_is_bootable(vd->vdev_child[c])) 3435185029Spjd return (B_FALSE); 3436185029Spjd } 3437297077Smav#endif /* illumos */ 3438185029Spjd return (B_TRUE); 3439168404Spjd} 3440213197Smm 3441219089Spjd/* 3442219089Spjd * Load the state from the original vdev tree (ovd) which 3443219089Spjd * we've retrieved from the MOS config object. If the original 3444219089Spjd * vdev was offline or faulted then we transfer that state to the 3445219089Spjd * device in the current vdev tree (nvd). 3446219089Spjd */ 3447213197Smmvoid 3448219089Spjdvdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3449213197Smm{ 3450219089Spjd spa_t *spa = nvd->vdev_spa; 3451213197Smm 3452219089Spjd ASSERT(nvd->vdev_top->vdev_islog); 3453219089Spjd ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3454219089Spjd ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3455213197Smm 3456219089Spjd for (int c = 0; c < nvd->vdev_children; c++) 3457219089Spjd vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3458213197Smm 3459219089Spjd if (nvd->vdev_ops->vdev_op_leaf) { 3460213197Smm /* 3461219089Spjd * Restore the persistent vdev state 3462213197Smm */ 3463219089Spjd nvd->vdev_offline = ovd->vdev_offline; 3464219089Spjd nvd->vdev_faulted = ovd->vdev_faulted; 3465219089Spjd nvd->vdev_degraded = ovd->vdev_degraded; 3466219089Spjd nvd->vdev_removed = ovd->vdev_removed; 3467213197Smm } 3468213197Smm} 3469219089Spjd 3470219089Spjd/* 3471219089Spjd * Determine if a log device has valid content. If the vdev was 3472219089Spjd * removed or faulted in the MOS config then we know that 3473219089Spjd * the content on the log device has already been written to the pool. 3474219089Spjd */ 3475219089Spjdboolean_t 3476219089Spjdvdev_log_state_valid(vdev_t *vd) 3477219089Spjd{ 3478219089Spjd if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3479219089Spjd !vd->vdev_removed) 3480219089Spjd return (B_TRUE); 3481219089Spjd 3482219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3483219089Spjd if (vdev_log_state_valid(vd->vdev_child[c])) 3484219089Spjd return (B_TRUE); 3485219089Spjd 3486219089Spjd return (B_FALSE); 3487219089Spjd} 3488219089Spjd 3489219089Spjd/* 3490219089Spjd * Expand a vdev if possible. 3491219089Spjd */ 3492219089Spjdvoid 3493219089Spjdvdev_expand(vdev_t *vd, uint64_t txg) 3494219089Spjd{ 3495219089Spjd ASSERT(vd->vdev_top == vd); 3496219089Spjd ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3497219089Spjd 3498219089Spjd if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 3499219089Spjd VERIFY(vdev_metaslab_init(vd, txg) == 0); 3500219089Spjd vdev_config_dirty(vd); 3501219089Spjd } 3502219089Spjd} 3503219089Spjd 3504219089Spjd/* 3505219089Spjd * Split a vdev. 3506219089Spjd */ 3507219089Spjdvoid 3508219089Spjdvdev_split(vdev_t *vd) 3509219089Spjd{ 3510219089Spjd vdev_t *cvd, *pvd = vd->vdev_parent; 3511219089Spjd 3512219089Spjd vdev_remove_child(pvd, vd); 3513219089Spjd vdev_compact_children(pvd); 3514219089Spjd 3515219089Spjd cvd = pvd->vdev_child[0]; 3516219089Spjd if (pvd->vdev_children == 1) { 3517219089Spjd vdev_remove_parent(cvd); 3518219089Spjd cvd->vdev_splitting = B_TRUE; 3519219089Spjd } 3520219089Spjd vdev_propagate_state(cvd); 3521219089Spjd} 3522247265Smm 3523247265Smmvoid 3524247265Smmvdev_deadman(vdev_t *vd) 3525247265Smm{ 3526247265Smm for (int c = 0; c < vd->vdev_children; c++) { 3527247265Smm vdev_t *cvd = vd->vdev_child[c]; 3528247265Smm 3529247265Smm vdev_deadman(cvd); 3530247265Smm } 3531247265Smm 3532247265Smm if (vd->vdev_ops->vdev_op_leaf) { 3533247265Smm vdev_queue_t *vq = &vd->vdev_queue; 3534247265Smm 3535247265Smm mutex_enter(&vq->vq_lock); 3536260763Savg if (avl_numnodes(&vq->vq_active_tree) > 0) { 3537247265Smm spa_t *spa = vd->vdev_spa; 3538247265Smm zio_t *fio; 3539247265Smm uint64_t delta; 3540247265Smm 3541247265Smm /* 3542247265Smm * Look at the head of all the pending queues, 3543247265Smm * if any I/O has been outstanding for longer than 3544247265Smm * the spa_deadman_synctime we panic the system. 3545247265Smm */ 3546260763Savg fio = avl_first(&vq->vq_active_tree); 3547249206Smm delta = gethrtime() - fio->io_timestamp; 3548249206Smm if (delta > spa_deadman_synctime(spa)) { 3549249206Smm zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " 3550249206Smm "delta %lluns, last io %lluns", 3551247265Smm fio->io_timestamp, delta, 3552247265Smm vq->vq_io_complete_ts); 3553247265Smm fm_panic("I/O to pool '%s' appears to be " 3554247348Smm "hung on vdev guid %llu at '%s'.", 3555247348Smm spa_name(spa), 3556247348Smm (long long unsigned int) vd->vdev_guid, 3557247348Smm vd->vdev_path); 3558247265Smm } 3559247265Smm } 3560247265Smm mutex_exit(&vq->vq_lock); 3561247265Smm } 3562247265Smm} 3563