Cross Reference: /freebsd-11-stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c

Deleted Added

sdiff udiff text old ( 332525 ) new ( 332530 )

full compact

vdev.c (332525)	vdev.c (332530)
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21 22/ 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright 2017 Nexenta Systems, Inc. 26 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 * Copyright (c) 2014 Integros [integros.com] 28 * Copyright 2016 Toomas Soome <tsoome@me.com> 29 * Copyright 2017 Joyent, Inc. 30 / 31 32#include <sys/zfs_context.h> 33#include <sys/fm/fs/zfs.h> 34#include <sys/spa.h> 35#include <sys/spa_impl.h> 36#include <sys/bpobj.h> 37#include <sys/dmu.h> 38#include <sys/dmu_tx.h> 39#include <sys/dsl_dir.h> 40#include <sys/vdev_impl.h> 41#include <sys/uberblock_impl.h> 42#include <sys/metaslab.h> 43#include <sys/metaslab_impl.h> 44#include <sys/space_map.h> 45#include <sys/space_reftree.h> 46#include <sys/zio.h> 47#include <sys/zap.h> 48#include <sys/fs/zfs.h> 49#include <sys/arc.h> 50#include <sys/zil.h> 51#include <sys/dsl_scan.h> 52#include <sys/abd.h> 53#include <sys/trim_map.h> 54 55SYSCTL_DECL(_vfs_zfs); 56SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 57 58/ 59 * Virtual device management. 60 / 61 62/ 63 * The limit for ZFS to automatically increase a top-level vdev's ashift 64 * from logical ashift to physical ashift. 65 * 66 * Example: one or more 512B emulation child vdevs 67 * child->vdev_ashift = 9 (512 bytes) 68 * child->vdev_physical_ashift = 12 (4096 bytes) 69 * zfs_max_auto_ashift = 11 (2048 bytes) 70 * zfs_min_auto_ashift = 9 (512 bytes) 71 * 72 * On pool creation or the addition of a new top-level vdev, ZFS will 73 * increase the ashift of the top-level vdev to 2048 as limited by 74 * zfs_max_auto_ashift. 75 * 76 * Example: one or more 512B emulation child vdevs 77 * child->vdev_ashift = 9 (512 bytes) 78 * child->vdev_physical_ashift = 12 (4096 bytes) 79 * zfs_max_auto_ashift = 13 (8192 bytes) 80 * zfs_min_auto_ashift = 9 (512 bytes) 81 * 82 * On pool creation or the addition of a new top-level vdev, ZFS will 83 * increase the ashift of the top-level vdev to 4096 to match the 84 * max vdev_physical_ashift. 85 * 86 * Example: one or more 512B emulation child vdevs 87 * child->vdev_ashift = 9 (512 bytes) 88 * child->vdev_physical_ashift = 9 (512 bytes) 89 * zfs_max_auto_ashift = 13 (8192 bytes) 90 * zfs_min_auto_ashift = 12 (4096 bytes) 91 * 92 * On pool creation or the addition of a new top-level vdev, ZFS will 93 * increase the ashift of the top-level vdev to 4096 to match the 94 * zfs_min_auto_ashift. 95 / 96static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; 97static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; 98 99static int 100sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) 101{ 102* uint64_t val; 103 int err; 104 105 val = zfs_max_auto_ashift; 106 err = sysctl_handle_64(oidp, &val, 0, req); 107 if (err != 0 \|\| req->newptr == NULL) 108 return (err); 109 110 if (val > SPA_MAXASHIFT \|\| val < zfs_min_auto_ashift) 111 return (EINVAL); 112 113 zfs_max_auto_ashift = val; 114 115 return (0); 116} 117SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, 118 CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t), 119 sysctl_vfs_zfs_max_auto_ashift, "QU", 120 "Max ashift used when optimising for logical -> physical sectors size on " 121 "new top-level vdevs."); 122 123static int 124sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) 125{ 126 uint64_t val; 127 int err; 128 129 val = zfs_min_auto_ashift; 130 err = sysctl_handle_64(oidp, &val, 0, req); 131 if (err != 0 \|\| req->newptr == NULL) 132 return (err); 133 134 if (val < SPA_MINASHIFT \|\| val > zfs_max_auto_ashift) 135 return (EINVAL); 136 137 zfs_min_auto_ashift = val; 138 139 return (0); 140} 141SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, 142 CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t), 143 sysctl_vfs_zfs_min_auto_ashift, "QU", 144 "Min ashift used when creating new top-level vdevs."); 145 146static vdev_ops_t vdev_ops_table[] = { 147* &vdev_root_ops, 148 &vdev_raidz_ops, 149 &vdev_mirror_ops, 150 &vdev_replacing_ops, 151 &vdev_spare_ops, 152#ifdef _KERNEL 153 &vdev_geom_ops, 154#else 155 &vdev_disk_ops, 156#endif 157 &vdev_file_ops, 158 &vdev_missing_ops, 159 &vdev_hole_ops, 160 &vdev_indirect_ops, 161 NULL 162}; 163 164 165/* 166 * When a vdev is added, it will be divided into approximately (but no 167 * more than) this number of metaslabs. 168 / 169int metaslabs_per_vdev = 200; 170SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN, 171* &metaslabs_per_vdev, 0, 172 "When a vdev is added, how many metaslabs the vdev should be divided into"); 173	1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21 22/ 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright 2017 Nexenta Systems, Inc. 26 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 * Copyright (c) 2014 Integros [integros.com] 28 * Copyright 2016 Toomas Soome <tsoome@me.com> 29 * Copyright 2017 Joyent, Inc. 30 / 31 32#include <sys/zfs_context.h> 33#include <sys/fm/fs/zfs.h> 34#include <sys/spa.h> 35#include <sys/spa_impl.h> 36#include <sys/bpobj.h> 37#include <sys/dmu.h> 38#include <sys/dmu_tx.h> 39#include <sys/dsl_dir.h> 40#include <sys/vdev_impl.h> 41#include <sys/uberblock_impl.h> 42#include <sys/metaslab.h> 43#include <sys/metaslab_impl.h> 44#include <sys/space_map.h> 45#include <sys/space_reftree.h> 46#include <sys/zio.h> 47#include <sys/zap.h> 48#include <sys/fs/zfs.h> 49#include <sys/arc.h> 50#include <sys/zil.h> 51#include <sys/dsl_scan.h> 52#include <sys/abd.h> 53#include <sys/trim_map.h> 54 55SYSCTL_DECL(_vfs_zfs); 56SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 57 58/ 59 * Virtual device management. 60 / 61 62/ 63 * The limit for ZFS to automatically increase a top-level vdev's ashift 64 * from logical ashift to physical ashift. 65 * 66 * Example: one or more 512B emulation child vdevs 67 * child->vdev_ashift = 9 (512 bytes) 68 * child->vdev_physical_ashift = 12 (4096 bytes) 69 * zfs_max_auto_ashift = 11 (2048 bytes) 70 * zfs_min_auto_ashift = 9 (512 bytes) 71 * 72 * On pool creation or the addition of a new top-level vdev, ZFS will 73 * increase the ashift of the top-level vdev to 2048 as limited by 74 * zfs_max_auto_ashift. 75 * 76 * Example: one or more 512B emulation child vdevs 77 * child->vdev_ashift = 9 (512 bytes) 78 * child->vdev_physical_ashift = 12 (4096 bytes) 79 * zfs_max_auto_ashift = 13 (8192 bytes) 80 * zfs_min_auto_ashift = 9 (512 bytes) 81 * 82 * On pool creation or the addition of a new top-level vdev, ZFS will 83 * increase the ashift of the top-level vdev to 4096 to match the 84 * max vdev_physical_ashift. 85 * 86 * Example: one or more 512B emulation child vdevs 87 * child->vdev_ashift = 9 (512 bytes) 88 * child->vdev_physical_ashift = 9 (512 bytes) 89 * zfs_max_auto_ashift = 13 (8192 bytes) 90 * zfs_min_auto_ashift = 12 (4096 bytes) 91 * 92 * On pool creation or the addition of a new top-level vdev, ZFS will 93 * increase the ashift of the top-level vdev to 4096 to match the 94 * zfs_min_auto_ashift. 95 / 96static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; 97static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; 98 99static int 100sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) 101{ 102* uint64_t val; 103 int err; 104 105 val = zfs_max_auto_ashift; 106 err = sysctl_handle_64(oidp, &val, 0, req); 107 if (err != 0 \|\| req->newptr == NULL) 108 return (err); 109 110 if (val > SPA_MAXASHIFT \|\| val < zfs_min_auto_ashift) 111 return (EINVAL); 112 113 zfs_max_auto_ashift = val; 114 115 return (0); 116} 117SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, 118 CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t), 119 sysctl_vfs_zfs_max_auto_ashift, "QU", 120 "Max ashift used when optimising for logical -> physical sectors size on " 121 "new top-level vdevs."); 122 123static int 124sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) 125{ 126 uint64_t val; 127 int err; 128 129 val = zfs_min_auto_ashift; 130 err = sysctl_handle_64(oidp, &val, 0, req); 131 if (err != 0 \|\| req->newptr == NULL) 132 return (err); 133 134 if (val < SPA_MINASHIFT \|\| val > zfs_max_auto_ashift) 135 return (EINVAL); 136 137 zfs_min_auto_ashift = val; 138 139 return (0); 140} 141SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, 142 CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, 0, sizeof(uint64_t), 143 sysctl_vfs_zfs_min_auto_ashift, "QU", 144 "Min ashift used when creating new top-level vdevs."); 145 146static vdev_ops_t vdev_ops_table[] = { 147* &vdev_root_ops, 148 &vdev_raidz_ops, 149 &vdev_mirror_ops, 150 &vdev_replacing_ops, 151 &vdev_spare_ops, 152#ifdef _KERNEL 153 &vdev_geom_ops, 154#else 155 &vdev_disk_ops, 156#endif 157 &vdev_file_ops, 158 &vdev_missing_ops, 159 &vdev_hole_ops, 160 &vdev_indirect_ops, 161 NULL 162}; 163 164 165/* 166 * When a vdev is added, it will be divided into approximately (but no 167 * more than) this number of metaslabs. 168 / 169int metaslabs_per_vdev = 200; 170SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN, 171* &metaslabs_per_vdev, 0, 172 "When a vdev is added, how many metaslabs the vdev should be divided into"); 173
	174/PRINTFLIKE2/ 175void 176vdev_dbgmsg(vdev_t vd, const char fmt, ...) 177{ 178 va_list adx; 179 char buf[256]; 180 181 va_start(adx, fmt); 182 (void) vsnprintf(buf, sizeof (buf), fmt, adx); 183 va_end(adx); 184 185 if (vd->vdev_path != NULL) { 186 zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, 187 vd->vdev_path, buf); 188 } else { 189 zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", 190 vd->vdev_ops->vdev_op_type, 191 (u_longlong_t)vd->vdev_id, 192 (u_longlong_t)vd->vdev_guid, buf); 193 } 194} 195
174/* 175 * Given a vdev type, return the appropriate ops vector. 176 / 177static vdev_ops_t 178vdev_getops(const char type) 179{ 180* vdev_ops_t ops, opspp; 181* 182 for (opspp = vdev_ops_table; (ops = opspp) != NULL; opspp++) 183* if (strcmp(ops->vdev_op_type, type) == 0) 184 break; 185 186 return (ops); 187} 188 189/* 190 * Default asize function: return the MAX of psize with the asize of 191 * all children. This is what's used by anything other than RAID-Z. 192 / 193uint64_t 194vdev_default_asize(vdev_t vd, uint64_t psize) 195{ 196 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 197 uint64_t csize; 198 199 for (int c = 0; c < vd->vdev_children; c++) { 200 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 201 asize = MAX(asize, csize); 202 } 203 204 return (asize); 205} 206 207/* 208 * Get the minimum allocatable size. We define the allocatable size as 209 * the vdev's asize rounded to the nearest metaslab. This allows us to 210 * replace or attach devices which don't have the same physical size but 211 * can still satisfy the same number of allocations. 212 / 213uint64_t 214vdev_get_min_asize(vdev_t vd) 215{ 216 vdev_t pvd = vd->vdev_parent; 217* 218 /* 219 * If our parent is NULL (inactive spare or cache) or is the root, 220 * just return our own asize. 221 / 222* if (pvd == NULL) 223 return (vd->vdev_asize); 224 225 /* 226 * The top-level vdev just returns the allocatable size rounded 227 * to the nearest metaslab. 228 / 229* if (vd == vd->vdev_top) 230 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 231 232 /* 233 * The allocatable space for a raidz vdev is N * sizeof(smallest child), 234 * so each child must provide at least 1/Nth of its asize. 235 / 236* if (pvd->vdev_ops == &vdev_raidz_ops) 237 return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / 238 pvd->vdev_children); 239 240 return (pvd->vdev_min_asize); 241} 242 243void 244vdev_set_min_asize(vdev_t vd) 245{ 246* vd->vdev_min_asize = vdev_get_min_asize(vd); 247 248 for (int c = 0; c < vd->vdev_children; c++) 249 vdev_set_min_asize(vd->vdev_child[c]); 250} 251 252vdev_t * 253vdev_lookup_top(spa_t spa, uint64_t vdev) 254{ 255* vdev_t rvd = spa->spa_root_vdev; 256* 257 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 258 259 if (vdev < rvd->vdev_children) { 260 ASSERT(rvd->vdev_child[vdev] != NULL); 261 return (rvd->vdev_child[vdev]); 262 } 263 264 return (NULL); 265} 266 267vdev_t * 268vdev_lookup_by_guid(vdev_t vd, uint64_t guid) 269{ 270* vdev_t mvd; 271* 272 if (vd->vdev_guid == guid) 273 return (vd); 274 275 for (int c = 0; c < vd->vdev_children; c++) 276 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 277 NULL) 278 return (mvd); 279 280 return (NULL); 281} 282 283static int 284vdev_count_leaves_impl(vdev_t vd) 285{ 286* int n = 0; 287 288 if (vd->vdev_ops->vdev_op_leaf) 289 return (1); 290 291 for (int c = 0; c < vd->vdev_children; c++) 292 n += vdev_count_leaves_impl(vd->vdev_child[c]); 293 294 return (n); 295} 296 297int 298vdev_count_leaves(spa_t spa) 299{ 300* return (vdev_count_leaves_impl(spa->spa_root_vdev)); 301} 302 303void 304vdev_add_child(vdev_t pvd, vdev_t cvd) 305{ 306 size_t oldsize, newsize; 307 uint64_t id = cvd->vdev_id; 308 vdev_t *newchild; 309* spa_t spa = cvd->vdev_spa; 310* 311 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 312 ASSERT(cvd->vdev_parent == NULL); 313 314 cvd->vdev_parent = pvd; 315 316 if (pvd == NULL) 317 return; 318 319 ASSERT(id >= pvd->vdev_children \|\| pvd->vdev_child[id] == NULL); 320 321 oldsize = pvd->vdev_children * sizeof (vdev_t ); 322* pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 323 newsize = pvd->vdev_children * sizeof (vdev_t ); 324* 325 newchild = kmem_zalloc(newsize, KM_SLEEP); 326 if (pvd->vdev_child != NULL) { 327 bcopy(pvd->vdev_child, newchild, oldsize); 328 kmem_free(pvd->vdev_child, oldsize); 329 } 330 331 pvd->vdev_child = newchild; 332 pvd->vdev_child[id] = cvd; 333 334 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 335 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 336 337 /* 338 * Walk up all ancestors to update guid sum. 339 / 340* for (; pvd != NULL; pvd = pvd->vdev_parent) 341 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 342} 343 344void 345vdev_remove_child(vdev_t pvd, vdev_t cvd) 346{ 347 int c; 348 uint_t id = cvd->vdev_id; 349 350 ASSERT(cvd->vdev_parent == pvd); 351 352 if (pvd == NULL) 353 return; 354 355 ASSERT(id < pvd->vdev_children); 356 ASSERT(pvd->vdev_child[id] == cvd); 357 358 pvd->vdev_child[id] = NULL; 359 cvd->vdev_parent = NULL; 360 361 for (c = 0; c < pvd->vdev_children; c++) 362 if (pvd->vdev_child[c]) 363 break; 364 365 if (c == pvd->vdev_children) { 366 kmem_free(pvd->vdev_child, c * sizeof (vdev_t )); 367* pvd->vdev_child = NULL; 368 pvd->vdev_children = 0; 369 } 370 371 /* 372 * Walk up all ancestors to update guid sum. 373 / 374* for (; pvd != NULL; pvd = pvd->vdev_parent) 375 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 376} 377 378/* 379 * Remove any holes in the child array. 380 / 381void 382vdev_compact_children(vdev_t pvd) 383{ 384 vdev_t *newchild, cvd; 385 int oldc = pvd->vdev_children; 386 int newc; 387 388 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 389 390 for (int c = newc = 0; c < oldc; c++) 391 if (pvd->vdev_child[c]) 392 newc++; 393 394 newchild = kmem_alloc(newc * sizeof (vdev_t ), KM_SLEEP); 395* 396 for (int c = newc = 0; c < oldc; c++) { 397 if ((cvd = pvd->vdev_child[c]) != NULL) { 398 newchild[newc] = cvd; 399 cvd->vdev_id = newc++; 400 } 401 } 402 403 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t )); 404* pvd->vdev_child = newchild; 405 pvd->vdev_children = newc; 406} 407 408/* 409 * Allocate and minimally initialize a vdev_t. 410 / 411vdev_t 412vdev_alloc_common(spa_t spa, uint_t id, uint64_t guid, vdev_ops_t ops) 413{ 414 vdev_t vd; 415* vdev_indirect_config_t vic; 416* 417 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 418 vic = &vd->vdev_indirect_config; 419 420 if (spa->spa_root_vdev == NULL) { 421 ASSERT(ops == &vdev_root_ops); 422 spa->spa_root_vdev = vd; 423 spa->spa_load_guid = spa_generate_guid(NULL); 424 } 425 426 if (guid == 0 && ops != &vdev_hole_ops) { 427 if (spa->spa_root_vdev == vd) { 428 /* 429 * The root vdev's guid will also be the pool guid, 430 * which must be unique among all pools. 431 / 432* guid = spa_generate_guid(NULL); 433 } else { 434 /* 435 * Any other vdev's guid must be unique within the pool. 436 / 437* guid = spa_generate_guid(spa); 438 } 439 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 440 } 441 442 vd->vdev_spa = spa; 443 vd->vdev_id = id; 444 vd->vdev_guid = guid; 445 vd->vdev_guid_sum = guid; 446 vd->vdev_ops = ops; 447 vd->vdev_state = VDEV_STATE_CLOSED; 448 vd->vdev_ishole = (ops == &vdev_hole_ops); 449 vic->vic_prev_indirect_vdev = UINT64_MAX; 450 451 rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); 452 mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); 453 vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); 454 455 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 456 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 457 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 458 mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); 459 for (int t = 0; t < DTL_TYPES; t++) { 460 vd->vdev_dtl[t] = range_tree_create(NULL, NULL); 461 } 462 txg_list_create(&vd->vdev_ms_list, spa, 463 offsetof(struct metaslab, ms_txg_node)); 464 txg_list_create(&vd->vdev_dtl_list, spa, 465 offsetof(struct vdev, vdev_dtl_node)); 466 vd->vdev_stat.vs_timestamp = gethrtime(); 467 vdev_queue_init(vd); 468 vdev_cache_init(vd); 469 470 return (vd); 471} 472 473/* 474 * Allocate a new vdev. The 'alloctype' is used to control whether we are 475 * creating a new vdev or loading an existing one - the behavior is slightly 476 * different for each case. 477 / 478int 479vdev_alloc(spa_t spa, vdev_t *vdp, nvlist_t nv, vdev_t parent, uint_t id, 480* int alloctype) 481{ 482 vdev_ops_t ops; 483* char type; 484* uint64_t guid = 0, islog, nparity; 485 vdev_t vd; 486* vdev_indirect_config_t vic; 487* 488 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 489 490 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 491 return (SET_ERROR(EINVAL)); 492 493 if ((ops = vdev_getops(type)) == NULL) 494 return (SET_ERROR(EINVAL)); 495 496 /* 497 * If this is a load, get the vdev guid from the nvlist. 498 * Otherwise, vdev_alloc_common() will generate one for us. 499 / 500* if (alloctype == VDEV_ALLOC_LOAD) { 501 uint64_t label_id; 502 503 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) \|\| 504 label_id != id) 505 return (SET_ERROR(EINVAL)); 506 507 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 508 return (SET_ERROR(EINVAL)); 509 } else if (alloctype == VDEV_ALLOC_SPARE) { 510 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 511 return (SET_ERROR(EINVAL)); 512 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 513 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 514 return (SET_ERROR(EINVAL)); 515 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 516 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 517 return (SET_ERROR(EINVAL)); 518 } 519 520 /* 521 * The first allocated vdev must be of type 'root'. 522 / 523* if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 524 return (SET_ERROR(EINVAL)); 525 526 /* 527 * Determine whether we're a log vdev. 528 / 529* islog = 0; 530 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 531 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 532 return (SET_ERROR(ENOTSUP)); 533 534 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 535 return (SET_ERROR(ENOTSUP)); 536 537 /* 538 * Set the nparity property for RAID-Z vdevs. 539 / 540* nparity = -1ULL; 541 if (ops == &vdev_raidz_ops) { 542 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 543 &nparity) == 0) { 544 if (nparity == 0 \|\| nparity > VDEV_RAIDZ_MAXPARITY) 545 return (SET_ERROR(EINVAL)); 546 /* 547 * Previous versions could only support 1 or 2 parity 548 * device. 549 / 550* if (nparity > 1 && 551 spa_version(spa) < SPA_VERSION_RAIDZ2) 552 return (SET_ERROR(ENOTSUP)); 553 if (nparity > 2 && 554 spa_version(spa) < SPA_VERSION_RAIDZ3) 555 return (SET_ERROR(ENOTSUP)); 556 } else { 557 /* 558 * We require the parity to be specified for SPAs that 559 * support multiple parity levels. 560 / 561* if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 562 return (SET_ERROR(EINVAL)); 563 /* 564 * Otherwise, we default to 1 parity device for RAID-Z. 565 / 566* nparity = 1; 567 } 568 } else { 569 nparity = 0; 570 } 571 ASSERT(nparity != -1ULL); 572 573 vd = vdev_alloc_common(spa, id, guid, ops); 574 vic = &vd->vdev_indirect_config; 575 576 vd->vdev_islog = islog; 577 vd->vdev_nparity = nparity; 578 579 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 580 vd->vdev_path = spa_strdup(vd->vdev_path); 581 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 582 vd->vdev_devid = spa_strdup(vd->vdev_devid); 583 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 584 &vd->vdev_physpath) == 0) 585 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 586 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 587 vd->vdev_fru = spa_strdup(vd->vdev_fru); 588 589 /* 590 * Set the whole_disk property. If it's not specified, leave the value 591 * as -1. 592 / 593* if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 594 &vd->vdev_wholedisk) != 0) 595 vd->vdev_wholedisk = -1ULL; 596 597 ASSERT0(vic->vic_mapping_object); 598 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, 599 &vic->vic_mapping_object); 600 ASSERT0(vic->vic_births_object); 601 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, 602 &vic->vic_births_object); 603 ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); 604 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 605 &vic->vic_prev_indirect_vdev); 606 607 /* 608 * Look for the 'not present' flag. This will only be set if the device 609 * was not present at the time of import. 610 / 611* (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 612 &vd->vdev_not_present); 613 614 /* 615 * Get the alignment requirement. 616 / 617* (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 618 619 /* 620 * Retrieve the vdev creation time. 621 / 622* (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 623 &vd->vdev_crtxg); 624 625 /* 626 * If we're a top-level vdev, try to load the allocation parameters. 627 / 628* if (parent && !parent->vdev_parent && 629 (alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_SPLIT)) { 630 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 631 &vd->vdev_ms_array); 632 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 633 &vd->vdev_ms_shift); 634 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 635 &vd->vdev_asize); 636 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 637 &vd->vdev_removing); 638 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, 639 &vd->vdev_top_zap); 640 } else { 641 ASSERT0(vd->vdev_top_zap); 642 } 643 644 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 645 ASSERT(alloctype == VDEV_ALLOC_LOAD \|\| 646 alloctype == VDEV_ALLOC_ADD \|\| 647 alloctype == VDEV_ALLOC_SPLIT \|\| 648 alloctype == VDEV_ALLOC_ROOTPOOL); 649 vd->vdev_mg = metaslab_group_create(islog ? 650 spa_log_class(spa) : spa_normal_class(spa), vd); 651 } 652 653 if (vd->vdev_ops->vdev_op_leaf && 654 (alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_SPLIT)) { 655 (void) nvlist_lookup_uint64(nv, 656 ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); 657 } else { 658 ASSERT0(vd->vdev_leaf_zap); 659 } 660 661 /* 662 * If we're a leaf vdev, try to load the DTL object and other state. 663 / 664* 665 if (vd->vdev_ops->vdev_op_leaf && 666 (alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_L2CACHE \|\| 667 alloctype == VDEV_ALLOC_ROOTPOOL)) { 668 if (alloctype == VDEV_ALLOC_LOAD) { 669 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 670 &vd->vdev_dtl_object); 671 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 672 &vd->vdev_unspare); 673 } 674 675 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 676 uint64_t spare = 0; 677 678 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 679 &spare) == 0 && spare) 680 spa_spare_add(vd); 681 } 682 683 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 684 &vd->vdev_offline); 685 686 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 687 &vd->vdev_resilver_txg); 688 689 /* 690 * When importing a pool, we want to ignore the persistent fault 691 * state, as the diagnosis made on another system may not be 692 * valid in the current context. Local vdevs will 693 * remain in the faulted state. 694 / 695* if (spa_load_state(spa) == SPA_LOAD_OPEN) { 696 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 697 &vd->vdev_faulted); 698 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 699 &vd->vdev_degraded); 700 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 701 &vd->vdev_removed); 702 703 if (vd->vdev_faulted \|\| vd->vdev_degraded) { 704 char aux; 705* 706 vd->vdev_label_aux = 707 VDEV_AUX_ERR_EXCEEDED; 708 if (nvlist_lookup_string(nv, 709 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 710 strcmp(aux, "external") == 0) 711 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 712 } 713 } 714 } 715 716 /* 717 * Add ourselves to the parent's list of children. 718 / 719* vdev_add_child(parent, vd); 720 721 vdp = vd; 722* 723 return (0); 724} 725 726void 727vdev_free(vdev_t vd) 728{ 729* spa_t spa = vd->vdev_spa; 730* 731 /* 732 * vdev_free() implies closing the vdev first. This is simpler than 733 * trying to ensure complicated semantics for all callers. 734 / 735* vdev_close(vd); 736 737 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 738 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 739 740 /* 741 * Free all children. 742 / 743* for (int c = 0; c < vd->vdev_children; c++) 744 vdev_free(vd->vdev_child[c]); 745 746 ASSERT(vd->vdev_child == NULL); 747 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 748 749 /* 750 * Discard allocation state. 751 / 752* if (vd->vdev_mg != NULL) { 753 vdev_metaslab_fini(vd); 754 metaslab_group_destroy(vd->vdev_mg); 755 } 756 757 ASSERT0(vd->vdev_stat.vs_space); 758 ASSERT0(vd->vdev_stat.vs_dspace); 759 ASSERT0(vd->vdev_stat.vs_alloc); 760 761 /* 762 * Remove this vdev from its parent's child list. 763 / 764* vdev_remove_child(vd->vdev_parent, vd); 765 766 ASSERT(vd->vdev_parent == NULL); 767 768 /* 769 * Clean up vdev structure. 770 / 771* vdev_queue_fini(vd); 772 vdev_cache_fini(vd); 773 774 if (vd->vdev_path) 775 spa_strfree(vd->vdev_path); 776 if (vd->vdev_devid) 777 spa_strfree(vd->vdev_devid); 778 if (vd->vdev_physpath) 779 spa_strfree(vd->vdev_physpath); 780 if (vd->vdev_fru) 781 spa_strfree(vd->vdev_fru); 782 783 if (vd->vdev_isspare) 784 spa_spare_remove(vd); 785 if (vd->vdev_isl2cache) 786 spa_l2cache_remove(vd); 787 788 txg_list_destroy(&vd->vdev_ms_list); 789 txg_list_destroy(&vd->vdev_dtl_list); 790 791 mutex_enter(&vd->vdev_dtl_lock); 792 space_map_close(vd->vdev_dtl_sm); 793 for (int t = 0; t < DTL_TYPES; t++) { 794 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 795 range_tree_destroy(vd->vdev_dtl[t]); 796 } 797 mutex_exit(&vd->vdev_dtl_lock); 798 799 EQUIV(vd->vdev_indirect_births != NULL, 800 vd->vdev_indirect_mapping != NULL); 801 if (vd->vdev_indirect_births != NULL) { 802 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 803 vdev_indirect_births_close(vd->vdev_indirect_births); 804 } 805 806 if (vd->vdev_obsolete_sm != NULL) { 807 ASSERT(vd->vdev_removing \|\| 808 vd->vdev_ops == &vdev_indirect_ops); 809 space_map_close(vd->vdev_obsolete_sm); 810 vd->vdev_obsolete_sm = NULL; 811 } 812 range_tree_destroy(vd->vdev_obsolete_segments); 813 rw_destroy(&vd->vdev_indirect_rwlock); 814 mutex_destroy(&vd->vdev_obsolete_lock); 815 816 mutex_destroy(&vd->vdev_queue_lock); 817 mutex_destroy(&vd->vdev_dtl_lock); 818 mutex_destroy(&vd->vdev_stat_lock); 819 mutex_destroy(&vd->vdev_probe_lock); 820 821 if (vd == spa->spa_root_vdev) 822 spa->spa_root_vdev = NULL; 823 824 kmem_free(vd, sizeof (vdev_t)); 825} 826 827/* 828 * Transfer top-level vdev state from svd to tvd. 829 / 830static void 831vdev_top_transfer(vdev_t svd, vdev_t tvd) 832{ 833* spa_t spa = svd->vdev_spa; 834* metaslab_t msp; 835* vdev_t vd; 836* int t; 837 838 ASSERT(tvd == tvd->vdev_top); 839 840 tvd->vdev_ms_array = svd->vdev_ms_array; 841 tvd->vdev_ms_shift = svd->vdev_ms_shift; 842 tvd->vdev_ms_count = svd->vdev_ms_count; 843 tvd->vdev_top_zap = svd->vdev_top_zap; 844 845 svd->vdev_ms_array = 0; 846 svd->vdev_ms_shift = 0; 847 svd->vdev_ms_count = 0; 848 svd->vdev_top_zap = 0; 849 850 if (tvd->vdev_mg) 851 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 852 tvd->vdev_mg = svd->vdev_mg; 853 tvd->vdev_ms = svd->vdev_ms; 854 855 svd->vdev_mg = NULL; 856 svd->vdev_ms = NULL; 857 858 if (tvd->vdev_mg != NULL) 859 tvd->vdev_mg->mg_vd = tvd; 860 861 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 862 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 863 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 864 865 svd->vdev_stat.vs_alloc = 0; 866 svd->vdev_stat.vs_space = 0; 867 svd->vdev_stat.vs_dspace = 0; 868 869 for (t = 0; t < TXG_SIZE; t++) { 870 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 871 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 872 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 873 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 874 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 875 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 876 } 877 878 if (list_link_active(&svd->vdev_config_dirty_node)) { 879 vdev_config_clean(svd); 880 vdev_config_dirty(tvd); 881 } 882 883 if (list_link_active(&svd->vdev_state_dirty_node)) { 884 vdev_state_clean(svd); 885 vdev_state_dirty(tvd); 886 } 887 888 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 889 svd->vdev_deflate_ratio = 0; 890 891 tvd->vdev_islog = svd->vdev_islog; 892 svd->vdev_islog = 0; 893} 894 895static void 896vdev_top_update(vdev_t tvd, vdev_t vd) 897{ 898 if (vd == NULL) 899 return; 900 901 vd->vdev_top = tvd; 902 903 for (int c = 0; c < vd->vdev_children; c++) 904 vdev_top_update(tvd, vd->vdev_child[c]); 905} 906 907/* 908 * Add a mirror/replacing vdev above an existing vdev. 909 / 910vdev_t 911vdev_add_parent(vdev_t cvd, vdev_ops_t ops) 912{ 913 spa_t spa = cvd->vdev_spa; 914* vdev_t pvd = cvd->vdev_parent; 915* vdev_t mvd; 916* 917 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 918 919 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 920 921 mvd->vdev_asize = cvd->vdev_asize; 922 mvd->vdev_min_asize = cvd->vdev_min_asize; 923 mvd->vdev_max_asize = cvd->vdev_max_asize; 924 mvd->vdev_psize = cvd->vdev_psize; 925 mvd->vdev_ashift = cvd->vdev_ashift; 926 mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; 927 mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; 928 mvd->vdev_state = cvd->vdev_state; 929 mvd->vdev_crtxg = cvd->vdev_crtxg; 930 931 vdev_remove_child(pvd, cvd); 932 vdev_add_child(pvd, mvd); 933 cvd->vdev_id = mvd->vdev_children; 934 vdev_add_child(mvd, cvd); 935 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 936 937 if (mvd == mvd->vdev_top) 938 vdev_top_transfer(cvd, mvd); 939 940 return (mvd); 941} 942 943/* 944 * Remove a 1-way mirror/replacing vdev from the tree. 945 / 946void 947vdev_remove_parent(vdev_t cvd) 948{ 949 vdev_t mvd = cvd->vdev_parent; 950* vdev_t pvd = mvd->vdev_parent; 951* 952 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 953 954 ASSERT(mvd->vdev_children == 1); 955 ASSERT(mvd->vdev_ops == &vdev_mirror_ops \|\| 956 mvd->vdev_ops == &vdev_replacing_ops \|\| 957 mvd->vdev_ops == &vdev_spare_ops); 958 cvd->vdev_ashift = mvd->vdev_ashift; 959 cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; 960 cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; 961 962 vdev_remove_child(mvd, cvd); 963 vdev_remove_child(pvd, mvd); 964 965 /* 966 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 967 * Otherwise, we could have detached an offline device, and when we 968 * go to import the pool we'll think we have two top-level vdevs, 969 * instead of a different version of the same top-level vdev. 970 / 971* if (mvd->vdev_top == mvd) { 972 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 973 cvd->vdev_orig_guid = cvd->vdev_guid; 974 cvd->vdev_guid += guid_delta; 975 cvd->vdev_guid_sum += guid_delta; 976 } 977 cvd->vdev_id = mvd->vdev_id; 978 vdev_add_child(pvd, cvd); 979 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 980 981 if (cvd == cvd->vdev_top) 982 vdev_top_transfer(mvd, cvd); 983 984 ASSERT(mvd->vdev_children == 0); 985 vdev_free(mvd); 986} 987 988int 989vdev_metaslab_init(vdev_t vd, uint64_t txg) 990{ 991* spa_t spa = vd->vdev_spa; 992* objset_t mos = spa->spa_meta_objset; 993* uint64_t m; 994 uint64_t oldc = vd->vdev_ms_count; 995 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 996 metaslab_t *mspp; 997* int error; 998 999 ASSERT(txg == 0 \|\| spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1000 1001 /* 1002 * This vdev is not being allocated from yet or is a hole. 1003 / 1004* if (vd->vdev_ms_shift == 0) 1005 return (0); 1006 1007 ASSERT(!vd->vdev_ishole); 1008 1009 ASSERT(oldc <= newc); 1010 1011 mspp = kmem_zalloc(newc * sizeof (mspp), KM_SLEEP); 1012* 1013 if (oldc != 0) { 1014 bcopy(vd->vdev_ms, mspp, oldc * sizeof (mspp)); 1015* kmem_free(vd->vdev_ms, oldc * sizeof (mspp)); 1016* } 1017 1018 vd->vdev_ms = mspp; 1019 vd->vdev_ms_count = newc; 1020 1021 for (m = oldc; m < newc; m++) { 1022 uint64_t object = 0; 1023 1024 /* 1025 * vdev_ms_array may be 0 if we are creating the "fake" 1026 * metaslabs for an indirect vdev for zdb's leak detection. 1027 * See zdb_leak_init(). 1028 / 1029* if (txg == 0 && vd->vdev_ms_array != 0) { 1030 error = dmu_read(mos, vd->vdev_ms_array, 1031 m * sizeof (uint64_t), sizeof (uint64_t), &object, 1032 DMU_READ_PREFETCH);	196/* 197 * Given a vdev type, return the appropriate ops vector. 198 / 199static vdev_ops_t 200vdev_getops(const char type) 201{ 202* vdev_ops_t ops, opspp; 203* 204 for (opspp = vdev_ops_table; (ops = opspp) != NULL; opspp++) 205* if (strcmp(ops->vdev_op_type, type) == 0) 206 break; 207 208 return (ops); 209} 210 211/* 212 * Default asize function: return the MAX of psize with the asize of 213 * all children. This is what's used by anything other than RAID-Z. 214 / 215uint64_t 216vdev_default_asize(vdev_t vd, uint64_t psize) 217{ 218 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 219 uint64_t csize; 220 221 for (int c = 0; c < vd->vdev_children; c++) { 222 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 223 asize = MAX(asize, csize); 224 } 225 226 return (asize); 227} 228 229/* 230 * Get the minimum allocatable size. We define the allocatable size as 231 * the vdev's asize rounded to the nearest metaslab. This allows us to 232 * replace or attach devices which don't have the same physical size but 233 * can still satisfy the same number of allocations. 234 / 235uint64_t 236vdev_get_min_asize(vdev_t vd) 237{ 238 vdev_t pvd = vd->vdev_parent; 239* 240 /* 241 * If our parent is NULL (inactive spare or cache) or is the root, 242 * just return our own asize. 243 / 244* if (pvd == NULL) 245 return (vd->vdev_asize); 246 247 /* 248 * The top-level vdev just returns the allocatable size rounded 249 * to the nearest metaslab. 250 / 251* if (vd == vd->vdev_top) 252 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 253 254 /* 255 * The allocatable space for a raidz vdev is N * sizeof(smallest child), 256 * so each child must provide at least 1/Nth of its asize. 257 / 258* if (pvd->vdev_ops == &vdev_raidz_ops) 259 return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / 260 pvd->vdev_children); 261 262 return (pvd->vdev_min_asize); 263} 264 265void 266vdev_set_min_asize(vdev_t vd) 267{ 268* vd->vdev_min_asize = vdev_get_min_asize(vd); 269 270 for (int c = 0; c < vd->vdev_children; c++) 271 vdev_set_min_asize(vd->vdev_child[c]); 272} 273 274vdev_t * 275vdev_lookup_top(spa_t spa, uint64_t vdev) 276{ 277* vdev_t rvd = spa->spa_root_vdev; 278* 279 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 280 281 if (vdev < rvd->vdev_children) { 282 ASSERT(rvd->vdev_child[vdev] != NULL); 283 return (rvd->vdev_child[vdev]); 284 } 285 286 return (NULL); 287} 288 289vdev_t * 290vdev_lookup_by_guid(vdev_t vd, uint64_t guid) 291{ 292* vdev_t mvd; 293* 294 if (vd->vdev_guid == guid) 295 return (vd); 296 297 for (int c = 0; c < vd->vdev_children; c++) 298 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 299 NULL) 300 return (mvd); 301 302 return (NULL); 303} 304 305static int 306vdev_count_leaves_impl(vdev_t vd) 307{ 308* int n = 0; 309 310 if (vd->vdev_ops->vdev_op_leaf) 311 return (1); 312 313 for (int c = 0; c < vd->vdev_children; c++) 314 n += vdev_count_leaves_impl(vd->vdev_child[c]); 315 316 return (n); 317} 318 319int 320vdev_count_leaves(spa_t spa) 321{ 322* return (vdev_count_leaves_impl(spa->spa_root_vdev)); 323} 324 325void 326vdev_add_child(vdev_t pvd, vdev_t cvd) 327{ 328 size_t oldsize, newsize; 329 uint64_t id = cvd->vdev_id; 330 vdev_t *newchild; 331* spa_t spa = cvd->vdev_spa; 332* 333 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 334 ASSERT(cvd->vdev_parent == NULL); 335 336 cvd->vdev_parent = pvd; 337 338 if (pvd == NULL) 339 return; 340 341 ASSERT(id >= pvd->vdev_children \|\| pvd->vdev_child[id] == NULL); 342 343 oldsize = pvd->vdev_children * sizeof (vdev_t ); 344* pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 345 newsize = pvd->vdev_children * sizeof (vdev_t ); 346* 347 newchild = kmem_zalloc(newsize, KM_SLEEP); 348 if (pvd->vdev_child != NULL) { 349 bcopy(pvd->vdev_child, newchild, oldsize); 350 kmem_free(pvd->vdev_child, oldsize); 351 } 352 353 pvd->vdev_child = newchild; 354 pvd->vdev_child[id] = cvd; 355 356 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 357 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 358 359 /* 360 * Walk up all ancestors to update guid sum. 361 / 362* for (; pvd != NULL; pvd = pvd->vdev_parent) 363 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 364} 365 366void 367vdev_remove_child(vdev_t pvd, vdev_t cvd) 368{ 369 int c; 370 uint_t id = cvd->vdev_id; 371 372 ASSERT(cvd->vdev_parent == pvd); 373 374 if (pvd == NULL) 375 return; 376 377 ASSERT(id < pvd->vdev_children); 378 ASSERT(pvd->vdev_child[id] == cvd); 379 380 pvd->vdev_child[id] = NULL; 381 cvd->vdev_parent = NULL; 382 383 for (c = 0; c < pvd->vdev_children; c++) 384 if (pvd->vdev_child[c]) 385 break; 386 387 if (c == pvd->vdev_children) { 388 kmem_free(pvd->vdev_child, c * sizeof (vdev_t )); 389* pvd->vdev_child = NULL; 390 pvd->vdev_children = 0; 391 } 392 393 /* 394 * Walk up all ancestors to update guid sum. 395 / 396* for (; pvd != NULL; pvd = pvd->vdev_parent) 397 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 398} 399 400/* 401 * Remove any holes in the child array. 402 / 403void 404vdev_compact_children(vdev_t pvd) 405{ 406 vdev_t *newchild, cvd; 407 int oldc = pvd->vdev_children; 408 int newc; 409 410 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 411 412 for (int c = newc = 0; c < oldc; c++) 413 if (pvd->vdev_child[c]) 414 newc++; 415 416 newchild = kmem_alloc(newc * sizeof (vdev_t ), KM_SLEEP); 417* 418 for (int c = newc = 0; c < oldc; c++) { 419 if ((cvd = pvd->vdev_child[c]) != NULL) { 420 newchild[newc] = cvd; 421 cvd->vdev_id = newc++; 422 } 423 } 424 425 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t )); 426* pvd->vdev_child = newchild; 427 pvd->vdev_children = newc; 428} 429 430/* 431 * Allocate and minimally initialize a vdev_t. 432 / 433vdev_t 434vdev_alloc_common(spa_t spa, uint_t id, uint64_t guid, vdev_ops_t ops) 435{ 436 vdev_t vd; 437* vdev_indirect_config_t vic; 438* 439 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 440 vic = &vd->vdev_indirect_config; 441 442 if (spa->spa_root_vdev == NULL) { 443 ASSERT(ops == &vdev_root_ops); 444 spa->spa_root_vdev = vd; 445 spa->spa_load_guid = spa_generate_guid(NULL); 446 } 447 448 if (guid == 0 && ops != &vdev_hole_ops) { 449 if (spa->spa_root_vdev == vd) { 450 /* 451 * The root vdev's guid will also be the pool guid, 452 * which must be unique among all pools. 453 / 454* guid = spa_generate_guid(NULL); 455 } else { 456 /* 457 * Any other vdev's guid must be unique within the pool. 458 / 459* guid = spa_generate_guid(spa); 460 } 461 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 462 } 463 464 vd->vdev_spa = spa; 465 vd->vdev_id = id; 466 vd->vdev_guid = guid; 467 vd->vdev_guid_sum = guid; 468 vd->vdev_ops = ops; 469 vd->vdev_state = VDEV_STATE_CLOSED; 470 vd->vdev_ishole = (ops == &vdev_hole_ops); 471 vic->vic_prev_indirect_vdev = UINT64_MAX; 472 473 rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); 474 mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); 475 vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); 476 477 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 478 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 479 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 480 mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); 481 for (int t = 0; t < DTL_TYPES; t++) { 482 vd->vdev_dtl[t] = range_tree_create(NULL, NULL); 483 } 484 txg_list_create(&vd->vdev_ms_list, spa, 485 offsetof(struct metaslab, ms_txg_node)); 486 txg_list_create(&vd->vdev_dtl_list, spa, 487 offsetof(struct vdev, vdev_dtl_node)); 488 vd->vdev_stat.vs_timestamp = gethrtime(); 489 vdev_queue_init(vd); 490 vdev_cache_init(vd); 491 492 return (vd); 493} 494 495/* 496 * Allocate a new vdev. The 'alloctype' is used to control whether we are 497 * creating a new vdev or loading an existing one - the behavior is slightly 498 * different for each case. 499 / 500int 501vdev_alloc(spa_t spa, vdev_t *vdp, nvlist_t nv, vdev_t parent, uint_t id, 502* int alloctype) 503{ 504 vdev_ops_t ops; 505* char type; 506* uint64_t guid = 0, islog, nparity; 507 vdev_t vd; 508* vdev_indirect_config_t vic; 509* 510 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 511 512 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 513 return (SET_ERROR(EINVAL)); 514 515 if ((ops = vdev_getops(type)) == NULL) 516 return (SET_ERROR(EINVAL)); 517 518 /* 519 * If this is a load, get the vdev guid from the nvlist. 520 * Otherwise, vdev_alloc_common() will generate one for us. 521 / 522* if (alloctype == VDEV_ALLOC_LOAD) { 523 uint64_t label_id; 524 525 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) \|\| 526 label_id != id) 527 return (SET_ERROR(EINVAL)); 528 529 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 530 return (SET_ERROR(EINVAL)); 531 } else if (alloctype == VDEV_ALLOC_SPARE) { 532 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 533 return (SET_ERROR(EINVAL)); 534 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 535 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 536 return (SET_ERROR(EINVAL)); 537 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 538 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 539 return (SET_ERROR(EINVAL)); 540 } 541 542 /* 543 * The first allocated vdev must be of type 'root'. 544 / 545* if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 546 return (SET_ERROR(EINVAL)); 547 548 /* 549 * Determine whether we're a log vdev. 550 / 551* islog = 0; 552 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 553 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 554 return (SET_ERROR(ENOTSUP)); 555 556 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 557 return (SET_ERROR(ENOTSUP)); 558 559 /* 560 * Set the nparity property for RAID-Z vdevs. 561 / 562* nparity = -1ULL; 563 if (ops == &vdev_raidz_ops) { 564 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 565 &nparity) == 0) { 566 if (nparity == 0 \|\| nparity > VDEV_RAIDZ_MAXPARITY) 567 return (SET_ERROR(EINVAL)); 568 /* 569 * Previous versions could only support 1 or 2 parity 570 * device. 571 / 572* if (nparity > 1 && 573 spa_version(spa) < SPA_VERSION_RAIDZ2) 574 return (SET_ERROR(ENOTSUP)); 575 if (nparity > 2 && 576 spa_version(spa) < SPA_VERSION_RAIDZ3) 577 return (SET_ERROR(ENOTSUP)); 578 } else { 579 /* 580 * We require the parity to be specified for SPAs that 581 * support multiple parity levels. 582 / 583* if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 584 return (SET_ERROR(EINVAL)); 585 /* 586 * Otherwise, we default to 1 parity device for RAID-Z. 587 / 588* nparity = 1; 589 } 590 } else { 591 nparity = 0; 592 } 593 ASSERT(nparity != -1ULL); 594 595 vd = vdev_alloc_common(spa, id, guid, ops); 596 vic = &vd->vdev_indirect_config; 597 598 vd->vdev_islog = islog; 599 vd->vdev_nparity = nparity; 600 601 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 602 vd->vdev_path = spa_strdup(vd->vdev_path); 603 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 604 vd->vdev_devid = spa_strdup(vd->vdev_devid); 605 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 606 &vd->vdev_physpath) == 0) 607 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 608 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 609 vd->vdev_fru = spa_strdup(vd->vdev_fru); 610 611 /* 612 * Set the whole_disk property. If it's not specified, leave the value 613 * as -1. 614 / 615* if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 616 &vd->vdev_wholedisk) != 0) 617 vd->vdev_wholedisk = -1ULL; 618 619 ASSERT0(vic->vic_mapping_object); 620 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, 621 &vic->vic_mapping_object); 622 ASSERT0(vic->vic_births_object); 623 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, 624 &vic->vic_births_object); 625 ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); 626 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 627 &vic->vic_prev_indirect_vdev); 628 629 /* 630 * Look for the 'not present' flag. This will only be set if the device 631 * was not present at the time of import. 632 / 633* (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 634 &vd->vdev_not_present); 635 636 /* 637 * Get the alignment requirement. 638 / 639* (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 640 641 /* 642 * Retrieve the vdev creation time. 643 / 644* (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 645 &vd->vdev_crtxg); 646 647 /* 648 * If we're a top-level vdev, try to load the allocation parameters. 649 / 650* if (parent && !parent->vdev_parent && 651 (alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_SPLIT)) { 652 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 653 &vd->vdev_ms_array); 654 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 655 &vd->vdev_ms_shift); 656 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 657 &vd->vdev_asize); 658 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 659 &vd->vdev_removing); 660 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, 661 &vd->vdev_top_zap); 662 } else { 663 ASSERT0(vd->vdev_top_zap); 664 } 665 666 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 667 ASSERT(alloctype == VDEV_ALLOC_LOAD \|\| 668 alloctype == VDEV_ALLOC_ADD \|\| 669 alloctype == VDEV_ALLOC_SPLIT \|\| 670 alloctype == VDEV_ALLOC_ROOTPOOL); 671 vd->vdev_mg = metaslab_group_create(islog ? 672 spa_log_class(spa) : spa_normal_class(spa), vd); 673 } 674 675 if (vd->vdev_ops->vdev_op_leaf && 676 (alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_SPLIT)) { 677 (void) nvlist_lookup_uint64(nv, 678 ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); 679 } else { 680 ASSERT0(vd->vdev_leaf_zap); 681 } 682 683 /* 684 * If we're a leaf vdev, try to load the DTL object and other state. 685 / 686* 687 if (vd->vdev_ops->vdev_op_leaf && 688 (alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_L2CACHE \|\| 689 alloctype == VDEV_ALLOC_ROOTPOOL)) { 690 if (alloctype == VDEV_ALLOC_LOAD) { 691 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 692 &vd->vdev_dtl_object); 693 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 694 &vd->vdev_unspare); 695 } 696 697 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 698 uint64_t spare = 0; 699 700 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 701 &spare) == 0 && spare) 702 spa_spare_add(vd); 703 } 704 705 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 706 &vd->vdev_offline); 707 708 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 709 &vd->vdev_resilver_txg); 710 711 /* 712 * When importing a pool, we want to ignore the persistent fault 713 * state, as the diagnosis made on another system may not be 714 * valid in the current context. Local vdevs will 715 * remain in the faulted state. 716 / 717* if (spa_load_state(spa) == SPA_LOAD_OPEN) { 718 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 719 &vd->vdev_faulted); 720 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 721 &vd->vdev_degraded); 722 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 723 &vd->vdev_removed); 724 725 if (vd->vdev_faulted \|\| vd->vdev_degraded) { 726 char aux; 727* 728 vd->vdev_label_aux = 729 VDEV_AUX_ERR_EXCEEDED; 730 if (nvlist_lookup_string(nv, 731 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 732 strcmp(aux, "external") == 0) 733 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 734 } 735 } 736 } 737 738 /* 739 * Add ourselves to the parent's list of children. 740 / 741* vdev_add_child(parent, vd); 742 743 vdp = vd; 744* 745 return (0); 746} 747 748void 749vdev_free(vdev_t vd) 750{ 751* spa_t spa = vd->vdev_spa; 752* 753 /* 754 * vdev_free() implies closing the vdev first. This is simpler than 755 * trying to ensure complicated semantics for all callers. 756 / 757* vdev_close(vd); 758 759 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 760 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 761 762 /* 763 * Free all children. 764 / 765* for (int c = 0; c < vd->vdev_children; c++) 766 vdev_free(vd->vdev_child[c]); 767 768 ASSERT(vd->vdev_child == NULL); 769 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 770 771 /* 772 * Discard allocation state. 773 / 774* if (vd->vdev_mg != NULL) { 775 vdev_metaslab_fini(vd); 776 metaslab_group_destroy(vd->vdev_mg); 777 } 778 779 ASSERT0(vd->vdev_stat.vs_space); 780 ASSERT0(vd->vdev_stat.vs_dspace); 781 ASSERT0(vd->vdev_stat.vs_alloc); 782 783 /* 784 * Remove this vdev from its parent's child list. 785 / 786* vdev_remove_child(vd->vdev_parent, vd); 787 788 ASSERT(vd->vdev_parent == NULL); 789 790 /* 791 * Clean up vdev structure. 792 / 793* vdev_queue_fini(vd); 794 vdev_cache_fini(vd); 795 796 if (vd->vdev_path) 797 spa_strfree(vd->vdev_path); 798 if (vd->vdev_devid) 799 spa_strfree(vd->vdev_devid); 800 if (vd->vdev_physpath) 801 spa_strfree(vd->vdev_physpath); 802 if (vd->vdev_fru) 803 spa_strfree(vd->vdev_fru); 804 805 if (vd->vdev_isspare) 806 spa_spare_remove(vd); 807 if (vd->vdev_isl2cache) 808 spa_l2cache_remove(vd); 809 810 txg_list_destroy(&vd->vdev_ms_list); 811 txg_list_destroy(&vd->vdev_dtl_list); 812 813 mutex_enter(&vd->vdev_dtl_lock); 814 space_map_close(vd->vdev_dtl_sm); 815 for (int t = 0; t < DTL_TYPES; t++) { 816 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 817 range_tree_destroy(vd->vdev_dtl[t]); 818 } 819 mutex_exit(&vd->vdev_dtl_lock); 820 821 EQUIV(vd->vdev_indirect_births != NULL, 822 vd->vdev_indirect_mapping != NULL); 823 if (vd->vdev_indirect_births != NULL) { 824 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 825 vdev_indirect_births_close(vd->vdev_indirect_births); 826 } 827 828 if (vd->vdev_obsolete_sm != NULL) { 829 ASSERT(vd->vdev_removing \|\| 830 vd->vdev_ops == &vdev_indirect_ops); 831 space_map_close(vd->vdev_obsolete_sm); 832 vd->vdev_obsolete_sm = NULL; 833 } 834 range_tree_destroy(vd->vdev_obsolete_segments); 835 rw_destroy(&vd->vdev_indirect_rwlock); 836 mutex_destroy(&vd->vdev_obsolete_lock); 837 838 mutex_destroy(&vd->vdev_queue_lock); 839 mutex_destroy(&vd->vdev_dtl_lock); 840 mutex_destroy(&vd->vdev_stat_lock); 841 mutex_destroy(&vd->vdev_probe_lock); 842 843 if (vd == spa->spa_root_vdev) 844 spa->spa_root_vdev = NULL; 845 846 kmem_free(vd, sizeof (vdev_t)); 847} 848 849/* 850 * Transfer top-level vdev state from svd to tvd. 851 / 852static void 853vdev_top_transfer(vdev_t svd, vdev_t tvd) 854{ 855* spa_t spa = svd->vdev_spa; 856* metaslab_t msp; 857* vdev_t vd; 858* int t; 859 860 ASSERT(tvd == tvd->vdev_top); 861 862 tvd->vdev_ms_array = svd->vdev_ms_array; 863 tvd->vdev_ms_shift = svd->vdev_ms_shift; 864 tvd->vdev_ms_count = svd->vdev_ms_count; 865 tvd->vdev_top_zap = svd->vdev_top_zap; 866 867 svd->vdev_ms_array = 0; 868 svd->vdev_ms_shift = 0; 869 svd->vdev_ms_count = 0; 870 svd->vdev_top_zap = 0; 871 872 if (tvd->vdev_mg) 873 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 874 tvd->vdev_mg = svd->vdev_mg; 875 tvd->vdev_ms = svd->vdev_ms; 876 877 svd->vdev_mg = NULL; 878 svd->vdev_ms = NULL; 879 880 if (tvd->vdev_mg != NULL) 881 tvd->vdev_mg->mg_vd = tvd; 882 883 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 884 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 885 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 886 887 svd->vdev_stat.vs_alloc = 0; 888 svd->vdev_stat.vs_space = 0; 889 svd->vdev_stat.vs_dspace = 0; 890 891 for (t = 0; t < TXG_SIZE; t++) { 892 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 893 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 894 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 895 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 896 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 897 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 898 } 899 900 if (list_link_active(&svd->vdev_config_dirty_node)) { 901 vdev_config_clean(svd); 902 vdev_config_dirty(tvd); 903 } 904 905 if (list_link_active(&svd->vdev_state_dirty_node)) { 906 vdev_state_clean(svd); 907 vdev_state_dirty(tvd); 908 } 909 910 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 911 svd->vdev_deflate_ratio = 0; 912 913 tvd->vdev_islog = svd->vdev_islog; 914 svd->vdev_islog = 0; 915} 916 917static void 918vdev_top_update(vdev_t tvd, vdev_t vd) 919{ 920 if (vd == NULL) 921 return; 922 923 vd->vdev_top = tvd; 924 925 for (int c = 0; c < vd->vdev_children; c++) 926 vdev_top_update(tvd, vd->vdev_child[c]); 927} 928 929/* 930 * Add a mirror/replacing vdev above an existing vdev. 931 / 932vdev_t 933vdev_add_parent(vdev_t cvd, vdev_ops_t ops) 934{ 935 spa_t spa = cvd->vdev_spa; 936* vdev_t pvd = cvd->vdev_parent; 937* vdev_t mvd; 938* 939 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 940 941 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 942 943 mvd->vdev_asize = cvd->vdev_asize; 944 mvd->vdev_min_asize = cvd->vdev_min_asize; 945 mvd->vdev_max_asize = cvd->vdev_max_asize; 946 mvd->vdev_psize = cvd->vdev_psize; 947 mvd->vdev_ashift = cvd->vdev_ashift; 948 mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; 949 mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; 950 mvd->vdev_state = cvd->vdev_state; 951 mvd->vdev_crtxg = cvd->vdev_crtxg; 952 953 vdev_remove_child(pvd, cvd); 954 vdev_add_child(pvd, mvd); 955 cvd->vdev_id = mvd->vdev_children; 956 vdev_add_child(mvd, cvd); 957 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 958 959 if (mvd == mvd->vdev_top) 960 vdev_top_transfer(cvd, mvd); 961 962 return (mvd); 963} 964 965/* 966 * Remove a 1-way mirror/replacing vdev from the tree. 967 / 968void 969vdev_remove_parent(vdev_t cvd) 970{ 971 vdev_t mvd = cvd->vdev_parent; 972* vdev_t pvd = mvd->vdev_parent; 973* 974 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 975 976 ASSERT(mvd->vdev_children == 1); 977 ASSERT(mvd->vdev_ops == &vdev_mirror_ops \|\| 978 mvd->vdev_ops == &vdev_replacing_ops \|\| 979 mvd->vdev_ops == &vdev_spare_ops); 980 cvd->vdev_ashift = mvd->vdev_ashift; 981 cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; 982 cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; 983 984 vdev_remove_child(mvd, cvd); 985 vdev_remove_child(pvd, mvd); 986 987 /* 988 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 989 * Otherwise, we could have detached an offline device, and when we 990 * go to import the pool we'll think we have two top-level vdevs, 991 * instead of a different version of the same top-level vdev. 992 / 993* if (mvd->vdev_top == mvd) { 994 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 995 cvd->vdev_orig_guid = cvd->vdev_guid; 996 cvd->vdev_guid += guid_delta; 997 cvd->vdev_guid_sum += guid_delta; 998 } 999 cvd->vdev_id = mvd->vdev_id; 1000 vdev_add_child(pvd, cvd); 1001 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 1002 1003 if (cvd == cvd->vdev_top) 1004 vdev_top_transfer(mvd, cvd); 1005 1006 ASSERT(mvd->vdev_children == 0); 1007 vdev_free(mvd); 1008} 1009 1010int 1011vdev_metaslab_init(vdev_t vd, uint64_t txg) 1012{ 1013* spa_t spa = vd->vdev_spa; 1014* objset_t mos = spa->spa_meta_objset; 1015* uint64_t m; 1016 uint64_t oldc = vd->vdev_ms_count; 1017 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 1018 metaslab_t *mspp; 1019* int error; 1020 1021 ASSERT(txg == 0 \|\| spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1022 1023 /* 1024 * This vdev is not being allocated from yet or is a hole. 1025 / 1026* if (vd->vdev_ms_shift == 0) 1027 return (0); 1028 1029 ASSERT(!vd->vdev_ishole); 1030 1031 ASSERT(oldc <= newc); 1032 1033 mspp = kmem_zalloc(newc * sizeof (mspp), KM_SLEEP); 1034* 1035 if (oldc != 0) { 1036 bcopy(vd->vdev_ms, mspp, oldc * sizeof (mspp)); 1037* kmem_free(vd->vdev_ms, oldc * sizeof (mspp)); 1038* } 1039 1040 vd->vdev_ms = mspp; 1041 vd->vdev_ms_count = newc; 1042 1043 for (m = oldc; m < newc; m++) { 1044 uint64_t object = 0; 1045 1046 /* 1047 * vdev_ms_array may be 0 if we are creating the "fake" 1048 * metaslabs for an indirect vdev for zdb's leak detection. 1049 * See zdb_leak_init(). 1050 / 1051* if (txg == 0 && vd->vdev_ms_array != 0) { 1052 error = dmu_read(mos, vd->vdev_ms_array, 1053 m * sizeof (uint64_t), sizeof (uint64_t), &object, 1054 DMU_READ_PREFETCH);
1033 if (error)	1055 if (error != 0) { 1056 vdev_dbgmsg(vd, "unable to read the metaslab " 1057 "array [error=%d]", error);
1034 return (error);	1058 return (error);
	1059 }
1035 } 1036 1037 error = metaslab_init(vd->vdev_mg, m, object, txg, 1038 &(vd->vdev_ms[m]));	1060 } 1061 1062 error = metaslab_init(vd->vdev_mg, m, object, txg, 1063 &(vd->vdev_ms[m]));
1039 if (error)	1064 if (error != 0) { 1065 vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", 1066 error);
1040 return (error);	1067 return (error);
	1068 }
1041 } 1042 1043 if (txg == 0) 1044 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 1045 1046 /* 1047 * If the vdev is being removed we don't activate 1048 * the metaslabs since we want to ensure that no new 1049 * allocations are performed on this device. 1050 / 1051* if (oldc == 0 && !vd->vdev_removing) 1052 metaslab_group_activate(vd->vdev_mg); 1053 1054 if (txg == 0) 1055 spa_config_exit(spa, SCL_ALLOC, FTAG); 1056 1057 return (0); 1058} 1059 1060void 1061vdev_metaslab_fini(vdev_t vd) 1062{ 1063* if (vd->vdev_ms != NULL) { 1064 uint64_t count = vd->vdev_ms_count; 1065 1066 metaslab_group_passivate(vd->vdev_mg); 1067 for (uint64_t m = 0; m < count; m++) { 1068 metaslab_t msp = vd->vdev_ms[m]; 1069* 1070 if (msp != NULL) 1071 metaslab_fini(msp); 1072 } 1073 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t )); 1074* vd->vdev_ms = NULL; 1075 1076 vd->vdev_ms_count = 0; 1077 } 1078 ASSERT0(vd->vdev_ms_count); 1079} 1080 1081typedef struct vdev_probe_stats { 1082 boolean_t vps_readable; 1083 boolean_t vps_writeable; 1084 int vps_flags; 1085} vdev_probe_stats_t; 1086 1087static void 1088vdev_probe_done(zio_t zio) 1089{ 1090* spa_t spa = zio->io_spa; 1091* vdev_t vd = zio->io_vd; 1092* vdev_probe_stats_t vps = zio->io_private; 1093* 1094 ASSERT(vd->vdev_probe_zio != NULL); 1095 1096 if (zio->io_type == ZIO_TYPE_READ) { 1097 if (zio->io_error == 0) 1098 vps->vps_readable = 1; 1099 if (zio->io_error == 0 && spa_writeable(spa)) { 1100 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 1101 zio->io_offset, zio->io_size, zio->io_abd, 1102 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1103 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 1104 } else { 1105 abd_free(zio->io_abd); 1106 } 1107 } else if (zio->io_type == ZIO_TYPE_WRITE) { 1108 if (zio->io_error == 0) 1109 vps->vps_writeable = 1; 1110 abd_free(zio->io_abd); 1111 } else if (zio->io_type == ZIO_TYPE_NULL) { 1112 zio_t pio; 1113* 1114 vd->vdev_cant_read \|= !vps->vps_readable; 1115 vd->vdev_cant_write \|= !vps->vps_writeable; 1116 1117 if (vdev_readable(vd) && 1118 (vdev_writeable(vd) \|\| !spa_writeable(spa))) { 1119 zio->io_error = 0; 1120 } else { 1121 ASSERT(zio->io_error != 0);	1069 } 1070 1071 if (txg == 0) 1072 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 1073 1074 /* 1075 * If the vdev is being removed we don't activate 1076 * the metaslabs since we want to ensure that no new 1077 * allocations are performed on this device. 1078 / 1079* if (oldc == 0 && !vd->vdev_removing) 1080 metaslab_group_activate(vd->vdev_mg); 1081 1082 if (txg == 0) 1083 spa_config_exit(spa, SCL_ALLOC, FTAG); 1084 1085 return (0); 1086} 1087 1088void 1089vdev_metaslab_fini(vdev_t vd) 1090{ 1091* if (vd->vdev_ms != NULL) { 1092 uint64_t count = vd->vdev_ms_count; 1093 1094 metaslab_group_passivate(vd->vdev_mg); 1095 for (uint64_t m = 0; m < count; m++) { 1096 metaslab_t msp = vd->vdev_ms[m]; 1097* 1098 if (msp != NULL) 1099 metaslab_fini(msp); 1100 } 1101 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t )); 1102* vd->vdev_ms = NULL; 1103 1104 vd->vdev_ms_count = 0; 1105 } 1106 ASSERT0(vd->vdev_ms_count); 1107} 1108 1109typedef struct vdev_probe_stats { 1110 boolean_t vps_readable; 1111 boolean_t vps_writeable; 1112 int vps_flags; 1113} vdev_probe_stats_t; 1114 1115static void 1116vdev_probe_done(zio_t zio) 1117{ 1118* spa_t spa = zio->io_spa; 1119* vdev_t vd = zio->io_vd; 1120* vdev_probe_stats_t vps = zio->io_private; 1121* 1122 ASSERT(vd->vdev_probe_zio != NULL); 1123 1124 if (zio->io_type == ZIO_TYPE_READ) { 1125 if (zio->io_error == 0) 1126 vps->vps_readable = 1; 1127 if (zio->io_error == 0 && spa_writeable(spa)) { 1128 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 1129 zio->io_offset, zio->io_size, zio->io_abd, 1130 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1131 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 1132 } else { 1133 abd_free(zio->io_abd); 1134 } 1135 } else if (zio->io_type == ZIO_TYPE_WRITE) { 1136 if (zio->io_error == 0) 1137 vps->vps_writeable = 1; 1138 abd_free(zio->io_abd); 1139 } else if (zio->io_type == ZIO_TYPE_NULL) { 1140 zio_t pio; 1141* 1142 vd->vdev_cant_read \|= !vps->vps_readable; 1143 vd->vdev_cant_write \|= !vps->vps_writeable; 1144 1145 if (vdev_readable(vd) && 1146 (vdev_writeable(vd) \|\| !spa_writeable(spa))) { 1147 zio->io_error = 0; 1148 } else { 1149 ASSERT(zio->io_error != 0);
1122 zfs_dbgmsg("failed probe on vdev %llu", 1123 (longlong_t)vd->vdev_id);	1150 vdev_dbgmsg(vd, "failed probe");
1124 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 1125 spa, vd, NULL, 0, 0); 1126 zio->io_error = SET_ERROR(ENXIO); 1127 } 1128 1129 mutex_enter(&vd->vdev_probe_lock); 1130 ASSERT(vd->vdev_probe_zio == zio); 1131 vd->vdev_probe_zio = NULL; 1132 mutex_exit(&vd->vdev_probe_lock); 1133 1134 zio_link_t zl = NULL; 1135* while ((pio = zio_walk_parents(zio, &zl)) != NULL) 1136 if (!vdev_accessible(vd, pio)) 1137 pio->io_error = SET_ERROR(ENXIO); 1138 1139 kmem_free(vps, sizeof (vps)); 1140* } 1141} 1142 1143/* 1144 * Determine whether this device is accessible. 1145 * 1146 * Read and write to several known locations: the pad regions of each 1147 * vdev label but the first, which we leave alone in case it contains 1148 * a VTOC. 1149 / 1150zio_t 1151vdev_probe(vdev_t vd, zio_t zio) 1152{ 1153 spa_t spa = vd->vdev_spa; 1154* vdev_probe_stats_t vps = NULL; 1155* zio_t pio; 1156* 1157 ASSERT(vd->vdev_ops->vdev_op_leaf); 1158 1159 /* 1160 * Don't probe the probe. 1161 / 1162* if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1163 return (NULL); 1164 1165 /* 1166 * To prevent 'probe storms' when a device fails, we create 1167 * just one probe i/o at a time. All zios that want to probe 1168 * this vdev will become parents of the probe io. 1169 / 1170* mutex_enter(&vd->vdev_probe_lock); 1171 1172 if ((pio = vd->vdev_probe_zio) == NULL) { 1173 vps = kmem_zalloc(sizeof (vps), KM_SLEEP); 1174* 1175 vps->vps_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_PROBE \| 1176 ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_AGGREGATE \| 1177 ZIO_FLAG_TRYHARD; 1178 1179 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1180 /* 1181 * vdev_cant_read and vdev_cant_write can only 1182 * transition from TRUE to FALSE when we have the 1183 * SCL_ZIO lock as writer; otherwise they can only 1184 * transition from FALSE to TRUE. This ensures that 1185 * any zio looking at these values can assume that 1186 * failures persist for the life of the I/O. That's 1187 * important because when a device has intermittent 1188 * connectivity problems, we want to ensure that 1189 * they're ascribed to the device (ENXIO) and not 1190 * the zio (EIO). 1191 * 1192 * Since we hold SCL_ZIO as writer here, clear both 1193 * values so the probe can reevaluate from first 1194 * principles. 1195 / 1196* vps->vps_flags \|= ZIO_FLAG_CONFIG_WRITER; 1197 vd->vdev_cant_read = B_FALSE; 1198 vd->vdev_cant_write = B_FALSE; 1199 } 1200 1201 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1202 vdev_probe_done, vps, 1203 vps->vps_flags \| ZIO_FLAG_DONT_PROPAGATE); 1204 1205 /* 1206 * We can't change the vdev state in this context, so we 1207 * kick off an async task to do it on our behalf. 1208 / 1209* if (zio != NULL) { 1210 vd->vdev_probe_wanted = B_TRUE; 1211 spa_async_request(spa, SPA_ASYNC_PROBE); 1212 } 1213 } 1214 1215 if (zio != NULL) 1216 zio_add_child(zio, pio); 1217 1218 mutex_exit(&vd->vdev_probe_lock); 1219 1220 if (vps == NULL) { 1221 ASSERT(zio != NULL); 1222 return (NULL); 1223 } 1224 1225 for (int l = 1; l < VDEV_LABELS; l++) { 1226 zio_nowait(zio_read_phys(pio, vd, 1227 vdev_label_offset(vd->vdev_psize, l, 1228 offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, 1229 abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), 1230 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1231 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1232 } 1233 1234 if (zio == NULL) 1235 return (pio); 1236 1237 zio_nowait(pio); 1238 return (NULL); 1239} 1240 1241static void 1242vdev_open_child(void arg) 1243{ 1244* vdev_t vd = arg; 1245* 1246 vd->vdev_open_thread = curthread; 1247 vd->vdev_open_error = vdev_open(vd); 1248 vd->vdev_open_thread = NULL; 1249} 1250 1251boolean_t 1252vdev_uses_zvols(vdev_t vd) 1253{ 1254* if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1255 strlen(ZVOL_DIR)) == 0) 1256 return (B_TRUE); 1257 for (int c = 0; c < vd->vdev_children; c++) 1258 if (vdev_uses_zvols(vd->vdev_child[c])) 1259 return (B_TRUE); 1260 return (B_FALSE); 1261} 1262 1263void 1264vdev_open_children(vdev_t vd) 1265{ 1266* taskq_t tq; 1267* int children = vd->vdev_children; 1268 1269 /* 1270 * in order to handle pools on top of zvols, do the opens 1271 * in a single thread so that the same thread holds the 1272 * spa_namespace_lock 1273 / 1274* if (B_TRUE \|\| vdev_uses_zvols(vd)) { 1275 for (int c = 0; c < children; c++) 1276 vd->vdev_child[c]->vdev_open_error = 1277 vdev_open(vd->vdev_child[c]); 1278 return; 1279 } 1280 tq = taskq_create("vdev_open", children, minclsyspri, 1281 children, children, TASKQ_PREPOPULATE); 1282 1283 for (int c = 0; c < children; c++) 1284 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1285 TQ_SLEEP) != 0); 1286 1287 taskq_destroy(tq); 1288} 1289 1290/* 1291 * Compute the raidz-deflation ratio. Note, we hard-code 1292 * in 128k (1 << 17) because it is the "typical" blocksize. 1293 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, 1294 * otherwise it would inconsistently account for existing bp's. 1295 / 1296static void 1297vdev_set_deflate_ratio(vdev_t vd) 1298{ 1299 if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { 1300 vd->vdev_deflate_ratio = (1 << 17) / 1301 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 1302 } 1303} 1304 1305/* 1306 * Prepare a virtual device for access. 1307 / 1308int 1309vdev_open(vdev_t vd) 1310{ 1311 spa_t spa = vd->vdev_spa; 1312* int error; 1313 uint64_t osize = 0; 1314 uint64_t max_osize = 0; 1315 uint64_t asize, max_asize, psize; 1316 uint64_t logical_ashift = 0; 1317 uint64_t physical_ashift = 0; 1318 1319 ASSERT(vd->vdev_open_thread == curthread \|\| 1320 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1321 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED \|\| 1322 vd->vdev_state == VDEV_STATE_CANT_OPEN \|\| 1323 vd->vdev_state == VDEV_STATE_OFFLINE); 1324 1325 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1326 vd->vdev_cant_read = B_FALSE; 1327 vd->vdev_cant_write = B_FALSE; 1328 vd->vdev_notrim = B_FALSE; 1329 vd->vdev_min_asize = vdev_get_min_asize(vd); 1330 1331 /* 1332 * If this vdev is not removed, check its fault status. If it's 1333 * faulted, bail out of the open. 1334 / 1335* if (!vd->vdev_removed && vd->vdev_faulted) { 1336 ASSERT(vd->vdev_children == 0); 1337 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED \|\| 1338 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1339 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1340 vd->vdev_label_aux); 1341 return (SET_ERROR(ENXIO)); 1342 } else if (vd->vdev_offline) { 1343 ASSERT(vd->vdev_children == 0); 1344 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1345 return (SET_ERROR(ENXIO)); 1346 } 1347 1348 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, 1349 &logical_ashift, &physical_ashift); 1350 1351 /* 1352 * Reset the vdev_reopening flag so that we actually close 1353 * the vdev on error. 1354 / 1355* vd->vdev_reopening = B_FALSE; 1356 if (zio_injection_enabled && error == 0) 1357 error = zio_handle_device_injection(vd, NULL, ENXIO); 1358 1359 if (error) { 1360 if (vd->vdev_removed && 1361 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1362 vd->vdev_removed = B_FALSE; 1363 1364 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1365 vd->vdev_stat.vs_aux); 1366 return (error); 1367 } 1368 1369 vd->vdev_removed = B_FALSE; 1370 1371 /* 1372 * Recheck the faulted flag now that we have confirmed that 1373 * the vdev is accessible. If we're faulted, bail. 1374 / 1375* if (vd->vdev_faulted) { 1376 ASSERT(vd->vdev_children == 0); 1377 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED \|\| 1378 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1379 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1380 vd->vdev_label_aux); 1381 return (SET_ERROR(ENXIO)); 1382 } 1383 1384 if (vd->vdev_degraded) { 1385 ASSERT(vd->vdev_children == 0); 1386 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1387 VDEV_AUX_ERR_EXCEEDED); 1388 } else { 1389 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1390 } 1391 1392 /* 1393 * For hole or missing vdevs we just return success. 1394 / 1395* if (vd->vdev_ishole \|\| vd->vdev_ops == &vdev_missing_ops) 1396 return (0); 1397 1398 if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) 1399 trim_map_create(vd); 1400 1401 for (int c = 0; c < vd->vdev_children; c++) { 1402 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1403 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1404 VDEV_AUX_NONE); 1405 break; 1406 } 1407 } 1408 1409 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1410 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1411 1412 if (vd->vdev_children == 0) { 1413 if (osize < SPA_MINDEVSIZE) { 1414 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1415 VDEV_AUX_TOO_SMALL); 1416 return (SET_ERROR(EOVERFLOW)); 1417 } 1418 psize = osize; 1419 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1420 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1421 VDEV_LABEL_END_SIZE); 1422 } else { 1423 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1424 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1425 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1426 VDEV_AUX_TOO_SMALL); 1427 return (SET_ERROR(EOVERFLOW)); 1428 } 1429 psize = 0; 1430 asize = osize; 1431 max_asize = max_osize; 1432 } 1433 1434 vd->vdev_psize = psize; 1435 1436 /* 1437 * Make sure the allocatable size hasn't shrunk too much. 1438 / 1439* if (asize < vd->vdev_min_asize) { 1440 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1441 VDEV_AUX_BAD_LABEL); 1442 return (SET_ERROR(EINVAL)); 1443 } 1444 1445 vd->vdev_physical_ashift = 1446 MAX(physical_ashift, vd->vdev_physical_ashift); 1447 vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); 1448 vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); 1449 1450 if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { 1451 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1452 VDEV_AUX_ASHIFT_TOO_BIG); 1453 return (EINVAL); 1454 } 1455 1456 if (vd->vdev_asize == 0) { 1457 /* 1458 * This is the first-ever open, so use the computed values. 1459 * For testing purposes, a higher ashift can be requested. 1460 / 1461* vd->vdev_asize = asize; 1462 vd->vdev_max_asize = max_asize; 1463 } else { 1464 /* 1465 * Make sure the alignment requirement hasn't increased. 1466 / 1467* if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && 1468 vd->vdev_ops->vdev_op_leaf) { 1469 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1470 VDEV_AUX_BAD_LABEL); 1471 return (EINVAL); 1472 } 1473 vd->vdev_max_asize = max_asize; 1474 } 1475 1476 /* 1477 * If all children are healthy we update asize if either: 1478 * The asize has increased, due to a device expansion caused by dynamic 1479 * LUN growth or vdev replacement, and automatic expansion is enabled; 1480 * making the additional space available. 1481 * 1482 * The asize has decreased, due to a device shrink usually caused by a 1483 * vdev replace with a smaller device. This ensures that calculations 1484 * based of max_asize and asize e.g. esize are always valid. It's safe 1485 * to do this as we've already validated that asize is greater than 1486 * vdev_min_asize. 1487 / 1488* if (vd->vdev_state == VDEV_STATE_HEALTHY && 1489 ((asize > vd->vdev_asize && 1490 (vd->vdev_expanding \|\| spa->spa_autoexpand)) \|\| 1491 (asize < vd->vdev_asize))) 1492 vd->vdev_asize = asize; 1493 1494 vdev_set_min_asize(vd); 1495 1496 /* 1497 * Ensure we can issue some IO before declaring the 1498 * vdev open for business. 1499 / 1500* if (vd->vdev_ops->vdev_op_leaf && 1501 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1502 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1503 VDEV_AUX_ERR_EXCEEDED); 1504 return (error); 1505 } 1506 1507 if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1508 !vd->vdev_isl2cache && !vd->vdev_islog) { 1509 if (vd->vdev_ashift > spa->spa_max_ashift) 1510 spa->spa_max_ashift = vd->vdev_ashift; 1511 if (vd->vdev_ashift < spa->spa_min_ashift) 1512 spa->spa_min_ashift = vd->vdev_ashift; 1513 } 1514 1515 /* 1516 * Track the min and max ashift values for normal data devices. 1517 / 1518* if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1519 !vd->vdev_islog && vd->vdev_aux == NULL) { 1520 if (vd->vdev_ashift > spa->spa_max_ashift) 1521 spa->spa_max_ashift = vd->vdev_ashift; 1522 if (vd->vdev_ashift < spa->spa_min_ashift) 1523 spa->spa_min_ashift = vd->vdev_ashift; 1524 } 1525 1526 /* 1527 * If a leaf vdev has a DTL, and seems healthy, then kick off a 1528 * resilver. But don't do this if we are doing a reopen for a scrub, 1529 * since this would just restart the scrub we are already doing. 1530 / 1531* if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1532 vdev_resilver_needed(vd, NULL, NULL)) 1533 spa_async_request(spa, SPA_ASYNC_RESILVER); 1534 1535 return (0); 1536} 1537 1538/* 1539 * Called once the vdevs are all opened, this routine validates the label 1540 * contents. This needs to be done before vdev_load() so that we don't 1541 * inadvertently do repair I/Os to the wrong device. 1542 * 1543 * If 'strict' is false ignore the spa guid check. This is necessary because 1544 * if the machine crashed during a re-guid the new guid might have been written 1545 * to all of the vdev labels, but not the cached config. The strict check 1546 * will be performed when the pool is opened again using the mos config. 1547 * 1548 * This function will only return failure if one of the vdevs indicates that it 1549 * has since been destroyed or exported. This is only possible if 1550 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1551 * will be updated but the function will return 0. 1552 / 1553int 1554vdev_validate(vdev_t vd, boolean_t strict) 1555{ 1556 spa_t spa = vd->vdev_spa; 1557* nvlist_t label; 1558* uint64_t guid = 0, top_guid; 1559 uint64_t state; 1560 1561 for (int c = 0; c < vd->vdev_children; c++) 1562 if (vdev_validate(vd->vdev_child[c], strict) != 0) 1563 return (SET_ERROR(EBADF)); 1564 1565 /* 1566 * If the device has already failed, or was marked offline, don't do 1567 * any further validation. Otherwise, label I/O will fail and we will 1568 * overwrite the previous state. 1569 / 1570* if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1571 uint64_t aux_guid = 0; 1572 nvlist_t nvl; 1573* uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1574 spa_last_synced_txg(spa) : -1ULL; 1575 1576 if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1577 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1578 VDEV_AUX_BAD_LABEL);	1151 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 1152 spa, vd, NULL, 0, 0); 1153 zio->io_error = SET_ERROR(ENXIO); 1154 } 1155 1156 mutex_enter(&vd->vdev_probe_lock); 1157 ASSERT(vd->vdev_probe_zio == zio); 1158 vd->vdev_probe_zio = NULL; 1159 mutex_exit(&vd->vdev_probe_lock); 1160 1161 zio_link_t zl = NULL; 1162* while ((pio = zio_walk_parents(zio, &zl)) != NULL) 1163 if (!vdev_accessible(vd, pio)) 1164 pio->io_error = SET_ERROR(ENXIO); 1165 1166 kmem_free(vps, sizeof (vps)); 1167* } 1168} 1169 1170/* 1171 * Determine whether this device is accessible. 1172 * 1173 * Read and write to several known locations: the pad regions of each 1174 * vdev label but the first, which we leave alone in case it contains 1175 * a VTOC. 1176 / 1177zio_t 1178vdev_probe(vdev_t vd, zio_t zio) 1179{ 1180 spa_t spa = vd->vdev_spa; 1181* vdev_probe_stats_t vps = NULL; 1182* zio_t pio; 1183* 1184 ASSERT(vd->vdev_ops->vdev_op_leaf); 1185 1186 /* 1187 * Don't probe the probe. 1188 / 1189* if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1190 return (NULL); 1191 1192 /* 1193 * To prevent 'probe storms' when a device fails, we create 1194 * just one probe i/o at a time. All zios that want to probe 1195 * this vdev will become parents of the probe io. 1196 / 1197* mutex_enter(&vd->vdev_probe_lock); 1198 1199 if ((pio = vd->vdev_probe_zio) == NULL) { 1200 vps = kmem_zalloc(sizeof (vps), KM_SLEEP); 1201* 1202 vps->vps_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_PROBE \| 1203 ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_AGGREGATE \| 1204 ZIO_FLAG_TRYHARD; 1205 1206 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1207 /* 1208 * vdev_cant_read and vdev_cant_write can only 1209 * transition from TRUE to FALSE when we have the 1210 * SCL_ZIO lock as writer; otherwise they can only 1211 * transition from FALSE to TRUE. This ensures that 1212 * any zio looking at these values can assume that 1213 * failures persist for the life of the I/O. That's 1214 * important because when a device has intermittent 1215 * connectivity problems, we want to ensure that 1216 * they're ascribed to the device (ENXIO) and not 1217 * the zio (EIO). 1218 * 1219 * Since we hold SCL_ZIO as writer here, clear both 1220 * values so the probe can reevaluate from first 1221 * principles. 1222 / 1223* vps->vps_flags \|= ZIO_FLAG_CONFIG_WRITER; 1224 vd->vdev_cant_read = B_FALSE; 1225 vd->vdev_cant_write = B_FALSE; 1226 } 1227 1228 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1229 vdev_probe_done, vps, 1230 vps->vps_flags \| ZIO_FLAG_DONT_PROPAGATE); 1231 1232 /* 1233 * We can't change the vdev state in this context, so we 1234 * kick off an async task to do it on our behalf. 1235 / 1236* if (zio != NULL) { 1237 vd->vdev_probe_wanted = B_TRUE; 1238 spa_async_request(spa, SPA_ASYNC_PROBE); 1239 } 1240 } 1241 1242 if (zio != NULL) 1243 zio_add_child(zio, pio); 1244 1245 mutex_exit(&vd->vdev_probe_lock); 1246 1247 if (vps == NULL) { 1248 ASSERT(zio != NULL); 1249 return (NULL); 1250 } 1251 1252 for (int l = 1; l < VDEV_LABELS; l++) { 1253 zio_nowait(zio_read_phys(pio, vd, 1254 vdev_label_offset(vd->vdev_psize, l, 1255 offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, 1256 abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), 1257 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1258 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1259 } 1260 1261 if (zio == NULL) 1262 return (pio); 1263 1264 zio_nowait(pio); 1265 return (NULL); 1266} 1267 1268static void 1269vdev_open_child(void arg) 1270{ 1271* vdev_t vd = arg; 1272* 1273 vd->vdev_open_thread = curthread; 1274 vd->vdev_open_error = vdev_open(vd); 1275 vd->vdev_open_thread = NULL; 1276} 1277 1278boolean_t 1279vdev_uses_zvols(vdev_t vd) 1280{ 1281* if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1282 strlen(ZVOL_DIR)) == 0) 1283 return (B_TRUE); 1284 for (int c = 0; c < vd->vdev_children; c++) 1285 if (vdev_uses_zvols(vd->vdev_child[c])) 1286 return (B_TRUE); 1287 return (B_FALSE); 1288} 1289 1290void 1291vdev_open_children(vdev_t vd) 1292{ 1293* taskq_t tq; 1294* int children = vd->vdev_children; 1295 1296 /* 1297 * in order to handle pools on top of zvols, do the opens 1298 * in a single thread so that the same thread holds the 1299 * spa_namespace_lock 1300 / 1301* if (B_TRUE \|\| vdev_uses_zvols(vd)) { 1302 for (int c = 0; c < children; c++) 1303 vd->vdev_child[c]->vdev_open_error = 1304 vdev_open(vd->vdev_child[c]); 1305 return; 1306 } 1307 tq = taskq_create("vdev_open", children, minclsyspri, 1308 children, children, TASKQ_PREPOPULATE); 1309 1310 for (int c = 0; c < children; c++) 1311 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1312 TQ_SLEEP) != 0); 1313 1314 taskq_destroy(tq); 1315} 1316 1317/* 1318 * Compute the raidz-deflation ratio. Note, we hard-code 1319 * in 128k (1 << 17) because it is the "typical" blocksize. 1320 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, 1321 * otherwise it would inconsistently account for existing bp's. 1322 / 1323static void 1324vdev_set_deflate_ratio(vdev_t vd) 1325{ 1326 if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { 1327 vd->vdev_deflate_ratio = (1 << 17) / 1328 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 1329 } 1330} 1331 1332/* 1333 * Prepare a virtual device for access. 1334 / 1335int 1336vdev_open(vdev_t vd) 1337{ 1338 spa_t spa = vd->vdev_spa; 1339* int error; 1340 uint64_t osize = 0; 1341 uint64_t max_osize = 0; 1342 uint64_t asize, max_asize, psize; 1343 uint64_t logical_ashift = 0; 1344 uint64_t physical_ashift = 0; 1345 1346 ASSERT(vd->vdev_open_thread == curthread \|\| 1347 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1348 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED \|\| 1349 vd->vdev_state == VDEV_STATE_CANT_OPEN \|\| 1350 vd->vdev_state == VDEV_STATE_OFFLINE); 1351 1352 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1353 vd->vdev_cant_read = B_FALSE; 1354 vd->vdev_cant_write = B_FALSE; 1355 vd->vdev_notrim = B_FALSE; 1356 vd->vdev_min_asize = vdev_get_min_asize(vd); 1357 1358 /* 1359 * If this vdev is not removed, check its fault status. If it's 1360 * faulted, bail out of the open. 1361 / 1362* if (!vd->vdev_removed && vd->vdev_faulted) { 1363 ASSERT(vd->vdev_children == 0); 1364 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED \|\| 1365 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1366 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1367 vd->vdev_label_aux); 1368 return (SET_ERROR(ENXIO)); 1369 } else if (vd->vdev_offline) { 1370 ASSERT(vd->vdev_children == 0); 1371 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1372 return (SET_ERROR(ENXIO)); 1373 } 1374 1375 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, 1376 &logical_ashift, &physical_ashift); 1377 1378 /* 1379 * Reset the vdev_reopening flag so that we actually close 1380 * the vdev on error. 1381 / 1382* vd->vdev_reopening = B_FALSE; 1383 if (zio_injection_enabled && error == 0) 1384 error = zio_handle_device_injection(vd, NULL, ENXIO); 1385 1386 if (error) { 1387 if (vd->vdev_removed && 1388 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1389 vd->vdev_removed = B_FALSE; 1390 1391 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1392 vd->vdev_stat.vs_aux); 1393 return (error); 1394 } 1395 1396 vd->vdev_removed = B_FALSE; 1397 1398 /* 1399 * Recheck the faulted flag now that we have confirmed that 1400 * the vdev is accessible. If we're faulted, bail. 1401 / 1402* if (vd->vdev_faulted) { 1403 ASSERT(vd->vdev_children == 0); 1404 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED \|\| 1405 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1406 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1407 vd->vdev_label_aux); 1408 return (SET_ERROR(ENXIO)); 1409 } 1410 1411 if (vd->vdev_degraded) { 1412 ASSERT(vd->vdev_children == 0); 1413 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1414 VDEV_AUX_ERR_EXCEEDED); 1415 } else { 1416 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1417 } 1418 1419 /* 1420 * For hole or missing vdevs we just return success. 1421 / 1422* if (vd->vdev_ishole \|\| vd->vdev_ops == &vdev_missing_ops) 1423 return (0); 1424 1425 if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) 1426 trim_map_create(vd); 1427 1428 for (int c = 0; c < vd->vdev_children; c++) { 1429 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1430 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1431 VDEV_AUX_NONE); 1432 break; 1433 } 1434 } 1435 1436 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1437 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1438 1439 if (vd->vdev_children == 0) { 1440 if (osize < SPA_MINDEVSIZE) { 1441 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1442 VDEV_AUX_TOO_SMALL); 1443 return (SET_ERROR(EOVERFLOW)); 1444 } 1445 psize = osize; 1446 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1447 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1448 VDEV_LABEL_END_SIZE); 1449 } else { 1450 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1451 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1452 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1453 VDEV_AUX_TOO_SMALL); 1454 return (SET_ERROR(EOVERFLOW)); 1455 } 1456 psize = 0; 1457 asize = osize; 1458 max_asize = max_osize; 1459 } 1460 1461 vd->vdev_psize = psize; 1462 1463 /* 1464 * Make sure the allocatable size hasn't shrunk too much. 1465 / 1466* if (asize < vd->vdev_min_asize) { 1467 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1468 VDEV_AUX_BAD_LABEL); 1469 return (SET_ERROR(EINVAL)); 1470 } 1471 1472 vd->vdev_physical_ashift = 1473 MAX(physical_ashift, vd->vdev_physical_ashift); 1474 vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); 1475 vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); 1476 1477 if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { 1478 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1479 VDEV_AUX_ASHIFT_TOO_BIG); 1480 return (EINVAL); 1481 } 1482 1483 if (vd->vdev_asize == 0) { 1484 /* 1485 * This is the first-ever open, so use the computed values. 1486 * For testing purposes, a higher ashift can be requested. 1487 / 1488* vd->vdev_asize = asize; 1489 vd->vdev_max_asize = max_asize; 1490 } else { 1491 /* 1492 * Make sure the alignment requirement hasn't increased. 1493 / 1494* if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && 1495 vd->vdev_ops->vdev_op_leaf) { 1496 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1497 VDEV_AUX_BAD_LABEL); 1498 return (EINVAL); 1499 } 1500 vd->vdev_max_asize = max_asize; 1501 } 1502 1503 /* 1504 * If all children are healthy we update asize if either: 1505 * The asize has increased, due to a device expansion caused by dynamic 1506 * LUN growth or vdev replacement, and automatic expansion is enabled; 1507 * making the additional space available. 1508 * 1509 * The asize has decreased, due to a device shrink usually caused by a 1510 * vdev replace with a smaller device. This ensures that calculations 1511 * based of max_asize and asize e.g. esize are always valid. It's safe 1512 * to do this as we've already validated that asize is greater than 1513 * vdev_min_asize. 1514 / 1515* if (vd->vdev_state == VDEV_STATE_HEALTHY && 1516 ((asize > vd->vdev_asize && 1517 (vd->vdev_expanding \|\| spa->spa_autoexpand)) \|\| 1518 (asize < vd->vdev_asize))) 1519 vd->vdev_asize = asize; 1520 1521 vdev_set_min_asize(vd); 1522 1523 /* 1524 * Ensure we can issue some IO before declaring the 1525 * vdev open for business. 1526 / 1527* if (vd->vdev_ops->vdev_op_leaf && 1528 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1529 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1530 VDEV_AUX_ERR_EXCEEDED); 1531 return (error); 1532 } 1533 1534 if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1535 !vd->vdev_isl2cache && !vd->vdev_islog) { 1536 if (vd->vdev_ashift > spa->spa_max_ashift) 1537 spa->spa_max_ashift = vd->vdev_ashift; 1538 if (vd->vdev_ashift < spa->spa_min_ashift) 1539 spa->spa_min_ashift = vd->vdev_ashift; 1540 } 1541 1542 /* 1543 * Track the min and max ashift values for normal data devices. 1544 / 1545* if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1546 !vd->vdev_islog && vd->vdev_aux == NULL) { 1547 if (vd->vdev_ashift > spa->spa_max_ashift) 1548 spa->spa_max_ashift = vd->vdev_ashift; 1549 if (vd->vdev_ashift < spa->spa_min_ashift) 1550 spa->spa_min_ashift = vd->vdev_ashift; 1551 } 1552 1553 /* 1554 * If a leaf vdev has a DTL, and seems healthy, then kick off a 1555 * resilver. But don't do this if we are doing a reopen for a scrub, 1556 * since this would just restart the scrub we are already doing. 1557 / 1558* if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1559 vdev_resilver_needed(vd, NULL, NULL)) 1560 spa_async_request(spa, SPA_ASYNC_RESILVER); 1561 1562 return (0); 1563} 1564 1565/* 1566 * Called once the vdevs are all opened, this routine validates the label 1567 * contents. This needs to be done before vdev_load() so that we don't 1568 * inadvertently do repair I/Os to the wrong device. 1569 * 1570 * If 'strict' is false ignore the spa guid check. This is necessary because 1571 * if the machine crashed during a re-guid the new guid might have been written 1572 * to all of the vdev labels, but not the cached config. The strict check 1573 * will be performed when the pool is opened again using the mos config. 1574 * 1575 * This function will only return failure if one of the vdevs indicates that it 1576 * has since been destroyed or exported. This is only possible if 1577 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1578 * will be updated but the function will return 0. 1579 / 1580int 1581vdev_validate(vdev_t vd, boolean_t strict) 1582{ 1583 spa_t spa = vd->vdev_spa; 1584* nvlist_t label; 1585* uint64_t guid = 0, top_guid; 1586 uint64_t state; 1587 1588 for (int c = 0; c < vd->vdev_children; c++) 1589 if (vdev_validate(vd->vdev_child[c], strict) != 0) 1590 return (SET_ERROR(EBADF)); 1591 1592 /* 1593 * If the device has already failed, or was marked offline, don't do 1594 * any further validation. Otherwise, label I/O will fail and we will 1595 * overwrite the previous state. 1596 / 1597* if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1598 uint64_t aux_guid = 0; 1599 nvlist_t nvl; 1600* uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1601 spa_last_synced_txg(spa) : -1ULL; 1602 1603 if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1604 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1605 VDEV_AUX_BAD_LABEL);
	1606 vdev_dbgmsg(vd, "vdev_validate: failed reading config");
1579 return (0); 1580 } 1581 1582 /* 1583 * Determine if this vdev has been split off into another 1584 * pool. If so, then refuse to open it. 1585 / 1586* if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1587 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1588 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1589 VDEV_AUX_SPLIT_POOL); 1590 nvlist_free(label);	1607 return (0); 1608 } 1609 1610 /* 1611 * Determine if this vdev has been split off into another 1612 * pool. If so, then refuse to open it. 1613 / 1614* if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1615 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1616 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1617 VDEV_AUX_SPLIT_POOL); 1618 nvlist_free(label);
	1619 vdev_dbgmsg(vd, "vdev_validate: vdev split into other " 1620 "pool");
1591 return (0); 1592 } 1593 1594 if (strict && (nvlist_lookup_uint64(label, 1595 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 \|\| 1596 guid != spa_guid(spa))) { 1597 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1598 VDEV_AUX_CORRUPT_DATA); 1599 nvlist_free(label);	1621 return (0); 1622 } 1623 1624 if (strict && (nvlist_lookup_uint64(label, 1625 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 \|\| 1626 guid != spa_guid(spa))) { 1627 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1628 VDEV_AUX_CORRUPT_DATA); 1629 nvlist_free(label);
	1630 vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid " 1631 "doesn't match config (%llu != %llu)", 1632 (u_longlong_t)guid, 1633 (u_longlong_t)spa_guid(spa));
1600 return (0); 1601 } 1602 1603 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1604 != 0 \|\| nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1605 &aux_guid) != 0) 1606 aux_guid = 0; 1607 1608 /* 1609 * If this vdev just became a top-level vdev because its 1610 * sibling was detached, it will have adopted the parent's 1611 * vdev guid -- but the label may or may not be on disk yet. 1612 * Fortunately, either version of the label will have the 1613 * same top guid, so if we're a top-level vdev, we can 1614 * safely compare to that instead. 1615 * 1616 * If we split this vdev off instead, then we also check the 1617 * original pool's guid. We don't want to consider the vdev 1618 * corrupt if it is partway through a split operation. 1619 / 1620* if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1621 &guid) != 0 \|\| 1622 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1623 &top_guid) != 0 \|\| 1624 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1625 (vd->vdev_guid != top_guid \|\| vd != vd->vdev_top))) { 1626 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1627 VDEV_AUX_CORRUPT_DATA); 1628 nvlist_free(label);	1634 return (0); 1635 } 1636 1637 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1638 != 0 \|\| nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1639 &aux_guid) != 0) 1640 aux_guid = 0; 1641 1642 /* 1643 * If this vdev just became a top-level vdev because its 1644 * sibling was detached, it will have adopted the parent's 1645 * vdev guid -- but the label may or may not be on disk yet. 1646 * Fortunately, either version of the label will have the 1647 * same top guid, so if we're a top-level vdev, we can 1648 * safely compare to that instead. 1649 * 1650 * If we split this vdev off instead, then we also check the 1651 * original pool's guid. We don't want to consider the vdev 1652 * corrupt if it is partway through a split operation. 1653 / 1654* if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1655 &guid) != 0 \|\| 1656 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1657 &top_guid) != 0 \|\| 1658 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1659 (vd->vdev_guid != top_guid \|\| vd != vd->vdev_top))) { 1660 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1661 VDEV_AUX_CORRUPT_DATA); 1662 nvlist_free(label);
	1663 vdev_dbgmsg(vd, "vdev_validate: config guid doesn't " 1664 "match label guid (%llu != %llu)", 1665 (u_longlong_t)vd->vdev_guid, (u_longlong_t)guid);
1629 return (0); 1630 } 1631 1632 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1633 &state) != 0) { 1634 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1635 VDEV_AUX_CORRUPT_DATA); 1636 nvlist_free(label);	1666 return (0); 1667 } 1668 1669 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1670 &state) != 0) { 1671 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1672 VDEV_AUX_CORRUPT_DATA); 1673 nvlist_free(label);
	1674 vdev_dbgmsg(vd, "vdev_validate: '%s' missing", 1675 ZPOOL_CONFIG_POOL_STATE);
1637 return (0); 1638 } 1639 1640 nvlist_free(label); 1641 1642 /* 1643 * If this is a verbatim import, no need to check the 1644 * state of the pool. 1645 / 1646* if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1647 spa_load_state(spa) == SPA_LOAD_OPEN &&	1676 return (0); 1677 } 1678 1679 nvlist_free(label); 1680 1681 /* 1682 * If this is a verbatim import, no need to check the 1683 * state of the pool. 1684 / 1685* if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1686 spa_load_state(spa) == SPA_LOAD_OPEN &&
1648 state != POOL_STATE_ACTIVE)	1687 state != POOL_STATE_ACTIVE) { 1688 vdev_dbgmsg(vd, "vdev_validate: invalid pool state " 1689 "(%llu) for spa %s", (u_longlong_t)state, 1690 spa->spa_name);
1649 return (SET_ERROR(EBADF));	1691 return (SET_ERROR(EBADF));
	1692 }
1650 1651 /* 1652 * If we were able to open and validate a vdev that was 1653 * previously marked permanently unavailable, clear that state 1654 * now. 1655 / 1656* if (vd->vdev_not_present) 1657 vd->vdev_not_present = 0; 1658 } 1659 1660 return (0); 1661} 1662 1663/* 1664 * Close a virtual device. 1665 / 1666void 1667vdev_close(vdev_t vd) 1668{ 1669 spa_t spa = vd->vdev_spa; 1670* vdev_t pvd = vd->vdev_parent; 1671* 1672 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1673 1674 /* 1675 * If our parent is reopening, then we are as well, unless we are 1676 * going offline. 1677 / 1678* if (pvd != NULL && pvd->vdev_reopening) 1679 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1680 1681 vd->vdev_ops->vdev_op_close(vd); 1682 1683 vdev_cache_purge(vd); 1684 1685 if (vd->vdev_ops->vdev_op_leaf) 1686 trim_map_destroy(vd); 1687 1688 /* 1689 * We record the previous state before we close it, so that if we are 1690 * doing a reopen(), we don't generate FMA ereports if we notice that 1691 * it's still faulted. 1692 / 1693* vd->vdev_prevstate = vd->vdev_state; 1694 1695 if (vd->vdev_offline) 1696 vd->vdev_state = VDEV_STATE_OFFLINE; 1697 else 1698 vd->vdev_state = VDEV_STATE_CLOSED; 1699 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1700} 1701 1702void 1703vdev_hold(vdev_t vd) 1704{ 1705* spa_t spa = vd->vdev_spa; 1706* 1707 ASSERT(spa_is_root(spa)); 1708 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1709 return; 1710 1711 for (int c = 0; c < vd->vdev_children; c++) 1712 vdev_hold(vd->vdev_child[c]); 1713 1714 if (vd->vdev_ops->vdev_op_leaf) 1715 vd->vdev_ops->vdev_op_hold(vd); 1716} 1717 1718void 1719vdev_rele(vdev_t vd) 1720{ 1721* spa_t spa = vd->vdev_spa; 1722* 1723 ASSERT(spa_is_root(spa)); 1724 for (int c = 0; c < vd->vdev_children; c++) 1725 vdev_rele(vd->vdev_child[c]); 1726 1727 if (vd->vdev_ops->vdev_op_leaf) 1728 vd->vdev_ops->vdev_op_rele(vd); 1729} 1730 1731/* 1732 * Reopen all interior vdevs and any unopened leaves. We don't actually 1733 * reopen leaf vdevs which had previously been opened as they might deadlock 1734 * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1735 * If the leaf has never been opened then open it, as usual. 1736 / 1737void 1738vdev_reopen(vdev_t vd) 1739{ 1740 spa_t spa = vd->vdev_spa; 1741* 1742 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1743 1744 /* set the reopening flag unless we're taking the vdev offline / 1745* vd->vdev_reopening = !vd->vdev_offline; 1746 vdev_close(vd); 1747 (void) vdev_open(vd); 1748 1749 /* 1750 * Call vdev_validate() here to make sure we have the same device. 1751 * Otherwise, a device with an invalid label could be successfully 1752 * opened in response to vdev_reopen(). 1753 / 1754* if (vd->vdev_aux) { 1755 (void) vdev_validate_aux(vd); 1756 if (vdev_readable(vd) && vdev_writeable(vd) && 1757 vd->vdev_aux == &spa->spa_l2cache && 1758 !l2arc_vdev_present(vd)) 1759 l2arc_add_vdev(spa, vd); 1760 } else { 1761 (void) vdev_validate(vd, B_TRUE); 1762 } 1763 1764 /* 1765 * Reassess parent vdev's health. 1766 / 1767* vdev_propagate_state(vd); 1768} 1769 1770int 1771vdev_create(vdev_t vd, uint64_t txg, boolean_t isreplacing) 1772{ 1773* int error; 1774 1775 /* 1776 * Normally, partial opens (e.g. of a mirror) are allowed. 1777 * For a create, however, we want to fail the request if 1778 * there are any components we can't open. 1779 / 1780* error = vdev_open(vd); 1781 1782 if (error \|\| vd->vdev_state != VDEV_STATE_HEALTHY) { 1783 vdev_close(vd); 1784 return (error ? error : ENXIO); 1785 } 1786 1787 /* 1788 * Recursively load DTLs and initialize all labels. 1789 / 1790* if ((error = vdev_dtl_load(vd)) != 0 \|\| 1791 (error = vdev_label_init(vd, txg, isreplacing ? 1792 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1793 vdev_close(vd); 1794 return (error); 1795 } 1796 1797 return (0); 1798} 1799 1800void 1801vdev_metaslab_set_size(vdev_t vd) 1802{ 1803* /* 1804 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. 1805 / 1806* vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); 1807 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1808} 1809 1810/* 1811 * Maximize performance by inflating the configured ashift for top level 1812 * vdevs to be as close to the physical ashift as possible while maintaining 1813 * administrator defined limits and ensuring it doesn't go below the 1814 * logical ashift. 1815 / 1816void 1817vdev_ashift_optimize(vdev_t vd) 1818{ 1819 if (vd == vd->vdev_top) { 1820 if (vd->vdev_ashift < vd->vdev_physical_ashift) { 1821 vd->vdev_ashift = MIN( 1822 MAX(zfs_max_auto_ashift, vd->vdev_ashift), 1823 MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); 1824 } else { 1825 /* 1826 * Unusual case where logical ashift > physical ashift 1827 * so we can't cap the calculated ashift based on max 1828 * ashift as that would cause failures. 1829 * We still check if we need to increase it to match 1830 * the min ashift. 1831 / 1832* vd->vdev_ashift = MAX(zfs_min_auto_ashift, 1833 vd->vdev_ashift); 1834 } 1835 } 1836} 1837 1838void 1839vdev_dirty(vdev_t vd, int flags, void arg, uint64_t txg) 1840{ 1841 ASSERT(vd == vd->vdev_top); 1842 /* indirect vdevs don't have metaslabs or dtls / 1843* ASSERT(vdev_is_concrete(vd) \|\| flags == 0); 1844 ASSERT(ISP2(flags)); 1845 ASSERT(spa_writeable(vd->vdev_spa)); 1846 1847 if (flags & VDD_METASLAB) 1848 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1849 1850 if (flags & VDD_DTL) 1851 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1852 1853 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1854} 1855 1856void 1857vdev_dirty_leaves(vdev_t vd, int flags, uint64_t txg) 1858{ 1859* for (int c = 0; c < vd->vdev_children; c++) 1860 vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 1861 1862 if (vd->vdev_ops->vdev_op_leaf) 1863 vdev_dirty(vd->vdev_top, flags, vd, txg); 1864} 1865 1866/* 1867 * DTLs. 1868 * 1869 * A vdev's DTL (dirty time log) is the set of transaction groups for which 1870 * the vdev has less than perfect replication. There are four kinds of DTL: 1871 * 1872 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1873 * 1874 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1875 * 1876 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1877 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1878 * txgs that was scrubbed. 1879 * 1880 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1881 * persistent errors or just some device being offline. 1882 * Unlike the other three, the DTL_OUTAGE map is not generally 1883 * maintained; it's only computed when needed, typically to 1884 * determine whether a device can be detached. 1885 * 1886 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1887 * either has the data or it doesn't. 1888 * 1889 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1890 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1891 * if any child is less than fully replicated, then so is its parent. 1892 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1893 * comprising only those txgs which appear in 'maxfaults' or more children; 1894 * those are the txgs we don't have enough replication to read. For example, 1895 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1896 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1897 * two child DTL_MISSING maps. 1898 * 1899 * It should be clear from the above that to compute the DTLs and outage maps 1900 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1901 * Therefore, that is all we keep on disk. When loading the pool, or after 1902 * a configuration change, we generate all other DTLs from first principles. 1903 / 1904void 1905vdev_dtl_dirty(vdev_t vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1906{ 1907 range_tree_t rt = vd->vdev_dtl[t]; 1908* 1909 ASSERT(t < DTL_TYPES); 1910 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1911 ASSERT(spa_writeable(vd->vdev_spa)); 1912 1913 mutex_enter(&vd->vdev_dtl_lock); 1914 if (!range_tree_contains(rt, txg, size)) 1915 range_tree_add(rt, txg, size); 1916 mutex_exit(&vd->vdev_dtl_lock); 1917} 1918 1919boolean_t 1920vdev_dtl_contains(vdev_t vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1921{ 1922* range_tree_t rt = vd->vdev_dtl[t]; 1923* boolean_t dirty = B_FALSE; 1924 1925 ASSERT(t < DTL_TYPES); 1926 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1927 1928 /* 1929 * While we are loading the pool, the DTLs have not been loaded yet. 1930 * Ignore the DTLs and try all devices. This avoids a recursive 1931 * mutex enter on the vdev_dtl_lock, and also makes us try hard 1932 * when loading the pool (relying on the checksum to ensure that 1933 * we get the right data -- note that we while loading, we are 1934 * only reading the MOS, which is always checksummed). 1935 / 1936* if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) 1937 return (B_FALSE); 1938 1939 mutex_enter(&vd->vdev_dtl_lock); 1940 if (range_tree_space(rt) != 0) 1941 dirty = range_tree_contains(rt, txg, size); 1942 mutex_exit(&vd->vdev_dtl_lock); 1943 1944 return (dirty); 1945} 1946 1947boolean_t 1948vdev_dtl_empty(vdev_t vd, vdev_dtl_type_t t) 1949{ 1950* range_tree_t rt = vd->vdev_dtl[t]; 1951* boolean_t empty; 1952 1953 mutex_enter(&vd->vdev_dtl_lock); 1954 empty = (range_tree_space(rt) == 0); 1955 mutex_exit(&vd->vdev_dtl_lock); 1956 1957 return (empty); 1958} 1959 1960/* 1961 * Returns the lowest txg in the DTL range. 1962 / 1963static uint64_t 1964vdev_dtl_min(vdev_t vd) 1965{ 1966 range_seg_t rs; 1967* 1968 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1969 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1970 ASSERT0(vd->vdev_children); 1971 1972 rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1973 return (rs->rs_start - 1); 1974} 1975 1976/* 1977 * Returns the highest txg in the DTL. 1978 / 1979static uint64_t 1980vdev_dtl_max(vdev_t vd) 1981{ 1982 range_seg_t rs; 1983* 1984 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1985 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1986 ASSERT0(vd->vdev_children); 1987 1988 rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1989 return (rs->rs_end); 1990} 1991 1992/* 1993 * Determine if a resilvering vdev should remove any DTL entries from 1994 * its range. If the vdev was resilvering for the entire duration of the 1995 * scan then it should excise that range from its DTLs. Otherwise, this 1996 * vdev is considered partially resilvered and should leave its DTL 1997 * entries intact. The comment in vdev_dtl_reassess() describes how we 1998 * excise the DTLs. 1999 / 2000static boolean_t 2001vdev_dtl_should_excise(vdev_t vd) 2002{ 2003 spa_t spa = vd->vdev_spa; 2004* dsl_scan_t scn = spa->spa_dsl_pool->dp_scan; 2005* 2006 ASSERT0(scn->scn_phys.scn_errors); 2007 ASSERT0(vd->vdev_children); 2008 2009 if (vd->vdev_state < VDEV_STATE_DEGRADED) 2010 return (B_FALSE); 2011 2012 if (vd->vdev_resilver_txg == 0 \|\| 2013 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) 2014 return (B_TRUE); 2015 2016 /* 2017 * When a resilver is initiated the scan will assign the scn_max_txg 2018 * value to the highest txg value that exists in all DTLs. If this 2019 * device's max DTL is not part of this scan (i.e. it is not in 2020 * the range (scn_min_txg, scn_max_txg] then it is not eligible 2021 * for excision. 2022 / 2023* if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 2024 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); 2025 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); 2026 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); 2027 return (B_TRUE); 2028 } 2029 return (B_FALSE); 2030} 2031 2032/* 2033 * Reassess DTLs after a config change or scrub completion. 2034 / 2035void 2036vdev_dtl_reassess(vdev_t vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 2037{ 2038 spa_t spa = vd->vdev_spa; 2039* avl_tree_t reftree; 2040 int minref; 2041 2042 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2043 2044 for (int c = 0; c < vd->vdev_children; c++) 2045 vdev_dtl_reassess(vd->vdev_child[c], txg, 2046 scrub_txg, scrub_done); 2047 2048 if (vd == spa->spa_root_vdev \|\| !vdev_is_concrete(vd) \|\| vd->vdev_aux) 2049 return; 2050 2051 if (vd->vdev_ops->vdev_op_leaf) { 2052 dsl_scan_t scn = spa->spa_dsl_pool->dp_scan; 2053* 2054 mutex_enter(&vd->vdev_dtl_lock); 2055 2056 /* 2057 * If we've completed a scan cleanly then determine 2058 * if this vdev should remove any DTLs. We only want to 2059 * excise regions on vdevs that were available during 2060 * the entire duration of this scan. 2061 / 2062* if (scrub_txg != 0 && 2063 (spa->spa_scrub_started \|\| 2064 (scn != NULL && scn->scn_phys.scn_errors == 0)) && 2065 vdev_dtl_should_excise(vd)) { 2066 /* 2067 * We completed a scrub up to scrub_txg. If we 2068 * did it without rebooting, then the scrub dtl 2069 * will be valid, so excise the old region and 2070 * fold in the scrub dtl. Otherwise, leave the 2071 * dtl as-is if there was an error. 2072 * 2073 * There's little trick here: to excise the beginning 2074 * of the DTL_MISSING map, we put it into a reference 2075 * tree and then add a segment with refcnt -1 that 2076 * covers the range [0, scrub_txg). This means 2077 * that each txg in that range has refcnt -1 or 0. 2078 * We then add DTL_SCRUB with a refcnt of 2, so that 2079 * entries in the range [0, scrub_txg) will have a 2080 * positive refcnt -- either 1 or 2. We then convert 2081 * the reference tree into the new DTL_MISSING map. 2082 / 2083* space_reftree_create(&reftree); 2084 space_reftree_add_map(&reftree, 2085 vd->vdev_dtl[DTL_MISSING], 1); 2086 space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 2087 space_reftree_add_map(&reftree, 2088 vd->vdev_dtl[DTL_SCRUB], 2); 2089 space_reftree_generate_map(&reftree, 2090 vd->vdev_dtl[DTL_MISSING], 1); 2091 space_reftree_destroy(&reftree); 2092 } 2093 range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 2094 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 2095 range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 2096 if (scrub_done) 2097 range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 2098 range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 2099 if (!vdev_readable(vd)) 2100 range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 2101 else 2102 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 2103 range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 2104 2105 /* 2106 * If the vdev was resilvering and no longer has any 2107 * DTLs then reset its resilvering flag and dirty 2108 * the top level so that we persist the change. 2109 / 2110* if (vd->vdev_resilver_txg != 0 && 2111 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && 2112 range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { 2113 vd->vdev_resilver_txg = 0; 2114 vdev_config_dirty(vd->vdev_top); 2115 } 2116 2117 mutex_exit(&vd->vdev_dtl_lock); 2118 2119 if (txg != 0) 2120 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 2121 return; 2122 } 2123 2124 mutex_enter(&vd->vdev_dtl_lock); 2125 for (int t = 0; t < DTL_TYPES; t++) { 2126 /* account for child's outage in parent's missing map / 2127* int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 2128 if (t == DTL_SCRUB) 2129 continue; /* leaf vdevs only / 2130* if (t == DTL_PARTIAL) 2131 minref = 1; /* i.e. non-zero / 2132* else if (vd->vdev_nparity != 0) 2133 minref = vd->vdev_nparity + 1; /* RAID-Z / 2134* else 2135 minref = vd->vdev_children; /* any kind of mirror / 2136* space_reftree_create(&reftree); 2137 for (int c = 0; c < vd->vdev_children; c++) { 2138 vdev_t cvd = vd->vdev_child[c]; 2139* mutex_enter(&cvd->vdev_dtl_lock); 2140 space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); 2141 mutex_exit(&cvd->vdev_dtl_lock); 2142 } 2143 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); 2144 space_reftree_destroy(&reftree); 2145 } 2146 mutex_exit(&vd->vdev_dtl_lock); 2147} 2148 2149int 2150vdev_dtl_load(vdev_t vd) 2151{ 2152* spa_t spa = vd->vdev_spa; 2153* objset_t mos = spa->spa_meta_objset; 2154* int error = 0; 2155 2156 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 2157 ASSERT(vdev_is_concrete(vd)); 2158 2159 error = space_map_open(&vd->vdev_dtl_sm, mos, 2160 vd->vdev_dtl_object, 0, -1ULL, 0); 2161 if (error) 2162 return (error); 2163 ASSERT(vd->vdev_dtl_sm != NULL); 2164 2165 mutex_enter(&vd->vdev_dtl_lock); 2166 2167 /* 2168 * Now that we've opened the space_map we need to update 2169 * the in-core DTL. 2170 / 2171* space_map_update(vd->vdev_dtl_sm); 2172 2173 error = space_map_load(vd->vdev_dtl_sm, 2174 vd->vdev_dtl[DTL_MISSING], SM_ALLOC); 2175 mutex_exit(&vd->vdev_dtl_lock); 2176 2177 return (error); 2178 } 2179 2180 for (int c = 0; c < vd->vdev_children; c++) { 2181 error = vdev_dtl_load(vd->vdev_child[c]); 2182 if (error != 0) 2183 break; 2184 } 2185 2186 return (error); 2187} 2188 2189void 2190vdev_destroy_unlink_zap(vdev_t vd, uint64_t zapobj, dmu_tx_t tx) 2191{ 2192 spa_t spa = vd->vdev_spa; 2193* 2194 VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); 2195 VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2196 zapobj, tx)); 2197} 2198 2199uint64_t 2200vdev_create_link_zap(vdev_t vd, dmu_tx_t tx) 2201{ 2202 spa_t spa = vd->vdev_spa; 2203* uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, 2204 DMU_OT_NONE, 0, tx); 2205 2206 ASSERT(zap != 0); 2207 VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2208 zap, tx)); 2209 2210 return (zap); 2211} 2212 2213void 2214vdev_construct_zaps(vdev_t vd, dmu_tx_t tx) 2215{ 2216 if (vd->vdev_ops != &vdev_hole_ops && 2217 vd->vdev_ops != &vdev_missing_ops && 2218 vd->vdev_ops != &vdev_root_ops && 2219 !vd->vdev_top->vdev_removing) { 2220 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { 2221 vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); 2222 } 2223 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { 2224 vd->vdev_top_zap = vdev_create_link_zap(vd, tx); 2225 } 2226 } 2227 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2228 vdev_construct_zaps(vd->vdev_child[i], tx); 2229 } 2230} 2231 2232void 2233vdev_dtl_sync(vdev_t vd, uint64_t txg) 2234{ 2235* spa_t spa = vd->vdev_spa; 2236* range_tree_t rt = vd->vdev_dtl[DTL_MISSING]; 2237* objset_t mos = spa->spa_meta_objset; 2238* range_tree_t rtsync; 2239* dmu_tx_t tx; 2240* uint64_t object = space_map_object(vd->vdev_dtl_sm); 2241 2242 ASSERT(vdev_is_concrete(vd)); 2243 ASSERT(vd->vdev_ops->vdev_op_leaf); 2244 2245 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2246 2247 if (vd->vdev_detached \|\| vd->vdev_top->vdev_removing) { 2248 mutex_enter(&vd->vdev_dtl_lock); 2249 space_map_free(vd->vdev_dtl_sm, tx); 2250 space_map_close(vd->vdev_dtl_sm); 2251 vd->vdev_dtl_sm = NULL; 2252 mutex_exit(&vd->vdev_dtl_lock); 2253 2254 /* 2255 * We only destroy the leaf ZAP for detached leaves or for 2256 * removed log devices. Removed data devices handle leaf ZAP 2257 * cleanup later, once cancellation is no longer possible. 2258 / 2259* if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached \|\| 2260 vd->vdev_top->vdev_islog)) { 2261 vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); 2262 vd->vdev_leaf_zap = 0; 2263 } 2264 2265 dmu_tx_commit(tx); 2266 return; 2267 } 2268 2269 if (vd->vdev_dtl_sm == NULL) { 2270 uint64_t new_object; 2271 2272 new_object = space_map_alloc(mos, tx); 2273 VERIFY3U(new_object, !=, 0); 2274 2275 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 2276 0, -1ULL, 0)); 2277 ASSERT(vd->vdev_dtl_sm != NULL); 2278 } 2279 2280 rtsync = range_tree_create(NULL, NULL); 2281 2282 mutex_enter(&vd->vdev_dtl_lock); 2283 range_tree_walk(rt, range_tree_add, rtsync); 2284 mutex_exit(&vd->vdev_dtl_lock); 2285 2286 space_map_truncate(vd->vdev_dtl_sm, tx); 2287 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); 2288 range_tree_vacate(rtsync, NULL, NULL); 2289 2290 range_tree_destroy(rtsync); 2291 2292 /* 2293 * If the object for the space map has changed then dirty 2294 * the top level so that we update the config. 2295 / 2296* if (object != space_map_object(vd->vdev_dtl_sm)) {	1693 1694 /* 1695 * If we were able to open and validate a vdev that was 1696 * previously marked permanently unavailable, clear that state 1697 * now. 1698 / 1699* if (vd->vdev_not_present) 1700 vd->vdev_not_present = 0; 1701 } 1702 1703 return (0); 1704} 1705 1706/* 1707 * Close a virtual device. 1708 / 1709void 1710vdev_close(vdev_t vd) 1711{ 1712 spa_t spa = vd->vdev_spa; 1713* vdev_t pvd = vd->vdev_parent; 1714* 1715 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1716 1717 /* 1718 * If our parent is reopening, then we are as well, unless we are 1719 * going offline. 1720 / 1721* if (pvd != NULL && pvd->vdev_reopening) 1722 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1723 1724 vd->vdev_ops->vdev_op_close(vd); 1725 1726 vdev_cache_purge(vd); 1727 1728 if (vd->vdev_ops->vdev_op_leaf) 1729 trim_map_destroy(vd); 1730 1731 /* 1732 * We record the previous state before we close it, so that if we are 1733 * doing a reopen(), we don't generate FMA ereports if we notice that 1734 * it's still faulted. 1735 / 1736* vd->vdev_prevstate = vd->vdev_state; 1737 1738 if (vd->vdev_offline) 1739 vd->vdev_state = VDEV_STATE_OFFLINE; 1740 else 1741 vd->vdev_state = VDEV_STATE_CLOSED; 1742 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1743} 1744 1745void 1746vdev_hold(vdev_t vd) 1747{ 1748* spa_t spa = vd->vdev_spa; 1749* 1750 ASSERT(spa_is_root(spa)); 1751 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1752 return; 1753 1754 for (int c = 0; c < vd->vdev_children; c++) 1755 vdev_hold(vd->vdev_child[c]); 1756 1757 if (vd->vdev_ops->vdev_op_leaf) 1758 vd->vdev_ops->vdev_op_hold(vd); 1759} 1760 1761void 1762vdev_rele(vdev_t vd) 1763{ 1764* spa_t spa = vd->vdev_spa; 1765* 1766 ASSERT(spa_is_root(spa)); 1767 for (int c = 0; c < vd->vdev_children; c++) 1768 vdev_rele(vd->vdev_child[c]); 1769 1770 if (vd->vdev_ops->vdev_op_leaf) 1771 vd->vdev_ops->vdev_op_rele(vd); 1772} 1773 1774/* 1775 * Reopen all interior vdevs and any unopened leaves. We don't actually 1776 * reopen leaf vdevs which had previously been opened as they might deadlock 1777 * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1778 * If the leaf has never been opened then open it, as usual. 1779 / 1780void 1781vdev_reopen(vdev_t vd) 1782{ 1783 spa_t spa = vd->vdev_spa; 1784* 1785 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1786 1787 /* set the reopening flag unless we're taking the vdev offline / 1788* vd->vdev_reopening = !vd->vdev_offline; 1789 vdev_close(vd); 1790 (void) vdev_open(vd); 1791 1792 /* 1793 * Call vdev_validate() here to make sure we have the same device. 1794 * Otherwise, a device with an invalid label could be successfully 1795 * opened in response to vdev_reopen(). 1796 / 1797* if (vd->vdev_aux) { 1798 (void) vdev_validate_aux(vd); 1799 if (vdev_readable(vd) && vdev_writeable(vd) && 1800 vd->vdev_aux == &spa->spa_l2cache && 1801 !l2arc_vdev_present(vd)) 1802 l2arc_add_vdev(spa, vd); 1803 } else { 1804 (void) vdev_validate(vd, B_TRUE); 1805 } 1806 1807 /* 1808 * Reassess parent vdev's health. 1809 / 1810* vdev_propagate_state(vd); 1811} 1812 1813int 1814vdev_create(vdev_t vd, uint64_t txg, boolean_t isreplacing) 1815{ 1816* int error; 1817 1818 /* 1819 * Normally, partial opens (e.g. of a mirror) are allowed. 1820 * For a create, however, we want to fail the request if 1821 * there are any components we can't open. 1822 / 1823* error = vdev_open(vd); 1824 1825 if (error \|\| vd->vdev_state != VDEV_STATE_HEALTHY) { 1826 vdev_close(vd); 1827 return (error ? error : ENXIO); 1828 } 1829 1830 /* 1831 * Recursively load DTLs and initialize all labels. 1832 / 1833* if ((error = vdev_dtl_load(vd)) != 0 \|\| 1834 (error = vdev_label_init(vd, txg, isreplacing ? 1835 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1836 vdev_close(vd); 1837 return (error); 1838 } 1839 1840 return (0); 1841} 1842 1843void 1844vdev_metaslab_set_size(vdev_t vd) 1845{ 1846* /* 1847 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. 1848 / 1849* vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); 1850 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1851} 1852 1853/* 1854 * Maximize performance by inflating the configured ashift for top level 1855 * vdevs to be as close to the physical ashift as possible while maintaining 1856 * administrator defined limits and ensuring it doesn't go below the 1857 * logical ashift. 1858 / 1859void 1860vdev_ashift_optimize(vdev_t vd) 1861{ 1862 if (vd == vd->vdev_top) { 1863 if (vd->vdev_ashift < vd->vdev_physical_ashift) { 1864 vd->vdev_ashift = MIN( 1865 MAX(zfs_max_auto_ashift, vd->vdev_ashift), 1866 MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); 1867 } else { 1868 /* 1869 * Unusual case where logical ashift > physical ashift 1870 * so we can't cap the calculated ashift based on max 1871 * ashift as that would cause failures. 1872 * We still check if we need to increase it to match 1873 * the min ashift. 1874 / 1875* vd->vdev_ashift = MAX(zfs_min_auto_ashift, 1876 vd->vdev_ashift); 1877 } 1878 } 1879} 1880 1881void 1882vdev_dirty(vdev_t vd, int flags, void arg, uint64_t txg) 1883{ 1884 ASSERT(vd == vd->vdev_top); 1885 /* indirect vdevs don't have metaslabs or dtls / 1886* ASSERT(vdev_is_concrete(vd) \|\| flags == 0); 1887 ASSERT(ISP2(flags)); 1888 ASSERT(spa_writeable(vd->vdev_spa)); 1889 1890 if (flags & VDD_METASLAB) 1891 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1892 1893 if (flags & VDD_DTL) 1894 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1895 1896 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1897} 1898 1899void 1900vdev_dirty_leaves(vdev_t vd, int flags, uint64_t txg) 1901{ 1902* for (int c = 0; c < vd->vdev_children; c++) 1903 vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 1904 1905 if (vd->vdev_ops->vdev_op_leaf) 1906 vdev_dirty(vd->vdev_top, flags, vd, txg); 1907} 1908 1909/* 1910 * DTLs. 1911 * 1912 * A vdev's DTL (dirty time log) is the set of transaction groups for which 1913 * the vdev has less than perfect replication. There are four kinds of DTL: 1914 * 1915 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1916 * 1917 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1918 * 1919 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1920 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1921 * txgs that was scrubbed. 1922 * 1923 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1924 * persistent errors or just some device being offline. 1925 * Unlike the other three, the DTL_OUTAGE map is not generally 1926 * maintained; it's only computed when needed, typically to 1927 * determine whether a device can be detached. 1928 * 1929 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1930 * either has the data or it doesn't. 1931 * 1932 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1933 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1934 * if any child is less than fully replicated, then so is its parent. 1935 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1936 * comprising only those txgs which appear in 'maxfaults' or more children; 1937 * those are the txgs we don't have enough replication to read. For example, 1938 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1939 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1940 * two child DTL_MISSING maps. 1941 * 1942 * It should be clear from the above that to compute the DTLs and outage maps 1943 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1944 * Therefore, that is all we keep on disk. When loading the pool, or after 1945 * a configuration change, we generate all other DTLs from first principles. 1946 / 1947void 1948vdev_dtl_dirty(vdev_t vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1949{ 1950 range_tree_t rt = vd->vdev_dtl[t]; 1951* 1952 ASSERT(t < DTL_TYPES); 1953 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1954 ASSERT(spa_writeable(vd->vdev_spa)); 1955 1956 mutex_enter(&vd->vdev_dtl_lock); 1957 if (!range_tree_contains(rt, txg, size)) 1958 range_tree_add(rt, txg, size); 1959 mutex_exit(&vd->vdev_dtl_lock); 1960} 1961 1962boolean_t 1963vdev_dtl_contains(vdev_t vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1964{ 1965* range_tree_t rt = vd->vdev_dtl[t]; 1966* boolean_t dirty = B_FALSE; 1967 1968 ASSERT(t < DTL_TYPES); 1969 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1970 1971 /* 1972 * While we are loading the pool, the DTLs have not been loaded yet. 1973 * Ignore the DTLs and try all devices. This avoids a recursive 1974 * mutex enter on the vdev_dtl_lock, and also makes us try hard 1975 * when loading the pool (relying on the checksum to ensure that 1976 * we get the right data -- note that we while loading, we are 1977 * only reading the MOS, which is always checksummed). 1978 / 1979* if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) 1980 return (B_FALSE); 1981 1982 mutex_enter(&vd->vdev_dtl_lock); 1983 if (range_tree_space(rt) != 0) 1984 dirty = range_tree_contains(rt, txg, size); 1985 mutex_exit(&vd->vdev_dtl_lock); 1986 1987 return (dirty); 1988} 1989 1990boolean_t 1991vdev_dtl_empty(vdev_t vd, vdev_dtl_type_t t) 1992{ 1993* range_tree_t rt = vd->vdev_dtl[t]; 1994* boolean_t empty; 1995 1996 mutex_enter(&vd->vdev_dtl_lock); 1997 empty = (range_tree_space(rt) == 0); 1998 mutex_exit(&vd->vdev_dtl_lock); 1999 2000 return (empty); 2001} 2002 2003/* 2004 * Returns the lowest txg in the DTL range. 2005 / 2006static uint64_t 2007vdev_dtl_min(vdev_t vd) 2008{ 2009 range_seg_t rs; 2010* 2011 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 2012 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 2013 ASSERT0(vd->vdev_children); 2014 2015 rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); 2016 return (rs->rs_start - 1); 2017} 2018 2019/* 2020 * Returns the highest txg in the DTL. 2021 / 2022static uint64_t 2023vdev_dtl_max(vdev_t vd) 2024{ 2025 range_seg_t rs; 2026* 2027 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 2028 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 2029 ASSERT0(vd->vdev_children); 2030 2031 rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); 2032 return (rs->rs_end); 2033} 2034 2035/* 2036 * Determine if a resilvering vdev should remove any DTL entries from 2037 * its range. If the vdev was resilvering for the entire duration of the 2038 * scan then it should excise that range from its DTLs. Otherwise, this 2039 * vdev is considered partially resilvered and should leave its DTL 2040 * entries intact. The comment in vdev_dtl_reassess() describes how we 2041 * excise the DTLs. 2042 / 2043static boolean_t 2044vdev_dtl_should_excise(vdev_t vd) 2045{ 2046 spa_t spa = vd->vdev_spa; 2047* dsl_scan_t scn = spa->spa_dsl_pool->dp_scan; 2048* 2049 ASSERT0(scn->scn_phys.scn_errors); 2050 ASSERT0(vd->vdev_children); 2051 2052 if (vd->vdev_state < VDEV_STATE_DEGRADED) 2053 return (B_FALSE); 2054 2055 if (vd->vdev_resilver_txg == 0 \|\| 2056 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) 2057 return (B_TRUE); 2058 2059 /* 2060 * When a resilver is initiated the scan will assign the scn_max_txg 2061 * value to the highest txg value that exists in all DTLs. If this 2062 * device's max DTL is not part of this scan (i.e. it is not in 2063 * the range (scn_min_txg, scn_max_txg] then it is not eligible 2064 * for excision. 2065 / 2066* if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 2067 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); 2068 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); 2069 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); 2070 return (B_TRUE); 2071 } 2072 return (B_FALSE); 2073} 2074 2075/* 2076 * Reassess DTLs after a config change or scrub completion. 2077 / 2078void 2079vdev_dtl_reassess(vdev_t vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 2080{ 2081 spa_t spa = vd->vdev_spa; 2082* avl_tree_t reftree; 2083 int minref; 2084 2085 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2086 2087 for (int c = 0; c < vd->vdev_children; c++) 2088 vdev_dtl_reassess(vd->vdev_child[c], txg, 2089 scrub_txg, scrub_done); 2090 2091 if (vd == spa->spa_root_vdev \|\| !vdev_is_concrete(vd) \|\| vd->vdev_aux) 2092 return; 2093 2094 if (vd->vdev_ops->vdev_op_leaf) { 2095 dsl_scan_t scn = spa->spa_dsl_pool->dp_scan; 2096* 2097 mutex_enter(&vd->vdev_dtl_lock); 2098 2099 /* 2100 * If we've completed a scan cleanly then determine 2101 * if this vdev should remove any DTLs. We only want to 2102 * excise regions on vdevs that were available during 2103 * the entire duration of this scan. 2104 / 2105* if (scrub_txg != 0 && 2106 (spa->spa_scrub_started \|\| 2107 (scn != NULL && scn->scn_phys.scn_errors == 0)) && 2108 vdev_dtl_should_excise(vd)) { 2109 /* 2110 * We completed a scrub up to scrub_txg. If we 2111 * did it without rebooting, then the scrub dtl 2112 * will be valid, so excise the old region and 2113 * fold in the scrub dtl. Otherwise, leave the 2114 * dtl as-is if there was an error. 2115 * 2116 * There's little trick here: to excise the beginning 2117 * of the DTL_MISSING map, we put it into a reference 2118 * tree and then add a segment with refcnt -1 that 2119 * covers the range [0, scrub_txg). This means 2120 * that each txg in that range has refcnt -1 or 0. 2121 * We then add DTL_SCRUB with a refcnt of 2, so that 2122 * entries in the range [0, scrub_txg) will have a 2123 * positive refcnt -- either 1 or 2. We then convert 2124 * the reference tree into the new DTL_MISSING map. 2125 / 2126* space_reftree_create(&reftree); 2127 space_reftree_add_map(&reftree, 2128 vd->vdev_dtl[DTL_MISSING], 1); 2129 space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 2130 space_reftree_add_map(&reftree, 2131 vd->vdev_dtl[DTL_SCRUB], 2); 2132 space_reftree_generate_map(&reftree, 2133 vd->vdev_dtl[DTL_MISSING], 1); 2134 space_reftree_destroy(&reftree); 2135 } 2136 range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 2137 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 2138 range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 2139 if (scrub_done) 2140 range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 2141 range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 2142 if (!vdev_readable(vd)) 2143 range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 2144 else 2145 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 2146 range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 2147 2148 /* 2149 * If the vdev was resilvering and no longer has any 2150 * DTLs then reset its resilvering flag and dirty 2151 * the top level so that we persist the change. 2152 / 2153* if (vd->vdev_resilver_txg != 0 && 2154 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && 2155 range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { 2156 vd->vdev_resilver_txg = 0; 2157 vdev_config_dirty(vd->vdev_top); 2158 } 2159 2160 mutex_exit(&vd->vdev_dtl_lock); 2161 2162 if (txg != 0) 2163 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 2164 return; 2165 } 2166 2167 mutex_enter(&vd->vdev_dtl_lock); 2168 for (int t = 0; t < DTL_TYPES; t++) { 2169 /* account for child's outage in parent's missing map / 2170* int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 2171 if (t == DTL_SCRUB) 2172 continue; /* leaf vdevs only / 2173* if (t == DTL_PARTIAL) 2174 minref = 1; /* i.e. non-zero / 2175* else if (vd->vdev_nparity != 0) 2176 minref = vd->vdev_nparity + 1; /* RAID-Z / 2177* else 2178 minref = vd->vdev_children; /* any kind of mirror / 2179* space_reftree_create(&reftree); 2180 for (int c = 0; c < vd->vdev_children; c++) { 2181 vdev_t cvd = vd->vdev_child[c]; 2182* mutex_enter(&cvd->vdev_dtl_lock); 2183 space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); 2184 mutex_exit(&cvd->vdev_dtl_lock); 2185 } 2186 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); 2187 space_reftree_destroy(&reftree); 2188 } 2189 mutex_exit(&vd->vdev_dtl_lock); 2190} 2191 2192int 2193vdev_dtl_load(vdev_t vd) 2194{ 2195* spa_t spa = vd->vdev_spa; 2196* objset_t mos = spa->spa_meta_objset; 2197* int error = 0; 2198 2199 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 2200 ASSERT(vdev_is_concrete(vd)); 2201 2202 error = space_map_open(&vd->vdev_dtl_sm, mos, 2203 vd->vdev_dtl_object, 0, -1ULL, 0); 2204 if (error) 2205 return (error); 2206 ASSERT(vd->vdev_dtl_sm != NULL); 2207 2208 mutex_enter(&vd->vdev_dtl_lock); 2209 2210 /* 2211 * Now that we've opened the space_map we need to update 2212 * the in-core DTL. 2213 / 2214* space_map_update(vd->vdev_dtl_sm); 2215 2216 error = space_map_load(vd->vdev_dtl_sm, 2217 vd->vdev_dtl[DTL_MISSING], SM_ALLOC); 2218 mutex_exit(&vd->vdev_dtl_lock); 2219 2220 return (error); 2221 } 2222 2223 for (int c = 0; c < vd->vdev_children; c++) { 2224 error = vdev_dtl_load(vd->vdev_child[c]); 2225 if (error != 0) 2226 break; 2227 } 2228 2229 return (error); 2230} 2231 2232void 2233vdev_destroy_unlink_zap(vdev_t vd, uint64_t zapobj, dmu_tx_t tx) 2234{ 2235 spa_t spa = vd->vdev_spa; 2236* 2237 VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); 2238 VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2239 zapobj, tx)); 2240} 2241 2242uint64_t 2243vdev_create_link_zap(vdev_t vd, dmu_tx_t tx) 2244{ 2245 spa_t spa = vd->vdev_spa; 2246* uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, 2247 DMU_OT_NONE, 0, tx); 2248 2249 ASSERT(zap != 0); 2250 VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2251 zap, tx)); 2252 2253 return (zap); 2254} 2255 2256void 2257vdev_construct_zaps(vdev_t vd, dmu_tx_t tx) 2258{ 2259 if (vd->vdev_ops != &vdev_hole_ops && 2260 vd->vdev_ops != &vdev_missing_ops && 2261 vd->vdev_ops != &vdev_root_ops && 2262 !vd->vdev_top->vdev_removing) { 2263 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { 2264 vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); 2265 } 2266 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { 2267 vd->vdev_top_zap = vdev_create_link_zap(vd, tx); 2268 } 2269 } 2270 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2271 vdev_construct_zaps(vd->vdev_child[i], tx); 2272 } 2273} 2274 2275void 2276vdev_dtl_sync(vdev_t vd, uint64_t txg) 2277{ 2278* spa_t spa = vd->vdev_spa; 2279* range_tree_t rt = vd->vdev_dtl[DTL_MISSING]; 2280* objset_t mos = spa->spa_meta_objset; 2281* range_tree_t rtsync; 2282* dmu_tx_t tx; 2283* uint64_t object = space_map_object(vd->vdev_dtl_sm); 2284 2285 ASSERT(vdev_is_concrete(vd)); 2286 ASSERT(vd->vdev_ops->vdev_op_leaf); 2287 2288 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2289 2290 if (vd->vdev_detached \|\| vd->vdev_top->vdev_removing) { 2291 mutex_enter(&vd->vdev_dtl_lock); 2292 space_map_free(vd->vdev_dtl_sm, tx); 2293 space_map_close(vd->vdev_dtl_sm); 2294 vd->vdev_dtl_sm = NULL; 2295 mutex_exit(&vd->vdev_dtl_lock); 2296 2297 /* 2298 * We only destroy the leaf ZAP for detached leaves or for 2299 * removed log devices. Removed data devices handle leaf ZAP 2300 * cleanup later, once cancellation is no longer possible. 2301 / 2302* if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached \|\| 2303 vd->vdev_top->vdev_islog)) { 2304 vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); 2305 vd->vdev_leaf_zap = 0; 2306 } 2307 2308 dmu_tx_commit(tx); 2309 return; 2310 } 2311 2312 if (vd->vdev_dtl_sm == NULL) { 2313 uint64_t new_object; 2314 2315 new_object = space_map_alloc(mos, tx); 2316 VERIFY3U(new_object, !=, 0); 2317 2318 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 2319 0, -1ULL, 0)); 2320 ASSERT(vd->vdev_dtl_sm != NULL); 2321 } 2322 2323 rtsync = range_tree_create(NULL, NULL); 2324 2325 mutex_enter(&vd->vdev_dtl_lock); 2326 range_tree_walk(rt, range_tree_add, rtsync); 2327 mutex_exit(&vd->vdev_dtl_lock); 2328 2329 space_map_truncate(vd->vdev_dtl_sm, tx); 2330 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); 2331 range_tree_vacate(rtsync, NULL, NULL); 2332 2333 range_tree_destroy(rtsync); 2334 2335 /* 2336 * If the object for the space map has changed then dirty 2337 * the top level so that we update the config. 2338 / 2339* if (object != space_map_object(vd->vdev_dtl_sm)) {
2297 zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " 2298 "new object %llu", txg, spa_name(spa), object, 2299 space_map_object(vd->vdev_dtl_sm));	2340 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " 2341 "new object %llu", (u_longlong_t)txg, spa_name(spa), 2342 (u_longlong_t)object, 2343 (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
2300 vdev_config_dirty(vd->vdev_top); 2301 } 2302 2303 dmu_tx_commit(tx); 2304 2305 mutex_enter(&vd->vdev_dtl_lock); 2306 space_map_update(vd->vdev_dtl_sm); 2307 mutex_exit(&vd->vdev_dtl_lock); 2308} 2309 2310/* 2311 * Determine whether the specified vdev can be offlined/detached/removed 2312 * without losing data. 2313 / 2314boolean_t 2315vdev_dtl_required(vdev_t vd) 2316{ 2317 spa_t spa = vd->vdev_spa; 2318* vdev_t tvd = vd->vdev_top; 2319* uint8_t cant_read = vd->vdev_cant_read; 2320 boolean_t required; 2321 2322 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2323 2324 if (vd == spa->spa_root_vdev \|\| vd == tvd) 2325 return (B_TRUE); 2326 2327 /* 2328 * Temporarily mark the device as unreadable, and then determine 2329 * whether this results in any DTL outages in the top-level vdev. 2330 * If not, we can safely offline/detach/remove the device. 2331 / 2332* vd->vdev_cant_read = B_TRUE; 2333 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2334 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 2335 vd->vdev_cant_read = cant_read; 2336 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2337 2338 if (!required && zio_injection_enabled) 2339 required = !!zio_handle_device_injection(vd, NULL, ECHILD); 2340 2341 return (required); 2342} 2343 2344/* 2345 * Determine if resilver is needed, and if so the txg range. 2346 / 2347boolean_t 2348vdev_resilver_needed(vdev_t vd, uint64_t minp, uint64_t maxp) 2349{ 2350 boolean_t needed = B_FALSE; 2351 uint64_t thismin = UINT64_MAX; 2352 uint64_t thismax = 0; 2353 2354 if (vd->vdev_children == 0) { 2355 mutex_enter(&vd->vdev_dtl_lock); 2356 if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && 2357 vdev_writeable(vd)) { 2358 2359 thismin = vdev_dtl_min(vd); 2360 thismax = vdev_dtl_max(vd); 2361 needed = B_TRUE; 2362 } 2363 mutex_exit(&vd->vdev_dtl_lock); 2364 } else { 2365 for (int c = 0; c < vd->vdev_children; c++) { 2366 vdev_t cvd = vd->vdev_child[c]; 2367* uint64_t cmin, cmax; 2368 2369 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 2370 thismin = MIN(thismin, cmin); 2371 thismax = MAX(thismax, cmax); 2372 needed = B_TRUE; 2373 } 2374 } 2375 } 2376 2377 if (needed && minp) { 2378 minp = thismin; 2379* maxp = thismax; 2380* } 2381 return (needed); 2382} 2383 2384int 2385vdev_load(vdev_t vd) 2386{ 2387* int error = 0; 2388 /* 2389 * Recursively load all children. 2390 / 2391* for (int c = 0; c < vd->vdev_children; c++) { 2392 error = vdev_load(vd->vdev_child[c]); 2393 if (error != 0) { 2394 return (error); 2395 } 2396 } 2397 2398 vdev_set_deflate_ratio(vd); 2399 2400 /* 2401 * If this is a top-level vdev, initialize its metaslabs. 2402 / 2403* if (vd == vd->vdev_top && vdev_is_concrete(vd)) { 2404 if (vd->vdev_ashift == 0 \|\| vd->vdev_asize == 0) { 2405 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2406 VDEV_AUX_CORRUPT_DATA);	2344 vdev_config_dirty(vd->vdev_top); 2345 } 2346 2347 dmu_tx_commit(tx); 2348 2349 mutex_enter(&vd->vdev_dtl_lock); 2350 space_map_update(vd->vdev_dtl_sm); 2351 mutex_exit(&vd->vdev_dtl_lock); 2352} 2353 2354/* 2355 * Determine whether the specified vdev can be offlined/detached/removed 2356 * without losing data. 2357 / 2358boolean_t 2359vdev_dtl_required(vdev_t vd) 2360{ 2361 spa_t spa = vd->vdev_spa; 2362* vdev_t tvd = vd->vdev_top; 2363* uint8_t cant_read = vd->vdev_cant_read; 2364 boolean_t required; 2365 2366 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2367 2368 if (vd == spa->spa_root_vdev \|\| vd == tvd) 2369 return (B_TRUE); 2370 2371 /* 2372 * Temporarily mark the device as unreadable, and then determine 2373 * whether this results in any DTL outages in the top-level vdev. 2374 * If not, we can safely offline/detach/remove the device. 2375 / 2376* vd->vdev_cant_read = B_TRUE; 2377 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2378 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 2379 vd->vdev_cant_read = cant_read; 2380 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2381 2382 if (!required && zio_injection_enabled) 2383 required = !!zio_handle_device_injection(vd, NULL, ECHILD); 2384 2385 return (required); 2386} 2387 2388/* 2389 * Determine if resilver is needed, and if so the txg range. 2390 / 2391boolean_t 2392vdev_resilver_needed(vdev_t vd, uint64_t minp, uint64_t maxp) 2393{ 2394 boolean_t needed = B_FALSE; 2395 uint64_t thismin = UINT64_MAX; 2396 uint64_t thismax = 0; 2397 2398 if (vd->vdev_children == 0) { 2399 mutex_enter(&vd->vdev_dtl_lock); 2400 if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && 2401 vdev_writeable(vd)) { 2402 2403 thismin = vdev_dtl_min(vd); 2404 thismax = vdev_dtl_max(vd); 2405 needed = B_TRUE; 2406 } 2407 mutex_exit(&vd->vdev_dtl_lock); 2408 } else { 2409 for (int c = 0; c < vd->vdev_children; c++) { 2410 vdev_t cvd = vd->vdev_child[c]; 2411* uint64_t cmin, cmax; 2412 2413 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 2414 thismin = MIN(thismin, cmin); 2415 thismax = MAX(thismax, cmax); 2416 needed = B_TRUE; 2417 } 2418 } 2419 } 2420 2421 if (needed && minp) { 2422 minp = thismin; 2423* maxp = thismax; 2424* } 2425 return (needed); 2426} 2427 2428int 2429vdev_load(vdev_t vd) 2430{ 2431* int error = 0; 2432 /* 2433 * Recursively load all children. 2434 / 2435* for (int c = 0; c < vd->vdev_children; c++) { 2436 error = vdev_load(vd->vdev_child[c]); 2437 if (error != 0) { 2438 return (error); 2439 } 2440 } 2441 2442 vdev_set_deflate_ratio(vd); 2443 2444 /* 2445 * If this is a top-level vdev, initialize its metaslabs. 2446 / 2447* if (vd == vd->vdev_top && vdev_is_concrete(vd)) { 2448 if (vd->vdev_ashift == 0 \|\| vd->vdev_asize == 0) { 2449 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2450 VDEV_AUX_CORRUPT_DATA);
	2451 vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " 2452 "asize=%llu", (u_longlong_t)vd->vdev_ashift, 2453 (u_longlong_t)vd->vdev_asize);
2407 return (SET_ERROR(ENXIO)); 2408 } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {	2454 return (SET_ERROR(ENXIO)); 2455 } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
	2456 vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " 2457 "[error=%d]", error);
2409 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2410 VDEV_AUX_CORRUPT_DATA); 2411 return (error); 2412 } 2413 } 2414 2415 /* 2416 * If this is a leaf vdev, load its DTL. 2417 / 2418* if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { 2419 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2420 VDEV_AUX_CORRUPT_DATA);	2458 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2459 VDEV_AUX_CORRUPT_DATA); 2460 return (error); 2461 } 2462 } 2463 2464 /* 2465 * If this is a leaf vdev, load its DTL. 2466 / 2467* if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { 2468 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2469 VDEV_AUX_CORRUPT_DATA);
	2470 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " 2471 "[error=%d]", error);
2421 return (error); 2422 } 2423 2424 uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); 2425 if (obsolete_sm_object != 0) { 2426 objset_t mos = vd->vdev_spa->spa_meta_objset; 2427* ASSERT(vd->vdev_asize != 0); 2428 ASSERT(vd->vdev_obsolete_sm == NULL); 2429 2430 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, 2431 obsolete_sm_object, 0, vd->vdev_asize, 0))) { 2432 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2433 VDEV_AUX_CORRUPT_DATA);	2472 return (error); 2473 } 2474 2475 uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); 2476 if (obsolete_sm_object != 0) { 2477 objset_t mos = vd->vdev_spa->spa_meta_objset; 2478* ASSERT(vd->vdev_asize != 0); 2479 ASSERT(vd->vdev_obsolete_sm == NULL); 2480 2481 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, 2482 obsolete_sm_object, 0, vd->vdev_asize, 0))) { 2483 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2484 VDEV_AUX_CORRUPT_DATA);
	2485 vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " 2486 "obsolete spacemap (obj %llu) [error=%d]", 2487 (u_longlong_t)obsolete_sm_object, error);
2434 return (error); 2435 } 2436 space_map_update(vd->vdev_obsolete_sm); 2437 } 2438 2439 return (0); 2440} 2441 2442/* 2443 * The special vdev case is used for hot spares and l2cache devices. Its 2444 * sole purpose it to set the vdev state for the associated vdev. To do this, 2445 * we make sure that we can open the underlying device, then try to read the 2446 * label, and make sure that the label is sane and that it hasn't been 2447 * repurposed to another pool. 2448 / 2449int 2450vdev_validate_aux(vdev_t vd) 2451{ 2452 nvlist_t label; 2453* uint64_t guid, version; 2454 uint64_t state; 2455 2456 if (!vdev_readable(vd)) 2457 return (0); 2458 2459 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 2460 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2461 VDEV_AUX_CORRUPT_DATA); 2462 return (-1); 2463 } 2464 2465 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 \|\| 2466 !SPA_VERSION_IS_SUPPORTED(version) \|\| 2467 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 \|\| 2468 guid != vd->vdev_guid \|\| 2469 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2470 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2471 VDEV_AUX_CORRUPT_DATA); 2472 nvlist_free(label); 2473 return (-1); 2474 } 2475 2476 /* 2477 * We don't actually check the pool state here. If it's in fact in 2478 * use by another pool, we update this fact on the fly when requested. 2479 / 2480* nvlist_free(label); 2481 return (0); 2482} 2483 2484/* 2485 * Free the objects used to store this vdev's spacemaps, and the array 2486 * that points to them. 2487 / 2488void 2489vdev_destroy_spacemaps(vdev_t vd, dmu_tx_t tx) 2490{ 2491* if (vd->vdev_ms_array == 0) 2492 return; 2493 2494 objset_t mos = vd->vdev_spa->spa_meta_objset; 2495* uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; 2496 size_t array_bytes = array_count * sizeof (uint64_t); 2497 uint64_t smobj_array = kmem_alloc(array_bytes, KM_SLEEP); 2498* VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, 2499 array_bytes, smobj_array, 0)); 2500 2501 for (uint64_t i = 0; i < array_count; i++) { 2502 uint64_t smobj = smobj_array[i]; 2503 if (smobj == 0) 2504 continue; 2505 2506 space_map_free_obj(mos, smobj, tx); 2507 } 2508 2509 kmem_free(smobj_array, array_bytes); 2510 VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); 2511 vd->vdev_ms_array = 0; 2512} 2513 2514static void 2515vdev_remove_empty(vdev_t vd, uint64_t txg) 2516{ 2517* spa_t spa = vd->vdev_spa; 2518* dmu_tx_t tx; 2519* 2520 ASSERT(vd == vd->vdev_top); 2521 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 2522 2523 if (vd->vdev_ms != NULL) { 2524 metaslab_group_t mg = vd->vdev_mg; 2525* 2526 metaslab_group_histogram_verify(mg); 2527 metaslab_class_histogram_verify(mg->mg_class); 2528 2529 for (int m = 0; m < vd->vdev_ms_count; m++) { 2530 metaslab_t msp = vd->vdev_ms[m]; 2531* 2532 if (msp == NULL \|\| msp->ms_sm == NULL) 2533 continue; 2534 2535 mutex_enter(&msp->ms_lock); 2536 /* 2537 * If the metaslab was not loaded when the vdev 2538 * was removed then the histogram accounting may 2539 * not be accurate. Update the histogram information 2540 * here so that we ensure that the metaslab group 2541 * and metaslab class are up-to-date. 2542 / 2543* metaslab_group_histogram_remove(mg, msp); 2544 2545 VERIFY0(space_map_allocated(msp->ms_sm)); 2546 space_map_close(msp->ms_sm); 2547 msp->ms_sm = NULL; 2548 mutex_exit(&msp->ms_lock); 2549 } 2550 2551 metaslab_group_histogram_verify(mg); 2552 metaslab_class_histogram_verify(mg->mg_class); 2553 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 2554 ASSERT0(mg->mg_histogram[i]); 2555 } 2556 2557 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2558 vdev_destroy_spacemaps(vd, tx); 2559 2560 if (vd->vdev_islog && vd->vdev_top_zap != 0) { 2561 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); 2562 vd->vdev_top_zap = 0; 2563 } 2564 dmu_tx_commit(tx); 2565} 2566 2567void 2568vdev_sync_done(vdev_t vd, uint64_t txg) 2569{ 2570* metaslab_t msp; 2571* boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2572 2573 ASSERT(vdev_is_concrete(vd)); 2574 2575 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2576 metaslab_sync_done(msp, txg); 2577 2578 if (reassess) 2579 metaslab_sync_reassess(vd->vdev_mg); 2580} 2581 2582void 2583vdev_sync(vdev_t vd, uint64_t txg) 2584{ 2585* spa_t spa = vd->vdev_spa; 2586* vdev_t lvd; 2587* metaslab_t msp; 2588* dmu_tx_t tx; 2589* 2590 if (range_tree_space(vd->vdev_obsolete_segments) > 0) { 2591 dmu_tx_t tx; 2592* 2593 ASSERT(vd->vdev_removing \|\| 2594 vd->vdev_ops == &vdev_indirect_ops); 2595 2596 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2597 vdev_indirect_sync_obsolete(vd, tx); 2598 dmu_tx_commit(tx); 2599 2600 /* 2601 * If the vdev is indirect, it can't have dirty 2602 * metaslabs or DTLs. 2603 / 2604* if (vd->vdev_ops == &vdev_indirect_ops) { 2605 ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); 2606 ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); 2607 return; 2608 } 2609 } 2610 2611 ASSERT(vdev_is_concrete(vd)); 2612 2613 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && 2614 !vd->vdev_removing) { 2615 ASSERT(vd == vd->vdev_top); 2616 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 2617 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2618 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2619 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2620 ASSERT(vd->vdev_ms_array != 0); 2621 vdev_config_dirty(vd); 2622 dmu_tx_commit(tx); 2623 } 2624 2625 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2626 metaslab_sync(msp, txg); 2627 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2628 } 2629 2630 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2631 vdev_dtl_sync(lvd, txg); 2632 2633 /* 2634 * Remove the metadata associated with this vdev once it's empty. 2635 * Note that this is typically used for log/cache device removal; 2636 * we don't empty toplevel vdevs when removing them. But if 2637 * a toplevel happens to be emptied, this is not harmful. 2638 / 2639* if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) { 2640 vdev_remove_empty(vd, txg); 2641 } 2642 2643 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2644} 2645 2646uint64_t 2647vdev_psize_to_asize(vdev_t vd, uint64_t psize) 2648{ 2649* return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2650} 2651 2652/* 2653 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2654 * not be opened, and no I/O is attempted. 2655 / 2656int 2657vdev_fault(spa_t spa, uint64_t guid, vdev_aux_t aux) 2658{ 2659 vdev_t vd, tvd; 2660 2661 spa_vdev_state_enter(spa, SCL_NONE); 2662 2663 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2664 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2665 2666 if (!vd->vdev_ops->vdev_op_leaf) 2667 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2668 2669 tvd = vd->vdev_top; 2670 2671 /* 2672 * We don't directly use the aux state here, but if we do a 2673 * vdev_reopen(), we need this value to be present to remember why we 2674 * were faulted. 2675 / 2676* vd->vdev_label_aux = aux; 2677 2678 /* 2679 * Faulted state takes precedence over degraded. 2680 / 2681* vd->vdev_delayed_close = B_FALSE; 2682 vd->vdev_faulted = 1ULL; 2683 vd->vdev_degraded = 0ULL; 2684 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2685 2686 /* 2687 * If this device has the only valid copy of the data, then 2688 * back off and simply mark the vdev as degraded instead. 2689 / 2690* if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2691 vd->vdev_degraded = 1ULL; 2692 vd->vdev_faulted = 0ULL; 2693 2694 /* 2695 * If we reopen the device and it's not dead, only then do we 2696 * mark it degraded. 2697 / 2698* vdev_reopen(tvd); 2699 2700 if (vdev_readable(vd)) 2701 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2702 } 2703 2704 return (spa_vdev_state_exit(spa, vd, 0)); 2705} 2706 2707/* 2708 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2709 * user that something is wrong. The vdev continues to operate as normal as far 2710 * as I/O is concerned. 2711 / 2712int 2713vdev_degrade(spa_t spa, uint64_t guid, vdev_aux_t aux) 2714{ 2715 vdev_t vd; 2716* 2717 spa_vdev_state_enter(spa, SCL_NONE); 2718 2719 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2720 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2721 2722 if (!vd->vdev_ops->vdev_op_leaf) 2723 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2724 2725 /* 2726 * If the vdev is already faulted, then don't do anything. 2727 / 2728* if (vd->vdev_faulted \|\| vd->vdev_degraded) 2729 return (spa_vdev_state_exit(spa, NULL, 0)); 2730 2731 vd->vdev_degraded = 1ULL; 2732 if (!vdev_is_dead(vd)) 2733 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2734 aux); 2735 2736 return (spa_vdev_state_exit(spa, vd, 0)); 2737} 2738 2739/* 2740 * Online the given vdev. 2741 * 2742 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2743 * spare device should be detached when the device finishes resilvering. 2744 * Second, the online should be treated like a 'test' online case, so no FMA 2745 * events are generated if the device fails to open. 2746 / 2747int 2748vdev_online(spa_t spa, uint64_t guid, uint64_t flags, vdev_state_t newstate) 2749{ 2750* vdev_t vd, tvd, pvd, rvd = spa->spa_root_vdev; 2751 boolean_t wasoffline; 2752 vdev_state_t oldstate; 2753 2754 spa_vdev_state_enter(spa, SCL_NONE); 2755 2756 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2757 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2758 2759 if (!vd->vdev_ops->vdev_op_leaf) 2760 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2761 2762 wasoffline = (vd->vdev_offline \|\| vd->vdev_tmpoffline); 2763 oldstate = vd->vdev_state; 2764 2765 tvd = vd->vdev_top; 2766 vd->vdev_offline = B_FALSE; 2767 vd->vdev_tmpoffline = B_FALSE; 2768 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2769 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2770 2771 /* XXX - L2ARC 1.0 does not support expansion / 2772* if (!vd->vdev_aux) { 2773 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2774 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2775 } 2776 2777 vdev_reopen(tvd); 2778 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2779 2780 if (!vd->vdev_aux) { 2781 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2782 pvd->vdev_expanding = B_FALSE; 2783 } 2784 2785 if (newstate) 2786 newstate = vd->vdev_state; 2787* if ((flags & ZFS_ONLINE_UNSPARE) && 2788 !vdev_is_dead(vd) && vd->vdev_parent && 2789 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2790 vd->vdev_parent->vdev_child[0] == vd) 2791 vd->vdev_unspare = B_TRUE; 2792 2793 if ((flags & ZFS_ONLINE_EXPAND) \|\| spa->spa_autoexpand) { 2794 2795 /* XXX - L2ARC 1.0 does not support expansion / 2796* if (vd->vdev_aux) 2797 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2798 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2799 } 2800 2801 if (wasoffline \|\| 2802 (oldstate < VDEV_STATE_DEGRADED && 2803 vd->vdev_state >= VDEV_STATE_DEGRADED)) 2804 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); 2805 2806 return (spa_vdev_state_exit(spa, vd, 0)); 2807} 2808 2809static int 2810vdev_offline_locked(spa_t spa, uint64_t guid, uint64_t flags) 2811{ 2812* vdev_t vd, tvd; 2813 int error = 0; 2814 uint64_t generation; 2815 metaslab_group_t mg; 2816* 2817top: 2818 spa_vdev_state_enter(spa, SCL_ALLOC); 2819 2820 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2821 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2822 2823 if (!vd->vdev_ops->vdev_op_leaf) 2824 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2825 2826 tvd = vd->vdev_top; 2827 mg = tvd->vdev_mg; 2828 generation = spa->spa_config_generation + 1; 2829 2830 /* 2831 * If the device isn't already offline, try to offline it. 2832 / 2833* if (!vd->vdev_offline) { 2834 /* 2835 * If this device has the only valid copy of some data, 2836 * don't allow it to be offlined. Log devices are always 2837 * expendable. 2838 / 2839* if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2840 vdev_dtl_required(vd)) 2841 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2842 2843 /* 2844 * If the top-level is a slog and it has had allocations 2845 * then proceed. We check that the vdev's metaslab group 2846 * is not NULL since it's possible that we may have just 2847 * added this vdev but not yet initialized its metaslabs. 2848 / 2849* if (tvd->vdev_islog && mg != NULL) { 2850 /* 2851 * Prevent any future allocations. 2852 / 2853* metaslab_group_passivate(mg); 2854 (void) spa_vdev_state_exit(spa, vd, 0); 2855 2856 error = spa_reset_logs(spa); 2857 2858 spa_vdev_state_enter(spa, SCL_ALLOC); 2859 2860 /* 2861 * Check to see if the config has changed. 2862 / 2863* if (error \|\| generation != spa->spa_config_generation) { 2864 metaslab_group_activate(mg); 2865 if (error) 2866 return (spa_vdev_state_exit(spa, 2867 vd, error)); 2868 (void) spa_vdev_state_exit(spa, vd, 0); 2869 goto top; 2870 } 2871 ASSERT0(tvd->vdev_stat.vs_alloc); 2872 } 2873 2874 /* 2875 * Offline this device and reopen its top-level vdev. 2876 * If the top-level vdev is a log device then just offline 2877 * it. Otherwise, if this action results in the top-level 2878 * vdev becoming unusable, undo it and fail the request. 2879 / 2880* vd->vdev_offline = B_TRUE; 2881 vdev_reopen(tvd); 2882 2883 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2884 vdev_is_dead(tvd)) { 2885 vd->vdev_offline = B_FALSE; 2886 vdev_reopen(tvd); 2887 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2888 } 2889 2890 /* 2891 * Add the device back into the metaslab rotor so that 2892 * once we online the device it's open for business. 2893 / 2894* if (tvd->vdev_islog && mg != NULL) 2895 metaslab_group_activate(mg); 2896 } 2897 2898 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2899 2900 return (spa_vdev_state_exit(spa, vd, 0)); 2901} 2902 2903int 2904vdev_offline(spa_t spa, uint64_t guid, uint64_t flags) 2905{ 2906* int error; 2907 2908 mutex_enter(&spa->spa_vdev_top_lock); 2909 error = vdev_offline_locked(spa, guid, flags); 2910 mutex_exit(&spa->spa_vdev_top_lock); 2911 2912 return (error); 2913} 2914 2915/* 2916 * Clear the error counts associated with this vdev. Unlike vdev_online() and 2917 * vdev_offline(), we assume the spa config is locked. We also clear all 2918 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2919 / 2920void 2921vdev_clear(spa_t spa, vdev_t vd) 2922{ 2923* vdev_t rvd = spa->spa_root_vdev; 2924* 2925 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2926 2927 if (vd == NULL) 2928 vd = rvd; 2929 2930 vd->vdev_stat.vs_read_errors = 0; 2931 vd->vdev_stat.vs_write_errors = 0; 2932 vd->vdev_stat.vs_checksum_errors = 0; 2933 2934 for (int c = 0; c < vd->vdev_children; c++) 2935 vdev_clear(spa, vd->vdev_child[c]); 2936 2937 if (vd == rvd) { 2938 for (int c = 0; c < spa->spa_l2cache.sav_count; c++) 2939 vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); 2940 2941 for (int c = 0; c < spa->spa_spares.sav_count; c++) 2942 vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); 2943 } 2944 2945 /* 2946 * It makes no sense to "clear" an indirect vdev. 2947 / 2948* if (!vdev_is_concrete(vd)) 2949 return; 2950 2951 /* 2952 * If we're in the FAULTED state or have experienced failed I/O, then 2953 * clear the persistent state and attempt to reopen the device. We 2954 * also mark the vdev config dirty, so that the new faulted state is 2955 * written out to disk. 2956 / 2957* if (vd->vdev_faulted \|\| vd->vdev_degraded \|\| 2958 !vdev_readable(vd) \|\| !vdev_writeable(vd)) { 2959 2960 /* 2961 * When reopening in reponse to a clear event, it may be due to 2962 * a fmadm repair request. In this case, if the device is 2963 * still broken, we want to still post the ereport again. 2964 / 2965* vd->vdev_forcefault = B_TRUE; 2966 2967 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2968 vd->vdev_cant_read = B_FALSE; 2969 vd->vdev_cant_write = B_FALSE; 2970 2971 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 2972 2973 vd->vdev_forcefault = B_FALSE; 2974 2975 if (vd != rvd && vdev_writeable(vd->vdev_top)) 2976 vdev_state_dirty(vd->vdev_top); 2977 2978 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2979 spa_async_request(spa, SPA_ASYNC_RESILVER); 2980 2981 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); 2982 } 2983 2984 /* 2985 * When clearing a FMA-diagnosed fault, we always want to 2986 * unspare the device, as we assume that the original spare was 2987 * done in response to the FMA fault. 2988 / 2989* if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2990 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2991 vd->vdev_parent->vdev_child[0] == vd) 2992 vd->vdev_unspare = B_TRUE; 2993} 2994 2995boolean_t 2996vdev_is_dead(vdev_t vd) 2997{ 2998* /* 2999 * Holes and missing devices are always considered "dead". 3000 * This simplifies the code since we don't have to check for 3001 * these types of devices in the various code paths. 3002 * Instead we rely on the fact that we skip over dead devices 3003 * before issuing I/O to them. 3004 / 3005* return (vd->vdev_state < VDEV_STATE_DEGRADED \|\| 3006 vd->vdev_ops == &vdev_hole_ops \|\| 3007 vd->vdev_ops == &vdev_missing_ops); 3008} 3009 3010boolean_t 3011vdev_readable(vdev_t vd) 3012{ 3013* return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 3014} 3015 3016boolean_t 3017vdev_writeable(vdev_t vd) 3018{ 3019* return (!vdev_is_dead(vd) && !vd->vdev_cant_write && 3020 vdev_is_concrete(vd)); 3021} 3022 3023boolean_t 3024vdev_allocatable(vdev_t vd) 3025{ 3026* uint64_t state = vd->vdev_state; 3027 3028 /* 3029 * We currently allow allocations from vdevs which may be in the 3030 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 3031 * fails to reopen then we'll catch it later when we're holding 3032 * the proper locks. Note that we have to get the vdev state 3033 * in a local variable because although it changes atomically, 3034 * we're asking two separate questions about it. 3035 / 3036* return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 3037 !vd->vdev_cant_write && vdev_is_concrete(vd) && 3038 vd->vdev_mg->mg_initialized); 3039} 3040 3041boolean_t 3042vdev_accessible(vdev_t vd, zio_t zio) 3043{ 3044 ASSERT(zio->io_vd == vd); 3045 3046 if (vdev_is_dead(vd) \|\| vd->vdev_remove_wanted) 3047 return (B_FALSE); 3048 3049 if (zio->io_type == ZIO_TYPE_READ) 3050 return (!vd->vdev_cant_read); 3051 3052 if (zio->io_type == ZIO_TYPE_WRITE) 3053 return (!vd->vdev_cant_write); 3054 3055 return (B_TRUE); 3056} 3057 3058/* 3059 * Get statistics for the given vdev. 3060 / 3061void 3062vdev_get_stats(vdev_t vd, vdev_stat_t vs) 3063{ 3064* spa_t spa = vd->vdev_spa; 3065* vdev_t rvd = spa->spa_root_vdev; 3066* vdev_t tvd = vd->vdev_top; 3067* 3068 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 3069 3070 mutex_enter(&vd->vdev_stat_lock); 3071 bcopy(&vd->vdev_stat, vs, sizeof (vs)); 3072* vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 3073 vs->vs_state = vd->vdev_state; 3074 vs->vs_rsize = vdev_get_min_asize(vd); 3075 if (vd->vdev_ops->vdev_op_leaf) 3076 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 3077 /* 3078 * Report expandable space on top-level, non-auxillary devices only. 3079 * The expandable space is reported in terms of metaslab sized units 3080 * since that determines how much space the pool can expand. 3081 / 3082* if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { 3083 vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - 3084 spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); 3085 } 3086 vs->vs_configured_ashift = vd->vdev_top != NULL 3087 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; 3088 vs->vs_logical_ashift = vd->vdev_logical_ashift; 3089 vs->vs_physical_ashift = vd->vdev_physical_ashift; 3090 if (vd->vdev_aux == NULL && vd == vd->vdev_top && 3091 vdev_is_concrete(vd)) { 3092 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 3093 } 3094 3095 /* 3096 * If we're getting stats on the root vdev, aggregate the I/O counts 3097 * over all top-level vdevs (i.e. the direct children of the root). 3098 / 3099* if (vd == rvd) { 3100 for (int c = 0; c < rvd->vdev_children; c++) { 3101 vdev_t cvd = rvd->vdev_child[c]; 3102* vdev_stat_t cvs = &cvd->vdev_stat; 3103* 3104 for (int t = 0; t < ZIO_TYPES; t++) { 3105 vs->vs_ops[t] += cvs->vs_ops[t]; 3106 vs->vs_bytes[t] += cvs->vs_bytes[t]; 3107 } 3108 cvs->vs_scan_removing = cvd->vdev_removing; 3109 } 3110 } 3111 mutex_exit(&vd->vdev_stat_lock); 3112} 3113 3114void 3115vdev_clear_stats(vdev_t vd) 3116{ 3117* mutex_enter(&vd->vdev_stat_lock); 3118 vd->vdev_stat.vs_space = 0; 3119 vd->vdev_stat.vs_dspace = 0; 3120 vd->vdev_stat.vs_alloc = 0; 3121 mutex_exit(&vd->vdev_stat_lock); 3122} 3123 3124void 3125vdev_scan_stat_init(vdev_t vd) 3126{ 3127* vdev_stat_t vs = &vd->vdev_stat; 3128* 3129 for (int c = 0; c < vd->vdev_children; c++) 3130 vdev_scan_stat_init(vd->vdev_child[c]); 3131 3132 mutex_enter(&vd->vdev_stat_lock); 3133 vs->vs_scan_processed = 0; 3134 mutex_exit(&vd->vdev_stat_lock); 3135} 3136 3137void 3138vdev_stat_update(zio_t zio, uint64_t psize) 3139{ 3140* spa_t spa = zio->io_spa; 3141* vdev_t rvd = spa->spa_root_vdev; 3142* vdev_t vd = zio->io_vd ? zio->io_vd : rvd; 3143* vdev_t pvd; 3144* uint64_t txg = zio->io_txg; 3145 vdev_stat_t vs = &vd->vdev_stat; 3146* zio_type_t type = zio->io_type; 3147 int flags = zio->io_flags; 3148 3149 /* 3150 * If this i/o is a gang leader, it didn't do any actual work. 3151 / 3152* if (zio->io_gang_tree) 3153 return; 3154 3155 if (zio->io_error == 0) { 3156 /* 3157 * If this is a root i/o, don't count it -- we've already 3158 * counted the top-level vdevs, and vdev_get_stats() will 3159 * aggregate them when asked. This reduces contention on 3160 * the root vdev_stat_lock and implicitly handles blocks 3161 * that compress away to holes, for which there is no i/o. 3162 * (Holes never create vdev children, so all the counters 3163 * remain zero, which is what we want.) 3164 * 3165 * Note: this only applies to successful i/o (io_error == 0) 3166 * because unlike i/o counts, errors are not additive. 3167 * When reading a ditto block, for example, failure of 3168 * one top-level vdev does not imply a root-level error. 3169 / 3170* if (vd == rvd) 3171 return; 3172 3173 ASSERT(vd == zio->io_vd); 3174 3175 if (flags & ZIO_FLAG_IO_BYPASS) 3176 return; 3177 3178 mutex_enter(&vd->vdev_stat_lock); 3179 3180 if (flags & ZIO_FLAG_IO_REPAIR) { 3181 if (flags & ZIO_FLAG_SCAN_THREAD) { 3182 dsl_scan_phys_t scn_phys = 3183* &spa->spa_dsl_pool->dp_scan->scn_phys; 3184 uint64_t processed = &scn_phys->scn_processed; 3185* 3186 /* XXX cleanup? / 3187* if (vd->vdev_ops->vdev_op_leaf) 3188 atomic_add_64(processed, psize); 3189 vs->vs_scan_processed += psize; 3190 } 3191 3192 if (flags & ZIO_FLAG_SELF_HEAL) 3193 vs->vs_self_healed += psize; 3194 } 3195 3196 vs->vs_ops[type]++; 3197 vs->vs_bytes[type] += psize; 3198 3199 mutex_exit(&vd->vdev_stat_lock); 3200 return; 3201 } 3202 3203 if (flags & ZIO_FLAG_SPECULATIVE) 3204 return; 3205 3206 /* 3207 * If this is an I/O error that is going to be retried, then ignore the 3208 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 3209 * hard errors, when in reality they can happen for any number of 3210 * innocuous reasons (bus resets, MPxIO link failure, etc). 3211 / 3212* if (zio->io_error == EIO && 3213 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 3214 return; 3215 3216 /* 3217 * Intent logs writes won't propagate their error to the root 3218 * I/O so don't mark these types of failures as pool-level 3219 * errors. 3220 / 3221* if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 3222 return; 3223 3224 mutex_enter(&vd->vdev_stat_lock); 3225 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 3226 if (zio->io_error == ECKSUM) 3227 vs->vs_checksum_errors++; 3228 else 3229 vs->vs_read_errors++; 3230 } 3231 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 3232 vs->vs_write_errors++; 3233 mutex_exit(&vd->vdev_stat_lock); 3234 3235 if (spa->spa_load_state == SPA_LOAD_NONE && 3236 type == ZIO_TYPE_WRITE && txg != 0 && 3237 (!(flags & ZIO_FLAG_IO_REPAIR) \|\| 3238 (flags & ZIO_FLAG_SCAN_THREAD) \|\| 3239 spa->spa_claiming)) { 3240 /* 3241 * This is either a normal write (not a repair), or it's 3242 * a repair induced by the scrub thread, or it's a repair 3243 * made by zil_claim() during spa_load() in the first txg. 3244 * In the normal case, we commit the DTL change in the same 3245 * txg as the block was born. In the scrub-induced repair 3246 * case, we know that scrubs run in first-pass syncing context, 3247 * so we commit the DTL change in spa_syncing_txg(spa). 3248 * In the zil_claim() case, we commit in spa_first_txg(spa). 3249 * 3250 * We currently do not make DTL entries for failed spontaneous 3251 * self-healing writes triggered by normal (non-scrubbing) 3252 * reads, because we have no transactional context in which to 3253 * do so -- and it's not clear that it'd be desirable anyway. 3254 / 3255* if (vd->vdev_ops->vdev_op_leaf) { 3256 uint64_t commit_txg = txg; 3257 if (flags & ZIO_FLAG_SCAN_THREAD) { 3258 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3259 ASSERT(spa_sync_pass(spa) == 1); 3260 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 3261 commit_txg = spa_syncing_txg(spa); 3262 } else if (spa->spa_claiming) { 3263 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3264 commit_txg = spa_first_txg(spa); 3265 } 3266 ASSERT(commit_txg >= spa_syncing_txg(spa)); 3267 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 3268 return; 3269 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 3270 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 3271 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 3272 } 3273 if (vd != rvd) 3274 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 3275 } 3276} 3277 3278/* 3279 * Update the in-core space usage stats for this vdev, its metaslab class, 3280 * and the root vdev. 3281 / 3282void 3283vdev_space_update(vdev_t vd, int64_t alloc_delta, int64_t defer_delta, 3284 int64_t space_delta) 3285{ 3286 int64_t dspace_delta = space_delta; 3287 spa_t spa = vd->vdev_spa; 3288* vdev_t rvd = spa->spa_root_vdev; 3289* metaslab_group_t mg = vd->vdev_mg; 3290* metaslab_class_t mc = mg ? mg->mg_class : NULL; 3291* 3292 ASSERT(vd == vd->vdev_top); 3293 3294 /* 3295 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 3296 * factor. We must calculate this here and not at the root vdev 3297 * because the root vdev's psize-to-asize is simply the max of its 3298 * childrens', thus not accurate enough for us. 3299 / 3300* ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 3301 ASSERT(vd->vdev_deflate_ratio != 0 \|\| vd->vdev_isl2cache); 3302 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 3303 vd->vdev_deflate_ratio; 3304 3305 mutex_enter(&vd->vdev_stat_lock); 3306 vd->vdev_stat.vs_alloc += alloc_delta; 3307 vd->vdev_stat.vs_space += space_delta; 3308 vd->vdev_stat.vs_dspace += dspace_delta; 3309 mutex_exit(&vd->vdev_stat_lock); 3310 3311 if (mc == spa_normal_class(spa)) { 3312 mutex_enter(&rvd->vdev_stat_lock); 3313 rvd->vdev_stat.vs_alloc += alloc_delta; 3314 rvd->vdev_stat.vs_space += space_delta; 3315 rvd->vdev_stat.vs_dspace += dspace_delta; 3316 mutex_exit(&rvd->vdev_stat_lock); 3317 } 3318 3319 if (mc != NULL) { 3320 ASSERT(rvd == vd->vdev_parent); 3321 ASSERT(vd->vdev_ms_count != 0); 3322 3323 metaslab_class_space_update(mc, 3324 alloc_delta, defer_delta, space_delta, dspace_delta); 3325 } 3326} 3327 3328/* 3329 * Mark a top-level vdev's config as dirty, placing it on the dirty list 3330 * so that it will be written out next time the vdev configuration is synced. 3331 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 3332 / 3333void 3334vdev_config_dirty(vdev_t vd) 3335{ 3336 spa_t spa = vd->vdev_spa; 3337* vdev_t rvd = spa->spa_root_vdev; 3338* int c; 3339 3340 ASSERT(spa_writeable(spa)); 3341 3342 /* 3343 * If this is an aux vdev (as with l2cache and spare devices), then we 3344 * update the vdev config manually and set the sync flag. 3345 / 3346* if (vd->vdev_aux != NULL) { 3347 spa_aux_vdev_t sav = vd->vdev_aux; 3348* nvlist_t *aux; 3349* uint_t naux; 3350 3351 for (c = 0; c < sav->sav_count; c++) { 3352 if (sav->sav_vdevs[c] == vd) 3353 break; 3354 } 3355 3356 if (c == sav->sav_count) { 3357 /* 3358 * We're being removed. There's nothing more to do. 3359 / 3360* ASSERT(sav->sav_sync == B_TRUE); 3361 return; 3362 } 3363 3364 sav->sav_sync = B_TRUE; 3365 3366 if (nvlist_lookup_nvlist_array(sav->sav_config, 3367 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 3368 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 3369 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 3370 } 3371 3372 ASSERT(c < naux); 3373 3374 /* 3375 * Setting the nvlist in the middle if the array is a little 3376 * sketchy, but it will work. 3377 / 3378* nvlist_free(aux[c]); 3379 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3380 3381 return; 3382 } 3383 3384 /* 3385 * The dirty list is protected by the SCL_CONFIG lock. The caller 3386 * must either hold SCL_CONFIG as writer, or must be the sync thread 3387 * (which holds SCL_CONFIG as reader). There's only one sync thread, 3388 * so this is sufficient to ensure mutual exclusion. 3389 / 3390* ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) \|\| 3391 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3392 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3393 3394 if (vd == rvd) { 3395 for (c = 0; c < rvd->vdev_children; c++) 3396 vdev_config_dirty(rvd->vdev_child[c]); 3397 } else { 3398 ASSERT(vd == vd->vdev_top); 3399 3400 if (!list_link_active(&vd->vdev_config_dirty_node) && 3401 vdev_is_concrete(vd)) { 3402 list_insert_head(&spa->spa_config_dirty_list, vd); 3403 } 3404 } 3405} 3406 3407void 3408vdev_config_clean(vdev_t vd) 3409{ 3410* spa_t spa = vd->vdev_spa; 3411* 3412 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) \|\| 3413 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3414 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3415 3416 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3417 list_remove(&spa->spa_config_dirty_list, vd); 3418} 3419 3420/* 3421 * Mark a top-level vdev's state as dirty, so that the next pass of 3422 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3423 * the state changes from larger config changes because they require 3424 * much less locking, and are often needed for administrative actions. 3425 / 3426void 3427vdev_state_dirty(vdev_t vd) 3428{ 3429 spa_t spa = vd->vdev_spa; 3430* 3431 ASSERT(spa_writeable(spa)); 3432 ASSERT(vd == vd->vdev_top); 3433 3434 /* 3435 * The state list is protected by the SCL_STATE lock. The caller 3436 * must either hold SCL_STATE as writer, or must be the sync thread 3437 * (which holds SCL_STATE as reader). There's only one sync thread, 3438 * so this is sufficient to ensure mutual exclusion. 3439 / 3440* ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) \|\| 3441 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3442 spa_config_held(spa, SCL_STATE, RW_READER))); 3443 3444 if (!list_link_active(&vd->vdev_state_dirty_node) && 3445 vdev_is_concrete(vd)) 3446 list_insert_head(&spa->spa_state_dirty_list, vd); 3447} 3448 3449void 3450vdev_state_clean(vdev_t vd) 3451{ 3452* spa_t spa = vd->vdev_spa; 3453* 3454 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) \|\| 3455 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3456 spa_config_held(spa, SCL_STATE, RW_READER))); 3457 3458 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 3459 list_remove(&spa->spa_state_dirty_list, vd); 3460} 3461 3462/* 3463 * Propagate vdev state up from children to parent. 3464 / 3465void 3466vdev_propagate_state(vdev_t vd) 3467{ 3468 spa_t spa = vd->vdev_spa; 3469* vdev_t rvd = spa->spa_root_vdev; 3470* int degraded = 0, faulted = 0; 3471 int corrupted = 0; 3472 vdev_t child; 3473* 3474 if (vd->vdev_children > 0) { 3475 for (int c = 0; c < vd->vdev_children; c++) { 3476 child = vd->vdev_child[c]; 3477 3478 /* 3479 * Don't factor holes or indirect vdevs into the 3480 * decision. 3481 / 3482* if (!vdev_is_concrete(child)) 3483 continue; 3484 3485 if (!vdev_readable(child) \|\| 3486 (!vdev_writeable(child) && spa_writeable(spa))) { 3487 /* 3488 * Root special: if there is a top-level log 3489 * device, treat the root vdev as if it were 3490 * degraded. 3491 / 3492* if (child->vdev_islog && vd == rvd) 3493 degraded++; 3494 else 3495 faulted++; 3496 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 3497 degraded++; 3498 } 3499 3500 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 3501 corrupted++; 3502 } 3503 3504 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 3505 3506 /* 3507 * Root special: if there is a top-level vdev that cannot be 3508 * opened due to corrupted metadata, then propagate the root 3509 * vdev's aux state as 'corrupt' rather than 'insufficient 3510 * replicas'. 3511 / 3512* if (corrupted && vd == rvd && 3513 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 3514 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 3515 VDEV_AUX_CORRUPT_DATA); 3516 } 3517 3518 if (vd->vdev_parent) 3519 vdev_propagate_state(vd->vdev_parent); 3520} 3521 3522/* 3523 * Set a vdev's state. If this is during an open, we don't update the parent 3524 * state, because we're in the process of opening children depth-first. 3525 * Otherwise, we propagate the change to the parent. 3526 * 3527 * If this routine places a device in a faulted state, an appropriate ereport is 3528 * generated. 3529 / 3530void 3531vdev_set_state(vdev_t vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 3532{ 3533 uint64_t save_state; 3534 spa_t spa = vd->vdev_spa; 3535* 3536 if (state == vd->vdev_state) { 3537 vd->vdev_stat.vs_aux = aux; 3538 return; 3539 } 3540 3541 save_state = vd->vdev_state; 3542 3543 vd->vdev_state = state; 3544 vd->vdev_stat.vs_aux = aux; 3545 3546 /* 3547 * If we are setting the vdev state to anything but an open state, then 3548 * always close the underlying device unless the device has requested 3549 * a delayed close (i.e. we're about to remove or fault the device). 3550 * Otherwise, we keep accessible but invalid devices open forever. 3551 * We don't call vdev_close() itself, because that implies some extra 3552 * checks (offline, etc) that we don't want here. This is limited to 3553 * leaf devices, because otherwise closing the device will affect other 3554 * children. 3555 / 3556* if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 3557 vd->vdev_ops->vdev_op_leaf) 3558 vd->vdev_ops->vdev_op_close(vd); 3559 3560 if (vd->vdev_removed && 3561 state == VDEV_STATE_CANT_OPEN && 3562 (aux == VDEV_AUX_OPEN_FAILED \|\| vd->vdev_checkremove)) { 3563 /* 3564 * If the previous state is set to VDEV_STATE_REMOVED, then this 3565 * device was previously marked removed and someone attempted to 3566 * reopen it. If this failed due to a nonexistent device, then 3567 * keep the device in the REMOVED state. We also let this be if 3568 * it is one of our special test online cases, which is only 3569 * attempting to online the device and shouldn't generate an FMA 3570 * fault. 3571 / 3572* vd->vdev_state = VDEV_STATE_REMOVED; 3573 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 3574 } else if (state == VDEV_STATE_REMOVED) { 3575 vd->vdev_removed = B_TRUE; 3576 } else if (state == VDEV_STATE_CANT_OPEN) { 3577 /* 3578 * If we fail to open a vdev during an import or recovery, we 3579 * mark it as "not available", which signifies that it was 3580 * never there to begin with. Failure to open such a device 3581 * is not considered an error. 3582 / 3583* if ((spa_load_state(spa) == SPA_LOAD_IMPORT \|\| 3584 spa_load_state(spa) == SPA_LOAD_RECOVER) && 3585 vd->vdev_ops->vdev_op_leaf) 3586 vd->vdev_not_present = 1; 3587 3588 /* 3589 * Post the appropriate ereport. If the 'prevstate' field is 3590 * set to something other than VDEV_STATE_UNKNOWN, it indicates 3591 * that this is part of a vdev_reopen(). In this case, we don't 3592 * want to post the ereport if the device was already in the 3593 * CANT_OPEN state beforehand. 3594 * 3595 * If the 'checkremove' flag is set, then this is an attempt to 3596 * online the device in response to an insertion event. If we 3597 * hit this case, then we have detected an insertion event for a 3598 * faulted or offline device that wasn't in the removed state. 3599 * In this scenario, we don't post an ereport because we are 3600 * about to replace the device, or attempt an online with 3601 * vdev_forcefault, which will generate the fault for us. 3602 / 3603* if ((vd->vdev_prevstate != state \|\| vd->vdev_forcefault) && 3604 !vd->vdev_not_present && !vd->vdev_checkremove && 3605 vd != spa->spa_root_vdev) { 3606 const char class; 3607* 3608 switch (aux) { 3609 case VDEV_AUX_OPEN_FAILED: 3610 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3611 break; 3612 case VDEV_AUX_CORRUPT_DATA: 3613 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3614 break; 3615 case VDEV_AUX_NO_REPLICAS: 3616 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3617 break; 3618 case VDEV_AUX_BAD_GUID_SUM: 3619 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3620 break; 3621 case VDEV_AUX_TOO_SMALL: 3622 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3623 break; 3624 case VDEV_AUX_BAD_LABEL: 3625 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3626 break; 3627 default: 3628 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3629 } 3630 3631 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3632 } 3633 3634 /* Erase any notion of persistent removed state / 3635* vd->vdev_removed = B_FALSE; 3636 } else { 3637 vd->vdev_removed = B_FALSE; 3638 } 3639 3640 /* 3641 * Notify the fmd of the state change. Be verbose and post 3642 * notifications even for stuff that's not important; the fmd agent can 3643 * sort it out. Don't emit state change events for non-leaf vdevs since 3644 * they can't change state on their own. The FMD can check their state 3645 * if it wants to when it sees that a leaf vdev had a state change. 3646 / 3647* if (vd->vdev_ops->vdev_op_leaf) 3648 zfs_post_state_change(spa, vd); 3649 3650 if (!isopen && vd->vdev_parent) 3651 vdev_propagate_state(vd->vdev_parent); 3652} 3653 3654/* 3655 * Check the vdev configuration to ensure that it's capable of supporting 3656 * a root pool. We do not support partial configuration. 3657 * In addition, only a single top-level vdev is allowed. 3658 * 3659 * FreeBSD does not have above limitations. 3660 / 3661boolean_t 3662vdev_is_bootable(vdev_t vd) 3663{ 3664#ifdef illumos 3665 if (!vd->vdev_ops->vdev_op_leaf) { 3666 char vdev_type = vd->vdev_ops->vdev_op_type; 3667* 3668 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3669 vd->vdev_children > 1) { 3670 return (B_FALSE); 3671 } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 \|\| 3672 strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { 3673 return (B_FALSE); 3674 } 3675 } 3676 3677 for (int c = 0; c < vd->vdev_children; c++) { 3678 if (!vdev_is_bootable(vd->vdev_child[c])) 3679 return (B_FALSE); 3680 } 3681#endif /* illumos / 3682* return (B_TRUE); 3683} 3684 3685boolean_t 3686vdev_is_concrete(vdev_t vd) 3687{ 3688* vdev_ops_t ops = vd->vdev_ops; 3689* if (ops == &vdev_indirect_ops \|\| ops == &vdev_hole_ops \|\| 3690 ops == &vdev_missing_ops \|\| ops == &vdev_root_ops) { 3691 return (B_FALSE); 3692 } else { 3693 return (B_TRUE); 3694 } 3695} 3696 3697/* 3698 * Load the state from the original vdev tree (ovd) which 3699 * we've retrieved from the MOS config object. If the original 3700 * vdev was offline or faulted then we transfer that state to the 3701 * device in the current vdev tree (nvd). 3702 / 3703void 3704vdev_load_log_state(vdev_t nvd, vdev_t ovd) 3705{ 3706* spa_t spa = nvd->vdev_spa; 3707* 3708 ASSERT(nvd->vdev_top->vdev_islog); 3709 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3710 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3711 3712 for (int c = 0; c < nvd->vdev_children; c++) 3713 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3714 3715 if (nvd->vdev_ops->vdev_op_leaf) { 3716 /* 3717 * Restore the persistent vdev state 3718 / 3719* nvd->vdev_offline = ovd->vdev_offline; 3720 nvd->vdev_faulted = ovd->vdev_faulted; 3721 nvd->vdev_degraded = ovd->vdev_degraded; 3722 nvd->vdev_removed = ovd->vdev_removed; 3723 } 3724} 3725 3726/* 3727 * Determine if a log device has valid content. If the vdev was 3728 * removed or faulted in the MOS config then we know that 3729 * the content on the log device has already been written to the pool. 3730 / 3731boolean_t 3732vdev_log_state_valid(vdev_t vd) 3733{ 3734 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3735 !vd->vdev_removed) 3736 return (B_TRUE); 3737 3738 for (int c = 0; c < vd->vdev_children; c++) 3739 if (vdev_log_state_valid(vd->vdev_child[c])) 3740 return (B_TRUE); 3741 3742 return (B_FALSE); 3743} 3744 3745/* 3746 * Expand a vdev if possible. 3747 / 3748void 3749vdev_expand(vdev_t vd, uint64_t txg) 3750{ 3751 ASSERT(vd->vdev_top == vd); 3752 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3753 3754 vdev_set_deflate_ratio(vd); 3755 3756 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && 3757 vdev_is_concrete(vd)) { 3758 VERIFY(vdev_metaslab_init(vd, txg) == 0); 3759 vdev_config_dirty(vd); 3760 } 3761} 3762 3763/* 3764 * Split a vdev. 3765 / 3766void 3767vdev_split(vdev_t vd) 3768{ 3769 vdev_t cvd, pvd = vd->vdev_parent; 3770 3771 vdev_remove_child(pvd, vd); 3772 vdev_compact_children(pvd); 3773 3774 cvd = pvd->vdev_child[0]; 3775 if (pvd->vdev_children == 1) { 3776 vdev_remove_parent(cvd); 3777 cvd->vdev_splitting = B_TRUE; 3778 } 3779 vdev_propagate_state(cvd); 3780} 3781 3782void 3783vdev_deadman(vdev_t vd) 3784{ 3785* for (int c = 0; c < vd->vdev_children; c++) { 3786 vdev_t cvd = vd->vdev_child[c]; 3787* 3788 vdev_deadman(cvd); 3789 } 3790 3791 if (vd->vdev_ops->vdev_op_leaf) { 3792 vdev_queue_t vq = &vd->vdev_queue; 3793* 3794 mutex_enter(&vq->vq_lock); 3795 if (avl_numnodes(&vq->vq_active_tree) > 0) { 3796 spa_t spa = vd->vdev_spa; 3797* zio_t fio; 3798* uint64_t delta; 3799 3800 /* 3801 * Look at the head of all the pending queues, 3802 * if any I/O has been outstanding for longer than 3803 * the spa_deadman_synctime we panic the system. 3804 / 3805* fio = avl_first(&vq->vq_active_tree); 3806 delta = gethrtime() - fio->io_timestamp; 3807 if (delta > spa_deadman_synctime(spa)) {	2488 return (error); 2489 } 2490 space_map_update(vd->vdev_obsolete_sm); 2491 } 2492 2493 return (0); 2494} 2495 2496/* 2497 * The special vdev case is used for hot spares and l2cache devices. Its 2498 * sole purpose it to set the vdev state for the associated vdev. To do this, 2499 * we make sure that we can open the underlying device, then try to read the 2500 * label, and make sure that the label is sane and that it hasn't been 2501 * repurposed to another pool. 2502 / 2503int 2504vdev_validate_aux(vdev_t vd) 2505{ 2506 nvlist_t label; 2507* uint64_t guid, version; 2508 uint64_t state; 2509 2510 if (!vdev_readable(vd)) 2511 return (0); 2512 2513 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 2514 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2515 VDEV_AUX_CORRUPT_DATA); 2516 return (-1); 2517 } 2518 2519 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 \|\| 2520 !SPA_VERSION_IS_SUPPORTED(version) \|\| 2521 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 \|\| 2522 guid != vd->vdev_guid \|\| 2523 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2524 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2525 VDEV_AUX_CORRUPT_DATA); 2526 nvlist_free(label); 2527 return (-1); 2528 } 2529 2530 /* 2531 * We don't actually check the pool state here. If it's in fact in 2532 * use by another pool, we update this fact on the fly when requested. 2533 / 2534* nvlist_free(label); 2535 return (0); 2536} 2537 2538/* 2539 * Free the objects used to store this vdev's spacemaps, and the array 2540 * that points to them. 2541 / 2542void 2543vdev_destroy_spacemaps(vdev_t vd, dmu_tx_t tx) 2544{ 2545* if (vd->vdev_ms_array == 0) 2546 return; 2547 2548 objset_t mos = vd->vdev_spa->spa_meta_objset; 2549* uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; 2550 size_t array_bytes = array_count * sizeof (uint64_t); 2551 uint64_t smobj_array = kmem_alloc(array_bytes, KM_SLEEP); 2552* VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, 2553 array_bytes, smobj_array, 0)); 2554 2555 for (uint64_t i = 0; i < array_count; i++) { 2556 uint64_t smobj = smobj_array[i]; 2557 if (smobj == 0) 2558 continue; 2559 2560 space_map_free_obj(mos, smobj, tx); 2561 } 2562 2563 kmem_free(smobj_array, array_bytes); 2564 VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); 2565 vd->vdev_ms_array = 0; 2566} 2567 2568static void 2569vdev_remove_empty(vdev_t vd, uint64_t txg) 2570{ 2571* spa_t spa = vd->vdev_spa; 2572* dmu_tx_t tx; 2573* 2574 ASSERT(vd == vd->vdev_top); 2575 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 2576 2577 if (vd->vdev_ms != NULL) { 2578 metaslab_group_t mg = vd->vdev_mg; 2579* 2580 metaslab_group_histogram_verify(mg); 2581 metaslab_class_histogram_verify(mg->mg_class); 2582 2583 for (int m = 0; m < vd->vdev_ms_count; m++) { 2584 metaslab_t msp = vd->vdev_ms[m]; 2585* 2586 if (msp == NULL \|\| msp->ms_sm == NULL) 2587 continue; 2588 2589 mutex_enter(&msp->ms_lock); 2590 /* 2591 * If the metaslab was not loaded when the vdev 2592 * was removed then the histogram accounting may 2593 * not be accurate. Update the histogram information 2594 * here so that we ensure that the metaslab group 2595 * and metaslab class are up-to-date. 2596 / 2597* metaslab_group_histogram_remove(mg, msp); 2598 2599 VERIFY0(space_map_allocated(msp->ms_sm)); 2600 space_map_close(msp->ms_sm); 2601 msp->ms_sm = NULL; 2602 mutex_exit(&msp->ms_lock); 2603 } 2604 2605 metaslab_group_histogram_verify(mg); 2606 metaslab_class_histogram_verify(mg->mg_class); 2607 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 2608 ASSERT0(mg->mg_histogram[i]); 2609 } 2610 2611 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2612 vdev_destroy_spacemaps(vd, tx); 2613 2614 if (vd->vdev_islog && vd->vdev_top_zap != 0) { 2615 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); 2616 vd->vdev_top_zap = 0; 2617 } 2618 dmu_tx_commit(tx); 2619} 2620 2621void 2622vdev_sync_done(vdev_t vd, uint64_t txg) 2623{ 2624* metaslab_t msp; 2625* boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2626 2627 ASSERT(vdev_is_concrete(vd)); 2628 2629 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2630 metaslab_sync_done(msp, txg); 2631 2632 if (reassess) 2633 metaslab_sync_reassess(vd->vdev_mg); 2634} 2635 2636void 2637vdev_sync(vdev_t vd, uint64_t txg) 2638{ 2639* spa_t spa = vd->vdev_spa; 2640* vdev_t lvd; 2641* metaslab_t msp; 2642* dmu_tx_t tx; 2643* 2644 if (range_tree_space(vd->vdev_obsolete_segments) > 0) { 2645 dmu_tx_t tx; 2646* 2647 ASSERT(vd->vdev_removing \|\| 2648 vd->vdev_ops == &vdev_indirect_ops); 2649 2650 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2651 vdev_indirect_sync_obsolete(vd, tx); 2652 dmu_tx_commit(tx); 2653 2654 /* 2655 * If the vdev is indirect, it can't have dirty 2656 * metaslabs or DTLs. 2657 / 2658* if (vd->vdev_ops == &vdev_indirect_ops) { 2659 ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); 2660 ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); 2661 return; 2662 } 2663 } 2664 2665 ASSERT(vdev_is_concrete(vd)); 2666 2667 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && 2668 !vd->vdev_removing) { 2669 ASSERT(vd == vd->vdev_top); 2670 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 2671 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2672 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2673 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2674 ASSERT(vd->vdev_ms_array != 0); 2675 vdev_config_dirty(vd); 2676 dmu_tx_commit(tx); 2677 } 2678 2679 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2680 metaslab_sync(msp, txg); 2681 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2682 } 2683 2684 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2685 vdev_dtl_sync(lvd, txg); 2686 2687 /* 2688 * Remove the metadata associated with this vdev once it's empty. 2689 * Note that this is typically used for log/cache device removal; 2690 * we don't empty toplevel vdevs when removing them. But if 2691 * a toplevel happens to be emptied, this is not harmful. 2692 / 2693* if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) { 2694 vdev_remove_empty(vd, txg); 2695 } 2696 2697 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2698} 2699 2700uint64_t 2701vdev_psize_to_asize(vdev_t vd, uint64_t psize) 2702{ 2703* return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2704} 2705 2706/* 2707 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2708 * not be opened, and no I/O is attempted. 2709 / 2710int 2711vdev_fault(spa_t spa, uint64_t guid, vdev_aux_t aux) 2712{ 2713 vdev_t vd, tvd; 2714 2715 spa_vdev_state_enter(spa, SCL_NONE); 2716 2717 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2718 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2719 2720 if (!vd->vdev_ops->vdev_op_leaf) 2721 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2722 2723 tvd = vd->vdev_top; 2724 2725 /* 2726 * We don't directly use the aux state here, but if we do a 2727 * vdev_reopen(), we need this value to be present to remember why we 2728 * were faulted. 2729 / 2730* vd->vdev_label_aux = aux; 2731 2732 /* 2733 * Faulted state takes precedence over degraded. 2734 / 2735* vd->vdev_delayed_close = B_FALSE; 2736 vd->vdev_faulted = 1ULL; 2737 vd->vdev_degraded = 0ULL; 2738 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2739 2740 /* 2741 * If this device has the only valid copy of the data, then 2742 * back off and simply mark the vdev as degraded instead. 2743 / 2744* if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2745 vd->vdev_degraded = 1ULL; 2746 vd->vdev_faulted = 0ULL; 2747 2748 /* 2749 * If we reopen the device and it's not dead, only then do we 2750 * mark it degraded. 2751 / 2752* vdev_reopen(tvd); 2753 2754 if (vdev_readable(vd)) 2755 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2756 } 2757 2758 return (spa_vdev_state_exit(spa, vd, 0)); 2759} 2760 2761/* 2762 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2763 * user that something is wrong. The vdev continues to operate as normal as far 2764 * as I/O is concerned. 2765 / 2766int 2767vdev_degrade(spa_t spa, uint64_t guid, vdev_aux_t aux) 2768{ 2769 vdev_t vd; 2770* 2771 spa_vdev_state_enter(spa, SCL_NONE); 2772 2773 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2774 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2775 2776 if (!vd->vdev_ops->vdev_op_leaf) 2777 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2778 2779 /* 2780 * If the vdev is already faulted, then don't do anything. 2781 / 2782* if (vd->vdev_faulted \|\| vd->vdev_degraded) 2783 return (spa_vdev_state_exit(spa, NULL, 0)); 2784 2785 vd->vdev_degraded = 1ULL; 2786 if (!vdev_is_dead(vd)) 2787 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2788 aux); 2789 2790 return (spa_vdev_state_exit(spa, vd, 0)); 2791} 2792 2793/* 2794 * Online the given vdev. 2795 * 2796 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2797 * spare device should be detached when the device finishes resilvering. 2798 * Second, the online should be treated like a 'test' online case, so no FMA 2799 * events are generated if the device fails to open. 2800 / 2801int 2802vdev_online(spa_t spa, uint64_t guid, uint64_t flags, vdev_state_t newstate) 2803{ 2804* vdev_t vd, tvd, pvd, rvd = spa->spa_root_vdev; 2805 boolean_t wasoffline; 2806 vdev_state_t oldstate; 2807 2808 spa_vdev_state_enter(spa, SCL_NONE); 2809 2810 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2811 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2812 2813 if (!vd->vdev_ops->vdev_op_leaf) 2814 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2815 2816 wasoffline = (vd->vdev_offline \|\| vd->vdev_tmpoffline); 2817 oldstate = vd->vdev_state; 2818 2819 tvd = vd->vdev_top; 2820 vd->vdev_offline = B_FALSE; 2821 vd->vdev_tmpoffline = B_FALSE; 2822 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2823 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2824 2825 /* XXX - L2ARC 1.0 does not support expansion / 2826* if (!vd->vdev_aux) { 2827 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2828 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2829 } 2830 2831 vdev_reopen(tvd); 2832 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2833 2834 if (!vd->vdev_aux) { 2835 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2836 pvd->vdev_expanding = B_FALSE; 2837 } 2838 2839 if (newstate) 2840 newstate = vd->vdev_state; 2841* if ((flags & ZFS_ONLINE_UNSPARE) && 2842 !vdev_is_dead(vd) && vd->vdev_parent && 2843 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2844 vd->vdev_parent->vdev_child[0] == vd) 2845 vd->vdev_unspare = B_TRUE; 2846 2847 if ((flags & ZFS_ONLINE_EXPAND) \|\| spa->spa_autoexpand) { 2848 2849 /* XXX - L2ARC 1.0 does not support expansion / 2850* if (vd->vdev_aux) 2851 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2852 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2853 } 2854 2855 if (wasoffline \|\| 2856 (oldstate < VDEV_STATE_DEGRADED && 2857 vd->vdev_state >= VDEV_STATE_DEGRADED)) 2858 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); 2859 2860 return (spa_vdev_state_exit(spa, vd, 0)); 2861} 2862 2863static int 2864vdev_offline_locked(spa_t spa, uint64_t guid, uint64_t flags) 2865{ 2866* vdev_t vd, tvd; 2867 int error = 0; 2868 uint64_t generation; 2869 metaslab_group_t mg; 2870* 2871top: 2872 spa_vdev_state_enter(spa, SCL_ALLOC); 2873 2874 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2875 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2876 2877 if (!vd->vdev_ops->vdev_op_leaf) 2878 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2879 2880 tvd = vd->vdev_top; 2881 mg = tvd->vdev_mg; 2882 generation = spa->spa_config_generation + 1; 2883 2884 /* 2885 * If the device isn't already offline, try to offline it. 2886 / 2887* if (!vd->vdev_offline) { 2888 /* 2889 * If this device has the only valid copy of some data, 2890 * don't allow it to be offlined. Log devices are always 2891 * expendable. 2892 / 2893* if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2894 vdev_dtl_required(vd)) 2895 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2896 2897 /* 2898 * If the top-level is a slog and it has had allocations 2899 * then proceed. We check that the vdev's metaslab group 2900 * is not NULL since it's possible that we may have just 2901 * added this vdev but not yet initialized its metaslabs. 2902 / 2903* if (tvd->vdev_islog && mg != NULL) { 2904 /* 2905 * Prevent any future allocations. 2906 / 2907* metaslab_group_passivate(mg); 2908 (void) spa_vdev_state_exit(spa, vd, 0); 2909 2910 error = spa_reset_logs(spa); 2911 2912 spa_vdev_state_enter(spa, SCL_ALLOC); 2913 2914 /* 2915 * Check to see if the config has changed. 2916 / 2917* if (error \|\| generation != spa->spa_config_generation) { 2918 metaslab_group_activate(mg); 2919 if (error) 2920 return (spa_vdev_state_exit(spa, 2921 vd, error)); 2922 (void) spa_vdev_state_exit(spa, vd, 0); 2923 goto top; 2924 } 2925 ASSERT0(tvd->vdev_stat.vs_alloc); 2926 } 2927 2928 /* 2929 * Offline this device and reopen its top-level vdev. 2930 * If the top-level vdev is a log device then just offline 2931 * it. Otherwise, if this action results in the top-level 2932 * vdev becoming unusable, undo it and fail the request. 2933 / 2934* vd->vdev_offline = B_TRUE; 2935 vdev_reopen(tvd); 2936 2937 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2938 vdev_is_dead(tvd)) { 2939 vd->vdev_offline = B_FALSE; 2940 vdev_reopen(tvd); 2941 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2942 } 2943 2944 /* 2945 * Add the device back into the metaslab rotor so that 2946 * once we online the device it's open for business. 2947 / 2948* if (tvd->vdev_islog && mg != NULL) 2949 metaslab_group_activate(mg); 2950 } 2951 2952 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2953 2954 return (spa_vdev_state_exit(spa, vd, 0)); 2955} 2956 2957int 2958vdev_offline(spa_t spa, uint64_t guid, uint64_t flags) 2959{ 2960* int error; 2961 2962 mutex_enter(&spa->spa_vdev_top_lock); 2963 error = vdev_offline_locked(spa, guid, flags); 2964 mutex_exit(&spa->spa_vdev_top_lock); 2965 2966 return (error); 2967} 2968 2969/* 2970 * Clear the error counts associated with this vdev. Unlike vdev_online() and 2971 * vdev_offline(), we assume the spa config is locked. We also clear all 2972 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2973 / 2974void 2975vdev_clear(spa_t spa, vdev_t vd) 2976{ 2977* vdev_t rvd = spa->spa_root_vdev; 2978* 2979 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2980 2981 if (vd == NULL) 2982 vd = rvd; 2983 2984 vd->vdev_stat.vs_read_errors = 0; 2985 vd->vdev_stat.vs_write_errors = 0; 2986 vd->vdev_stat.vs_checksum_errors = 0; 2987 2988 for (int c = 0; c < vd->vdev_children; c++) 2989 vdev_clear(spa, vd->vdev_child[c]); 2990 2991 if (vd == rvd) { 2992 for (int c = 0; c < spa->spa_l2cache.sav_count; c++) 2993 vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); 2994 2995 for (int c = 0; c < spa->spa_spares.sav_count; c++) 2996 vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); 2997 } 2998 2999 /* 3000 * It makes no sense to "clear" an indirect vdev. 3001 / 3002* if (!vdev_is_concrete(vd)) 3003 return; 3004 3005 /* 3006 * If we're in the FAULTED state or have experienced failed I/O, then 3007 * clear the persistent state and attempt to reopen the device. We 3008 * also mark the vdev config dirty, so that the new faulted state is 3009 * written out to disk. 3010 / 3011* if (vd->vdev_faulted \|\| vd->vdev_degraded \|\| 3012 !vdev_readable(vd) \|\| !vdev_writeable(vd)) { 3013 3014 /* 3015 * When reopening in reponse to a clear event, it may be due to 3016 * a fmadm repair request. In this case, if the device is 3017 * still broken, we want to still post the ereport again. 3018 / 3019* vd->vdev_forcefault = B_TRUE; 3020 3021 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 3022 vd->vdev_cant_read = B_FALSE; 3023 vd->vdev_cant_write = B_FALSE; 3024 3025 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 3026 3027 vd->vdev_forcefault = B_FALSE; 3028 3029 if (vd != rvd && vdev_writeable(vd->vdev_top)) 3030 vdev_state_dirty(vd->vdev_top); 3031 3032 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 3033 spa_async_request(spa, SPA_ASYNC_RESILVER); 3034 3035 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); 3036 } 3037 3038 /* 3039 * When clearing a FMA-diagnosed fault, we always want to 3040 * unspare the device, as we assume that the original spare was 3041 * done in response to the FMA fault. 3042 / 3043* if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 3044 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 3045 vd->vdev_parent->vdev_child[0] == vd) 3046 vd->vdev_unspare = B_TRUE; 3047} 3048 3049boolean_t 3050vdev_is_dead(vdev_t vd) 3051{ 3052* /* 3053 * Holes and missing devices are always considered "dead". 3054 * This simplifies the code since we don't have to check for 3055 * these types of devices in the various code paths. 3056 * Instead we rely on the fact that we skip over dead devices 3057 * before issuing I/O to them. 3058 / 3059* return (vd->vdev_state < VDEV_STATE_DEGRADED \|\| 3060 vd->vdev_ops == &vdev_hole_ops \|\| 3061 vd->vdev_ops == &vdev_missing_ops); 3062} 3063 3064boolean_t 3065vdev_readable(vdev_t vd) 3066{ 3067* return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 3068} 3069 3070boolean_t 3071vdev_writeable(vdev_t vd) 3072{ 3073* return (!vdev_is_dead(vd) && !vd->vdev_cant_write && 3074 vdev_is_concrete(vd)); 3075} 3076 3077boolean_t 3078vdev_allocatable(vdev_t vd) 3079{ 3080* uint64_t state = vd->vdev_state; 3081 3082 /* 3083 * We currently allow allocations from vdevs which may be in the 3084 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 3085 * fails to reopen then we'll catch it later when we're holding 3086 * the proper locks. Note that we have to get the vdev state 3087 * in a local variable because although it changes atomically, 3088 * we're asking two separate questions about it. 3089 / 3090* return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 3091 !vd->vdev_cant_write && vdev_is_concrete(vd) && 3092 vd->vdev_mg->mg_initialized); 3093} 3094 3095boolean_t 3096vdev_accessible(vdev_t vd, zio_t zio) 3097{ 3098 ASSERT(zio->io_vd == vd); 3099 3100 if (vdev_is_dead(vd) \|\| vd->vdev_remove_wanted) 3101 return (B_FALSE); 3102 3103 if (zio->io_type == ZIO_TYPE_READ) 3104 return (!vd->vdev_cant_read); 3105 3106 if (zio->io_type == ZIO_TYPE_WRITE) 3107 return (!vd->vdev_cant_write); 3108 3109 return (B_TRUE); 3110} 3111 3112/* 3113 * Get statistics for the given vdev. 3114 / 3115void 3116vdev_get_stats(vdev_t vd, vdev_stat_t vs) 3117{ 3118* spa_t spa = vd->vdev_spa; 3119* vdev_t rvd = spa->spa_root_vdev; 3120* vdev_t tvd = vd->vdev_top; 3121* 3122 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 3123 3124 mutex_enter(&vd->vdev_stat_lock); 3125 bcopy(&vd->vdev_stat, vs, sizeof (vs)); 3126* vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 3127 vs->vs_state = vd->vdev_state; 3128 vs->vs_rsize = vdev_get_min_asize(vd); 3129 if (vd->vdev_ops->vdev_op_leaf) 3130 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 3131 /* 3132 * Report expandable space on top-level, non-auxillary devices only. 3133 * The expandable space is reported in terms of metaslab sized units 3134 * since that determines how much space the pool can expand. 3135 / 3136* if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { 3137 vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - 3138 spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); 3139 } 3140 vs->vs_configured_ashift = vd->vdev_top != NULL 3141 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; 3142 vs->vs_logical_ashift = vd->vdev_logical_ashift; 3143 vs->vs_physical_ashift = vd->vdev_physical_ashift; 3144 if (vd->vdev_aux == NULL && vd == vd->vdev_top && 3145 vdev_is_concrete(vd)) { 3146 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 3147 } 3148 3149 /* 3150 * If we're getting stats on the root vdev, aggregate the I/O counts 3151 * over all top-level vdevs (i.e. the direct children of the root). 3152 / 3153* if (vd == rvd) { 3154 for (int c = 0; c < rvd->vdev_children; c++) { 3155 vdev_t cvd = rvd->vdev_child[c]; 3156* vdev_stat_t cvs = &cvd->vdev_stat; 3157* 3158 for (int t = 0; t < ZIO_TYPES; t++) { 3159 vs->vs_ops[t] += cvs->vs_ops[t]; 3160 vs->vs_bytes[t] += cvs->vs_bytes[t]; 3161 } 3162 cvs->vs_scan_removing = cvd->vdev_removing; 3163 } 3164 } 3165 mutex_exit(&vd->vdev_stat_lock); 3166} 3167 3168void 3169vdev_clear_stats(vdev_t vd) 3170{ 3171* mutex_enter(&vd->vdev_stat_lock); 3172 vd->vdev_stat.vs_space = 0; 3173 vd->vdev_stat.vs_dspace = 0; 3174 vd->vdev_stat.vs_alloc = 0; 3175 mutex_exit(&vd->vdev_stat_lock); 3176} 3177 3178void 3179vdev_scan_stat_init(vdev_t vd) 3180{ 3181* vdev_stat_t vs = &vd->vdev_stat; 3182* 3183 for (int c = 0; c < vd->vdev_children; c++) 3184 vdev_scan_stat_init(vd->vdev_child[c]); 3185 3186 mutex_enter(&vd->vdev_stat_lock); 3187 vs->vs_scan_processed = 0; 3188 mutex_exit(&vd->vdev_stat_lock); 3189} 3190 3191void 3192vdev_stat_update(zio_t zio, uint64_t psize) 3193{ 3194* spa_t spa = zio->io_spa; 3195* vdev_t rvd = spa->spa_root_vdev; 3196* vdev_t vd = zio->io_vd ? zio->io_vd : rvd; 3197* vdev_t pvd; 3198* uint64_t txg = zio->io_txg; 3199 vdev_stat_t vs = &vd->vdev_stat; 3200* zio_type_t type = zio->io_type; 3201 int flags = zio->io_flags; 3202 3203 /* 3204 * If this i/o is a gang leader, it didn't do any actual work. 3205 / 3206* if (zio->io_gang_tree) 3207 return; 3208 3209 if (zio->io_error == 0) { 3210 /* 3211 * If this is a root i/o, don't count it -- we've already 3212 * counted the top-level vdevs, and vdev_get_stats() will 3213 * aggregate them when asked. This reduces contention on 3214 * the root vdev_stat_lock and implicitly handles blocks 3215 * that compress away to holes, for which there is no i/o. 3216 * (Holes never create vdev children, so all the counters 3217 * remain zero, which is what we want.) 3218 * 3219 * Note: this only applies to successful i/o (io_error == 0) 3220 * because unlike i/o counts, errors are not additive. 3221 * When reading a ditto block, for example, failure of 3222 * one top-level vdev does not imply a root-level error. 3223 / 3224* if (vd == rvd) 3225 return; 3226 3227 ASSERT(vd == zio->io_vd); 3228 3229 if (flags & ZIO_FLAG_IO_BYPASS) 3230 return; 3231 3232 mutex_enter(&vd->vdev_stat_lock); 3233 3234 if (flags & ZIO_FLAG_IO_REPAIR) { 3235 if (flags & ZIO_FLAG_SCAN_THREAD) { 3236 dsl_scan_phys_t scn_phys = 3237* &spa->spa_dsl_pool->dp_scan->scn_phys; 3238 uint64_t processed = &scn_phys->scn_processed; 3239* 3240 /* XXX cleanup? / 3241* if (vd->vdev_ops->vdev_op_leaf) 3242 atomic_add_64(processed, psize); 3243 vs->vs_scan_processed += psize; 3244 } 3245 3246 if (flags & ZIO_FLAG_SELF_HEAL) 3247 vs->vs_self_healed += psize; 3248 } 3249 3250 vs->vs_ops[type]++; 3251 vs->vs_bytes[type] += psize; 3252 3253 mutex_exit(&vd->vdev_stat_lock); 3254 return; 3255 } 3256 3257 if (flags & ZIO_FLAG_SPECULATIVE) 3258 return; 3259 3260 /* 3261 * If this is an I/O error that is going to be retried, then ignore the 3262 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 3263 * hard errors, when in reality they can happen for any number of 3264 * innocuous reasons (bus resets, MPxIO link failure, etc). 3265 / 3266* if (zio->io_error == EIO && 3267 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 3268 return; 3269 3270 /* 3271 * Intent logs writes won't propagate their error to the root 3272 * I/O so don't mark these types of failures as pool-level 3273 * errors. 3274 / 3275* if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 3276 return; 3277 3278 mutex_enter(&vd->vdev_stat_lock); 3279 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 3280 if (zio->io_error == ECKSUM) 3281 vs->vs_checksum_errors++; 3282 else 3283 vs->vs_read_errors++; 3284 } 3285 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 3286 vs->vs_write_errors++; 3287 mutex_exit(&vd->vdev_stat_lock); 3288 3289 if (spa->spa_load_state == SPA_LOAD_NONE && 3290 type == ZIO_TYPE_WRITE && txg != 0 && 3291 (!(flags & ZIO_FLAG_IO_REPAIR) \|\| 3292 (flags & ZIO_FLAG_SCAN_THREAD) \|\| 3293 spa->spa_claiming)) { 3294 /* 3295 * This is either a normal write (not a repair), or it's 3296 * a repair induced by the scrub thread, or it's a repair 3297 * made by zil_claim() during spa_load() in the first txg. 3298 * In the normal case, we commit the DTL change in the same 3299 * txg as the block was born. In the scrub-induced repair 3300 * case, we know that scrubs run in first-pass syncing context, 3301 * so we commit the DTL change in spa_syncing_txg(spa). 3302 * In the zil_claim() case, we commit in spa_first_txg(spa). 3303 * 3304 * We currently do not make DTL entries for failed spontaneous 3305 * self-healing writes triggered by normal (non-scrubbing) 3306 * reads, because we have no transactional context in which to 3307 * do so -- and it's not clear that it'd be desirable anyway. 3308 / 3309* if (vd->vdev_ops->vdev_op_leaf) { 3310 uint64_t commit_txg = txg; 3311 if (flags & ZIO_FLAG_SCAN_THREAD) { 3312 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3313 ASSERT(spa_sync_pass(spa) == 1); 3314 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 3315 commit_txg = spa_syncing_txg(spa); 3316 } else if (spa->spa_claiming) { 3317 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3318 commit_txg = spa_first_txg(spa); 3319 } 3320 ASSERT(commit_txg >= spa_syncing_txg(spa)); 3321 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 3322 return; 3323 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 3324 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 3325 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 3326 } 3327 if (vd != rvd) 3328 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 3329 } 3330} 3331 3332/* 3333 * Update the in-core space usage stats for this vdev, its metaslab class, 3334 * and the root vdev. 3335 / 3336void 3337vdev_space_update(vdev_t vd, int64_t alloc_delta, int64_t defer_delta, 3338 int64_t space_delta) 3339{ 3340 int64_t dspace_delta = space_delta; 3341 spa_t spa = vd->vdev_spa; 3342* vdev_t rvd = spa->spa_root_vdev; 3343* metaslab_group_t mg = vd->vdev_mg; 3344* metaslab_class_t mc = mg ? mg->mg_class : NULL; 3345* 3346 ASSERT(vd == vd->vdev_top); 3347 3348 /* 3349 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 3350 * factor. We must calculate this here and not at the root vdev 3351 * because the root vdev's psize-to-asize is simply the max of its 3352 * childrens', thus not accurate enough for us. 3353 / 3354* ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 3355 ASSERT(vd->vdev_deflate_ratio != 0 \|\| vd->vdev_isl2cache); 3356 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 3357 vd->vdev_deflate_ratio; 3358 3359 mutex_enter(&vd->vdev_stat_lock); 3360 vd->vdev_stat.vs_alloc += alloc_delta; 3361 vd->vdev_stat.vs_space += space_delta; 3362 vd->vdev_stat.vs_dspace += dspace_delta; 3363 mutex_exit(&vd->vdev_stat_lock); 3364 3365 if (mc == spa_normal_class(spa)) { 3366 mutex_enter(&rvd->vdev_stat_lock); 3367 rvd->vdev_stat.vs_alloc += alloc_delta; 3368 rvd->vdev_stat.vs_space += space_delta; 3369 rvd->vdev_stat.vs_dspace += dspace_delta; 3370 mutex_exit(&rvd->vdev_stat_lock); 3371 } 3372 3373 if (mc != NULL) { 3374 ASSERT(rvd == vd->vdev_parent); 3375 ASSERT(vd->vdev_ms_count != 0); 3376 3377 metaslab_class_space_update(mc, 3378 alloc_delta, defer_delta, space_delta, dspace_delta); 3379 } 3380} 3381 3382/* 3383 * Mark a top-level vdev's config as dirty, placing it on the dirty list 3384 * so that it will be written out next time the vdev configuration is synced. 3385 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 3386 / 3387void 3388vdev_config_dirty(vdev_t vd) 3389{ 3390 spa_t spa = vd->vdev_spa; 3391* vdev_t rvd = spa->spa_root_vdev; 3392* int c; 3393 3394 ASSERT(spa_writeable(spa)); 3395 3396 /* 3397 * If this is an aux vdev (as with l2cache and spare devices), then we 3398 * update the vdev config manually and set the sync flag. 3399 / 3400* if (vd->vdev_aux != NULL) { 3401 spa_aux_vdev_t sav = vd->vdev_aux; 3402* nvlist_t *aux; 3403* uint_t naux; 3404 3405 for (c = 0; c < sav->sav_count; c++) { 3406 if (sav->sav_vdevs[c] == vd) 3407 break; 3408 } 3409 3410 if (c == sav->sav_count) { 3411 /* 3412 * We're being removed. There's nothing more to do. 3413 / 3414* ASSERT(sav->sav_sync == B_TRUE); 3415 return; 3416 } 3417 3418 sav->sav_sync = B_TRUE; 3419 3420 if (nvlist_lookup_nvlist_array(sav->sav_config, 3421 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 3422 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 3423 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 3424 } 3425 3426 ASSERT(c < naux); 3427 3428 /* 3429 * Setting the nvlist in the middle if the array is a little 3430 * sketchy, but it will work. 3431 / 3432* nvlist_free(aux[c]); 3433 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3434 3435 return; 3436 } 3437 3438 /* 3439 * The dirty list is protected by the SCL_CONFIG lock. The caller 3440 * must either hold SCL_CONFIG as writer, or must be the sync thread 3441 * (which holds SCL_CONFIG as reader). There's only one sync thread, 3442 * so this is sufficient to ensure mutual exclusion. 3443 / 3444* ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) \|\| 3445 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3446 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3447 3448 if (vd == rvd) { 3449 for (c = 0; c < rvd->vdev_children; c++) 3450 vdev_config_dirty(rvd->vdev_child[c]); 3451 } else { 3452 ASSERT(vd == vd->vdev_top); 3453 3454 if (!list_link_active(&vd->vdev_config_dirty_node) && 3455 vdev_is_concrete(vd)) { 3456 list_insert_head(&spa->spa_config_dirty_list, vd); 3457 } 3458 } 3459} 3460 3461void 3462vdev_config_clean(vdev_t vd) 3463{ 3464* spa_t spa = vd->vdev_spa; 3465* 3466 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) \|\| 3467 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3468 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3469 3470 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3471 list_remove(&spa->spa_config_dirty_list, vd); 3472} 3473 3474/* 3475 * Mark a top-level vdev's state as dirty, so that the next pass of 3476 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3477 * the state changes from larger config changes because they require 3478 * much less locking, and are often needed for administrative actions. 3479 / 3480void 3481vdev_state_dirty(vdev_t vd) 3482{ 3483 spa_t spa = vd->vdev_spa; 3484* 3485 ASSERT(spa_writeable(spa)); 3486 ASSERT(vd == vd->vdev_top); 3487 3488 /* 3489 * The state list is protected by the SCL_STATE lock. The caller 3490 * must either hold SCL_STATE as writer, or must be the sync thread 3491 * (which holds SCL_STATE as reader). There's only one sync thread, 3492 * so this is sufficient to ensure mutual exclusion. 3493 / 3494* ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) \|\| 3495 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3496 spa_config_held(spa, SCL_STATE, RW_READER))); 3497 3498 if (!list_link_active(&vd->vdev_state_dirty_node) && 3499 vdev_is_concrete(vd)) 3500 list_insert_head(&spa->spa_state_dirty_list, vd); 3501} 3502 3503void 3504vdev_state_clean(vdev_t vd) 3505{ 3506* spa_t spa = vd->vdev_spa; 3507* 3508 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) \|\| 3509 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3510 spa_config_held(spa, SCL_STATE, RW_READER))); 3511 3512 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 3513 list_remove(&spa->spa_state_dirty_list, vd); 3514} 3515 3516/* 3517 * Propagate vdev state up from children to parent. 3518 / 3519void 3520vdev_propagate_state(vdev_t vd) 3521{ 3522 spa_t spa = vd->vdev_spa; 3523* vdev_t rvd = spa->spa_root_vdev; 3524* int degraded = 0, faulted = 0; 3525 int corrupted = 0; 3526 vdev_t child; 3527* 3528 if (vd->vdev_children > 0) { 3529 for (int c = 0; c < vd->vdev_children; c++) { 3530 child = vd->vdev_child[c]; 3531 3532 /* 3533 * Don't factor holes or indirect vdevs into the 3534 * decision. 3535 / 3536* if (!vdev_is_concrete(child)) 3537 continue; 3538 3539 if (!vdev_readable(child) \|\| 3540 (!vdev_writeable(child) && spa_writeable(spa))) { 3541 /* 3542 * Root special: if there is a top-level log 3543 * device, treat the root vdev as if it were 3544 * degraded. 3545 / 3546* if (child->vdev_islog && vd == rvd) 3547 degraded++; 3548 else 3549 faulted++; 3550 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 3551 degraded++; 3552 } 3553 3554 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 3555 corrupted++; 3556 } 3557 3558 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 3559 3560 /* 3561 * Root special: if there is a top-level vdev that cannot be 3562 * opened due to corrupted metadata, then propagate the root 3563 * vdev's aux state as 'corrupt' rather than 'insufficient 3564 * replicas'. 3565 / 3566* if (corrupted && vd == rvd && 3567 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 3568 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 3569 VDEV_AUX_CORRUPT_DATA); 3570 } 3571 3572 if (vd->vdev_parent) 3573 vdev_propagate_state(vd->vdev_parent); 3574} 3575 3576/* 3577 * Set a vdev's state. If this is during an open, we don't update the parent 3578 * state, because we're in the process of opening children depth-first. 3579 * Otherwise, we propagate the change to the parent. 3580 * 3581 * If this routine places a device in a faulted state, an appropriate ereport is 3582 * generated. 3583 / 3584void 3585vdev_set_state(vdev_t vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 3586{ 3587 uint64_t save_state; 3588 spa_t spa = vd->vdev_spa; 3589* 3590 if (state == vd->vdev_state) { 3591 vd->vdev_stat.vs_aux = aux; 3592 return; 3593 } 3594 3595 save_state = vd->vdev_state; 3596 3597 vd->vdev_state = state; 3598 vd->vdev_stat.vs_aux = aux; 3599 3600 /* 3601 * If we are setting the vdev state to anything but an open state, then 3602 * always close the underlying device unless the device has requested 3603 * a delayed close (i.e. we're about to remove or fault the device). 3604 * Otherwise, we keep accessible but invalid devices open forever. 3605 * We don't call vdev_close() itself, because that implies some extra 3606 * checks (offline, etc) that we don't want here. This is limited to 3607 * leaf devices, because otherwise closing the device will affect other 3608 * children. 3609 / 3610* if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 3611 vd->vdev_ops->vdev_op_leaf) 3612 vd->vdev_ops->vdev_op_close(vd); 3613 3614 if (vd->vdev_removed && 3615 state == VDEV_STATE_CANT_OPEN && 3616 (aux == VDEV_AUX_OPEN_FAILED \|\| vd->vdev_checkremove)) { 3617 /* 3618 * If the previous state is set to VDEV_STATE_REMOVED, then this 3619 * device was previously marked removed and someone attempted to 3620 * reopen it. If this failed due to a nonexistent device, then 3621 * keep the device in the REMOVED state. We also let this be if 3622 * it is one of our special test online cases, which is only 3623 * attempting to online the device and shouldn't generate an FMA 3624 * fault. 3625 / 3626* vd->vdev_state = VDEV_STATE_REMOVED; 3627 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 3628 } else if (state == VDEV_STATE_REMOVED) { 3629 vd->vdev_removed = B_TRUE; 3630 } else if (state == VDEV_STATE_CANT_OPEN) { 3631 /* 3632 * If we fail to open a vdev during an import or recovery, we 3633 * mark it as "not available", which signifies that it was 3634 * never there to begin with. Failure to open such a device 3635 * is not considered an error. 3636 / 3637* if ((spa_load_state(spa) == SPA_LOAD_IMPORT \|\| 3638 spa_load_state(spa) == SPA_LOAD_RECOVER) && 3639 vd->vdev_ops->vdev_op_leaf) 3640 vd->vdev_not_present = 1; 3641 3642 /* 3643 * Post the appropriate ereport. If the 'prevstate' field is 3644 * set to something other than VDEV_STATE_UNKNOWN, it indicates 3645 * that this is part of a vdev_reopen(). In this case, we don't 3646 * want to post the ereport if the device was already in the 3647 * CANT_OPEN state beforehand. 3648 * 3649 * If the 'checkremove' flag is set, then this is an attempt to 3650 * online the device in response to an insertion event. If we 3651 * hit this case, then we have detected an insertion event for a 3652 * faulted or offline device that wasn't in the removed state. 3653 * In this scenario, we don't post an ereport because we are 3654 * about to replace the device, or attempt an online with 3655 * vdev_forcefault, which will generate the fault for us. 3656 / 3657* if ((vd->vdev_prevstate != state \|\| vd->vdev_forcefault) && 3658 !vd->vdev_not_present && !vd->vdev_checkremove && 3659 vd != spa->spa_root_vdev) { 3660 const char class; 3661* 3662 switch (aux) { 3663 case VDEV_AUX_OPEN_FAILED: 3664 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3665 break; 3666 case VDEV_AUX_CORRUPT_DATA: 3667 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3668 break; 3669 case VDEV_AUX_NO_REPLICAS: 3670 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3671 break; 3672 case VDEV_AUX_BAD_GUID_SUM: 3673 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3674 break; 3675 case VDEV_AUX_TOO_SMALL: 3676 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3677 break; 3678 case VDEV_AUX_BAD_LABEL: 3679 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3680 break; 3681 default: 3682 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3683 } 3684 3685 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3686 } 3687 3688 /* Erase any notion of persistent removed state / 3689* vd->vdev_removed = B_FALSE; 3690 } else { 3691 vd->vdev_removed = B_FALSE; 3692 } 3693 3694 /* 3695 * Notify the fmd of the state change. Be verbose and post 3696 * notifications even for stuff that's not important; the fmd agent can 3697 * sort it out. Don't emit state change events for non-leaf vdevs since 3698 * they can't change state on their own. The FMD can check their state 3699 * if it wants to when it sees that a leaf vdev had a state change. 3700 / 3701* if (vd->vdev_ops->vdev_op_leaf) 3702 zfs_post_state_change(spa, vd); 3703 3704 if (!isopen && vd->vdev_parent) 3705 vdev_propagate_state(vd->vdev_parent); 3706} 3707 3708/* 3709 * Check the vdev configuration to ensure that it's capable of supporting 3710 * a root pool. We do not support partial configuration. 3711 * In addition, only a single top-level vdev is allowed. 3712 * 3713 * FreeBSD does not have above limitations. 3714 / 3715boolean_t 3716vdev_is_bootable(vdev_t vd) 3717{ 3718#ifdef illumos 3719 if (!vd->vdev_ops->vdev_op_leaf) { 3720 char vdev_type = vd->vdev_ops->vdev_op_type; 3721* 3722 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3723 vd->vdev_children > 1) { 3724 return (B_FALSE); 3725 } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 \|\| 3726 strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { 3727 return (B_FALSE); 3728 } 3729 } 3730 3731 for (int c = 0; c < vd->vdev_children; c++) { 3732 if (!vdev_is_bootable(vd->vdev_child[c])) 3733 return (B_FALSE); 3734 } 3735#endif /* illumos / 3736* return (B_TRUE); 3737} 3738 3739boolean_t 3740vdev_is_concrete(vdev_t vd) 3741{ 3742* vdev_ops_t ops = vd->vdev_ops; 3743* if (ops == &vdev_indirect_ops \|\| ops == &vdev_hole_ops \|\| 3744 ops == &vdev_missing_ops \|\| ops == &vdev_root_ops) { 3745 return (B_FALSE); 3746 } else { 3747 return (B_TRUE); 3748 } 3749} 3750 3751/* 3752 * Load the state from the original vdev tree (ovd) which 3753 * we've retrieved from the MOS config object. If the original 3754 * vdev was offline or faulted then we transfer that state to the 3755 * device in the current vdev tree (nvd). 3756 / 3757void 3758vdev_load_log_state(vdev_t nvd, vdev_t ovd) 3759{ 3760* spa_t spa = nvd->vdev_spa; 3761* 3762 ASSERT(nvd->vdev_top->vdev_islog); 3763 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3764 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3765 3766 for (int c = 0; c < nvd->vdev_children; c++) 3767 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3768 3769 if (nvd->vdev_ops->vdev_op_leaf) { 3770 /* 3771 * Restore the persistent vdev state 3772 / 3773* nvd->vdev_offline = ovd->vdev_offline; 3774 nvd->vdev_faulted = ovd->vdev_faulted; 3775 nvd->vdev_degraded = ovd->vdev_degraded; 3776 nvd->vdev_removed = ovd->vdev_removed; 3777 } 3778} 3779 3780/* 3781 * Determine if a log device has valid content. If the vdev was 3782 * removed or faulted in the MOS config then we know that 3783 * the content on the log device has already been written to the pool. 3784 / 3785boolean_t 3786vdev_log_state_valid(vdev_t vd) 3787{ 3788 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3789 !vd->vdev_removed) 3790 return (B_TRUE); 3791 3792 for (int c = 0; c < vd->vdev_children; c++) 3793 if (vdev_log_state_valid(vd->vdev_child[c])) 3794 return (B_TRUE); 3795 3796 return (B_FALSE); 3797} 3798 3799/* 3800 * Expand a vdev if possible. 3801 / 3802void 3803vdev_expand(vdev_t vd, uint64_t txg) 3804{ 3805 ASSERT(vd->vdev_top == vd); 3806 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3807 3808 vdev_set_deflate_ratio(vd); 3809 3810 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && 3811 vdev_is_concrete(vd)) { 3812 VERIFY(vdev_metaslab_init(vd, txg) == 0); 3813 vdev_config_dirty(vd); 3814 } 3815} 3816 3817/* 3818 * Split a vdev. 3819 / 3820void 3821vdev_split(vdev_t vd) 3822{ 3823 vdev_t cvd, pvd = vd->vdev_parent; 3824 3825 vdev_remove_child(pvd, vd); 3826 vdev_compact_children(pvd); 3827 3828 cvd = pvd->vdev_child[0]; 3829 if (pvd->vdev_children == 1) { 3830 vdev_remove_parent(cvd); 3831 cvd->vdev_splitting = B_TRUE; 3832 } 3833 vdev_propagate_state(cvd); 3834} 3835 3836void 3837vdev_deadman(vdev_t vd) 3838{ 3839* for (int c = 0; c < vd->vdev_children; c++) { 3840 vdev_t cvd = vd->vdev_child[c]; 3841* 3842 vdev_deadman(cvd); 3843 } 3844 3845 if (vd->vdev_ops->vdev_op_leaf) { 3846 vdev_queue_t vq = &vd->vdev_queue; 3847* 3848 mutex_enter(&vq->vq_lock); 3849 if (avl_numnodes(&vq->vq_active_tree) > 0) { 3850 spa_t spa = vd->vdev_spa; 3851* zio_t fio; 3852* uint64_t delta; 3853 3854 /* 3855 * Look at the head of all the pending queues, 3856 * if any I/O has been outstanding for longer than 3857 * the spa_deadman_synctime we panic the system. 3858 / 3859* fio = avl_first(&vq->vq_active_tree); 3860 delta = gethrtime() - fio->io_timestamp; 3861 if (delta > spa_deadman_synctime(spa)) {
3808 zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " 3809 "delta %lluns, last io %lluns", 3810 fio->io_timestamp, delta,	3862 vdev_dbgmsg(vd, "SLOW IO: zio timestamp " 3863 "%lluns, delta %lluns, last io %lluns", 3864 fio->io_timestamp, (u_longlong_t)delta,
3811 vq->vq_io_complete_ts); 3812 fm_panic("I/O to pool '%s' appears to be " 3813 "hung on vdev guid %llu at '%s'.", 3814 spa_name(spa), 3815 (long long unsigned int) vd->vdev_guid, 3816 vd->vdev_path); 3817 } 3818 } 3819 mutex_exit(&vq->vq_lock); 3820 } 3821}	3865 vq->vq_io_complete_ts); 3866 fm_panic("I/O to pool '%s' appears to be " 3867 "hung on vdev guid %llu at '%s'.", 3868 spa_name(spa), 3869 (long long unsigned int) vd->vdev_guid, 3870 vd->vdev_path); 3871 } 3872 } 3873 mutex_exit(&vq->vq_lock); 3874 } 3875}