vdev.c revision 253441
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 by Delphix. All rights reserved. 26 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 */ 28 29#include <sys/zfs_context.h> 30#include <sys/fm/fs/zfs.h> 31#include <sys/spa.h> 32#include <sys/spa_impl.h> 33#include <sys/dmu.h> 34#include <sys/dmu_tx.h> 35#include <sys/vdev_impl.h> 36#include <sys/uberblock_impl.h> 37#include <sys/metaslab.h> 38#include <sys/metaslab_impl.h> 39#include <sys/space_map.h> 40#include <sys/zio.h> 41#include <sys/zap.h> 42#include <sys/fs/zfs.h> 43#include <sys/arc.h> 44#include <sys/zil.h> 45#include <sys/dsl_scan.h> 46#include <sys/trim_map.h> 47 48SYSCTL_DECL(_vfs_zfs); 49SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 50 51/* 52 * Virtual device management. 53 */ 54 55static vdev_ops_t *vdev_ops_table[] = { 56 &vdev_root_ops, 57 &vdev_raidz_ops, 58 &vdev_mirror_ops, 59 &vdev_replacing_ops, 60 &vdev_spare_ops, 61#ifdef _KERNEL 62 &vdev_geom_ops, 63#else 64 &vdev_disk_ops, 65#endif 66 &vdev_file_ops, 67 &vdev_missing_ops, 68 &vdev_hole_ops, 69 NULL 70}; 71 72 73/* 74 * Given a vdev type, return the appropriate ops vector. 75 */ 76static vdev_ops_t * 77vdev_getops(const char *type) 78{ 79 vdev_ops_t *ops, **opspp; 80 81 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 82 if (strcmp(ops->vdev_op_type, type) == 0) 83 break; 84 85 return (ops); 86} 87 88/* 89 * Default asize function: return the MAX of psize with the asize of 90 * all children. This is what's used by anything other than RAID-Z. 91 */ 92uint64_t 93vdev_default_asize(vdev_t *vd, uint64_t psize) 94{ 95 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 96 uint64_t csize; 97 98 for (int c = 0; c < vd->vdev_children; c++) { 99 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 100 asize = MAX(asize, csize); 101 } 102 103 return (asize); 104} 105 106/* 107 * Get the minimum allocatable size. We define the allocatable size as 108 * the vdev's asize rounded to the nearest metaslab. This allows us to 109 * replace or attach devices which don't have the same physical size but 110 * can still satisfy the same number of allocations. 111 */ 112uint64_t 113vdev_get_min_asize(vdev_t *vd) 114{ 115 vdev_t *pvd = vd->vdev_parent; 116 117 /* 118 * If our parent is NULL (inactive spare or cache) or is the root, 119 * just return our own asize. 120 */ 121 if (pvd == NULL) 122 return (vd->vdev_asize); 123 124 /* 125 * The top-level vdev just returns the allocatable size rounded 126 * to the nearest metaslab. 127 */ 128 if (vd == vd->vdev_top) 129 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 130 131 /* 132 * The allocatable space for a raidz vdev is N * sizeof(smallest child), 133 * so each child must provide at least 1/Nth of its asize. 134 */ 135 if (pvd->vdev_ops == &vdev_raidz_ops) 136 return (pvd->vdev_min_asize / pvd->vdev_children); 137 138 return (pvd->vdev_min_asize); 139} 140 141void 142vdev_set_min_asize(vdev_t *vd) 143{ 144 vd->vdev_min_asize = vdev_get_min_asize(vd); 145 146 for (int c = 0; c < vd->vdev_children; c++) 147 vdev_set_min_asize(vd->vdev_child[c]); 148} 149 150vdev_t * 151vdev_lookup_top(spa_t *spa, uint64_t vdev) 152{ 153 vdev_t *rvd = spa->spa_root_vdev; 154 155 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 156 157 if (vdev < rvd->vdev_children) { 158 ASSERT(rvd->vdev_child[vdev] != NULL); 159 return (rvd->vdev_child[vdev]); 160 } 161 162 return (NULL); 163} 164 165vdev_t * 166vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 167{ 168 vdev_t *mvd; 169 170 if (vd->vdev_guid == guid) 171 return (vd); 172 173 for (int c = 0; c < vd->vdev_children; c++) 174 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 175 NULL) 176 return (mvd); 177 178 return (NULL); 179} 180 181void 182vdev_add_child(vdev_t *pvd, vdev_t *cvd) 183{ 184 size_t oldsize, newsize; 185 uint64_t id = cvd->vdev_id; 186 vdev_t **newchild; 187 188 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 189 ASSERT(cvd->vdev_parent == NULL); 190 191 cvd->vdev_parent = pvd; 192 193 if (pvd == NULL) 194 return; 195 196 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 197 198 oldsize = pvd->vdev_children * sizeof (vdev_t *); 199 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 200 newsize = pvd->vdev_children * sizeof (vdev_t *); 201 202 newchild = kmem_zalloc(newsize, KM_SLEEP); 203 if (pvd->vdev_child != NULL) { 204 bcopy(pvd->vdev_child, newchild, oldsize); 205 kmem_free(pvd->vdev_child, oldsize); 206 } 207 208 pvd->vdev_child = newchild; 209 pvd->vdev_child[id] = cvd; 210 211 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 212 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 213 214 /* 215 * Walk up all ancestors to update guid sum. 216 */ 217 for (; pvd != NULL; pvd = pvd->vdev_parent) 218 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 219} 220 221void 222vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 223{ 224 int c; 225 uint_t id = cvd->vdev_id; 226 227 ASSERT(cvd->vdev_parent == pvd); 228 229 if (pvd == NULL) 230 return; 231 232 ASSERT(id < pvd->vdev_children); 233 ASSERT(pvd->vdev_child[id] == cvd); 234 235 pvd->vdev_child[id] = NULL; 236 cvd->vdev_parent = NULL; 237 238 for (c = 0; c < pvd->vdev_children; c++) 239 if (pvd->vdev_child[c]) 240 break; 241 242 if (c == pvd->vdev_children) { 243 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 244 pvd->vdev_child = NULL; 245 pvd->vdev_children = 0; 246 } 247 248 /* 249 * Walk up all ancestors to update guid sum. 250 */ 251 for (; pvd != NULL; pvd = pvd->vdev_parent) 252 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 253} 254 255/* 256 * Remove any holes in the child array. 257 */ 258void 259vdev_compact_children(vdev_t *pvd) 260{ 261 vdev_t **newchild, *cvd; 262 int oldc = pvd->vdev_children; 263 int newc; 264 265 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 266 267 for (int c = newc = 0; c < oldc; c++) 268 if (pvd->vdev_child[c]) 269 newc++; 270 271 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 272 273 for (int c = newc = 0; c < oldc; c++) { 274 if ((cvd = pvd->vdev_child[c]) != NULL) { 275 newchild[newc] = cvd; 276 cvd->vdev_id = newc++; 277 } 278 } 279 280 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 281 pvd->vdev_child = newchild; 282 pvd->vdev_children = newc; 283} 284 285/* 286 * Allocate and minimally initialize a vdev_t. 287 */ 288vdev_t * 289vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 290{ 291 vdev_t *vd; 292 293 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 294 295 if (spa->spa_root_vdev == NULL) { 296 ASSERT(ops == &vdev_root_ops); 297 spa->spa_root_vdev = vd; 298 spa->spa_load_guid = spa_generate_guid(NULL); 299 } 300 301 if (guid == 0 && ops != &vdev_hole_ops) { 302 if (spa->spa_root_vdev == vd) { 303 /* 304 * The root vdev's guid will also be the pool guid, 305 * which must be unique among all pools. 306 */ 307 guid = spa_generate_guid(NULL); 308 } else { 309 /* 310 * Any other vdev's guid must be unique within the pool. 311 */ 312 guid = spa_generate_guid(spa); 313 } 314 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 315 } 316 317 vd->vdev_spa = spa; 318 vd->vdev_id = id; 319 vd->vdev_guid = guid; 320 vd->vdev_guid_sum = guid; 321 vd->vdev_ops = ops; 322 vd->vdev_state = VDEV_STATE_CLOSED; 323 vd->vdev_ishole = (ops == &vdev_hole_ops); 324 325 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 326 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 327 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 328 for (int t = 0; t < DTL_TYPES; t++) { 329 space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, 330 &vd->vdev_dtl_lock); 331 } 332 txg_list_create(&vd->vdev_ms_list, 333 offsetof(struct metaslab, ms_txg_node)); 334 txg_list_create(&vd->vdev_dtl_list, 335 offsetof(struct vdev, vdev_dtl_node)); 336 vd->vdev_stat.vs_timestamp = gethrtime(); 337 vdev_queue_init(vd); 338 vdev_cache_init(vd); 339 340 return (vd); 341} 342 343/* 344 * Allocate a new vdev. The 'alloctype' is used to control whether we are 345 * creating a new vdev or loading an existing one - the behavior is slightly 346 * different for each case. 347 */ 348int 349vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 350 int alloctype) 351{ 352 vdev_ops_t *ops; 353 char *type; 354 uint64_t guid = 0, islog, nparity; 355 vdev_t *vd; 356 357 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 358 359 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 360 return (SET_ERROR(EINVAL)); 361 362 if ((ops = vdev_getops(type)) == NULL) 363 return (SET_ERROR(EINVAL)); 364 365 /* 366 * If this is a load, get the vdev guid from the nvlist. 367 * Otherwise, vdev_alloc_common() will generate one for us. 368 */ 369 if (alloctype == VDEV_ALLOC_LOAD) { 370 uint64_t label_id; 371 372 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 373 label_id != id) 374 return (SET_ERROR(EINVAL)); 375 376 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 377 return (SET_ERROR(EINVAL)); 378 } else if (alloctype == VDEV_ALLOC_SPARE) { 379 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 380 return (SET_ERROR(EINVAL)); 381 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 382 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 383 return (SET_ERROR(EINVAL)); 384 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 385 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 386 return (SET_ERROR(EINVAL)); 387 } 388 389 /* 390 * The first allocated vdev must be of type 'root'. 391 */ 392 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 393 return (SET_ERROR(EINVAL)); 394 395 /* 396 * Determine whether we're a log vdev. 397 */ 398 islog = 0; 399 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 400 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 401 return (SET_ERROR(ENOTSUP)); 402 403 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 404 return (SET_ERROR(ENOTSUP)); 405 406 /* 407 * Set the nparity property for RAID-Z vdevs. 408 */ 409 nparity = -1ULL; 410 if (ops == &vdev_raidz_ops) { 411 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 412 &nparity) == 0) { 413 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 414 return (SET_ERROR(EINVAL)); 415 /* 416 * Previous versions could only support 1 or 2 parity 417 * device. 418 */ 419 if (nparity > 1 && 420 spa_version(spa) < SPA_VERSION_RAIDZ2) 421 return (SET_ERROR(ENOTSUP)); 422 if (nparity > 2 && 423 spa_version(spa) < SPA_VERSION_RAIDZ3) 424 return (SET_ERROR(ENOTSUP)); 425 } else { 426 /* 427 * We require the parity to be specified for SPAs that 428 * support multiple parity levels. 429 */ 430 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 431 return (SET_ERROR(EINVAL)); 432 /* 433 * Otherwise, we default to 1 parity device for RAID-Z. 434 */ 435 nparity = 1; 436 } 437 } else { 438 nparity = 0; 439 } 440 ASSERT(nparity != -1ULL); 441 442 vd = vdev_alloc_common(spa, id, guid, ops); 443 444 vd->vdev_islog = islog; 445 vd->vdev_nparity = nparity; 446 447 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 448 vd->vdev_path = spa_strdup(vd->vdev_path); 449 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 450 vd->vdev_devid = spa_strdup(vd->vdev_devid); 451 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 452 &vd->vdev_physpath) == 0) 453 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 454 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 455 vd->vdev_fru = spa_strdup(vd->vdev_fru); 456 457 /* 458 * Set the whole_disk property. If it's not specified, leave the value 459 * as -1. 460 */ 461 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 462 &vd->vdev_wholedisk) != 0) 463 vd->vdev_wholedisk = -1ULL; 464 465 /* 466 * Look for the 'not present' flag. This will only be set if the device 467 * was not present at the time of import. 468 */ 469 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 470 &vd->vdev_not_present); 471 472 /* 473 * Get the alignment requirement. 474 */ 475 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 476 477 /* 478 * Retrieve the vdev creation time. 479 */ 480 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 481 &vd->vdev_crtxg); 482 483 /* 484 * If we're a top-level vdev, try to load the allocation parameters. 485 */ 486 if (parent && !parent->vdev_parent && 487 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 488 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 489 &vd->vdev_ms_array); 490 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 491 &vd->vdev_ms_shift); 492 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 493 &vd->vdev_asize); 494 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 495 &vd->vdev_removing); 496 } 497 498 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 499 ASSERT(alloctype == VDEV_ALLOC_LOAD || 500 alloctype == VDEV_ALLOC_ADD || 501 alloctype == VDEV_ALLOC_SPLIT || 502 alloctype == VDEV_ALLOC_ROOTPOOL); 503 vd->vdev_mg = metaslab_group_create(islog ? 504 spa_log_class(spa) : spa_normal_class(spa), vd); 505 } 506 507 /* 508 * If we're a leaf vdev, try to load the DTL object and other state. 509 */ 510 if (vd->vdev_ops->vdev_op_leaf && 511 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 512 alloctype == VDEV_ALLOC_ROOTPOOL)) { 513 if (alloctype == VDEV_ALLOC_LOAD) { 514 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 515 &vd->vdev_dtl_smo.smo_object); 516 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 517 &vd->vdev_unspare); 518 } 519 520 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 521 uint64_t spare = 0; 522 523 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 524 &spare) == 0 && spare) 525 spa_spare_add(vd); 526 } 527 528 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 529 &vd->vdev_offline); 530 531 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING, 532 &vd->vdev_resilvering); 533 534 /* 535 * When importing a pool, we want to ignore the persistent fault 536 * state, as the diagnosis made on another system may not be 537 * valid in the current context. Local vdevs will 538 * remain in the faulted state. 539 */ 540 if (spa_load_state(spa) == SPA_LOAD_OPEN) { 541 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 542 &vd->vdev_faulted); 543 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 544 &vd->vdev_degraded); 545 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 546 &vd->vdev_removed); 547 548 if (vd->vdev_faulted || vd->vdev_degraded) { 549 char *aux; 550 551 vd->vdev_label_aux = 552 VDEV_AUX_ERR_EXCEEDED; 553 if (nvlist_lookup_string(nv, 554 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 555 strcmp(aux, "external") == 0) 556 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 557 } 558 } 559 } 560 561 /* 562 * Add ourselves to the parent's list of children. 563 */ 564 vdev_add_child(parent, vd); 565 566 *vdp = vd; 567 568 return (0); 569} 570 571void 572vdev_free(vdev_t *vd) 573{ 574 spa_t *spa = vd->vdev_spa; 575 576 /* 577 * vdev_free() implies closing the vdev first. This is simpler than 578 * trying to ensure complicated semantics for all callers. 579 */ 580 vdev_close(vd); 581 582 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 583 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 584 585 /* 586 * Free all children. 587 */ 588 for (int c = 0; c < vd->vdev_children; c++) 589 vdev_free(vd->vdev_child[c]); 590 591 ASSERT(vd->vdev_child == NULL); 592 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 593 594 /* 595 * Discard allocation state. 596 */ 597 if (vd->vdev_mg != NULL) { 598 vdev_metaslab_fini(vd); 599 metaslab_group_destroy(vd->vdev_mg); 600 } 601 602 ASSERT0(vd->vdev_stat.vs_space); 603 ASSERT0(vd->vdev_stat.vs_dspace); 604 ASSERT0(vd->vdev_stat.vs_alloc); 605 606 /* 607 * Remove this vdev from its parent's child list. 608 */ 609 vdev_remove_child(vd->vdev_parent, vd); 610 611 ASSERT(vd->vdev_parent == NULL); 612 613 /* 614 * Clean up vdev structure. 615 */ 616 vdev_queue_fini(vd); 617 vdev_cache_fini(vd); 618 619 if (vd->vdev_path) 620 spa_strfree(vd->vdev_path); 621 if (vd->vdev_devid) 622 spa_strfree(vd->vdev_devid); 623 if (vd->vdev_physpath) 624 spa_strfree(vd->vdev_physpath); 625 if (vd->vdev_fru) 626 spa_strfree(vd->vdev_fru); 627 628 if (vd->vdev_isspare) 629 spa_spare_remove(vd); 630 if (vd->vdev_isl2cache) 631 spa_l2cache_remove(vd); 632 633 txg_list_destroy(&vd->vdev_ms_list); 634 txg_list_destroy(&vd->vdev_dtl_list); 635 636 mutex_enter(&vd->vdev_dtl_lock); 637 for (int t = 0; t < DTL_TYPES; t++) { 638 space_map_unload(&vd->vdev_dtl[t]); 639 space_map_destroy(&vd->vdev_dtl[t]); 640 } 641 mutex_exit(&vd->vdev_dtl_lock); 642 643 mutex_destroy(&vd->vdev_dtl_lock); 644 mutex_destroy(&vd->vdev_stat_lock); 645 mutex_destroy(&vd->vdev_probe_lock); 646 647 if (vd == spa->spa_root_vdev) 648 spa->spa_root_vdev = NULL; 649 650 kmem_free(vd, sizeof (vdev_t)); 651} 652 653/* 654 * Transfer top-level vdev state from svd to tvd. 655 */ 656static void 657vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 658{ 659 spa_t *spa = svd->vdev_spa; 660 metaslab_t *msp; 661 vdev_t *vd; 662 int t; 663 664 ASSERT(tvd == tvd->vdev_top); 665 666 tvd->vdev_ms_array = svd->vdev_ms_array; 667 tvd->vdev_ms_shift = svd->vdev_ms_shift; 668 tvd->vdev_ms_count = svd->vdev_ms_count; 669 670 svd->vdev_ms_array = 0; 671 svd->vdev_ms_shift = 0; 672 svd->vdev_ms_count = 0; 673 674 if (tvd->vdev_mg) 675 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 676 tvd->vdev_mg = svd->vdev_mg; 677 tvd->vdev_ms = svd->vdev_ms; 678 679 svd->vdev_mg = NULL; 680 svd->vdev_ms = NULL; 681 682 if (tvd->vdev_mg != NULL) 683 tvd->vdev_mg->mg_vd = tvd; 684 685 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 686 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 687 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 688 689 svd->vdev_stat.vs_alloc = 0; 690 svd->vdev_stat.vs_space = 0; 691 svd->vdev_stat.vs_dspace = 0; 692 693 for (t = 0; t < TXG_SIZE; t++) { 694 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 695 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 696 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 697 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 698 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 699 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 700 } 701 702 if (list_link_active(&svd->vdev_config_dirty_node)) { 703 vdev_config_clean(svd); 704 vdev_config_dirty(tvd); 705 } 706 707 if (list_link_active(&svd->vdev_state_dirty_node)) { 708 vdev_state_clean(svd); 709 vdev_state_dirty(tvd); 710 } 711 712 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 713 svd->vdev_deflate_ratio = 0; 714 715 tvd->vdev_islog = svd->vdev_islog; 716 svd->vdev_islog = 0; 717} 718 719static void 720vdev_top_update(vdev_t *tvd, vdev_t *vd) 721{ 722 if (vd == NULL) 723 return; 724 725 vd->vdev_top = tvd; 726 727 for (int c = 0; c < vd->vdev_children; c++) 728 vdev_top_update(tvd, vd->vdev_child[c]); 729} 730 731/* 732 * Add a mirror/replacing vdev above an existing vdev. 733 */ 734vdev_t * 735vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 736{ 737 spa_t *spa = cvd->vdev_spa; 738 vdev_t *pvd = cvd->vdev_parent; 739 vdev_t *mvd; 740 741 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 742 743 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 744 745 mvd->vdev_asize = cvd->vdev_asize; 746 mvd->vdev_min_asize = cvd->vdev_min_asize; 747 mvd->vdev_max_asize = cvd->vdev_max_asize; 748 mvd->vdev_ashift = cvd->vdev_ashift; 749 mvd->vdev_state = cvd->vdev_state; 750 mvd->vdev_crtxg = cvd->vdev_crtxg; 751 752 vdev_remove_child(pvd, cvd); 753 vdev_add_child(pvd, mvd); 754 cvd->vdev_id = mvd->vdev_children; 755 vdev_add_child(mvd, cvd); 756 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 757 758 if (mvd == mvd->vdev_top) 759 vdev_top_transfer(cvd, mvd); 760 761 return (mvd); 762} 763 764/* 765 * Remove a 1-way mirror/replacing vdev from the tree. 766 */ 767void 768vdev_remove_parent(vdev_t *cvd) 769{ 770 vdev_t *mvd = cvd->vdev_parent; 771 vdev_t *pvd = mvd->vdev_parent; 772 773 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 774 775 ASSERT(mvd->vdev_children == 1); 776 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 777 mvd->vdev_ops == &vdev_replacing_ops || 778 mvd->vdev_ops == &vdev_spare_ops); 779 cvd->vdev_ashift = mvd->vdev_ashift; 780 781 vdev_remove_child(mvd, cvd); 782 vdev_remove_child(pvd, mvd); 783 784 /* 785 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 786 * Otherwise, we could have detached an offline device, and when we 787 * go to import the pool we'll think we have two top-level vdevs, 788 * instead of a different version of the same top-level vdev. 789 */ 790 if (mvd->vdev_top == mvd) { 791 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 792 cvd->vdev_orig_guid = cvd->vdev_guid; 793 cvd->vdev_guid += guid_delta; 794 cvd->vdev_guid_sum += guid_delta; 795 } 796 cvd->vdev_id = mvd->vdev_id; 797 vdev_add_child(pvd, cvd); 798 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 799 800 if (cvd == cvd->vdev_top) 801 vdev_top_transfer(mvd, cvd); 802 803 ASSERT(mvd->vdev_children == 0); 804 vdev_free(mvd); 805} 806 807int 808vdev_metaslab_init(vdev_t *vd, uint64_t txg) 809{ 810 spa_t *spa = vd->vdev_spa; 811 objset_t *mos = spa->spa_meta_objset; 812 uint64_t m; 813 uint64_t oldc = vd->vdev_ms_count; 814 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 815 metaslab_t **mspp; 816 int error; 817 818 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 819 820 /* 821 * This vdev is not being allocated from yet or is a hole. 822 */ 823 if (vd->vdev_ms_shift == 0) 824 return (0); 825 826 ASSERT(!vd->vdev_ishole); 827 828 /* 829 * Compute the raidz-deflation ratio. Note, we hard-code 830 * in 128k (1 << 17) because it is the current "typical" blocksize. 831 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, 832 * or we will inconsistently account for existing bp's. 833 */ 834 vd->vdev_deflate_ratio = (1 << 17) / 835 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 836 837 ASSERT(oldc <= newc); 838 839 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 840 841 if (oldc != 0) { 842 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 843 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 844 } 845 846 vd->vdev_ms = mspp; 847 vd->vdev_ms_count = newc; 848 849 for (m = oldc; m < newc; m++) { 850 space_map_obj_t smo = { 0, 0, 0 }; 851 if (txg == 0) { 852 uint64_t object = 0; 853 error = dmu_read(mos, vd->vdev_ms_array, 854 m * sizeof (uint64_t), sizeof (uint64_t), &object, 855 DMU_READ_PREFETCH); 856 if (error) 857 return (error); 858 if (object != 0) { 859 dmu_buf_t *db; 860 error = dmu_bonus_hold(mos, object, FTAG, &db); 861 if (error) 862 return (error); 863 ASSERT3U(db->db_size, >=, sizeof (smo)); 864 bcopy(db->db_data, &smo, sizeof (smo)); 865 ASSERT3U(smo.smo_object, ==, object); 866 dmu_buf_rele(db, FTAG); 867 } 868 } 869 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 870 m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 871 } 872 873 if (txg == 0) 874 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 875 876 /* 877 * If the vdev is being removed we don't activate 878 * the metaslabs since we want to ensure that no new 879 * allocations are performed on this device. 880 */ 881 if (oldc == 0 && !vd->vdev_removing) 882 metaslab_group_activate(vd->vdev_mg); 883 884 if (txg == 0) 885 spa_config_exit(spa, SCL_ALLOC, FTAG); 886 887 return (0); 888} 889 890void 891vdev_metaslab_fini(vdev_t *vd) 892{ 893 uint64_t m; 894 uint64_t count = vd->vdev_ms_count; 895 896 if (vd->vdev_ms != NULL) { 897 metaslab_group_passivate(vd->vdev_mg); 898 for (m = 0; m < count; m++) 899 if (vd->vdev_ms[m] != NULL) 900 metaslab_fini(vd->vdev_ms[m]); 901 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 902 vd->vdev_ms = NULL; 903 } 904} 905 906typedef struct vdev_probe_stats { 907 boolean_t vps_readable; 908 boolean_t vps_writeable; 909 int vps_flags; 910} vdev_probe_stats_t; 911 912static void 913vdev_probe_done(zio_t *zio) 914{ 915 spa_t *spa = zio->io_spa; 916 vdev_t *vd = zio->io_vd; 917 vdev_probe_stats_t *vps = zio->io_private; 918 919 ASSERT(vd->vdev_probe_zio != NULL); 920 921 if (zio->io_type == ZIO_TYPE_READ) { 922 if (zio->io_error == 0) 923 vps->vps_readable = 1; 924 if (zio->io_error == 0 && spa_writeable(spa)) { 925 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 926 zio->io_offset, zio->io_size, zio->io_data, 927 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 928 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 929 } else { 930 zio_buf_free(zio->io_data, zio->io_size); 931 } 932 } else if (zio->io_type == ZIO_TYPE_WRITE) { 933 if (zio->io_error == 0) 934 vps->vps_writeable = 1; 935 zio_buf_free(zio->io_data, zio->io_size); 936 } else if (zio->io_type == ZIO_TYPE_NULL) { 937 zio_t *pio; 938 939 vd->vdev_cant_read |= !vps->vps_readable; 940 vd->vdev_cant_write |= !vps->vps_writeable; 941 942 if (vdev_readable(vd) && 943 (vdev_writeable(vd) || !spa_writeable(spa))) { 944 zio->io_error = 0; 945 } else { 946 ASSERT(zio->io_error != 0); 947 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 948 spa, vd, NULL, 0, 0); 949 zio->io_error = SET_ERROR(ENXIO); 950 } 951 952 mutex_enter(&vd->vdev_probe_lock); 953 ASSERT(vd->vdev_probe_zio == zio); 954 vd->vdev_probe_zio = NULL; 955 mutex_exit(&vd->vdev_probe_lock); 956 957 while ((pio = zio_walk_parents(zio)) != NULL) 958 if (!vdev_accessible(vd, pio)) 959 pio->io_error = SET_ERROR(ENXIO); 960 961 kmem_free(vps, sizeof (*vps)); 962 } 963} 964 965/* 966 * Determine whether this device is accessible. 967 * 968 * Read and write to several known locations: the pad regions of each 969 * vdev label but the first, which we leave alone in case it contains 970 * a VTOC. 971 */ 972zio_t * 973vdev_probe(vdev_t *vd, zio_t *zio) 974{ 975 spa_t *spa = vd->vdev_spa; 976 vdev_probe_stats_t *vps = NULL; 977 zio_t *pio; 978 979 ASSERT(vd->vdev_ops->vdev_op_leaf); 980 981 /* 982 * Don't probe the probe. 983 */ 984 if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 985 return (NULL); 986 987 /* 988 * To prevent 'probe storms' when a device fails, we create 989 * just one probe i/o at a time. All zios that want to probe 990 * this vdev will become parents of the probe io. 991 */ 992 mutex_enter(&vd->vdev_probe_lock); 993 994 if ((pio = vd->vdev_probe_zio) == NULL) { 995 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 996 997 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 998 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 999 ZIO_FLAG_TRYHARD; 1000 1001 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1002 /* 1003 * vdev_cant_read and vdev_cant_write can only 1004 * transition from TRUE to FALSE when we have the 1005 * SCL_ZIO lock as writer; otherwise they can only 1006 * transition from FALSE to TRUE. This ensures that 1007 * any zio looking at these values can assume that 1008 * failures persist for the life of the I/O. That's 1009 * important because when a device has intermittent 1010 * connectivity problems, we want to ensure that 1011 * they're ascribed to the device (ENXIO) and not 1012 * the zio (EIO). 1013 * 1014 * Since we hold SCL_ZIO as writer here, clear both 1015 * values so the probe can reevaluate from first 1016 * principles. 1017 */ 1018 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1019 vd->vdev_cant_read = B_FALSE; 1020 vd->vdev_cant_write = B_FALSE; 1021 } 1022 1023 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1024 vdev_probe_done, vps, 1025 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1026 1027 /* 1028 * We can't change the vdev state in this context, so we 1029 * kick off an async task to do it on our behalf. 1030 */ 1031 if (zio != NULL) { 1032 vd->vdev_probe_wanted = B_TRUE; 1033 spa_async_request(spa, SPA_ASYNC_PROBE); 1034 } 1035 } 1036 1037 if (zio != NULL) 1038 zio_add_child(zio, pio); 1039 1040 mutex_exit(&vd->vdev_probe_lock); 1041 1042 if (vps == NULL) { 1043 ASSERT(zio != NULL); 1044 return (NULL); 1045 } 1046 1047 for (int l = 1; l < VDEV_LABELS; l++) { 1048 zio_nowait(zio_read_phys(pio, vd, 1049 vdev_label_offset(vd->vdev_psize, l, 1050 offsetof(vdev_label_t, vl_pad2)), 1051 VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), 1052 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1053 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1054 } 1055 1056 if (zio == NULL) 1057 return (pio); 1058 1059 zio_nowait(pio); 1060 return (NULL); 1061} 1062 1063static void 1064vdev_open_child(void *arg) 1065{ 1066 vdev_t *vd = arg; 1067 1068 vd->vdev_open_thread = curthread; 1069 vd->vdev_open_error = vdev_open(vd); 1070 vd->vdev_open_thread = NULL; 1071} 1072 1073boolean_t 1074vdev_uses_zvols(vdev_t *vd) 1075{ 1076 if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1077 strlen(ZVOL_DIR)) == 0) 1078 return (B_TRUE); 1079 for (int c = 0; c < vd->vdev_children; c++) 1080 if (vdev_uses_zvols(vd->vdev_child[c])) 1081 return (B_TRUE); 1082 return (B_FALSE); 1083} 1084 1085void 1086vdev_open_children(vdev_t *vd) 1087{ 1088 taskq_t *tq; 1089 int children = vd->vdev_children; 1090 1091 /* 1092 * in order to handle pools on top of zvols, do the opens 1093 * in a single thread so that the same thread holds the 1094 * spa_namespace_lock 1095 */ 1096 if (B_TRUE || vdev_uses_zvols(vd)) { 1097 for (int c = 0; c < children; c++) 1098 vd->vdev_child[c]->vdev_open_error = 1099 vdev_open(vd->vdev_child[c]); 1100 return; 1101 } 1102 tq = taskq_create("vdev_open", children, minclsyspri, 1103 children, children, TASKQ_PREPOPULATE); 1104 1105 for (int c = 0; c < children; c++) 1106 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1107 TQ_SLEEP) != 0); 1108 1109 taskq_destroy(tq); 1110} 1111 1112/* 1113 * Prepare a virtual device for access. 1114 */ 1115int 1116vdev_open(vdev_t *vd) 1117{ 1118 spa_t *spa = vd->vdev_spa; 1119 int error; 1120 uint64_t osize = 0; 1121 uint64_t max_osize = 0; 1122 uint64_t asize, max_asize, psize; 1123 uint64_t ashift = 0; 1124 1125 ASSERT(vd->vdev_open_thread == curthread || 1126 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1127 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1128 vd->vdev_state == VDEV_STATE_CANT_OPEN || 1129 vd->vdev_state == VDEV_STATE_OFFLINE); 1130 1131 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1132 vd->vdev_cant_read = B_FALSE; 1133 vd->vdev_cant_write = B_FALSE; 1134 vd->vdev_min_asize = vdev_get_min_asize(vd); 1135 1136 /* 1137 * If this vdev is not removed, check its fault status. If it's 1138 * faulted, bail out of the open. 1139 */ 1140 if (!vd->vdev_removed && vd->vdev_faulted) { 1141 ASSERT(vd->vdev_children == 0); 1142 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1143 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1144 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1145 vd->vdev_label_aux); 1146 return (SET_ERROR(ENXIO)); 1147 } else if (vd->vdev_offline) { 1148 ASSERT(vd->vdev_children == 0); 1149 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1150 return (SET_ERROR(ENXIO)); 1151 } 1152 1153 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); 1154 1155 /* 1156 * Reset the vdev_reopening flag so that we actually close 1157 * the vdev on error. 1158 */ 1159 vd->vdev_reopening = B_FALSE; 1160 if (zio_injection_enabled && error == 0) 1161 error = zio_handle_device_injection(vd, NULL, ENXIO); 1162 1163 if (error) { 1164 if (vd->vdev_removed && 1165 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1166 vd->vdev_removed = B_FALSE; 1167 1168 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1169 vd->vdev_stat.vs_aux); 1170 return (error); 1171 } 1172 1173 vd->vdev_removed = B_FALSE; 1174 1175 /* 1176 * Recheck the faulted flag now that we have confirmed that 1177 * the vdev is accessible. If we're faulted, bail. 1178 */ 1179 if (vd->vdev_faulted) { 1180 ASSERT(vd->vdev_children == 0); 1181 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1182 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1183 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1184 vd->vdev_label_aux); 1185 return (SET_ERROR(ENXIO)); 1186 } 1187 1188 if (vd->vdev_degraded) { 1189 ASSERT(vd->vdev_children == 0); 1190 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1191 VDEV_AUX_ERR_EXCEEDED); 1192 } else { 1193 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1194 } 1195 1196 /* 1197 * For hole or missing vdevs we just return success. 1198 */ 1199 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1200 return (0); 1201 1202 if (vd->vdev_ops->vdev_op_leaf) { 1203 vd->vdev_notrim = B_FALSE; 1204 trim_map_create(vd); 1205 } 1206 1207 for (int c = 0; c < vd->vdev_children; c++) { 1208 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1209 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1210 VDEV_AUX_NONE); 1211 break; 1212 } 1213 } 1214 1215 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1216 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1217 1218 if (vd->vdev_children == 0) { 1219 if (osize < SPA_MINDEVSIZE) { 1220 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1221 VDEV_AUX_TOO_SMALL); 1222 return (SET_ERROR(EOVERFLOW)); 1223 } 1224 psize = osize; 1225 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1226 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1227 VDEV_LABEL_END_SIZE); 1228 } else { 1229 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1230 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1231 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1232 VDEV_AUX_TOO_SMALL); 1233 return (SET_ERROR(EOVERFLOW)); 1234 } 1235 psize = 0; 1236 asize = osize; 1237 max_asize = max_osize; 1238 } 1239 1240 vd->vdev_psize = psize; 1241 1242 /* 1243 * Make sure the allocatable size hasn't shrunk. 1244 */ 1245 if (asize < vd->vdev_min_asize) { 1246 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1247 VDEV_AUX_BAD_LABEL); 1248 return (SET_ERROR(EINVAL)); 1249 } 1250 1251 if (vd->vdev_asize == 0) { 1252 /* 1253 * This is the first-ever open, so use the computed values. 1254 * For testing purposes, a higher ashift can be requested. 1255 */ 1256 vd->vdev_asize = asize; 1257 vd->vdev_max_asize = max_asize; 1258 vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 1259 } else { 1260 /* 1261 * Detect if the alignment requirement has increased. 1262 * We don't want to make the pool unavailable, just 1263 * issue a warning instead. 1264 */ 1265 if (ashift > vd->vdev_top->vdev_ashift && 1266 vd->vdev_ops->vdev_op_leaf) { 1267 cmn_err(CE_WARN, 1268 "Disk, '%s', has a block alignment that is " 1269 "larger than the pool's alignment\n", 1270 vd->vdev_path); 1271 } 1272 vd->vdev_max_asize = max_asize; 1273 } 1274 1275 /* 1276 * If all children are healthy and the asize has increased, 1277 * then we've experienced dynamic LUN growth. If automatic 1278 * expansion is enabled then use the additional space. 1279 */ 1280 if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && 1281 (vd->vdev_expanding || spa->spa_autoexpand)) 1282 vd->vdev_asize = asize; 1283 1284 vdev_set_min_asize(vd); 1285 1286 /* 1287 * Ensure we can issue some IO before declaring the 1288 * vdev open for business. 1289 */ 1290 if (vd->vdev_ops->vdev_op_leaf && 1291 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1292 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1293 VDEV_AUX_ERR_EXCEEDED); 1294 return (error); 1295 } 1296 1297 /* 1298 * If a leaf vdev has a DTL, and seems healthy, then kick off a 1299 * resilver. But don't do this if we are doing a reopen for a scrub, 1300 * since this would just restart the scrub we are already doing. 1301 */ 1302 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1303 vdev_resilver_needed(vd, NULL, NULL)) 1304 spa_async_request(spa, SPA_ASYNC_RESILVER); 1305 1306 return (0); 1307} 1308 1309/* 1310 * Called once the vdevs are all opened, this routine validates the label 1311 * contents. This needs to be done before vdev_load() so that we don't 1312 * inadvertently do repair I/Os to the wrong device. 1313 * 1314 * If 'strict' is false ignore the spa guid check. This is necessary because 1315 * if the machine crashed during a re-guid the new guid might have been written 1316 * to all of the vdev labels, but not the cached config. The strict check 1317 * will be performed when the pool is opened again using the mos config. 1318 * 1319 * This function will only return failure if one of the vdevs indicates that it 1320 * has since been destroyed or exported. This is only possible if 1321 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1322 * will be updated but the function will return 0. 1323 */ 1324int 1325vdev_validate(vdev_t *vd, boolean_t strict) 1326{ 1327 spa_t *spa = vd->vdev_spa; 1328 nvlist_t *label; 1329 uint64_t guid = 0, top_guid; 1330 uint64_t state; 1331 1332 for (int c = 0; c < vd->vdev_children; c++) 1333 if (vdev_validate(vd->vdev_child[c], strict) != 0) 1334 return (SET_ERROR(EBADF)); 1335 1336 /* 1337 * If the device has already failed, or was marked offline, don't do 1338 * any further validation. Otherwise, label I/O will fail and we will 1339 * overwrite the previous state. 1340 */ 1341 if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1342 uint64_t aux_guid = 0; 1343 nvlist_t *nvl; 1344 uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1345 spa_last_synced_txg(spa) : -1ULL; 1346 1347 if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1348 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1349 VDEV_AUX_BAD_LABEL); 1350 return (0); 1351 } 1352 1353 /* 1354 * Determine if this vdev has been split off into another 1355 * pool. If so, then refuse to open it. 1356 */ 1357 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1358 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1359 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1360 VDEV_AUX_SPLIT_POOL); 1361 nvlist_free(label); 1362 return (0); 1363 } 1364 1365 if (strict && (nvlist_lookup_uint64(label, 1366 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 1367 guid != spa_guid(spa))) { 1368 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1369 VDEV_AUX_CORRUPT_DATA); 1370 nvlist_free(label); 1371 return (0); 1372 } 1373 1374 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1375 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1376 &aux_guid) != 0) 1377 aux_guid = 0; 1378 1379 /* 1380 * If this vdev just became a top-level vdev because its 1381 * sibling was detached, it will have adopted the parent's 1382 * vdev guid -- but the label may or may not be on disk yet. 1383 * Fortunately, either version of the label will have the 1384 * same top guid, so if we're a top-level vdev, we can 1385 * safely compare to that instead. 1386 * 1387 * If we split this vdev off instead, then we also check the 1388 * original pool's guid. We don't want to consider the vdev 1389 * corrupt if it is partway through a split operation. 1390 */ 1391 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1392 &guid) != 0 || 1393 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1394 &top_guid) != 0 || 1395 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1396 (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1397 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1398 VDEV_AUX_CORRUPT_DATA); 1399 nvlist_free(label); 1400 return (0); 1401 } 1402 1403 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1404 &state) != 0) { 1405 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1406 VDEV_AUX_CORRUPT_DATA); 1407 nvlist_free(label); 1408 return (0); 1409 } 1410 1411 nvlist_free(label); 1412 1413 /* 1414 * If this is a verbatim import, no need to check the 1415 * state of the pool. 1416 */ 1417 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1418 spa_load_state(spa) == SPA_LOAD_OPEN && 1419 state != POOL_STATE_ACTIVE) 1420 return (SET_ERROR(EBADF)); 1421 1422 /* 1423 * If we were able to open and validate a vdev that was 1424 * previously marked permanently unavailable, clear that state 1425 * now. 1426 */ 1427 if (vd->vdev_not_present) 1428 vd->vdev_not_present = 0; 1429 } 1430 1431 return (0); 1432} 1433 1434/* 1435 * Close a virtual device. 1436 */ 1437void 1438vdev_close(vdev_t *vd) 1439{ 1440 spa_t *spa = vd->vdev_spa; 1441 vdev_t *pvd = vd->vdev_parent; 1442 1443 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1444 1445 /* 1446 * If our parent is reopening, then we are as well, unless we are 1447 * going offline. 1448 */ 1449 if (pvd != NULL && pvd->vdev_reopening) 1450 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1451 1452 vd->vdev_ops->vdev_op_close(vd); 1453 1454 vdev_cache_purge(vd); 1455 1456 if (vd->vdev_ops->vdev_op_leaf) 1457 trim_map_destroy(vd); 1458 1459 /* 1460 * We record the previous state before we close it, so that if we are 1461 * doing a reopen(), we don't generate FMA ereports if we notice that 1462 * it's still faulted. 1463 */ 1464 vd->vdev_prevstate = vd->vdev_state; 1465 1466 if (vd->vdev_offline) 1467 vd->vdev_state = VDEV_STATE_OFFLINE; 1468 else 1469 vd->vdev_state = VDEV_STATE_CLOSED; 1470 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1471} 1472 1473void 1474vdev_hold(vdev_t *vd) 1475{ 1476 spa_t *spa = vd->vdev_spa; 1477 1478 ASSERT(spa_is_root(spa)); 1479 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1480 return; 1481 1482 for (int c = 0; c < vd->vdev_children; c++) 1483 vdev_hold(vd->vdev_child[c]); 1484 1485 if (vd->vdev_ops->vdev_op_leaf) 1486 vd->vdev_ops->vdev_op_hold(vd); 1487} 1488 1489void 1490vdev_rele(vdev_t *vd) 1491{ 1492 spa_t *spa = vd->vdev_spa; 1493 1494 ASSERT(spa_is_root(spa)); 1495 for (int c = 0; c < vd->vdev_children; c++) 1496 vdev_rele(vd->vdev_child[c]); 1497 1498 if (vd->vdev_ops->vdev_op_leaf) 1499 vd->vdev_ops->vdev_op_rele(vd); 1500} 1501 1502/* 1503 * Reopen all interior vdevs and any unopened leaves. We don't actually 1504 * reopen leaf vdevs which had previously been opened as they might deadlock 1505 * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1506 * If the leaf has never been opened then open it, as usual. 1507 */ 1508void 1509vdev_reopen(vdev_t *vd) 1510{ 1511 spa_t *spa = vd->vdev_spa; 1512 1513 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1514 1515 /* set the reopening flag unless we're taking the vdev offline */ 1516 vd->vdev_reopening = !vd->vdev_offline; 1517 vdev_close(vd); 1518 (void) vdev_open(vd); 1519 1520 /* 1521 * Call vdev_validate() here to make sure we have the same device. 1522 * Otherwise, a device with an invalid label could be successfully 1523 * opened in response to vdev_reopen(). 1524 */ 1525 if (vd->vdev_aux) { 1526 (void) vdev_validate_aux(vd); 1527 if (vdev_readable(vd) && vdev_writeable(vd) && 1528 vd->vdev_aux == &spa->spa_l2cache && 1529 !l2arc_vdev_present(vd)) 1530 l2arc_add_vdev(spa, vd); 1531 } else { 1532 (void) vdev_validate(vd, B_TRUE); 1533 } 1534 1535 /* 1536 * Reassess parent vdev's health. 1537 */ 1538 vdev_propagate_state(vd); 1539} 1540 1541int 1542vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1543{ 1544 int error; 1545 1546 /* 1547 * Normally, partial opens (e.g. of a mirror) are allowed. 1548 * For a create, however, we want to fail the request if 1549 * there are any components we can't open. 1550 */ 1551 error = vdev_open(vd); 1552 1553 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1554 vdev_close(vd); 1555 return (error ? error : ENXIO); 1556 } 1557 1558 /* 1559 * Recursively initialize all labels. 1560 */ 1561 if ((error = vdev_label_init(vd, txg, isreplacing ? 1562 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1563 vdev_close(vd); 1564 return (error); 1565 } 1566 1567 return (0); 1568} 1569 1570void 1571vdev_metaslab_set_size(vdev_t *vd) 1572{ 1573 /* 1574 * Aim for roughly 200 metaslabs per vdev. 1575 */ 1576 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1577 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1578} 1579 1580void 1581vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1582{ 1583 ASSERT(vd == vd->vdev_top); 1584 ASSERT(!vd->vdev_ishole); 1585 ASSERT(ISP2(flags)); 1586 ASSERT(spa_writeable(vd->vdev_spa)); 1587 1588 if (flags & VDD_METASLAB) 1589 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1590 1591 if (flags & VDD_DTL) 1592 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1593 1594 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1595} 1596 1597/* 1598 * DTLs. 1599 * 1600 * A vdev's DTL (dirty time log) is the set of transaction groups for which 1601 * the vdev has less than perfect replication. There are four kinds of DTL: 1602 * 1603 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1604 * 1605 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1606 * 1607 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1608 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1609 * txgs that was scrubbed. 1610 * 1611 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1612 * persistent errors or just some device being offline. 1613 * Unlike the other three, the DTL_OUTAGE map is not generally 1614 * maintained; it's only computed when needed, typically to 1615 * determine whether a device can be detached. 1616 * 1617 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1618 * either has the data or it doesn't. 1619 * 1620 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1621 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1622 * if any child is less than fully replicated, then so is its parent. 1623 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1624 * comprising only those txgs which appear in 'maxfaults' or more children; 1625 * those are the txgs we don't have enough replication to read. For example, 1626 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1627 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1628 * two child DTL_MISSING maps. 1629 * 1630 * It should be clear from the above that to compute the DTLs and outage maps 1631 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1632 * Therefore, that is all we keep on disk. When loading the pool, or after 1633 * a configuration change, we generate all other DTLs from first principles. 1634 */ 1635void 1636vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1637{ 1638 space_map_t *sm = &vd->vdev_dtl[t]; 1639 1640 ASSERT(t < DTL_TYPES); 1641 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1642 ASSERT(spa_writeable(vd->vdev_spa)); 1643 1644 mutex_enter(sm->sm_lock); 1645 if (!space_map_contains(sm, txg, size)) 1646 space_map_add(sm, txg, size); 1647 mutex_exit(sm->sm_lock); 1648} 1649 1650boolean_t 1651vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1652{ 1653 space_map_t *sm = &vd->vdev_dtl[t]; 1654 boolean_t dirty = B_FALSE; 1655 1656 ASSERT(t < DTL_TYPES); 1657 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1658 1659 mutex_enter(sm->sm_lock); 1660 if (sm->sm_space != 0) 1661 dirty = space_map_contains(sm, txg, size); 1662 mutex_exit(sm->sm_lock); 1663 1664 return (dirty); 1665} 1666 1667boolean_t 1668vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1669{ 1670 space_map_t *sm = &vd->vdev_dtl[t]; 1671 boolean_t empty; 1672 1673 mutex_enter(sm->sm_lock); 1674 empty = (sm->sm_space == 0); 1675 mutex_exit(sm->sm_lock); 1676 1677 return (empty); 1678} 1679 1680/* 1681 * Reassess DTLs after a config change or scrub completion. 1682 */ 1683void 1684vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1685{ 1686 spa_t *spa = vd->vdev_spa; 1687 avl_tree_t reftree; 1688 int minref; 1689 1690 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1691 1692 for (int c = 0; c < vd->vdev_children; c++) 1693 vdev_dtl_reassess(vd->vdev_child[c], txg, 1694 scrub_txg, scrub_done); 1695 1696 if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) 1697 return; 1698 1699 if (vd->vdev_ops->vdev_op_leaf) { 1700 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1701 1702 mutex_enter(&vd->vdev_dtl_lock); 1703 if (scrub_txg != 0 && 1704 (spa->spa_scrub_started || 1705 (scn && scn->scn_phys.scn_errors == 0))) { 1706 /* 1707 * We completed a scrub up to scrub_txg. If we 1708 * did it without rebooting, then the scrub dtl 1709 * will be valid, so excise the old region and 1710 * fold in the scrub dtl. Otherwise, leave the 1711 * dtl as-is if there was an error. 1712 * 1713 * There's little trick here: to excise the beginning 1714 * of the DTL_MISSING map, we put it into a reference 1715 * tree and then add a segment with refcnt -1 that 1716 * covers the range [0, scrub_txg). This means 1717 * that each txg in that range has refcnt -1 or 0. 1718 * We then add DTL_SCRUB with a refcnt of 2, so that 1719 * entries in the range [0, scrub_txg) will have a 1720 * positive refcnt -- either 1 or 2. We then convert 1721 * the reference tree into the new DTL_MISSING map. 1722 */ 1723 space_map_ref_create(&reftree); 1724 space_map_ref_add_map(&reftree, 1725 &vd->vdev_dtl[DTL_MISSING], 1); 1726 space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); 1727 space_map_ref_add_map(&reftree, 1728 &vd->vdev_dtl[DTL_SCRUB], 2); 1729 space_map_ref_generate_map(&reftree, 1730 &vd->vdev_dtl[DTL_MISSING], 1); 1731 space_map_ref_destroy(&reftree); 1732 } 1733 space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 1734 space_map_walk(&vd->vdev_dtl[DTL_MISSING], 1735 space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); 1736 if (scrub_done) 1737 space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 1738 space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 1739 if (!vdev_readable(vd)) 1740 space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 1741 else 1742 space_map_walk(&vd->vdev_dtl[DTL_MISSING], 1743 space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); 1744 mutex_exit(&vd->vdev_dtl_lock); 1745 1746 if (txg != 0) 1747 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1748 return; 1749 } 1750 1751 mutex_enter(&vd->vdev_dtl_lock); 1752 for (int t = 0; t < DTL_TYPES; t++) { 1753 /* account for child's outage in parent's missing map */ 1754 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 1755 if (t == DTL_SCRUB) 1756 continue; /* leaf vdevs only */ 1757 if (t == DTL_PARTIAL) 1758 minref = 1; /* i.e. non-zero */ 1759 else if (vd->vdev_nparity != 0) 1760 minref = vd->vdev_nparity + 1; /* RAID-Z */ 1761 else 1762 minref = vd->vdev_children; /* any kind of mirror */ 1763 space_map_ref_create(&reftree); 1764 for (int c = 0; c < vd->vdev_children; c++) { 1765 vdev_t *cvd = vd->vdev_child[c]; 1766 mutex_enter(&cvd->vdev_dtl_lock); 1767 space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); 1768 mutex_exit(&cvd->vdev_dtl_lock); 1769 } 1770 space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); 1771 space_map_ref_destroy(&reftree); 1772 } 1773 mutex_exit(&vd->vdev_dtl_lock); 1774} 1775 1776static int 1777vdev_dtl_load(vdev_t *vd) 1778{ 1779 spa_t *spa = vd->vdev_spa; 1780 space_map_obj_t *smo = &vd->vdev_dtl_smo; 1781 objset_t *mos = spa->spa_meta_objset; 1782 dmu_buf_t *db; 1783 int error; 1784 1785 ASSERT(vd->vdev_children == 0); 1786 1787 if (smo->smo_object == 0) 1788 return (0); 1789 1790 ASSERT(!vd->vdev_ishole); 1791 1792 if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 1793 return (error); 1794 1795 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1796 bcopy(db->db_data, smo, sizeof (*smo)); 1797 dmu_buf_rele(db, FTAG); 1798 1799 mutex_enter(&vd->vdev_dtl_lock); 1800 error = space_map_load(&vd->vdev_dtl[DTL_MISSING], 1801 NULL, SM_ALLOC, smo, mos); 1802 mutex_exit(&vd->vdev_dtl_lock); 1803 1804 return (error); 1805} 1806 1807void 1808vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1809{ 1810 spa_t *spa = vd->vdev_spa; 1811 space_map_obj_t *smo = &vd->vdev_dtl_smo; 1812 space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; 1813 objset_t *mos = spa->spa_meta_objset; 1814 space_map_t smsync; 1815 kmutex_t smlock; 1816 dmu_buf_t *db; 1817 dmu_tx_t *tx; 1818 1819 ASSERT(!vd->vdev_ishole); 1820 1821 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1822 1823 if (vd->vdev_detached) { 1824 if (smo->smo_object != 0) { 1825 int err = dmu_object_free(mos, smo->smo_object, tx); 1826 ASSERT0(err); 1827 smo->smo_object = 0; 1828 } 1829 dmu_tx_commit(tx); 1830 return; 1831 } 1832 1833 if (smo->smo_object == 0) { 1834 ASSERT(smo->smo_objsize == 0); 1835 ASSERT(smo->smo_alloc == 0); 1836 smo->smo_object = dmu_object_alloc(mos, 1837 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1838 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1839 ASSERT(smo->smo_object != 0); 1840 vdev_config_dirty(vd->vdev_top); 1841 } 1842 1843 bzero(&smlock, sizeof (smlock)); 1844 mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1845 1846 space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1847 &smlock); 1848 1849 mutex_enter(&smlock); 1850 1851 mutex_enter(&vd->vdev_dtl_lock); 1852 space_map_walk(sm, space_map_add, &smsync); 1853 mutex_exit(&vd->vdev_dtl_lock); 1854 1855 space_map_truncate(smo, mos, tx); 1856 space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1857 space_map_vacate(&smsync, NULL, NULL); 1858 1859 space_map_destroy(&smsync); 1860 1861 mutex_exit(&smlock); 1862 mutex_destroy(&smlock); 1863 1864 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1865 dmu_buf_will_dirty(db, tx); 1866 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1867 bcopy(smo, db->db_data, sizeof (*smo)); 1868 dmu_buf_rele(db, FTAG); 1869 1870 dmu_tx_commit(tx); 1871} 1872 1873/* 1874 * Determine whether the specified vdev can be offlined/detached/removed 1875 * without losing data. 1876 */ 1877boolean_t 1878vdev_dtl_required(vdev_t *vd) 1879{ 1880 spa_t *spa = vd->vdev_spa; 1881 vdev_t *tvd = vd->vdev_top; 1882 uint8_t cant_read = vd->vdev_cant_read; 1883 boolean_t required; 1884 1885 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1886 1887 if (vd == spa->spa_root_vdev || vd == tvd) 1888 return (B_TRUE); 1889 1890 /* 1891 * Temporarily mark the device as unreadable, and then determine 1892 * whether this results in any DTL outages in the top-level vdev. 1893 * If not, we can safely offline/detach/remove the device. 1894 */ 1895 vd->vdev_cant_read = B_TRUE; 1896 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 1897 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 1898 vd->vdev_cant_read = cant_read; 1899 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 1900 1901 if (!required && zio_injection_enabled) 1902 required = !!zio_handle_device_injection(vd, NULL, ECHILD); 1903 1904 return (required); 1905} 1906 1907/* 1908 * Determine if resilver is needed, and if so the txg range. 1909 */ 1910boolean_t 1911vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 1912{ 1913 boolean_t needed = B_FALSE; 1914 uint64_t thismin = UINT64_MAX; 1915 uint64_t thismax = 0; 1916 1917 if (vd->vdev_children == 0) { 1918 mutex_enter(&vd->vdev_dtl_lock); 1919 if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && 1920 vdev_writeable(vd)) { 1921 space_seg_t *ss; 1922 1923 ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); 1924 thismin = ss->ss_start - 1; 1925 ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); 1926 thismax = ss->ss_end; 1927 needed = B_TRUE; 1928 } 1929 mutex_exit(&vd->vdev_dtl_lock); 1930 } else { 1931 for (int c = 0; c < vd->vdev_children; c++) { 1932 vdev_t *cvd = vd->vdev_child[c]; 1933 uint64_t cmin, cmax; 1934 1935 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 1936 thismin = MIN(thismin, cmin); 1937 thismax = MAX(thismax, cmax); 1938 needed = B_TRUE; 1939 } 1940 } 1941 } 1942 1943 if (needed && minp) { 1944 *minp = thismin; 1945 *maxp = thismax; 1946 } 1947 return (needed); 1948} 1949 1950void 1951vdev_load(vdev_t *vd) 1952{ 1953 /* 1954 * Recursively load all children. 1955 */ 1956 for (int c = 0; c < vd->vdev_children; c++) 1957 vdev_load(vd->vdev_child[c]); 1958 1959 /* 1960 * If this is a top-level vdev, initialize its metaslabs. 1961 */ 1962 if (vd == vd->vdev_top && !vd->vdev_ishole && 1963 (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 1964 vdev_metaslab_init(vd, 0) != 0)) 1965 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1966 VDEV_AUX_CORRUPT_DATA); 1967 1968 /* 1969 * If this is a leaf vdev, load its DTL. 1970 */ 1971 if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 1972 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1973 VDEV_AUX_CORRUPT_DATA); 1974} 1975 1976/* 1977 * The special vdev case is used for hot spares and l2cache devices. Its 1978 * sole purpose it to set the vdev state for the associated vdev. To do this, 1979 * we make sure that we can open the underlying device, then try to read the 1980 * label, and make sure that the label is sane and that it hasn't been 1981 * repurposed to another pool. 1982 */ 1983int 1984vdev_validate_aux(vdev_t *vd) 1985{ 1986 nvlist_t *label; 1987 uint64_t guid, version; 1988 uint64_t state; 1989 1990 if (!vdev_readable(vd)) 1991 return (0); 1992 1993 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 1994 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1995 VDEV_AUX_CORRUPT_DATA); 1996 return (-1); 1997 } 1998 1999 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 2000 !SPA_VERSION_IS_SUPPORTED(version) || 2001 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 2002 guid != vd->vdev_guid || 2003 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2004 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2005 VDEV_AUX_CORRUPT_DATA); 2006 nvlist_free(label); 2007 return (-1); 2008 } 2009 2010 /* 2011 * We don't actually check the pool state here. If it's in fact in 2012 * use by another pool, we update this fact on the fly when requested. 2013 */ 2014 nvlist_free(label); 2015 return (0); 2016} 2017 2018void 2019vdev_remove(vdev_t *vd, uint64_t txg) 2020{ 2021 spa_t *spa = vd->vdev_spa; 2022 objset_t *mos = spa->spa_meta_objset; 2023 dmu_tx_t *tx; 2024 2025 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2026 2027 if (vd->vdev_dtl_smo.smo_object) { 2028 ASSERT0(vd->vdev_dtl_smo.smo_alloc); 2029 (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); 2030 vd->vdev_dtl_smo.smo_object = 0; 2031 } 2032 2033 if (vd->vdev_ms != NULL) { 2034 for (int m = 0; m < vd->vdev_ms_count; m++) { 2035 metaslab_t *msp = vd->vdev_ms[m]; 2036 2037 if (msp == NULL || msp->ms_smo.smo_object == 0) 2038 continue; 2039 2040 ASSERT0(msp->ms_smo.smo_alloc); 2041 (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); 2042 msp->ms_smo.smo_object = 0; 2043 } 2044 } 2045 2046 if (vd->vdev_ms_array) { 2047 (void) dmu_object_free(mos, vd->vdev_ms_array, tx); 2048 vd->vdev_ms_array = 0; 2049 vd->vdev_ms_shift = 0; 2050 } 2051 dmu_tx_commit(tx); 2052} 2053 2054void 2055vdev_sync_done(vdev_t *vd, uint64_t txg) 2056{ 2057 metaslab_t *msp; 2058 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2059 2060 ASSERT(!vd->vdev_ishole); 2061 2062 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2063 metaslab_sync_done(msp, txg); 2064 2065 if (reassess) 2066 metaslab_sync_reassess(vd->vdev_mg); 2067} 2068 2069void 2070vdev_sync(vdev_t *vd, uint64_t txg) 2071{ 2072 spa_t *spa = vd->vdev_spa; 2073 vdev_t *lvd; 2074 metaslab_t *msp; 2075 dmu_tx_t *tx; 2076 2077 ASSERT(!vd->vdev_ishole); 2078 2079 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 2080 ASSERT(vd == vd->vdev_top); 2081 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2082 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2083 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2084 ASSERT(vd->vdev_ms_array != 0); 2085 vdev_config_dirty(vd); 2086 dmu_tx_commit(tx); 2087 } 2088 2089 /* 2090 * Remove the metadata associated with this vdev once it's empty. 2091 */ 2092 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) 2093 vdev_remove(vd, txg); 2094 2095 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2096 metaslab_sync(msp, txg); 2097 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2098 } 2099 2100 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2101 vdev_dtl_sync(lvd, txg); 2102 2103 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2104} 2105 2106uint64_t 2107vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2108{ 2109 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2110} 2111 2112/* 2113 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2114 * not be opened, and no I/O is attempted. 2115 */ 2116int 2117vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2118{ 2119 vdev_t *vd, *tvd; 2120 2121 spa_vdev_state_enter(spa, SCL_NONE); 2122 2123 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2124 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2125 2126 if (!vd->vdev_ops->vdev_op_leaf) 2127 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2128 2129 tvd = vd->vdev_top; 2130 2131 /* 2132 * We don't directly use the aux state here, but if we do a 2133 * vdev_reopen(), we need this value to be present to remember why we 2134 * were faulted. 2135 */ 2136 vd->vdev_label_aux = aux; 2137 2138 /* 2139 * Faulted state takes precedence over degraded. 2140 */ 2141 vd->vdev_delayed_close = B_FALSE; 2142 vd->vdev_faulted = 1ULL; 2143 vd->vdev_degraded = 0ULL; 2144 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2145 2146 /* 2147 * If this device has the only valid copy of the data, then 2148 * back off and simply mark the vdev as degraded instead. 2149 */ 2150 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2151 vd->vdev_degraded = 1ULL; 2152 vd->vdev_faulted = 0ULL; 2153 2154 /* 2155 * If we reopen the device and it's not dead, only then do we 2156 * mark it degraded. 2157 */ 2158 vdev_reopen(tvd); 2159 2160 if (vdev_readable(vd)) 2161 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2162 } 2163 2164 return (spa_vdev_state_exit(spa, vd, 0)); 2165} 2166 2167/* 2168 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2169 * user that something is wrong. The vdev continues to operate as normal as far 2170 * as I/O is concerned. 2171 */ 2172int 2173vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2174{ 2175 vdev_t *vd; 2176 2177 spa_vdev_state_enter(spa, SCL_NONE); 2178 2179 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2180 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2181 2182 if (!vd->vdev_ops->vdev_op_leaf) 2183 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2184 2185 /* 2186 * If the vdev is already faulted, then don't do anything. 2187 */ 2188 if (vd->vdev_faulted || vd->vdev_degraded) 2189 return (spa_vdev_state_exit(spa, NULL, 0)); 2190 2191 vd->vdev_degraded = 1ULL; 2192 if (!vdev_is_dead(vd)) 2193 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2194 aux); 2195 2196 return (spa_vdev_state_exit(spa, vd, 0)); 2197} 2198 2199/* 2200 * Online the given vdev. 2201 * 2202 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2203 * spare device should be detached when the device finishes resilvering. 2204 * Second, the online should be treated like a 'test' online case, so no FMA 2205 * events are generated if the device fails to open. 2206 */ 2207int 2208vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 2209{ 2210 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 2211 2212 spa_vdev_state_enter(spa, SCL_NONE); 2213 2214 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2215 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2216 2217 if (!vd->vdev_ops->vdev_op_leaf) 2218 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2219 2220 tvd = vd->vdev_top; 2221 vd->vdev_offline = B_FALSE; 2222 vd->vdev_tmpoffline = B_FALSE; 2223 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2224 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2225 2226 /* XXX - L2ARC 1.0 does not support expansion */ 2227 if (!vd->vdev_aux) { 2228 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2229 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2230 } 2231 2232 vdev_reopen(tvd); 2233 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2234 2235 if (!vd->vdev_aux) { 2236 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2237 pvd->vdev_expanding = B_FALSE; 2238 } 2239 2240 if (newstate) 2241 *newstate = vd->vdev_state; 2242 if ((flags & ZFS_ONLINE_UNSPARE) && 2243 !vdev_is_dead(vd) && vd->vdev_parent && 2244 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2245 vd->vdev_parent->vdev_child[0] == vd) 2246 vd->vdev_unspare = B_TRUE; 2247 2248 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2249 2250 /* XXX - L2ARC 1.0 does not support expansion */ 2251 if (vd->vdev_aux) 2252 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2253 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2254 } 2255 return (spa_vdev_state_exit(spa, vd, 0)); 2256} 2257 2258static int 2259vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2260{ 2261 vdev_t *vd, *tvd; 2262 int error = 0; 2263 uint64_t generation; 2264 metaslab_group_t *mg; 2265 2266top: 2267 spa_vdev_state_enter(spa, SCL_ALLOC); 2268 2269 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2270 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2271 2272 if (!vd->vdev_ops->vdev_op_leaf) 2273 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2274 2275 tvd = vd->vdev_top; 2276 mg = tvd->vdev_mg; 2277 generation = spa->spa_config_generation + 1; 2278 2279 /* 2280 * If the device isn't already offline, try to offline it. 2281 */ 2282 if (!vd->vdev_offline) { 2283 /* 2284 * If this device has the only valid copy of some data, 2285 * don't allow it to be offlined. Log devices are always 2286 * expendable. 2287 */ 2288 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2289 vdev_dtl_required(vd)) 2290 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2291 2292 /* 2293 * If the top-level is a slog and it has had allocations 2294 * then proceed. We check that the vdev's metaslab group 2295 * is not NULL since it's possible that we may have just 2296 * added this vdev but not yet initialized its metaslabs. 2297 */ 2298 if (tvd->vdev_islog && mg != NULL) { 2299 /* 2300 * Prevent any future allocations. 2301 */ 2302 metaslab_group_passivate(mg); 2303 (void) spa_vdev_state_exit(spa, vd, 0); 2304 2305 error = spa_offline_log(spa); 2306 2307 spa_vdev_state_enter(spa, SCL_ALLOC); 2308 2309 /* 2310 * Check to see if the config has changed. 2311 */ 2312 if (error || generation != spa->spa_config_generation) { 2313 metaslab_group_activate(mg); 2314 if (error) 2315 return (spa_vdev_state_exit(spa, 2316 vd, error)); 2317 (void) spa_vdev_state_exit(spa, vd, 0); 2318 goto top; 2319 } 2320 ASSERT0(tvd->vdev_stat.vs_alloc); 2321 } 2322 2323 /* 2324 * Offline this device and reopen its top-level vdev. 2325 * If the top-level vdev is a log device then just offline 2326 * it. Otherwise, if this action results in the top-level 2327 * vdev becoming unusable, undo it and fail the request. 2328 */ 2329 vd->vdev_offline = B_TRUE; 2330 vdev_reopen(tvd); 2331 2332 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2333 vdev_is_dead(tvd)) { 2334 vd->vdev_offline = B_FALSE; 2335 vdev_reopen(tvd); 2336 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2337 } 2338 2339 /* 2340 * Add the device back into the metaslab rotor so that 2341 * once we online the device it's open for business. 2342 */ 2343 if (tvd->vdev_islog && mg != NULL) 2344 metaslab_group_activate(mg); 2345 } 2346 2347 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2348 2349 return (spa_vdev_state_exit(spa, vd, 0)); 2350} 2351 2352int 2353vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2354{ 2355 int error; 2356 2357 mutex_enter(&spa->spa_vdev_top_lock); 2358 error = vdev_offline_locked(spa, guid, flags); 2359 mutex_exit(&spa->spa_vdev_top_lock); 2360 2361 return (error); 2362} 2363 2364/* 2365 * Clear the error counts associated with this vdev. Unlike vdev_online() and 2366 * vdev_offline(), we assume the spa config is locked. We also clear all 2367 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2368 */ 2369void 2370vdev_clear(spa_t *spa, vdev_t *vd) 2371{ 2372 vdev_t *rvd = spa->spa_root_vdev; 2373 2374 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2375 2376 if (vd == NULL) 2377 vd = rvd; 2378 2379 vd->vdev_stat.vs_read_errors = 0; 2380 vd->vdev_stat.vs_write_errors = 0; 2381 vd->vdev_stat.vs_checksum_errors = 0; 2382 2383 for (int c = 0; c < vd->vdev_children; c++) 2384 vdev_clear(spa, vd->vdev_child[c]); 2385 2386 /* 2387 * If we're in the FAULTED state or have experienced failed I/O, then 2388 * clear the persistent state and attempt to reopen the device. We 2389 * also mark the vdev config dirty, so that the new faulted state is 2390 * written out to disk. 2391 */ 2392 if (vd->vdev_faulted || vd->vdev_degraded || 2393 !vdev_readable(vd) || !vdev_writeable(vd)) { 2394 2395 /* 2396 * When reopening in reponse to a clear event, it may be due to 2397 * a fmadm repair request. In this case, if the device is 2398 * still broken, we want to still post the ereport again. 2399 */ 2400 vd->vdev_forcefault = B_TRUE; 2401 2402 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2403 vd->vdev_cant_read = B_FALSE; 2404 vd->vdev_cant_write = B_FALSE; 2405 2406 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 2407 2408 vd->vdev_forcefault = B_FALSE; 2409 2410 if (vd != rvd && vdev_writeable(vd->vdev_top)) 2411 vdev_state_dirty(vd->vdev_top); 2412 2413 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2414 spa_async_request(spa, SPA_ASYNC_RESILVER); 2415 2416 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 2417 } 2418 2419 /* 2420 * When clearing a FMA-diagnosed fault, we always want to 2421 * unspare the device, as we assume that the original spare was 2422 * done in response to the FMA fault. 2423 */ 2424 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2425 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2426 vd->vdev_parent->vdev_child[0] == vd) 2427 vd->vdev_unspare = B_TRUE; 2428} 2429 2430boolean_t 2431vdev_is_dead(vdev_t *vd) 2432{ 2433 /* 2434 * Holes and missing devices are always considered "dead". 2435 * This simplifies the code since we don't have to check for 2436 * these types of devices in the various code paths. 2437 * Instead we rely on the fact that we skip over dead devices 2438 * before issuing I/O to them. 2439 */ 2440 return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 2441 vd->vdev_ops == &vdev_missing_ops); 2442} 2443 2444boolean_t 2445vdev_readable(vdev_t *vd) 2446{ 2447 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 2448} 2449 2450boolean_t 2451vdev_writeable(vdev_t *vd) 2452{ 2453 return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 2454} 2455 2456boolean_t 2457vdev_allocatable(vdev_t *vd) 2458{ 2459 uint64_t state = vd->vdev_state; 2460 2461 /* 2462 * We currently allow allocations from vdevs which may be in the 2463 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2464 * fails to reopen then we'll catch it later when we're holding 2465 * the proper locks. Note that we have to get the vdev state 2466 * in a local variable because although it changes atomically, 2467 * we're asking two separate questions about it. 2468 */ 2469 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 2470 !vd->vdev_cant_write && !vd->vdev_ishole); 2471} 2472 2473boolean_t 2474vdev_accessible(vdev_t *vd, zio_t *zio) 2475{ 2476 ASSERT(zio->io_vd == vd); 2477 2478 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2479 return (B_FALSE); 2480 2481 if (zio->io_type == ZIO_TYPE_READ) 2482 return (!vd->vdev_cant_read); 2483 2484 if (zio->io_type == ZIO_TYPE_WRITE) 2485 return (!vd->vdev_cant_write); 2486 2487 return (B_TRUE); 2488} 2489 2490/* 2491 * Get statistics for the given vdev. 2492 */ 2493void 2494vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2495{ 2496 vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 2497 2498 mutex_enter(&vd->vdev_stat_lock); 2499 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2500 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2501 vs->vs_state = vd->vdev_state; 2502 vs->vs_rsize = vdev_get_min_asize(vd); 2503 if (vd->vdev_ops->vdev_op_leaf) 2504 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2505 vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; 2506 mutex_exit(&vd->vdev_stat_lock); 2507 2508 /* 2509 * If we're getting stats on the root vdev, aggregate the I/O counts 2510 * over all top-level vdevs (i.e. the direct children of the root). 2511 */ 2512 if (vd == rvd) { 2513 for (int c = 0; c < rvd->vdev_children; c++) { 2514 vdev_t *cvd = rvd->vdev_child[c]; 2515 vdev_stat_t *cvs = &cvd->vdev_stat; 2516 2517 mutex_enter(&vd->vdev_stat_lock); 2518 for (int t = 0; t < ZIO_TYPES; t++) { 2519 vs->vs_ops[t] += cvs->vs_ops[t]; 2520 vs->vs_bytes[t] += cvs->vs_bytes[t]; 2521 } 2522 cvs->vs_scan_removing = cvd->vdev_removing; 2523 mutex_exit(&vd->vdev_stat_lock); 2524 } 2525 } 2526} 2527 2528void 2529vdev_clear_stats(vdev_t *vd) 2530{ 2531 mutex_enter(&vd->vdev_stat_lock); 2532 vd->vdev_stat.vs_space = 0; 2533 vd->vdev_stat.vs_dspace = 0; 2534 vd->vdev_stat.vs_alloc = 0; 2535 mutex_exit(&vd->vdev_stat_lock); 2536} 2537 2538void 2539vdev_scan_stat_init(vdev_t *vd) 2540{ 2541 vdev_stat_t *vs = &vd->vdev_stat; 2542 2543 for (int c = 0; c < vd->vdev_children; c++) 2544 vdev_scan_stat_init(vd->vdev_child[c]); 2545 2546 mutex_enter(&vd->vdev_stat_lock); 2547 vs->vs_scan_processed = 0; 2548 mutex_exit(&vd->vdev_stat_lock); 2549} 2550 2551void 2552vdev_stat_update(zio_t *zio, uint64_t psize) 2553{ 2554 spa_t *spa = zio->io_spa; 2555 vdev_t *rvd = spa->spa_root_vdev; 2556 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2557 vdev_t *pvd; 2558 uint64_t txg = zio->io_txg; 2559 vdev_stat_t *vs = &vd->vdev_stat; 2560 zio_type_t type = zio->io_type; 2561 int flags = zio->io_flags; 2562 2563 /* 2564 * If this i/o is a gang leader, it didn't do any actual work. 2565 */ 2566 if (zio->io_gang_tree) 2567 return; 2568 2569 if (zio->io_error == 0) { 2570 /* 2571 * If this is a root i/o, don't count it -- we've already 2572 * counted the top-level vdevs, and vdev_get_stats() will 2573 * aggregate them when asked. This reduces contention on 2574 * the root vdev_stat_lock and implicitly handles blocks 2575 * that compress away to holes, for which there is no i/o. 2576 * (Holes never create vdev children, so all the counters 2577 * remain zero, which is what we want.) 2578 * 2579 * Note: this only applies to successful i/o (io_error == 0) 2580 * because unlike i/o counts, errors are not additive. 2581 * When reading a ditto block, for example, failure of 2582 * one top-level vdev does not imply a root-level error. 2583 */ 2584 if (vd == rvd) 2585 return; 2586 2587 ASSERT(vd == zio->io_vd); 2588 2589 if (flags & ZIO_FLAG_IO_BYPASS) 2590 return; 2591 2592 mutex_enter(&vd->vdev_stat_lock); 2593 2594 if (flags & ZIO_FLAG_IO_REPAIR) { 2595 if (flags & ZIO_FLAG_SCAN_THREAD) { 2596 dsl_scan_phys_t *scn_phys = 2597 &spa->spa_dsl_pool->dp_scan->scn_phys; 2598 uint64_t *processed = &scn_phys->scn_processed; 2599 2600 /* XXX cleanup? */ 2601 if (vd->vdev_ops->vdev_op_leaf) 2602 atomic_add_64(processed, psize); 2603 vs->vs_scan_processed += psize; 2604 } 2605 2606 if (flags & ZIO_FLAG_SELF_HEAL) 2607 vs->vs_self_healed += psize; 2608 } 2609 2610 vs->vs_ops[type]++; 2611 vs->vs_bytes[type] += psize; 2612 2613 mutex_exit(&vd->vdev_stat_lock); 2614 return; 2615 } 2616 2617 if (flags & ZIO_FLAG_SPECULATIVE) 2618 return; 2619 2620 /* 2621 * If this is an I/O error that is going to be retried, then ignore the 2622 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 2623 * hard errors, when in reality they can happen for any number of 2624 * innocuous reasons (bus resets, MPxIO link failure, etc). 2625 */ 2626 if (zio->io_error == EIO && 2627 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 2628 return; 2629 2630 /* 2631 * Intent logs writes won't propagate their error to the root 2632 * I/O so don't mark these types of failures as pool-level 2633 * errors. 2634 */ 2635 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 2636 return; 2637 2638 mutex_enter(&vd->vdev_stat_lock); 2639 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 2640 if (zio->io_error == ECKSUM) 2641 vs->vs_checksum_errors++; 2642 else 2643 vs->vs_read_errors++; 2644 } 2645 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 2646 vs->vs_write_errors++; 2647 mutex_exit(&vd->vdev_stat_lock); 2648 2649 if (type == ZIO_TYPE_WRITE && txg != 0 && 2650 (!(flags & ZIO_FLAG_IO_REPAIR) || 2651 (flags & ZIO_FLAG_SCAN_THREAD) || 2652 spa->spa_claiming)) { 2653 /* 2654 * This is either a normal write (not a repair), or it's 2655 * a repair induced by the scrub thread, or it's a repair 2656 * made by zil_claim() during spa_load() in the first txg. 2657 * In the normal case, we commit the DTL change in the same 2658 * txg as the block was born. In the scrub-induced repair 2659 * case, we know that scrubs run in first-pass syncing context, 2660 * so we commit the DTL change in spa_syncing_txg(spa). 2661 * In the zil_claim() case, we commit in spa_first_txg(spa). 2662 * 2663 * We currently do not make DTL entries for failed spontaneous 2664 * self-healing writes triggered by normal (non-scrubbing) 2665 * reads, because we have no transactional context in which to 2666 * do so -- and it's not clear that it'd be desirable anyway. 2667 */ 2668 if (vd->vdev_ops->vdev_op_leaf) { 2669 uint64_t commit_txg = txg; 2670 if (flags & ZIO_FLAG_SCAN_THREAD) { 2671 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2672 ASSERT(spa_sync_pass(spa) == 1); 2673 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 2674 commit_txg = spa_syncing_txg(spa); 2675 } else if (spa->spa_claiming) { 2676 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2677 commit_txg = spa_first_txg(spa); 2678 } 2679 ASSERT(commit_txg >= spa_syncing_txg(spa)); 2680 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 2681 return; 2682 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2683 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 2684 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 2685 } 2686 if (vd != rvd) 2687 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 2688 } 2689} 2690 2691/* 2692 * Update the in-core space usage stats for this vdev, its metaslab class, 2693 * and the root vdev. 2694 */ 2695void 2696vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 2697 int64_t space_delta) 2698{ 2699 int64_t dspace_delta = space_delta; 2700 spa_t *spa = vd->vdev_spa; 2701 vdev_t *rvd = spa->spa_root_vdev; 2702 metaslab_group_t *mg = vd->vdev_mg; 2703 metaslab_class_t *mc = mg ? mg->mg_class : NULL; 2704 2705 ASSERT(vd == vd->vdev_top); 2706 2707 /* 2708 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 2709 * factor. We must calculate this here and not at the root vdev 2710 * because the root vdev's psize-to-asize is simply the max of its 2711 * childrens', thus not accurate enough for us. 2712 */ 2713 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 2714 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 2715 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 2716 vd->vdev_deflate_ratio; 2717 2718 mutex_enter(&vd->vdev_stat_lock); 2719 vd->vdev_stat.vs_alloc += alloc_delta; 2720 vd->vdev_stat.vs_space += space_delta; 2721 vd->vdev_stat.vs_dspace += dspace_delta; 2722 mutex_exit(&vd->vdev_stat_lock); 2723 2724 if (mc == spa_normal_class(spa)) { 2725 mutex_enter(&rvd->vdev_stat_lock); 2726 rvd->vdev_stat.vs_alloc += alloc_delta; 2727 rvd->vdev_stat.vs_space += space_delta; 2728 rvd->vdev_stat.vs_dspace += dspace_delta; 2729 mutex_exit(&rvd->vdev_stat_lock); 2730 } 2731 2732 if (mc != NULL) { 2733 ASSERT(rvd == vd->vdev_parent); 2734 ASSERT(vd->vdev_ms_count != 0); 2735 2736 metaslab_class_space_update(mc, 2737 alloc_delta, defer_delta, space_delta, dspace_delta); 2738 } 2739} 2740 2741/* 2742 * Mark a top-level vdev's config as dirty, placing it on the dirty list 2743 * so that it will be written out next time the vdev configuration is synced. 2744 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2745 */ 2746void 2747vdev_config_dirty(vdev_t *vd) 2748{ 2749 spa_t *spa = vd->vdev_spa; 2750 vdev_t *rvd = spa->spa_root_vdev; 2751 int c; 2752 2753 ASSERT(spa_writeable(spa)); 2754 2755 /* 2756 * If this is an aux vdev (as with l2cache and spare devices), then we 2757 * update the vdev config manually and set the sync flag. 2758 */ 2759 if (vd->vdev_aux != NULL) { 2760 spa_aux_vdev_t *sav = vd->vdev_aux; 2761 nvlist_t **aux; 2762 uint_t naux; 2763 2764 for (c = 0; c < sav->sav_count; c++) { 2765 if (sav->sav_vdevs[c] == vd) 2766 break; 2767 } 2768 2769 if (c == sav->sav_count) { 2770 /* 2771 * We're being removed. There's nothing more to do. 2772 */ 2773 ASSERT(sav->sav_sync == B_TRUE); 2774 return; 2775 } 2776 2777 sav->sav_sync = B_TRUE; 2778 2779 if (nvlist_lookup_nvlist_array(sav->sav_config, 2780 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 2781 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 2782 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 2783 } 2784 2785 ASSERT(c < naux); 2786 2787 /* 2788 * Setting the nvlist in the middle if the array is a little 2789 * sketchy, but it will work. 2790 */ 2791 nvlist_free(aux[c]); 2792 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 2793 2794 return; 2795 } 2796 2797 /* 2798 * The dirty list is protected by the SCL_CONFIG lock. The caller 2799 * must either hold SCL_CONFIG as writer, or must be the sync thread 2800 * (which holds SCL_CONFIG as reader). There's only one sync thread, 2801 * so this is sufficient to ensure mutual exclusion. 2802 */ 2803 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2804 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2805 spa_config_held(spa, SCL_CONFIG, RW_READER))); 2806 2807 if (vd == rvd) { 2808 for (c = 0; c < rvd->vdev_children; c++) 2809 vdev_config_dirty(rvd->vdev_child[c]); 2810 } else { 2811 ASSERT(vd == vd->vdev_top); 2812 2813 if (!list_link_active(&vd->vdev_config_dirty_node) && 2814 !vd->vdev_ishole) 2815 list_insert_head(&spa->spa_config_dirty_list, vd); 2816 } 2817} 2818 2819void 2820vdev_config_clean(vdev_t *vd) 2821{ 2822 spa_t *spa = vd->vdev_spa; 2823 2824 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2825 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2826 spa_config_held(spa, SCL_CONFIG, RW_READER))); 2827 2828 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 2829 list_remove(&spa->spa_config_dirty_list, vd); 2830} 2831 2832/* 2833 * Mark a top-level vdev's state as dirty, so that the next pass of 2834 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 2835 * the state changes from larger config changes because they require 2836 * much less locking, and are often needed for administrative actions. 2837 */ 2838void 2839vdev_state_dirty(vdev_t *vd) 2840{ 2841 spa_t *spa = vd->vdev_spa; 2842 2843 ASSERT(spa_writeable(spa)); 2844 ASSERT(vd == vd->vdev_top); 2845 2846 /* 2847 * The state list is protected by the SCL_STATE lock. The caller 2848 * must either hold SCL_STATE as writer, or must be the sync thread 2849 * (which holds SCL_STATE as reader). There's only one sync thread, 2850 * so this is sufficient to ensure mutual exclusion. 2851 */ 2852 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2853 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2854 spa_config_held(spa, SCL_STATE, RW_READER))); 2855 2856 if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 2857 list_insert_head(&spa->spa_state_dirty_list, vd); 2858} 2859 2860void 2861vdev_state_clean(vdev_t *vd) 2862{ 2863 spa_t *spa = vd->vdev_spa; 2864 2865 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2866 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2867 spa_config_held(spa, SCL_STATE, RW_READER))); 2868 2869 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 2870 list_remove(&spa->spa_state_dirty_list, vd); 2871} 2872 2873/* 2874 * Propagate vdev state up from children to parent. 2875 */ 2876void 2877vdev_propagate_state(vdev_t *vd) 2878{ 2879 spa_t *spa = vd->vdev_spa; 2880 vdev_t *rvd = spa->spa_root_vdev; 2881 int degraded = 0, faulted = 0; 2882 int corrupted = 0; 2883 vdev_t *child; 2884 2885 if (vd->vdev_children > 0) { 2886 for (int c = 0; c < vd->vdev_children; c++) { 2887 child = vd->vdev_child[c]; 2888 2889 /* 2890 * Don't factor holes into the decision. 2891 */ 2892 if (child->vdev_ishole) 2893 continue; 2894 2895 if (!vdev_readable(child) || 2896 (!vdev_writeable(child) && spa_writeable(spa))) { 2897 /* 2898 * Root special: if there is a top-level log 2899 * device, treat the root vdev as if it were 2900 * degraded. 2901 */ 2902 if (child->vdev_islog && vd == rvd) 2903 degraded++; 2904 else 2905 faulted++; 2906 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 2907 degraded++; 2908 } 2909 2910 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 2911 corrupted++; 2912 } 2913 2914 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 2915 2916 /* 2917 * Root special: if there is a top-level vdev that cannot be 2918 * opened due to corrupted metadata, then propagate the root 2919 * vdev's aux state as 'corrupt' rather than 'insufficient 2920 * replicas'. 2921 */ 2922 if (corrupted && vd == rvd && 2923 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 2924 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 2925 VDEV_AUX_CORRUPT_DATA); 2926 } 2927 2928 if (vd->vdev_parent) 2929 vdev_propagate_state(vd->vdev_parent); 2930} 2931 2932/* 2933 * Set a vdev's state. If this is during an open, we don't update the parent 2934 * state, because we're in the process of opening children depth-first. 2935 * Otherwise, we propagate the change to the parent. 2936 * 2937 * If this routine places a device in a faulted state, an appropriate ereport is 2938 * generated. 2939 */ 2940void 2941vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2942{ 2943 uint64_t save_state; 2944 spa_t *spa = vd->vdev_spa; 2945 2946 if (state == vd->vdev_state) { 2947 vd->vdev_stat.vs_aux = aux; 2948 return; 2949 } 2950 2951 save_state = vd->vdev_state; 2952 2953 vd->vdev_state = state; 2954 vd->vdev_stat.vs_aux = aux; 2955 2956 /* 2957 * If we are setting the vdev state to anything but an open state, then 2958 * always close the underlying device unless the device has requested 2959 * a delayed close (i.e. we're about to remove or fault the device). 2960 * Otherwise, we keep accessible but invalid devices open forever. 2961 * We don't call vdev_close() itself, because that implies some extra 2962 * checks (offline, etc) that we don't want here. This is limited to 2963 * leaf devices, because otherwise closing the device will affect other 2964 * children. 2965 */ 2966 if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 2967 vd->vdev_ops->vdev_op_leaf) 2968 vd->vdev_ops->vdev_op_close(vd); 2969 2970 /* 2971 * If we have brought this vdev back into service, we need 2972 * to notify fmd so that it can gracefully repair any outstanding 2973 * cases due to a missing device. We do this in all cases, even those 2974 * that probably don't correlate to a repaired fault. This is sure to 2975 * catch all cases, and we let the zfs-retire agent sort it out. If 2976 * this is a transient state it's OK, as the retire agent will 2977 * double-check the state of the vdev before repairing it. 2978 */ 2979 if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 2980 vd->vdev_prevstate != state) 2981 zfs_post_state_change(spa, vd); 2982 2983 if (vd->vdev_removed && 2984 state == VDEV_STATE_CANT_OPEN && 2985 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 2986 /* 2987 * If the previous state is set to VDEV_STATE_REMOVED, then this 2988 * device was previously marked removed and someone attempted to 2989 * reopen it. If this failed due to a nonexistent device, then 2990 * keep the device in the REMOVED state. We also let this be if 2991 * it is one of our special test online cases, which is only 2992 * attempting to online the device and shouldn't generate an FMA 2993 * fault. 2994 */ 2995 vd->vdev_state = VDEV_STATE_REMOVED; 2996 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 2997 } else if (state == VDEV_STATE_REMOVED) { 2998 vd->vdev_removed = B_TRUE; 2999 } else if (state == VDEV_STATE_CANT_OPEN) { 3000 /* 3001 * If we fail to open a vdev during an import or recovery, we 3002 * mark it as "not available", which signifies that it was 3003 * never there to begin with. Failure to open such a device 3004 * is not considered an error. 3005 */ 3006 if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 3007 spa_load_state(spa) == SPA_LOAD_RECOVER) && 3008 vd->vdev_ops->vdev_op_leaf) 3009 vd->vdev_not_present = 1; 3010 3011 /* 3012 * Post the appropriate ereport. If the 'prevstate' field is 3013 * set to something other than VDEV_STATE_UNKNOWN, it indicates 3014 * that this is part of a vdev_reopen(). In this case, we don't 3015 * want to post the ereport if the device was already in the 3016 * CANT_OPEN state beforehand. 3017 * 3018 * If the 'checkremove' flag is set, then this is an attempt to 3019 * online the device in response to an insertion event. If we 3020 * hit this case, then we have detected an insertion event for a 3021 * faulted or offline device that wasn't in the removed state. 3022 * In this scenario, we don't post an ereport because we are 3023 * about to replace the device, or attempt an online with 3024 * vdev_forcefault, which will generate the fault for us. 3025 */ 3026 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 3027 !vd->vdev_not_present && !vd->vdev_checkremove && 3028 vd != spa->spa_root_vdev) { 3029 const char *class; 3030 3031 switch (aux) { 3032 case VDEV_AUX_OPEN_FAILED: 3033 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3034 break; 3035 case VDEV_AUX_CORRUPT_DATA: 3036 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3037 break; 3038 case VDEV_AUX_NO_REPLICAS: 3039 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3040 break; 3041 case VDEV_AUX_BAD_GUID_SUM: 3042 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3043 break; 3044 case VDEV_AUX_TOO_SMALL: 3045 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3046 break; 3047 case VDEV_AUX_BAD_LABEL: 3048 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3049 break; 3050 default: 3051 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3052 } 3053 3054 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3055 } 3056 3057 /* Erase any notion of persistent removed state */ 3058 vd->vdev_removed = B_FALSE; 3059 } else { 3060 vd->vdev_removed = B_FALSE; 3061 } 3062 3063 if (!isopen && vd->vdev_parent) 3064 vdev_propagate_state(vd->vdev_parent); 3065} 3066 3067/* 3068 * Check the vdev configuration to ensure that it's capable of supporting 3069 * a root pool. 3070 * 3071 * On Solaris, we do not support RAID-Z or partial configuration. In 3072 * addition, only a single top-level vdev is allowed and none of the 3073 * leaves can be wholedisks. 3074 * 3075 * For FreeBSD, we can boot from any configuration. There is a 3076 * limitation that the boot filesystem must be either uncompressed or 3077 * compresses with lzjb compression but I'm not sure how to enforce 3078 * that here. 3079 */ 3080boolean_t 3081vdev_is_bootable(vdev_t *vd) 3082{ 3083#ifdef sun 3084 if (!vd->vdev_ops->vdev_op_leaf) { 3085 char *vdev_type = vd->vdev_ops->vdev_op_type; 3086 3087 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3088 vd->vdev_children > 1) { 3089 return (B_FALSE); 3090 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 3091 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 3092 return (B_FALSE); 3093 } 3094 } else if (vd->vdev_wholedisk == 1) { 3095 return (B_FALSE); 3096 } 3097 3098 for (int c = 0; c < vd->vdev_children; c++) { 3099 if (!vdev_is_bootable(vd->vdev_child[c])) 3100 return (B_FALSE); 3101 } 3102#endif /* sun */ 3103 return (B_TRUE); 3104} 3105 3106/* 3107 * Load the state from the original vdev tree (ovd) which 3108 * we've retrieved from the MOS config object. If the original 3109 * vdev was offline or faulted then we transfer that state to the 3110 * device in the current vdev tree (nvd). 3111 */ 3112void 3113vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3114{ 3115 spa_t *spa = nvd->vdev_spa; 3116 3117 ASSERT(nvd->vdev_top->vdev_islog); 3118 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3119 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3120 3121 for (int c = 0; c < nvd->vdev_children; c++) 3122 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3123 3124 if (nvd->vdev_ops->vdev_op_leaf) { 3125 /* 3126 * Restore the persistent vdev state 3127 */ 3128 nvd->vdev_offline = ovd->vdev_offline; 3129 nvd->vdev_faulted = ovd->vdev_faulted; 3130 nvd->vdev_degraded = ovd->vdev_degraded; 3131 nvd->vdev_removed = ovd->vdev_removed; 3132 } 3133} 3134 3135/* 3136 * Determine if a log device has valid content. If the vdev was 3137 * removed or faulted in the MOS config then we know that 3138 * the content on the log device has already been written to the pool. 3139 */ 3140boolean_t 3141vdev_log_state_valid(vdev_t *vd) 3142{ 3143 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3144 !vd->vdev_removed) 3145 return (B_TRUE); 3146 3147 for (int c = 0; c < vd->vdev_children; c++) 3148 if (vdev_log_state_valid(vd->vdev_child[c])) 3149 return (B_TRUE); 3150 3151 return (B_FALSE); 3152} 3153 3154/* 3155 * Expand a vdev if possible. 3156 */ 3157void 3158vdev_expand(vdev_t *vd, uint64_t txg) 3159{ 3160 ASSERT(vd->vdev_top == vd); 3161 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3162 3163 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 3164 VERIFY(vdev_metaslab_init(vd, txg) == 0); 3165 vdev_config_dirty(vd); 3166 } 3167} 3168 3169/* 3170 * Split a vdev. 3171 */ 3172void 3173vdev_split(vdev_t *vd) 3174{ 3175 vdev_t *cvd, *pvd = vd->vdev_parent; 3176 3177 vdev_remove_child(pvd, vd); 3178 vdev_compact_children(pvd); 3179 3180 cvd = pvd->vdev_child[0]; 3181 if (pvd->vdev_children == 1) { 3182 vdev_remove_parent(cvd); 3183 cvd->vdev_splitting = B_TRUE; 3184 } 3185 vdev_propagate_state(cvd); 3186} 3187 3188void 3189vdev_deadman(vdev_t *vd) 3190{ 3191 for (int c = 0; c < vd->vdev_children; c++) { 3192 vdev_t *cvd = vd->vdev_child[c]; 3193 3194 vdev_deadman(cvd); 3195 } 3196 3197 if (vd->vdev_ops->vdev_op_leaf) { 3198 vdev_queue_t *vq = &vd->vdev_queue; 3199 3200 mutex_enter(&vq->vq_lock); 3201 if (avl_numnodes(&vq->vq_pending_tree) > 0) { 3202 spa_t *spa = vd->vdev_spa; 3203 zio_t *fio; 3204 uint64_t delta; 3205 3206 /* 3207 * Look at the head of all the pending queues, 3208 * if any I/O has been outstanding for longer than 3209 * the spa_deadman_synctime we panic the system. 3210 */ 3211 fio = avl_first(&vq->vq_pending_tree); 3212 delta = gethrtime() - fio->io_timestamp; 3213 if (delta > spa_deadman_synctime(spa)) { 3214 zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " 3215 "delta %lluns, last io %lluns", 3216 fio->io_timestamp, delta, 3217 vq->vq_io_complete_ts); 3218 fm_panic("I/O to pool '%s' appears to be " 3219 "hung on vdev guid %llu at '%s'.", 3220 spa_name(spa), 3221 (long long unsigned int) vd->vdev_guid, 3222 vd->vdev_path); 3223 } 3224 } 3225 mutex_exit(&vq->vq_lock); 3226 } 3227} 3228