vdev.c revision 253991
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 by Delphix. All rights reserved. 26 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 */ 28 29#include <sys/zfs_context.h> 30#include <sys/fm/fs/zfs.h> 31#include <sys/spa.h> 32#include <sys/spa_impl.h> 33#include <sys/dmu.h> 34#include <sys/dmu_tx.h> 35#include <sys/vdev_impl.h> 36#include <sys/uberblock_impl.h> 37#include <sys/metaslab.h> 38#include <sys/metaslab_impl.h> 39#include <sys/space_map.h> 40#include <sys/zio.h> 41#include <sys/zap.h> 42#include <sys/fs/zfs.h> 43#include <sys/arc.h> 44#include <sys/zil.h> 45#include <sys/dsl_scan.h> 46#include <sys/trim_map.h> 47 48SYSCTL_DECL(_vfs_zfs); 49SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 50 51/* 52 * Virtual device management. 53 */ 54 55static vdev_ops_t *vdev_ops_table[] = { 56 &vdev_root_ops, 57 &vdev_raidz_ops, 58 &vdev_mirror_ops, 59 &vdev_replacing_ops, 60 &vdev_spare_ops, 61#ifdef _KERNEL 62 &vdev_geom_ops, 63#else 64 &vdev_disk_ops, 65#endif 66 &vdev_file_ops, 67 &vdev_missing_ops, 68 &vdev_hole_ops, 69 NULL 70}; 71 72 73/* 74 * Given a vdev type, return the appropriate ops vector. 75 */ 76static vdev_ops_t * 77vdev_getops(const char *type) 78{ 79 vdev_ops_t *ops, **opspp; 80 81 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 82 if (strcmp(ops->vdev_op_type, type) == 0) 83 break; 84 85 return (ops); 86} 87 88/* 89 * Default asize function: return the MAX of psize with the asize of 90 * all children. This is what's used by anything other than RAID-Z. 91 */ 92uint64_t 93vdev_default_asize(vdev_t *vd, uint64_t psize) 94{ 95 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 96 uint64_t csize; 97 98 for (int c = 0; c < vd->vdev_children; c++) { 99 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 100 asize = MAX(asize, csize); 101 } 102 103 return (asize); 104} 105 106/* 107 * Get the minimum allocatable size. We define the allocatable size as 108 * the vdev's asize rounded to the nearest metaslab. This allows us to 109 * replace or attach devices which don't have the same physical size but 110 * can still satisfy the same number of allocations. 111 */ 112uint64_t 113vdev_get_min_asize(vdev_t *vd) 114{ 115 vdev_t *pvd = vd->vdev_parent; 116 117 /* 118 * If our parent is NULL (inactive spare or cache) or is the root, 119 * just return our own asize. 120 */ 121 if (pvd == NULL) 122 return (vd->vdev_asize); 123 124 /* 125 * The top-level vdev just returns the allocatable size rounded 126 * to the nearest metaslab. 127 */ 128 if (vd == vd->vdev_top) 129 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 130 131 /* 132 * The allocatable space for a raidz vdev is N * sizeof(smallest child), 133 * so each child must provide at least 1/Nth of its asize. 134 */ 135 if (pvd->vdev_ops == &vdev_raidz_ops) 136 return (pvd->vdev_min_asize / pvd->vdev_children); 137 138 return (pvd->vdev_min_asize); 139} 140 141void 142vdev_set_min_asize(vdev_t *vd) 143{ 144 vd->vdev_min_asize = vdev_get_min_asize(vd); 145 146 for (int c = 0; c < vd->vdev_children; c++) 147 vdev_set_min_asize(vd->vdev_child[c]); 148} 149 150vdev_t * 151vdev_lookup_top(spa_t *spa, uint64_t vdev) 152{ 153 vdev_t *rvd = spa->spa_root_vdev; 154 155 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 156 157 if (vdev < rvd->vdev_children) { 158 ASSERT(rvd->vdev_child[vdev] != NULL); 159 return (rvd->vdev_child[vdev]); 160 } 161 162 return (NULL); 163} 164 165vdev_t * 166vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 167{ 168 vdev_t *mvd; 169 170 if (vd->vdev_guid == guid) 171 return (vd); 172 173 for (int c = 0; c < vd->vdev_children; c++) 174 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 175 NULL) 176 return (mvd); 177 178 return (NULL); 179} 180 181void 182vdev_add_child(vdev_t *pvd, vdev_t *cvd) 183{ 184 size_t oldsize, newsize; 185 uint64_t id = cvd->vdev_id; 186 vdev_t **newchild; 187 188 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 189 ASSERT(cvd->vdev_parent == NULL); 190 191 cvd->vdev_parent = pvd; 192 193 if (pvd == NULL) 194 return; 195 196 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 197 198 oldsize = pvd->vdev_children * sizeof (vdev_t *); 199 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 200 newsize = pvd->vdev_children * sizeof (vdev_t *); 201 202 newchild = kmem_zalloc(newsize, KM_SLEEP); 203 if (pvd->vdev_child != NULL) { 204 bcopy(pvd->vdev_child, newchild, oldsize); 205 kmem_free(pvd->vdev_child, oldsize); 206 } 207 208 pvd->vdev_child = newchild; 209 pvd->vdev_child[id] = cvd; 210 211 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 212 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 213 214 /* 215 * Walk up all ancestors to update guid sum. 216 */ 217 for (; pvd != NULL; pvd = pvd->vdev_parent) 218 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 219} 220 221void 222vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 223{ 224 int c; 225 uint_t id = cvd->vdev_id; 226 227 ASSERT(cvd->vdev_parent == pvd); 228 229 if (pvd == NULL) 230 return; 231 232 ASSERT(id < pvd->vdev_children); 233 ASSERT(pvd->vdev_child[id] == cvd); 234 235 pvd->vdev_child[id] = NULL; 236 cvd->vdev_parent = NULL; 237 238 for (c = 0; c < pvd->vdev_children; c++) 239 if (pvd->vdev_child[c]) 240 break; 241 242 if (c == pvd->vdev_children) { 243 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 244 pvd->vdev_child = NULL; 245 pvd->vdev_children = 0; 246 } 247 248 /* 249 * Walk up all ancestors to update guid sum. 250 */ 251 for (; pvd != NULL; pvd = pvd->vdev_parent) 252 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 253} 254 255/* 256 * Remove any holes in the child array. 257 */ 258void 259vdev_compact_children(vdev_t *pvd) 260{ 261 vdev_t **newchild, *cvd; 262 int oldc = pvd->vdev_children; 263 int newc; 264 265 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 266 267 for (int c = newc = 0; c < oldc; c++) 268 if (pvd->vdev_child[c]) 269 newc++; 270 271 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 272 273 for (int c = newc = 0; c < oldc; c++) { 274 if ((cvd = pvd->vdev_child[c]) != NULL) { 275 newchild[newc] = cvd; 276 cvd->vdev_id = newc++; 277 } 278 } 279 280 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 281 pvd->vdev_child = newchild; 282 pvd->vdev_children = newc; 283} 284 285/* 286 * Allocate and minimally initialize a vdev_t. 287 */ 288vdev_t * 289vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 290{ 291 vdev_t *vd; 292 293 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 294 295 if (spa->spa_root_vdev == NULL) { 296 ASSERT(ops == &vdev_root_ops); 297 spa->spa_root_vdev = vd; 298 spa->spa_load_guid = spa_generate_guid(NULL); 299 } 300 301 if (guid == 0 && ops != &vdev_hole_ops) { 302 if (spa->spa_root_vdev == vd) { 303 /* 304 * The root vdev's guid will also be the pool guid, 305 * which must be unique among all pools. 306 */ 307 guid = spa_generate_guid(NULL); 308 } else { 309 /* 310 * Any other vdev's guid must be unique within the pool. 311 */ 312 guid = spa_generate_guid(spa); 313 } 314 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 315 } 316 317 vd->vdev_spa = spa; 318 vd->vdev_id = id; 319 vd->vdev_guid = guid; 320 vd->vdev_guid_sum = guid; 321 vd->vdev_ops = ops; 322 vd->vdev_state = VDEV_STATE_CLOSED; 323 vd->vdev_ishole = (ops == &vdev_hole_ops); 324 325 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 326 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 327 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 328 for (int t = 0; t < DTL_TYPES; t++) { 329 space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, 330 &vd->vdev_dtl_lock); 331 } 332 txg_list_create(&vd->vdev_ms_list, 333 offsetof(struct metaslab, ms_txg_node)); 334 txg_list_create(&vd->vdev_dtl_list, 335 offsetof(struct vdev, vdev_dtl_node)); 336 vd->vdev_stat.vs_timestamp = gethrtime(); 337 vdev_queue_init(vd); 338 vdev_cache_init(vd); 339 340 return (vd); 341} 342 343/* 344 * Allocate a new vdev. The 'alloctype' is used to control whether we are 345 * creating a new vdev or loading an existing one - the behavior is slightly 346 * different for each case. 347 */ 348int 349vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 350 int alloctype) 351{ 352 vdev_ops_t *ops; 353 char *type; 354 uint64_t guid = 0, islog, nparity; 355 vdev_t *vd; 356 357 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 358 359 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 360 return (SET_ERROR(EINVAL)); 361 362 if ((ops = vdev_getops(type)) == NULL) 363 return (SET_ERROR(EINVAL)); 364 365 /* 366 * If this is a load, get the vdev guid from the nvlist. 367 * Otherwise, vdev_alloc_common() will generate one for us. 368 */ 369 if (alloctype == VDEV_ALLOC_LOAD) { 370 uint64_t label_id; 371 372 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 373 label_id != id) 374 return (SET_ERROR(EINVAL)); 375 376 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 377 return (SET_ERROR(EINVAL)); 378 } else if (alloctype == VDEV_ALLOC_SPARE) { 379 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 380 return (SET_ERROR(EINVAL)); 381 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 382 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 383 return (SET_ERROR(EINVAL)); 384 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 385 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 386 return (SET_ERROR(EINVAL)); 387 } 388 389 /* 390 * The first allocated vdev must be of type 'root'. 391 */ 392 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 393 return (SET_ERROR(EINVAL)); 394 395 /* 396 * Determine whether we're a log vdev. 397 */ 398 islog = 0; 399 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 400 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 401 return (SET_ERROR(ENOTSUP)); 402 403 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 404 return (SET_ERROR(ENOTSUP)); 405 406 /* 407 * Set the nparity property for RAID-Z vdevs. 408 */ 409 nparity = -1ULL; 410 if (ops == &vdev_raidz_ops) { 411 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 412 &nparity) == 0) { 413 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 414 return (SET_ERROR(EINVAL)); 415 /* 416 * Previous versions could only support 1 or 2 parity 417 * device. 418 */ 419 if (nparity > 1 && 420 spa_version(spa) < SPA_VERSION_RAIDZ2) 421 return (SET_ERROR(ENOTSUP)); 422 if (nparity > 2 && 423 spa_version(spa) < SPA_VERSION_RAIDZ3) 424 return (SET_ERROR(ENOTSUP)); 425 } else { 426 /* 427 * We require the parity to be specified for SPAs that 428 * support multiple parity levels. 429 */ 430 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 431 return (SET_ERROR(EINVAL)); 432 /* 433 * Otherwise, we default to 1 parity device for RAID-Z. 434 */ 435 nparity = 1; 436 } 437 } else { 438 nparity = 0; 439 } 440 ASSERT(nparity != -1ULL); 441 442 vd = vdev_alloc_common(spa, id, guid, ops); 443 444 vd->vdev_islog = islog; 445 vd->vdev_nparity = nparity; 446 447 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 448 vd->vdev_path = spa_strdup(vd->vdev_path); 449 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 450 vd->vdev_devid = spa_strdup(vd->vdev_devid); 451 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 452 &vd->vdev_physpath) == 0) 453 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 454 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 455 vd->vdev_fru = spa_strdup(vd->vdev_fru); 456 457 /* 458 * Set the whole_disk property. If it's not specified, leave the value 459 * as -1. 460 */ 461 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 462 &vd->vdev_wholedisk) != 0) 463 vd->vdev_wholedisk = -1ULL; 464 465 /* 466 * Look for the 'not present' flag. This will only be set if the device 467 * was not present at the time of import. 468 */ 469 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 470 &vd->vdev_not_present); 471 472 /* 473 * Get the alignment requirement. 474 */ 475 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 476 477 /* 478 * Retrieve the vdev creation time. 479 */ 480 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 481 &vd->vdev_crtxg); 482 483 /* 484 * If we're a top-level vdev, try to load the allocation parameters. 485 */ 486 if (parent && !parent->vdev_parent && 487 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 488 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 489 &vd->vdev_ms_array); 490 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 491 &vd->vdev_ms_shift); 492 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 493 &vd->vdev_asize); 494 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 495 &vd->vdev_removing); 496 } 497 498 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 499 ASSERT(alloctype == VDEV_ALLOC_LOAD || 500 alloctype == VDEV_ALLOC_ADD || 501 alloctype == VDEV_ALLOC_SPLIT || 502 alloctype == VDEV_ALLOC_ROOTPOOL); 503 vd->vdev_mg = metaslab_group_create(islog ? 504 spa_log_class(spa) : spa_normal_class(spa), vd); 505 } 506 507 /* 508 * If we're a leaf vdev, try to load the DTL object and other state. 509 */ 510 if (vd->vdev_ops->vdev_op_leaf && 511 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 512 alloctype == VDEV_ALLOC_ROOTPOOL)) { 513 if (alloctype == VDEV_ALLOC_LOAD) { 514 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 515 &vd->vdev_dtl_smo.smo_object); 516 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 517 &vd->vdev_unspare); 518 } 519 520 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 521 uint64_t spare = 0; 522 523 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 524 &spare) == 0 && spare) 525 spa_spare_add(vd); 526 } 527 528 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 529 &vd->vdev_offline); 530 531 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING, 532 &vd->vdev_resilvering); 533 534 /* 535 * When importing a pool, we want to ignore the persistent fault 536 * state, as the diagnosis made on another system may not be 537 * valid in the current context. Local vdevs will 538 * remain in the faulted state. 539 */ 540 if (spa_load_state(spa) == SPA_LOAD_OPEN) { 541 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 542 &vd->vdev_faulted); 543 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 544 &vd->vdev_degraded); 545 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 546 &vd->vdev_removed); 547 548 if (vd->vdev_faulted || vd->vdev_degraded) { 549 char *aux; 550 551 vd->vdev_label_aux = 552 VDEV_AUX_ERR_EXCEEDED; 553 if (nvlist_lookup_string(nv, 554 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 555 strcmp(aux, "external") == 0) 556 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 557 } 558 } 559 } 560 561 /* 562 * Add ourselves to the parent's list of children. 563 */ 564 vdev_add_child(parent, vd); 565 566 *vdp = vd; 567 568 return (0); 569} 570 571void 572vdev_free(vdev_t *vd) 573{ 574 spa_t *spa = vd->vdev_spa; 575 576 /* 577 * vdev_free() implies closing the vdev first. This is simpler than 578 * trying to ensure complicated semantics for all callers. 579 */ 580 vdev_close(vd); 581 582 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 583 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 584 585 /* 586 * Free all children. 587 */ 588 for (int c = 0; c < vd->vdev_children; c++) 589 vdev_free(vd->vdev_child[c]); 590 591 ASSERT(vd->vdev_child == NULL); 592 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 593 594 /* 595 * Discard allocation state. 596 */ 597 if (vd->vdev_mg != NULL) { 598 vdev_metaslab_fini(vd); 599 metaslab_group_destroy(vd->vdev_mg); 600 } 601 602 ASSERT0(vd->vdev_stat.vs_space); 603 ASSERT0(vd->vdev_stat.vs_dspace); 604 ASSERT0(vd->vdev_stat.vs_alloc); 605 606 /* 607 * Remove this vdev from its parent's child list. 608 */ 609 vdev_remove_child(vd->vdev_parent, vd); 610 611 ASSERT(vd->vdev_parent == NULL); 612 613 /* 614 * Clean up vdev structure. 615 */ 616 vdev_queue_fini(vd); 617 vdev_cache_fini(vd); 618 619 if (vd->vdev_path) 620 spa_strfree(vd->vdev_path); 621 if (vd->vdev_devid) 622 spa_strfree(vd->vdev_devid); 623 if (vd->vdev_physpath) 624 spa_strfree(vd->vdev_physpath); 625 if (vd->vdev_fru) 626 spa_strfree(vd->vdev_fru); 627 628 if (vd->vdev_isspare) 629 spa_spare_remove(vd); 630 if (vd->vdev_isl2cache) 631 spa_l2cache_remove(vd); 632 633 txg_list_destroy(&vd->vdev_ms_list); 634 txg_list_destroy(&vd->vdev_dtl_list); 635 636 mutex_enter(&vd->vdev_dtl_lock); 637 for (int t = 0; t < DTL_TYPES; t++) { 638 space_map_unload(&vd->vdev_dtl[t]); 639 space_map_destroy(&vd->vdev_dtl[t]); 640 } 641 mutex_exit(&vd->vdev_dtl_lock); 642 643 mutex_destroy(&vd->vdev_dtl_lock); 644 mutex_destroy(&vd->vdev_stat_lock); 645 mutex_destroy(&vd->vdev_probe_lock); 646 647 if (vd == spa->spa_root_vdev) 648 spa->spa_root_vdev = NULL; 649 650 kmem_free(vd, sizeof (vdev_t)); 651} 652 653/* 654 * Transfer top-level vdev state from svd to tvd. 655 */ 656static void 657vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 658{ 659 spa_t *spa = svd->vdev_spa; 660 metaslab_t *msp; 661 vdev_t *vd; 662 int t; 663 664 ASSERT(tvd == tvd->vdev_top); 665 666 tvd->vdev_ms_array = svd->vdev_ms_array; 667 tvd->vdev_ms_shift = svd->vdev_ms_shift; 668 tvd->vdev_ms_count = svd->vdev_ms_count; 669 670 svd->vdev_ms_array = 0; 671 svd->vdev_ms_shift = 0; 672 svd->vdev_ms_count = 0; 673 674 if (tvd->vdev_mg) 675 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 676 tvd->vdev_mg = svd->vdev_mg; 677 tvd->vdev_ms = svd->vdev_ms; 678 679 svd->vdev_mg = NULL; 680 svd->vdev_ms = NULL; 681 682 if (tvd->vdev_mg != NULL) 683 tvd->vdev_mg->mg_vd = tvd; 684 685 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 686 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 687 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 688 689 svd->vdev_stat.vs_alloc = 0; 690 svd->vdev_stat.vs_space = 0; 691 svd->vdev_stat.vs_dspace = 0; 692 693 for (t = 0; t < TXG_SIZE; t++) { 694 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 695 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 696 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 697 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 698 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 699 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 700 } 701 702 if (list_link_active(&svd->vdev_config_dirty_node)) { 703 vdev_config_clean(svd); 704 vdev_config_dirty(tvd); 705 } 706 707 if (list_link_active(&svd->vdev_state_dirty_node)) { 708 vdev_state_clean(svd); 709 vdev_state_dirty(tvd); 710 } 711 712 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 713 svd->vdev_deflate_ratio = 0; 714 715 tvd->vdev_islog = svd->vdev_islog; 716 svd->vdev_islog = 0; 717} 718 719static void 720vdev_top_update(vdev_t *tvd, vdev_t *vd) 721{ 722 if (vd == NULL) 723 return; 724 725 vd->vdev_top = tvd; 726 727 for (int c = 0; c < vd->vdev_children; c++) 728 vdev_top_update(tvd, vd->vdev_child[c]); 729} 730 731/* 732 * Add a mirror/replacing vdev above an existing vdev. 733 */ 734vdev_t * 735vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 736{ 737 spa_t *spa = cvd->vdev_spa; 738 vdev_t *pvd = cvd->vdev_parent; 739 vdev_t *mvd; 740 741 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 742 743 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 744 745 mvd->vdev_asize = cvd->vdev_asize; 746 mvd->vdev_min_asize = cvd->vdev_min_asize; 747 mvd->vdev_max_asize = cvd->vdev_max_asize; 748 mvd->vdev_ashift = cvd->vdev_ashift; 749 mvd->vdev_state = cvd->vdev_state; 750 mvd->vdev_crtxg = cvd->vdev_crtxg; 751 752 vdev_remove_child(pvd, cvd); 753 vdev_add_child(pvd, mvd); 754 cvd->vdev_id = mvd->vdev_children; 755 vdev_add_child(mvd, cvd); 756 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 757 758 if (mvd == mvd->vdev_top) 759 vdev_top_transfer(cvd, mvd); 760 761 return (mvd); 762} 763 764/* 765 * Remove a 1-way mirror/replacing vdev from the tree. 766 */ 767void 768vdev_remove_parent(vdev_t *cvd) 769{ 770 vdev_t *mvd = cvd->vdev_parent; 771 vdev_t *pvd = mvd->vdev_parent; 772 773 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 774 775 ASSERT(mvd->vdev_children == 1); 776 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 777 mvd->vdev_ops == &vdev_replacing_ops || 778 mvd->vdev_ops == &vdev_spare_ops); 779 cvd->vdev_ashift = mvd->vdev_ashift; 780 781 vdev_remove_child(mvd, cvd); 782 vdev_remove_child(pvd, mvd); 783 784 /* 785 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 786 * Otherwise, we could have detached an offline device, and when we 787 * go to import the pool we'll think we have two top-level vdevs, 788 * instead of a different version of the same top-level vdev. 789 */ 790 if (mvd->vdev_top == mvd) { 791 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 792 cvd->vdev_orig_guid = cvd->vdev_guid; 793 cvd->vdev_guid += guid_delta; 794 cvd->vdev_guid_sum += guid_delta; 795 } 796 cvd->vdev_id = mvd->vdev_id; 797 vdev_add_child(pvd, cvd); 798 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 799 800 if (cvd == cvd->vdev_top) 801 vdev_top_transfer(mvd, cvd); 802 803 ASSERT(mvd->vdev_children == 0); 804 vdev_free(mvd); 805} 806 807int 808vdev_metaslab_init(vdev_t *vd, uint64_t txg) 809{ 810 spa_t *spa = vd->vdev_spa; 811 objset_t *mos = spa->spa_meta_objset; 812 uint64_t m; 813 uint64_t oldc = vd->vdev_ms_count; 814 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 815 metaslab_t **mspp; 816 int error; 817 818 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 819 820 /* 821 * This vdev is not being allocated from yet or is a hole. 822 */ 823 if (vd->vdev_ms_shift == 0) 824 return (0); 825 826 ASSERT(!vd->vdev_ishole); 827 828 /* 829 * Compute the raidz-deflation ratio. Note, we hard-code 830 * in 128k (1 << 17) because it is the current "typical" blocksize. 831 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, 832 * or we will inconsistently account for existing bp's. 833 */ 834 vd->vdev_deflate_ratio = (1 << 17) / 835 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 836 837 ASSERT(oldc <= newc); 838 839 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 840 841 if (oldc != 0) { 842 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 843 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 844 } 845 846 vd->vdev_ms = mspp; 847 vd->vdev_ms_count = newc; 848 849 for (m = oldc; m < newc; m++) { 850 space_map_obj_t smo = { 0, 0, 0 }; 851 if (txg == 0) { 852 uint64_t object = 0; 853 error = dmu_read(mos, vd->vdev_ms_array, 854 m * sizeof (uint64_t), sizeof (uint64_t), &object, 855 DMU_READ_PREFETCH); 856 if (error) 857 return (error); 858 if (object != 0) { 859 dmu_buf_t *db; 860 error = dmu_bonus_hold(mos, object, FTAG, &db); 861 if (error) 862 return (error); 863 ASSERT3U(db->db_size, >=, sizeof (smo)); 864 bcopy(db->db_data, &smo, sizeof (smo)); 865 ASSERT3U(smo.smo_object, ==, object); 866 dmu_buf_rele(db, FTAG); 867 } 868 } 869 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 870 m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 871 } 872 873 if (txg == 0) 874 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 875 876 /* 877 * If the vdev is being removed we don't activate 878 * the metaslabs since we want to ensure that no new 879 * allocations are performed on this device. 880 */ 881 if (oldc == 0 && !vd->vdev_removing) 882 metaslab_group_activate(vd->vdev_mg); 883 884 if (txg == 0) 885 spa_config_exit(spa, SCL_ALLOC, FTAG); 886 887 return (0); 888} 889 890void 891vdev_metaslab_fini(vdev_t *vd) 892{ 893 uint64_t m; 894 uint64_t count = vd->vdev_ms_count; 895 896 if (vd->vdev_ms != NULL) { 897 metaslab_group_passivate(vd->vdev_mg); 898 for (m = 0; m < count; m++) 899 if (vd->vdev_ms[m] != NULL) 900 metaslab_fini(vd->vdev_ms[m]); 901 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 902 vd->vdev_ms = NULL; 903 } 904} 905 906typedef struct vdev_probe_stats { 907 boolean_t vps_readable; 908 boolean_t vps_writeable; 909 int vps_flags; 910} vdev_probe_stats_t; 911 912static void 913vdev_probe_done(zio_t *zio) 914{ 915 spa_t *spa = zio->io_spa; 916 vdev_t *vd = zio->io_vd; 917 vdev_probe_stats_t *vps = zio->io_private; 918 919 ASSERT(vd->vdev_probe_zio != NULL); 920 921 if (zio->io_type == ZIO_TYPE_READ) { 922 if (zio->io_error == 0) 923 vps->vps_readable = 1; 924 if (zio->io_error == 0 && spa_writeable(spa)) { 925 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 926 zio->io_offset, zio->io_size, zio->io_data, 927 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 928 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 929 } else { 930 zio_buf_free(zio->io_data, zio->io_size); 931 } 932 } else if (zio->io_type == ZIO_TYPE_WRITE) { 933 if (zio->io_error == 0) 934 vps->vps_writeable = 1; 935 zio_buf_free(zio->io_data, zio->io_size); 936 } else if (zio->io_type == ZIO_TYPE_NULL) { 937 zio_t *pio; 938 939 vd->vdev_cant_read |= !vps->vps_readable; 940 vd->vdev_cant_write |= !vps->vps_writeable; 941 942 if (vdev_readable(vd) && 943 (vdev_writeable(vd) || !spa_writeable(spa))) { 944 zio->io_error = 0; 945 } else { 946 ASSERT(zio->io_error != 0); 947 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 948 spa, vd, NULL, 0, 0); 949 zio->io_error = SET_ERROR(ENXIO); 950 } 951 952 mutex_enter(&vd->vdev_probe_lock); 953 ASSERT(vd->vdev_probe_zio == zio); 954 vd->vdev_probe_zio = NULL; 955 mutex_exit(&vd->vdev_probe_lock); 956 957 while ((pio = zio_walk_parents(zio)) != NULL) 958 if (!vdev_accessible(vd, pio)) 959 pio->io_error = SET_ERROR(ENXIO); 960 961 kmem_free(vps, sizeof (*vps)); 962 } 963} 964 965/* 966 * Determine whether this device is accessible. 967 * 968 * Read and write to several known locations: the pad regions of each 969 * vdev label but the first, which we leave alone in case it contains 970 * a VTOC. 971 */ 972zio_t * 973vdev_probe(vdev_t *vd, zio_t *zio) 974{ 975 spa_t *spa = vd->vdev_spa; 976 vdev_probe_stats_t *vps = NULL; 977 zio_t *pio; 978 979 ASSERT(vd->vdev_ops->vdev_op_leaf); 980 981 /* 982 * Don't probe the probe. 983 */ 984 if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 985 return (NULL); 986 987 /* 988 * To prevent 'probe storms' when a device fails, we create 989 * just one probe i/o at a time. All zios that want to probe 990 * this vdev will become parents of the probe io. 991 */ 992 mutex_enter(&vd->vdev_probe_lock); 993 994 if ((pio = vd->vdev_probe_zio) == NULL) { 995 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 996 997 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 998 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 999 ZIO_FLAG_TRYHARD; 1000 1001 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1002 /* 1003 * vdev_cant_read and vdev_cant_write can only 1004 * transition from TRUE to FALSE when we have the 1005 * SCL_ZIO lock as writer; otherwise they can only 1006 * transition from FALSE to TRUE. This ensures that 1007 * any zio looking at these values can assume that 1008 * failures persist for the life of the I/O. That's 1009 * important because when a device has intermittent 1010 * connectivity problems, we want to ensure that 1011 * they're ascribed to the device (ENXIO) and not 1012 * the zio (EIO). 1013 * 1014 * Since we hold SCL_ZIO as writer here, clear both 1015 * values so the probe can reevaluate from first 1016 * principles. 1017 */ 1018 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1019 vd->vdev_cant_read = B_FALSE; 1020 vd->vdev_cant_write = B_FALSE; 1021 } 1022 1023 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1024 vdev_probe_done, vps, 1025 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1026 1027 /* 1028 * We can't change the vdev state in this context, so we 1029 * kick off an async task to do it on our behalf. 1030 */ 1031 if (zio != NULL) { 1032 vd->vdev_probe_wanted = B_TRUE; 1033 spa_async_request(spa, SPA_ASYNC_PROBE); 1034 } 1035 } 1036 1037 if (zio != NULL) 1038 zio_add_child(zio, pio); 1039 1040 mutex_exit(&vd->vdev_probe_lock); 1041 1042 if (vps == NULL) { 1043 ASSERT(zio != NULL); 1044 return (NULL); 1045 } 1046 1047 for (int l = 1; l < VDEV_LABELS; l++) { 1048 zio_nowait(zio_read_phys(pio, vd, 1049 vdev_label_offset(vd->vdev_psize, l, 1050 offsetof(vdev_label_t, vl_pad2)), 1051 VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), 1052 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1053 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1054 } 1055 1056 if (zio == NULL) 1057 return (pio); 1058 1059 zio_nowait(pio); 1060 return (NULL); 1061} 1062 1063static void 1064vdev_open_child(void *arg) 1065{ 1066 vdev_t *vd = arg; 1067 1068 vd->vdev_open_thread = curthread; 1069 vd->vdev_open_error = vdev_open(vd); 1070 vd->vdev_open_thread = NULL; 1071} 1072 1073boolean_t 1074vdev_uses_zvols(vdev_t *vd) 1075{ 1076 if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1077 strlen(ZVOL_DIR)) == 0) 1078 return (B_TRUE); 1079 for (int c = 0; c < vd->vdev_children; c++) 1080 if (vdev_uses_zvols(vd->vdev_child[c])) 1081 return (B_TRUE); 1082 return (B_FALSE); 1083} 1084 1085void 1086vdev_open_children(vdev_t *vd) 1087{ 1088 taskq_t *tq; 1089 int children = vd->vdev_children; 1090 1091 /* 1092 * in order to handle pools on top of zvols, do the opens 1093 * in a single thread so that the same thread holds the 1094 * spa_namespace_lock 1095 */ 1096 if (B_TRUE || vdev_uses_zvols(vd)) { 1097 for (int c = 0; c < children; c++) 1098 vd->vdev_child[c]->vdev_open_error = 1099 vdev_open(vd->vdev_child[c]); 1100 return; 1101 } 1102 tq = taskq_create("vdev_open", children, minclsyspri, 1103 children, children, TASKQ_PREPOPULATE); 1104 1105 for (int c = 0; c < children; c++) 1106 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1107 TQ_SLEEP) != 0); 1108 1109 taskq_destroy(tq); 1110} 1111 1112/* 1113 * Prepare a virtual device for access. 1114 */ 1115int 1116vdev_open(vdev_t *vd) 1117{ 1118 spa_t *spa = vd->vdev_spa; 1119 int error; 1120 uint64_t osize = 0; 1121 uint64_t max_osize = 0; 1122 uint64_t asize, max_asize, psize; 1123 uint64_t ashift = 0; 1124 1125 ASSERT(vd->vdev_open_thread == curthread || 1126 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1127 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1128 vd->vdev_state == VDEV_STATE_CANT_OPEN || 1129 vd->vdev_state == VDEV_STATE_OFFLINE); 1130 1131 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1132 vd->vdev_cant_read = B_FALSE; 1133 vd->vdev_cant_write = B_FALSE; 1134 vd->vdev_min_asize = vdev_get_min_asize(vd); 1135 1136 /* 1137 * If this vdev is not removed, check its fault status. If it's 1138 * faulted, bail out of the open. 1139 */ 1140 if (!vd->vdev_removed && vd->vdev_faulted) { 1141 ASSERT(vd->vdev_children == 0); 1142 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1143 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1144 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1145 vd->vdev_label_aux); 1146 return (SET_ERROR(ENXIO)); 1147 } else if (vd->vdev_offline) { 1148 ASSERT(vd->vdev_children == 0); 1149 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1150 return (SET_ERROR(ENXIO)); 1151 } 1152 1153 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); 1154 1155 /* 1156 * Reset the vdev_reopening flag so that we actually close 1157 * the vdev on error. 1158 */ 1159 vd->vdev_reopening = B_FALSE; 1160 if (zio_injection_enabled && error == 0) 1161 error = zio_handle_device_injection(vd, NULL, ENXIO); 1162 1163 if (error) { 1164 if (vd->vdev_removed && 1165 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1166 vd->vdev_removed = B_FALSE; 1167 1168 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1169 vd->vdev_stat.vs_aux); 1170 return (error); 1171 } 1172 1173 vd->vdev_removed = B_FALSE; 1174 1175 /* 1176 * Recheck the faulted flag now that we have confirmed that 1177 * the vdev is accessible. If we're faulted, bail. 1178 */ 1179 if (vd->vdev_faulted) { 1180 ASSERT(vd->vdev_children == 0); 1181 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1182 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1183 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1184 vd->vdev_label_aux); 1185 return (SET_ERROR(ENXIO)); 1186 } 1187 1188 if (vd->vdev_degraded) { 1189 ASSERT(vd->vdev_children == 0); 1190 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1191 VDEV_AUX_ERR_EXCEEDED); 1192 } else { 1193 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1194 } 1195 1196 /* 1197 * For hole or missing vdevs we just return success. 1198 */ 1199 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1200 return (0); 1201 1202 if (vd->vdev_ops->vdev_op_leaf) { 1203 vd->vdev_notrim = B_FALSE; 1204 trim_map_create(vd); 1205 } 1206 1207 for (int c = 0; c < vd->vdev_children; c++) { 1208 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1209 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1210 VDEV_AUX_NONE); 1211 break; 1212 } 1213 } 1214 1215 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1216 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1217 1218 if (vd->vdev_children == 0) { 1219 if (osize < SPA_MINDEVSIZE) { 1220 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1221 VDEV_AUX_TOO_SMALL); 1222 return (SET_ERROR(EOVERFLOW)); 1223 } 1224 psize = osize; 1225 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1226 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1227 VDEV_LABEL_END_SIZE); 1228 } else { 1229 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1230 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1231 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1232 VDEV_AUX_TOO_SMALL); 1233 return (SET_ERROR(EOVERFLOW)); 1234 } 1235 psize = 0; 1236 asize = osize; 1237 max_asize = max_osize; 1238 } 1239 1240 vd->vdev_psize = psize; 1241 1242 /* 1243 * Make sure the allocatable size hasn't shrunk. 1244 */ 1245 if (asize < vd->vdev_min_asize) { 1246 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1247 VDEV_AUX_BAD_LABEL); 1248 return (SET_ERROR(EINVAL)); 1249 } 1250 1251 if (vd->vdev_asize == 0) { 1252 /* 1253 * This is the first-ever open, so use the computed values. 1254 * For testing purposes, a higher ashift can be requested. 1255 */ 1256 vd->vdev_asize = asize; 1257 vd->vdev_max_asize = max_asize; 1258 vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 1259 } else { 1260 /* 1261 * Detect if the alignment requirement has increased. 1262 * We don't want to make the pool unavailable, just 1263 * issue a warning instead. 1264 */ 1265 if (ashift > vd->vdev_top->vdev_ashift && 1266 vd->vdev_ops->vdev_op_leaf) { 1267 cmn_err(CE_WARN, 1268 "Disk, '%s', has a block alignment that is " 1269 "larger than the pool's alignment\n", 1270 vd->vdev_path); 1271 } 1272 vd->vdev_max_asize = max_asize; 1273 } 1274 1275 /* 1276 * If all children are healthy and the asize has increased, 1277 * then we've experienced dynamic LUN growth. If automatic 1278 * expansion is enabled then use the additional space. 1279 */ 1280 if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && 1281 (vd->vdev_expanding || spa->spa_autoexpand)) 1282 vd->vdev_asize = asize; 1283 1284 vdev_set_min_asize(vd); 1285 1286 /* 1287 * Ensure we can issue some IO before declaring the 1288 * vdev open for business. 1289 */ 1290 if (vd->vdev_ops->vdev_op_leaf && 1291 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1292 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1293 VDEV_AUX_ERR_EXCEEDED); 1294 return (error); 1295 } 1296 1297 /* 1298 * If a leaf vdev has a DTL, and seems healthy, then kick off a 1299 * resilver. But don't do this if we are doing a reopen for a scrub, 1300 * since this would just restart the scrub we are already doing. 1301 */ 1302 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1303 vdev_resilver_needed(vd, NULL, NULL)) 1304 spa_async_request(spa, SPA_ASYNC_RESILVER); 1305 1306 return (0); 1307} 1308 1309/* 1310 * Called once the vdevs are all opened, this routine validates the label 1311 * contents. This needs to be done before vdev_load() so that we don't 1312 * inadvertently do repair I/Os to the wrong device. 1313 * 1314 * If 'strict' is false ignore the spa guid check. This is necessary because 1315 * if the machine crashed during a re-guid the new guid might have been written 1316 * to all of the vdev labels, but not the cached config. The strict check 1317 * will be performed when the pool is opened again using the mos config. 1318 * 1319 * This function will only return failure if one of the vdevs indicates that it 1320 * has since been destroyed or exported. This is only possible if 1321 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1322 * will be updated but the function will return 0. 1323 */ 1324int 1325vdev_validate(vdev_t *vd, boolean_t strict) 1326{ 1327 spa_t *spa = vd->vdev_spa; 1328 nvlist_t *label; 1329 uint64_t guid = 0, top_guid; 1330 uint64_t state; 1331 1332 for (int c = 0; c < vd->vdev_children; c++) 1333 if (vdev_validate(vd->vdev_child[c], strict) != 0) 1334 return (SET_ERROR(EBADF)); 1335 1336 /* 1337 * If the device has already failed, or was marked offline, don't do 1338 * any further validation. Otherwise, label I/O will fail and we will 1339 * overwrite the previous state. 1340 */ 1341 if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1342 uint64_t aux_guid = 0; 1343 nvlist_t *nvl; 1344 uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1345 spa_last_synced_txg(spa) : -1ULL; 1346 1347 if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1348 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1349 VDEV_AUX_BAD_LABEL); 1350 return (0); 1351 } 1352 1353 /* 1354 * Determine if this vdev has been split off into another 1355 * pool. If so, then refuse to open it. 1356 */ 1357 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1358 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1359 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1360 VDEV_AUX_SPLIT_POOL); 1361 nvlist_free(label); 1362 return (0); 1363 } 1364 1365 if (strict && (nvlist_lookup_uint64(label, 1366 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 1367 guid != spa_guid(spa))) { 1368 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1369 VDEV_AUX_CORRUPT_DATA); 1370 nvlist_free(label); 1371 return (0); 1372 } 1373 1374 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1375 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1376 &aux_guid) != 0) 1377 aux_guid = 0; 1378 1379 /* 1380 * If this vdev just became a top-level vdev because its 1381 * sibling was detached, it will have adopted the parent's 1382 * vdev guid -- but the label may or may not be on disk yet. 1383 * Fortunately, either version of the label will have the 1384 * same top guid, so if we're a top-level vdev, we can 1385 * safely compare to that instead. 1386 * 1387 * If we split this vdev off instead, then we also check the 1388 * original pool's guid. We don't want to consider the vdev 1389 * corrupt if it is partway through a split operation. 1390 */ 1391 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1392 &guid) != 0 || 1393 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1394 &top_guid) != 0 || 1395 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1396 (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1397 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1398 VDEV_AUX_CORRUPT_DATA); 1399 nvlist_free(label); 1400 return (0); 1401 } 1402 1403 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1404 &state) != 0) { 1405 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1406 VDEV_AUX_CORRUPT_DATA); 1407 nvlist_free(label); 1408 return (0); 1409 } 1410 1411 nvlist_free(label); 1412 1413 /* 1414 * If this is a verbatim import, no need to check the 1415 * state of the pool. 1416 */ 1417 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1418 spa_load_state(spa) == SPA_LOAD_OPEN && 1419 state != POOL_STATE_ACTIVE) 1420 return (SET_ERROR(EBADF)); 1421 1422 /* 1423 * If we were able to open and validate a vdev that was 1424 * previously marked permanently unavailable, clear that state 1425 * now. 1426 */ 1427 if (vd->vdev_not_present) 1428 vd->vdev_not_present = 0; 1429 } 1430 1431 return (0); 1432} 1433 1434/* 1435 * Close a virtual device. 1436 */ 1437void 1438vdev_close(vdev_t *vd) 1439{ 1440 spa_t *spa = vd->vdev_spa; 1441 vdev_t *pvd = vd->vdev_parent; 1442 1443 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1444 1445 /* 1446 * If our parent is reopening, then we are as well, unless we are 1447 * going offline. 1448 */ 1449 if (pvd != NULL && pvd->vdev_reopening) 1450 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1451 1452 vd->vdev_ops->vdev_op_close(vd); 1453 1454 vdev_cache_purge(vd); 1455 1456 if (vd->vdev_ops->vdev_op_leaf) 1457 trim_map_destroy(vd); 1458 1459 /* 1460 * We record the previous state before we close it, so that if we are 1461 * doing a reopen(), we don't generate FMA ereports if we notice that 1462 * it's still faulted. 1463 */ 1464 vd->vdev_prevstate = vd->vdev_state; 1465 1466 if (vd->vdev_offline) 1467 vd->vdev_state = VDEV_STATE_OFFLINE; 1468 else 1469 vd->vdev_state = VDEV_STATE_CLOSED; 1470 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1471} 1472 1473void 1474vdev_hold(vdev_t *vd) 1475{ 1476 spa_t *spa = vd->vdev_spa; 1477 1478 ASSERT(spa_is_root(spa)); 1479 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1480 return; 1481 1482 for (int c = 0; c < vd->vdev_children; c++) 1483 vdev_hold(vd->vdev_child[c]); 1484 1485 if (vd->vdev_ops->vdev_op_leaf) 1486 vd->vdev_ops->vdev_op_hold(vd); 1487} 1488 1489void 1490vdev_rele(vdev_t *vd) 1491{ 1492 spa_t *spa = vd->vdev_spa; 1493 1494 ASSERT(spa_is_root(spa)); 1495 for (int c = 0; c < vd->vdev_children; c++) 1496 vdev_rele(vd->vdev_child[c]); 1497 1498 if (vd->vdev_ops->vdev_op_leaf) 1499 vd->vdev_ops->vdev_op_rele(vd); 1500} 1501 1502/* 1503 * Reopen all interior vdevs and any unopened leaves. We don't actually 1504 * reopen leaf vdevs which had previously been opened as they might deadlock 1505 * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1506 * If the leaf has never been opened then open it, as usual. 1507 */ 1508void 1509vdev_reopen(vdev_t *vd) 1510{ 1511 spa_t *spa = vd->vdev_spa; 1512 1513 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1514 1515 /* set the reopening flag unless we're taking the vdev offline */ 1516 vd->vdev_reopening = !vd->vdev_offline; 1517 vdev_close(vd); 1518 (void) vdev_open(vd); 1519 1520 /* 1521 * Call vdev_validate() here to make sure we have the same device. 1522 * Otherwise, a device with an invalid label could be successfully 1523 * opened in response to vdev_reopen(). 1524 */ 1525 if (vd->vdev_aux) { 1526 (void) vdev_validate_aux(vd); 1527 if (vdev_readable(vd) && vdev_writeable(vd) && 1528 vd->vdev_aux == &spa->spa_l2cache && 1529 !l2arc_vdev_present(vd)) 1530 l2arc_add_vdev(spa, vd); 1531 } else { 1532 (void) vdev_validate(vd, B_TRUE); 1533 } 1534 1535 /* 1536 * Reassess parent vdev's health. 1537 */ 1538 vdev_propagate_state(vd); 1539} 1540 1541int 1542vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1543{ 1544 int error; 1545 1546 /* 1547 * Normally, partial opens (e.g. of a mirror) are allowed. 1548 * For a create, however, we want to fail the request if 1549 * there are any components we can't open. 1550 */ 1551 error = vdev_open(vd); 1552 1553 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1554 vdev_close(vd); 1555 return (error ? error : ENXIO); 1556 } 1557 1558 /* 1559 * Recursively initialize all labels. 1560 */ 1561 if ((error = vdev_label_init(vd, txg, isreplacing ? 1562 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1563 vdev_close(vd); 1564 return (error); 1565 } 1566 1567 return (0); 1568} 1569 1570void 1571vdev_metaslab_set_size(vdev_t *vd) 1572{ 1573 /* 1574 * Aim for roughly 200 metaslabs per vdev. 1575 */ 1576 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1577 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1578} 1579 1580void 1581vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1582{ 1583 ASSERT(vd == vd->vdev_top); 1584 ASSERT(!vd->vdev_ishole); 1585 ASSERT(ISP2(flags)); 1586 ASSERT(spa_writeable(vd->vdev_spa)); 1587 1588 if (flags & VDD_METASLAB) 1589 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1590 1591 if (flags & VDD_DTL) 1592 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1593 1594 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1595} 1596 1597/* 1598 * DTLs. 1599 * 1600 * A vdev's DTL (dirty time log) is the set of transaction groups for which 1601 * the vdev has less than perfect replication. There are four kinds of DTL: 1602 * 1603 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1604 * 1605 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1606 * 1607 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1608 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1609 * txgs that was scrubbed. 1610 * 1611 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1612 * persistent errors or just some device being offline. 1613 * Unlike the other three, the DTL_OUTAGE map is not generally 1614 * maintained; it's only computed when needed, typically to 1615 * determine whether a device can be detached. 1616 * 1617 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1618 * either has the data or it doesn't. 1619 * 1620 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1621 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1622 * if any child is less than fully replicated, then so is its parent. 1623 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1624 * comprising only those txgs which appear in 'maxfaults' or more children; 1625 * those are the txgs we don't have enough replication to read. For example, 1626 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1627 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1628 * two child DTL_MISSING maps. 1629 * 1630 * It should be clear from the above that to compute the DTLs and outage maps 1631 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1632 * Therefore, that is all we keep on disk. When loading the pool, or after 1633 * a configuration change, we generate all other DTLs from first principles. 1634 */ 1635void 1636vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1637{ 1638 space_map_t *sm = &vd->vdev_dtl[t]; 1639 1640 ASSERT(t < DTL_TYPES); 1641 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1642 ASSERT(spa_writeable(vd->vdev_spa)); 1643 1644 mutex_enter(sm->sm_lock); 1645 if (!space_map_contains(sm, txg, size)) 1646 space_map_add(sm, txg, size); 1647 mutex_exit(sm->sm_lock); 1648} 1649 1650boolean_t 1651vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1652{ 1653 space_map_t *sm = &vd->vdev_dtl[t]; 1654 boolean_t dirty = B_FALSE; 1655 1656 ASSERT(t < DTL_TYPES); 1657 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1658 1659 mutex_enter(sm->sm_lock); 1660 if (sm->sm_space != 0) 1661 dirty = space_map_contains(sm, txg, size); 1662 mutex_exit(sm->sm_lock); 1663 1664 return (dirty); 1665} 1666 1667boolean_t 1668vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1669{ 1670 space_map_t *sm = &vd->vdev_dtl[t]; 1671 boolean_t empty; 1672 1673 mutex_enter(sm->sm_lock); 1674 empty = (sm->sm_space == 0); 1675 mutex_exit(sm->sm_lock); 1676 1677 return (empty); 1678} 1679 1680/* 1681 * Reassess DTLs after a config change or scrub completion. 1682 */ 1683void 1684vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1685{ 1686 spa_t *spa = vd->vdev_spa; 1687 avl_tree_t reftree; 1688 int minref; 1689 1690 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1691 1692 for (int c = 0; c < vd->vdev_children; c++) 1693 vdev_dtl_reassess(vd->vdev_child[c], txg, 1694 scrub_txg, scrub_done); 1695 1696 if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) 1697 return; 1698 1699 if (vd->vdev_ops->vdev_op_leaf) { 1700 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1701 1702 mutex_enter(&vd->vdev_dtl_lock); 1703 if (scrub_txg != 0 && 1704 (spa->spa_scrub_started || 1705 (scn && scn->scn_phys.scn_errors == 0))) { 1706 /* 1707 * We completed a scrub up to scrub_txg. If we 1708 * did it without rebooting, then the scrub dtl 1709 * will be valid, so excise the old region and 1710 * fold in the scrub dtl. Otherwise, leave the 1711 * dtl as-is if there was an error. 1712 * 1713 * There's little trick here: to excise the beginning 1714 * of the DTL_MISSING map, we put it into a reference 1715 * tree and then add a segment with refcnt -1 that 1716 * covers the range [0, scrub_txg). This means 1717 * that each txg in that range has refcnt -1 or 0. 1718 * We then add DTL_SCRUB with a refcnt of 2, so that 1719 * entries in the range [0, scrub_txg) will have a 1720 * positive refcnt -- either 1 or 2. We then convert 1721 * the reference tree into the new DTL_MISSING map. 1722 */ 1723 space_map_ref_create(&reftree); 1724 space_map_ref_add_map(&reftree, 1725 &vd->vdev_dtl[DTL_MISSING], 1); 1726 space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); 1727 space_map_ref_add_map(&reftree, 1728 &vd->vdev_dtl[DTL_SCRUB], 2); 1729 space_map_ref_generate_map(&reftree, 1730 &vd->vdev_dtl[DTL_MISSING], 1); 1731 space_map_ref_destroy(&reftree); 1732 } 1733 space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 1734 space_map_walk(&vd->vdev_dtl[DTL_MISSING], 1735 space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); 1736 if (scrub_done) 1737 space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 1738 space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 1739 if (!vdev_readable(vd)) 1740 space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 1741 else 1742 space_map_walk(&vd->vdev_dtl[DTL_MISSING], 1743 space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); 1744 mutex_exit(&vd->vdev_dtl_lock); 1745 1746 if (txg != 0) 1747 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1748 return; 1749 } 1750 1751 mutex_enter(&vd->vdev_dtl_lock); 1752 for (int t = 0; t < DTL_TYPES; t++) { 1753 /* account for child's outage in parent's missing map */ 1754 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 1755 if (t == DTL_SCRUB) 1756 continue; /* leaf vdevs only */ 1757 if (t == DTL_PARTIAL) 1758 minref = 1; /* i.e. non-zero */ 1759 else if (vd->vdev_nparity != 0) 1760 minref = vd->vdev_nparity + 1; /* RAID-Z */ 1761 else 1762 minref = vd->vdev_children; /* any kind of mirror */ 1763 space_map_ref_create(&reftree); 1764 for (int c = 0; c < vd->vdev_children; c++) { 1765 vdev_t *cvd = vd->vdev_child[c]; 1766 mutex_enter(&cvd->vdev_dtl_lock); 1767 space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); 1768 mutex_exit(&cvd->vdev_dtl_lock); 1769 } 1770 space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); 1771 space_map_ref_destroy(&reftree); 1772 } 1773 mutex_exit(&vd->vdev_dtl_lock); 1774} 1775 1776static int 1777vdev_dtl_load(vdev_t *vd) 1778{ 1779 spa_t *spa = vd->vdev_spa; 1780 space_map_obj_t *smo = &vd->vdev_dtl_smo; 1781 objset_t *mos = spa->spa_meta_objset; 1782 dmu_buf_t *db; 1783 int error; 1784 1785 ASSERT(vd->vdev_children == 0); 1786 1787 if (smo->smo_object == 0) 1788 return (0); 1789 1790 ASSERT(!vd->vdev_ishole); 1791 1792 if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 1793 return (error); 1794 1795 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1796 bcopy(db->db_data, smo, sizeof (*smo)); 1797 dmu_buf_rele(db, FTAG); 1798 1799 mutex_enter(&vd->vdev_dtl_lock); 1800 error = space_map_load(&vd->vdev_dtl[DTL_MISSING], 1801 NULL, SM_ALLOC, smo, mos); 1802 mutex_exit(&vd->vdev_dtl_lock); 1803 1804 return (error); 1805} 1806 1807void 1808vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1809{ 1810 spa_t *spa = vd->vdev_spa; 1811 space_map_obj_t *smo = &vd->vdev_dtl_smo; 1812 space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; 1813 objset_t *mos = spa->spa_meta_objset; 1814 space_map_t smsync; 1815 kmutex_t smlock; 1816 dmu_buf_t *db; 1817 dmu_tx_t *tx; 1818 1819 ASSERT(!vd->vdev_ishole); 1820 1821 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1822 1823 if (vd->vdev_detached) { 1824 if (smo->smo_object != 0) { 1825 int err = dmu_object_free(mos, smo->smo_object, tx); 1826 ASSERT0(err); 1827 smo->smo_object = 0; 1828 } 1829 dmu_tx_commit(tx); 1830 return; 1831 } 1832 1833 if (smo->smo_object == 0) { 1834 ASSERT(smo->smo_objsize == 0); 1835 ASSERT(smo->smo_alloc == 0); 1836 smo->smo_object = dmu_object_alloc(mos, 1837 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1838 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1839 ASSERT(smo->smo_object != 0); 1840 vdev_config_dirty(vd->vdev_top); 1841 } 1842 1843 bzero(&smlock, sizeof (smlock)); 1844 mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1845 1846 space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1847 &smlock); 1848 1849 mutex_enter(&smlock); 1850 1851 mutex_enter(&vd->vdev_dtl_lock); 1852 space_map_walk(sm, space_map_add, &smsync); 1853 mutex_exit(&vd->vdev_dtl_lock); 1854 1855 space_map_truncate(smo, mos, tx); 1856 space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1857 space_map_vacate(&smsync, NULL, NULL); 1858 1859 space_map_destroy(&smsync); 1860 1861 mutex_exit(&smlock); 1862 mutex_destroy(&smlock); 1863 1864 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1865 dmu_buf_will_dirty(db, tx); 1866 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1867 bcopy(smo, db->db_data, sizeof (*smo)); 1868 dmu_buf_rele(db, FTAG); 1869 1870 dmu_tx_commit(tx); 1871} 1872 1873/* 1874 * Determine whether the specified vdev can be offlined/detached/removed 1875 * without losing data. 1876 */ 1877boolean_t 1878vdev_dtl_required(vdev_t *vd) 1879{ 1880 spa_t *spa = vd->vdev_spa; 1881 vdev_t *tvd = vd->vdev_top; 1882 uint8_t cant_read = vd->vdev_cant_read; 1883 boolean_t required; 1884 1885 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1886 1887 if (vd == spa->spa_root_vdev || vd == tvd) 1888 return (B_TRUE); 1889 1890 /* 1891 * Temporarily mark the device as unreadable, and then determine 1892 * whether this results in any DTL outages in the top-level vdev. 1893 * If not, we can safely offline/detach/remove the device. 1894 */ 1895 vd->vdev_cant_read = B_TRUE; 1896 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 1897 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 1898 vd->vdev_cant_read = cant_read; 1899 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 1900 1901 if (!required && zio_injection_enabled) 1902 required = !!zio_handle_device_injection(vd, NULL, ECHILD); 1903 1904 return (required); 1905} 1906 1907/* 1908 * Determine if resilver is needed, and if so the txg range. 1909 */ 1910boolean_t 1911vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 1912{ 1913 boolean_t needed = B_FALSE; 1914 uint64_t thismin = UINT64_MAX; 1915 uint64_t thismax = 0; 1916 1917 if (vd->vdev_children == 0) { 1918 mutex_enter(&vd->vdev_dtl_lock); 1919 if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && 1920 vdev_writeable(vd)) { 1921 space_seg_t *ss; 1922 1923 ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); 1924 thismin = ss->ss_start - 1; 1925 ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); 1926 thismax = ss->ss_end; 1927 needed = B_TRUE; 1928 } 1929 mutex_exit(&vd->vdev_dtl_lock); 1930 } else { 1931 for (int c = 0; c < vd->vdev_children; c++) { 1932 vdev_t *cvd = vd->vdev_child[c]; 1933 uint64_t cmin, cmax; 1934 1935 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 1936 thismin = MIN(thismin, cmin); 1937 thismax = MAX(thismax, cmax); 1938 needed = B_TRUE; 1939 } 1940 } 1941 } 1942 1943 if (needed && minp) { 1944 *minp = thismin; 1945 *maxp = thismax; 1946 } 1947 return (needed); 1948} 1949 1950void 1951vdev_load(vdev_t *vd) 1952{ 1953 /* 1954 * Recursively load all children. 1955 */ 1956 for (int c = 0; c < vd->vdev_children; c++) 1957 vdev_load(vd->vdev_child[c]); 1958 1959 /* 1960 * If this is a top-level vdev, initialize its metaslabs. 1961 */ 1962 if (vd == vd->vdev_top && !vd->vdev_ishole && 1963 (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 1964 vdev_metaslab_init(vd, 0) != 0)) 1965 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1966 VDEV_AUX_CORRUPT_DATA); 1967 1968 /* 1969 * If this is a leaf vdev, load its DTL. 1970 */ 1971 if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 1972 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1973 VDEV_AUX_CORRUPT_DATA); 1974} 1975 1976/* 1977 * The special vdev case is used for hot spares and l2cache devices. Its 1978 * sole purpose it to set the vdev state for the associated vdev. To do this, 1979 * we make sure that we can open the underlying device, then try to read the 1980 * label, and make sure that the label is sane and that it hasn't been 1981 * repurposed to another pool. 1982 */ 1983int 1984vdev_validate_aux(vdev_t *vd) 1985{ 1986 nvlist_t *label; 1987 uint64_t guid, version; 1988 uint64_t state; 1989 1990 if (!vdev_readable(vd)) 1991 return (0); 1992 1993 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 1994 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1995 VDEV_AUX_CORRUPT_DATA); 1996 return (-1); 1997 } 1998 1999 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 2000 !SPA_VERSION_IS_SUPPORTED(version) || 2001 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 2002 guid != vd->vdev_guid || 2003 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2004 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2005 VDEV_AUX_CORRUPT_DATA); 2006 nvlist_free(label); 2007 return (-1); 2008 } 2009 2010 /* 2011 * We don't actually check the pool state here. If it's in fact in 2012 * use by another pool, we update this fact on the fly when requested. 2013 */ 2014 nvlist_free(label); 2015 return (0); 2016} 2017 2018void 2019vdev_remove(vdev_t *vd, uint64_t txg) 2020{ 2021 spa_t *spa = vd->vdev_spa; 2022 objset_t *mos = spa->spa_meta_objset; 2023 dmu_tx_t *tx; 2024 2025 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2026 2027 if (vd->vdev_dtl_smo.smo_object) { 2028 ASSERT0(vd->vdev_dtl_smo.smo_alloc); 2029 (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); 2030 vd->vdev_dtl_smo.smo_object = 0; 2031 } 2032 2033 if (vd->vdev_ms != NULL) { 2034 for (int m = 0; m < vd->vdev_ms_count; m++) { 2035 metaslab_t *msp = vd->vdev_ms[m]; 2036 2037 if (msp == NULL || msp->ms_smo.smo_object == 0) 2038 continue; 2039 2040 ASSERT0(msp->ms_smo.smo_alloc); 2041 (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); 2042 msp->ms_smo.smo_object = 0; 2043 } 2044 } 2045 2046 if (vd->vdev_ms_array) { 2047 (void) dmu_object_free(mos, vd->vdev_ms_array, tx); 2048 vd->vdev_ms_array = 0; 2049 vd->vdev_ms_shift = 0; 2050 } 2051 dmu_tx_commit(tx); 2052} 2053 2054void 2055vdev_sync_done(vdev_t *vd, uint64_t txg) 2056{ 2057 metaslab_t *msp; 2058 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2059 2060 ASSERT(!vd->vdev_ishole); 2061 2062 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2063 metaslab_sync_done(msp, txg); 2064 2065 if (reassess) 2066 metaslab_sync_reassess(vd->vdev_mg); 2067} 2068 2069void 2070vdev_sync(vdev_t *vd, uint64_t txg) 2071{ 2072 spa_t *spa = vd->vdev_spa; 2073 vdev_t *lvd; 2074 metaslab_t *msp; 2075 dmu_tx_t *tx; 2076 2077 ASSERT(!vd->vdev_ishole); 2078 2079 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 2080 ASSERT(vd == vd->vdev_top); 2081 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2082 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2083 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2084 ASSERT(vd->vdev_ms_array != 0); 2085 vdev_config_dirty(vd); 2086 dmu_tx_commit(tx); 2087 } 2088 2089 /* 2090 * Remove the metadata associated with this vdev once it's empty. 2091 */ 2092 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) 2093 vdev_remove(vd, txg); 2094 2095 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2096 metaslab_sync(msp, txg); 2097 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2098 } 2099 2100 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2101 vdev_dtl_sync(lvd, txg); 2102 2103 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2104} 2105 2106uint64_t 2107vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2108{ 2109 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2110} 2111 2112/* 2113 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2114 * not be opened, and no I/O is attempted. 2115 */ 2116int 2117vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2118{ 2119 vdev_t *vd, *tvd; 2120 2121 spa_vdev_state_enter(spa, SCL_NONE); 2122 2123 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2124 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2125 2126 if (!vd->vdev_ops->vdev_op_leaf) 2127 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2128 2129 tvd = vd->vdev_top; 2130 2131 /* 2132 * We don't directly use the aux state here, but if we do a 2133 * vdev_reopen(), we need this value to be present to remember why we 2134 * were faulted. 2135 */ 2136 vd->vdev_label_aux = aux; 2137 2138 /* 2139 * Faulted state takes precedence over degraded. 2140 */ 2141 vd->vdev_delayed_close = B_FALSE; 2142 vd->vdev_faulted = 1ULL; 2143 vd->vdev_degraded = 0ULL; 2144 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2145 2146 /* 2147 * If this device has the only valid copy of the data, then 2148 * back off and simply mark the vdev as degraded instead. 2149 */ 2150 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2151 vd->vdev_degraded = 1ULL; 2152 vd->vdev_faulted = 0ULL; 2153 2154 /* 2155 * If we reopen the device and it's not dead, only then do we 2156 * mark it degraded. 2157 */ 2158 vdev_reopen(tvd); 2159 2160 if (vdev_readable(vd)) 2161 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2162 } 2163 2164 return (spa_vdev_state_exit(spa, vd, 0)); 2165} 2166 2167/* 2168 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2169 * user that something is wrong. The vdev continues to operate as normal as far 2170 * as I/O is concerned. 2171 */ 2172int 2173vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2174{ 2175 vdev_t *vd; 2176 2177 spa_vdev_state_enter(spa, SCL_NONE); 2178 2179 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2180 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2181 2182 if (!vd->vdev_ops->vdev_op_leaf) 2183 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2184 2185 /* 2186 * If the vdev is already faulted, then don't do anything. 2187 */ 2188 if (vd->vdev_faulted || vd->vdev_degraded) 2189 return (spa_vdev_state_exit(spa, NULL, 0)); 2190 2191 vd->vdev_degraded = 1ULL; 2192 if (!vdev_is_dead(vd)) 2193 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2194 aux); 2195 2196 return (spa_vdev_state_exit(spa, vd, 0)); 2197} 2198 2199/* 2200 * Online the given vdev. 2201 * 2202 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2203 * spare device should be detached when the device finishes resilvering. 2204 * Second, the online should be treated like a 'test' online case, so no FMA 2205 * events are generated if the device fails to open. 2206 */ 2207int 2208vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 2209{ 2210 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 2211 2212 spa_vdev_state_enter(spa, SCL_NONE); 2213 2214 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2215 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2216 2217 if (!vd->vdev_ops->vdev_op_leaf) 2218 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2219 2220 tvd = vd->vdev_top; 2221 vd->vdev_offline = B_FALSE; 2222 vd->vdev_tmpoffline = B_FALSE; 2223 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2224 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2225 2226 /* XXX - L2ARC 1.0 does not support expansion */ 2227 if (!vd->vdev_aux) { 2228 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2229 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2230 } 2231 2232 vdev_reopen(tvd); 2233 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2234 2235 if (!vd->vdev_aux) { 2236 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2237 pvd->vdev_expanding = B_FALSE; 2238 } 2239 2240 if (newstate) 2241 *newstate = vd->vdev_state; 2242 if ((flags & ZFS_ONLINE_UNSPARE) && 2243 !vdev_is_dead(vd) && vd->vdev_parent && 2244 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2245 vd->vdev_parent->vdev_child[0] == vd) 2246 vd->vdev_unspare = B_TRUE; 2247 2248 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2249 2250 /* XXX - L2ARC 1.0 does not support expansion */ 2251 if (vd->vdev_aux) 2252 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2253 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2254 } 2255 return (spa_vdev_state_exit(spa, vd, 0)); 2256} 2257 2258static int 2259vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2260{ 2261 vdev_t *vd, *tvd; 2262 int error = 0; 2263 uint64_t generation; 2264 metaslab_group_t *mg; 2265 2266top: 2267 spa_vdev_state_enter(spa, SCL_ALLOC); 2268 2269 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2270 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2271 2272 if (!vd->vdev_ops->vdev_op_leaf) 2273 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2274 2275 tvd = vd->vdev_top; 2276 mg = tvd->vdev_mg; 2277 generation = spa->spa_config_generation + 1; 2278 2279 /* 2280 * If the device isn't already offline, try to offline it. 2281 */ 2282 if (!vd->vdev_offline) { 2283 /* 2284 * If this device has the only valid copy of some data, 2285 * don't allow it to be offlined. Log devices are always 2286 * expendable. 2287 */ 2288 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2289 vdev_dtl_required(vd)) 2290 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2291 2292 /* 2293 * If the top-level is a slog and it has had allocations 2294 * then proceed. We check that the vdev's metaslab group 2295 * is not NULL since it's possible that we may have just 2296 * added this vdev but not yet initialized its metaslabs. 2297 */ 2298 if (tvd->vdev_islog && mg != NULL) { 2299 /* 2300 * Prevent any future allocations. 2301 */ 2302 metaslab_group_passivate(mg); 2303 (void) spa_vdev_state_exit(spa, vd, 0); 2304 2305 error = spa_offline_log(spa); 2306 2307 spa_vdev_state_enter(spa, SCL_ALLOC); 2308 2309 /* 2310 * Check to see if the config has changed. 2311 */ 2312 if (error || generation != spa->spa_config_generation) { 2313 metaslab_group_activate(mg); 2314 if (error) 2315 return (spa_vdev_state_exit(spa, 2316 vd, error)); 2317 (void) spa_vdev_state_exit(spa, vd, 0); 2318 goto top; 2319 } 2320 ASSERT0(tvd->vdev_stat.vs_alloc); 2321 } 2322 2323 /* 2324 * Offline this device and reopen its top-level vdev. 2325 * If the top-level vdev is a log device then just offline 2326 * it. Otherwise, if this action results in the top-level 2327 * vdev becoming unusable, undo it and fail the request. 2328 */ 2329 vd->vdev_offline = B_TRUE; 2330 vdev_reopen(tvd); 2331 2332 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2333 vdev_is_dead(tvd)) { 2334 vd->vdev_offline = B_FALSE; 2335 vdev_reopen(tvd); 2336 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2337 } 2338 2339 /* 2340 * Add the device back into the metaslab rotor so that 2341 * once we online the device it's open for business. 2342 */ 2343 if (tvd->vdev_islog && mg != NULL) 2344 metaslab_group_activate(mg); 2345 } 2346 2347 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2348 2349 return (spa_vdev_state_exit(spa, vd, 0)); 2350} 2351 2352int 2353vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2354{ 2355 int error; 2356 2357 mutex_enter(&spa->spa_vdev_top_lock); 2358 error = vdev_offline_locked(spa, guid, flags); 2359 mutex_exit(&spa->spa_vdev_top_lock); 2360 2361 return (error); 2362} 2363 2364/* 2365 * Clear the error counts associated with this vdev. Unlike vdev_online() and 2366 * vdev_offline(), we assume the spa config is locked. We also clear all 2367 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2368 */ 2369void 2370vdev_clear(spa_t *spa, vdev_t *vd) 2371{ 2372 vdev_t *rvd = spa->spa_root_vdev; 2373 2374 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2375 2376 if (vd == NULL) 2377 vd = rvd; 2378 2379 vd->vdev_stat.vs_read_errors = 0; 2380 vd->vdev_stat.vs_write_errors = 0; 2381 vd->vdev_stat.vs_checksum_errors = 0; 2382 2383 for (int c = 0; c < vd->vdev_children; c++) 2384 vdev_clear(spa, vd->vdev_child[c]); 2385 2386 if (vd == rvd) { 2387 for (int c = 0; c < spa->spa_l2cache.sav_count; c++) 2388 vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); 2389 2390 for (int c = 0; c < spa->spa_spares.sav_count; c++) 2391 vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); 2392 } 2393 2394 /* 2395 * If we're in the FAULTED state or have experienced failed I/O, then 2396 * clear the persistent state and attempt to reopen the device. We 2397 * also mark the vdev config dirty, so that the new faulted state is 2398 * written out to disk. 2399 */ 2400 if (vd->vdev_faulted || vd->vdev_degraded || 2401 !vdev_readable(vd) || !vdev_writeable(vd)) { 2402 2403 /* 2404 * When reopening in reponse to a clear event, it may be due to 2405 * a fmadm repair request. In this case, if the device is 2406 * still broken, we want to still post the ereport again. 2407 */ 2408 vd->vdev_forcefault = B_TRUE; 2409 2410 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2411 vd->vdev_cant_read = B_FALSE; 2412 vd->vdev_cant_write = B_FALSE; 2413 2414 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 2415 2416 vd->vdev_forcefault = B_FALSE; 2417 2418 if (vd != rvd && vdev_writeable(vd->vdev_top)) 2419 vdev_state_dirty(vd->vdev_top); 2420 2421 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2422 spa_async_request(spa, SPA_ASYNC_RESILVER); 2423 2424 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 2425 } 2426 2427 /* 2428 * When clearing a FMA-diagnosed fault, we always want to 2429 * unspare the device, as we assume that the original spare was 2430 * done in response to the FMA fault. 2431 */ 2432 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2433 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2434 vd->vdev_parent->vdev_child[0] == vd) 2435 vd->vdev_unspare = B_TRUE; 2436} 2437 2438boolean_t 2439vdev_is_dead(vdev_t *vd) 2440{ 2441 /* 2442 * Holes and missing devices are always considered "dead". 2443 * This simplifies the code since we don't have to check for 2444 * these types of devices in the various code paths. 2445 * Instead we rely on the fact that we skip over dead devices 2446 * before issuing I/O to them. 2447 */ 2448 return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 2449 vd->vdev_ops == &vdev_missing_ops); 2450} 2451 2452boolean_t 2453vdev_readable(vdev_t *vd) 2454{ 2455 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 2456} 2457 2458boolean_t 2459vdev_writeable(vdev_t *vd) 2460{ 2461 return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 2462} 2463 2464boolean_t 2465vdev_allocatable(vdev_t *vd) 2466{ 2467 uint64_t state = vd->vdev_state; 2468 2469 /* 2470 * We currently allow allocations from vdevs which may be in the 2471 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2472 * fails to reopen then we'll catch it later when we're holding 2473 * the proper locks. Note that we have to get the vdev state 2474 * in a local variable because although it changes atomically, 2475 * we're asking two separate questions about it. 2476 */ 2477 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 2478 !vd->vdev_cant_write && !vd->vdev_ishole); 2479} 2480 2481boolean_t 2482vdev_accessible(vdev_t *vd, zio_t *zio) 2483{ 2484 ASSERT(zio->io_vd == vd); 2485 2486 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2487 return (B_FALSE); 2488 2489 if (zio->io_type == ZIO_TYPE_READ) 2490 return (!vd->vdev_cant_read); 2491 2492 if (zio->io_type == ZIO_TYPE_WRITE) 2493 return (!vd->vdev_cant_write); 2494 2495 return (B_TRUE); 2496} 2497 2498/* 2499 * Get statistics for the given vdev. 2500 */ 2501void 2502vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2503{ 2504 vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 2505 2506 mutex_enter(&vd->vdev_stat_lock); 2507 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2508 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2509 vs->vs_state = vd->vdev_state; 2510 vs->vs_rsize = vdev_get_min_asize(vd); 2511 if (vd->vdev_ops->vdev_op_leaf) 2512 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2513 vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; 2514 mutex_exit(&vd->vdev_stat_lock); 2515 2516 /* 2517 * If we're getting stats on the root vdev, aggregate the I/O counts 2518 * over all top-level vdevs (i.e. the direct children of the root). 2519 */ 2520 if (vd == rvd) { 2521 for (int c = 0; c < rvd->vdev_children; c++) { 2522 vdev_t *cvd = rvd->vdev_child[c]; 2523 vdev_stat_t *cvs = &cvd->vdev_stat; 2524 2525 mutex_enter(&vd->vdev_stat_lock); 2526 for (int t = 0; t < ZIO_TYPES; t++) { 2527 vs->vs_ops[t] += cvs->vs_ops[t]; 2528 vs->vs_bytes[t] += cvs->vs_bytes[t]; 2529 } 2530 cvs->vs_scan_removing = cvd->vdev_removing; 2531 mutex_exit(&vd->vdev_stat_lock); 2532 } 2533 } 2534} 2535 2536void 2537vdev_clear_stats(vdev_t *vd) 2538{ 2539 mutex_enter(&vd->vdev_stat_lock); 2540 vd->vdev_stat.vs_space = 0; 2541 vd->vdev_stat.vs_dspace = 0; 2542 vd->vdev_stat.vs_alloc = 0; 2543 mutex_exit(&vd->vdev_stat_lock); 2544} 2545 2546void 2547vdev_scan_stat_init(vdev_t *vd) 2548{ 2549 vdev_stat_t *vs = &vd->vdev_stat; 2550 2551 for (int c = 0; c < vd->vdev_children; c++) 2552 vdev_scan_stat_init(vd->vdev_child[c]); 2553 2554 mutex_enter(&vd->vdev_stat_lock); 2555 vs->vs_scan_processed = 0; 2556 mutex_exit(&vd->vdev_stat_lock); 2557} 2558 2559void 2560vdev_stat_update(zio_t *zio, uint64_t psize) 2561{ 2562 spa_t *spa = zio->io_spa; 2563 vdev_t *rvd = spa->spa_root_vdev; 2564 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2565 vdev_t *pvd; 2566 uint64_t txg = zio->io_txg; 2567 vdev_stat_t *vs = &vd->vdev_stat; 2568 zio_type_t type = zio->io_type; 2569 int flags = zio->io_flags; 2570 2571 /* 2572 * If this i/o is a gang leader, it didn't do any actual work. 2573 */ 2574 if (zio->io_gang_tree) 2575 return; 2576 2577 if (zio->io_error == 0) { 2578 /* 2579 * If this is a root i/o, don't count it -- we've already 2580 * counted the top-level vdevs, and vdev_get_stats() will 2581 * aggregate them when asked. This reduces contention on 2582 * the root vdev_stat_lock and implicitly handles blocks 2583 * that compress away to holes, for which there is no i/o. 2584 * (Holes never create vdev children, so all the counters 2585 * remain zero, which is what we want.) 2586 * 2587 * Note: this only applies to successful i/o (io_error == 0) 2588 * because unlike i/o counts, errors are not additive. 2589 * When reading a ditto block, for example, failure of 2590 * one top-level vdev does not imply a root-level error. 2591 */ 2592 if (vd == rvd) 2593 return; 2594 2595 ASSERT(vd == zio->io_vd); 2596 2597 if (flags & ZIO_FLAG_IO_BYPASS) 2598 return; 2599 2600 mutex_enter(&vd->vdev_stat_lock); 2601 2602 if (flags & ZIO_FLAG_IO_REPAIR) { 2603 if (flags & ZIO_FLAG_SCAN_THREAD) { 2604 dsl_scan_phys_t *scn_phys = 2605 &spa->spa_dsl_pool->dp_scan->scn_phys; 2606 uint64_t *processed = &scn_phys->scn_processed; 2607 2608 /* XXX cleanup? */ 2609 if (vd->vdev_ops->vdev_op_leaf) 2610 atomic_add_64(processed, psize); 2611 vs->vs_scan_processed += psize; 2612 } 2613 2614 if (flags & ZIO_FLAG_SELF_HEAL) 2615 vs->vs_self_healed += psize; 2616 } 2617 2618 vs->vs_ops[type]++; 2619 vs->vs_bytes[type] += psize; 2620 2621 mutex_exit(&vd->vdev_stat_lock); 2622 return; 2623 } 2624 2625 if (flags & ZIO_FLAG_SPECULATIVE) 2626 return; 2627 2628 /* 2629 * If this is an I/O error that is going to be retried, then ignore the 2630 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 2631 * hard errors, when in reality they can happen for any number of 2632 * innocuous reasons (bus resets, MPxIO link failure, etc). 2633 */ 2634 if (zio->io_error == EIO && 2635 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 2636 return; 2637 2638 /* 2639 * Intent logs writes won't propagate their error to the root 2640 * I/O so don't mark these types of failures as pool-level 2641 * errors. 2642 */ 2643 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 2644 return; 2645 2646 mutex_enter(&vd->vdev_stat_lock); 2647 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 2648 if (zio->io_error == ECKSUM) 2649 vs->vs_checksum_errors++; 2650 else 2651 vs->vs_read_errors++; 2652 } 2653 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 2654 vs->vs_write_errors++; 2655 mutex_exit(&vd->vdev_stat_lock); 2656 2657 if (type == ZIO_TYPE_WRITE && txg != 0 && 2658 (!(flags & ZIO_FLAG_IO_REPAIR) || 2659 (flags & ZIO_FLAG_SCAN_THREAD) || 2660 spa->spa_claiming)) { 2661 /* 2662 * This is either a normal write (not a repair), or it's 2663 * a repair induced by the scrub thread, or it's a repair 2664 * made by zil_claim() during spa_load() in the first txg. 2665 * In the normal case, we commit the DTL change in the same 2666 * txg as the block was born. In the scrub-induced repair 2667 * case, we know that scrubs run in first-pass syncing context, 2668 * so we commit the DTL change in spa_syncing_txg(spa). 2669 * In the zil_claim() case, we commit in spa_first_txg(spa). 2670 * 2671 * We currently do not make DTL entries for failed spontaneous 2672 * self-healing writes triggered by normal (non-scrubbing) 2673 * reads, because we have no transactional context in which to 2674 * do so -- and it's not clear that it'd be desirable anyway. 2675 */ 2676 if (vd->vdev_ops->vdev_op_leaf) { 2677 uint64_t commit_txg = txg; 2678 if (flags & ZIO_FLAG_SCAN_THREAD) { 2679 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2680 ASSERT(spa_sync_pass(spa) == 1); 2681 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 2682 commit_txg = spa_syncing_txg(spa); 2683 } else if (spa->spa_claiming) { 2684 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2685 commit_txg = spa_first_txg(spa); 2686 } 2687 ASSERT(commit_txg >= spa_syncing_txg(spa)); 2688 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 2689 return; 2690 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2691 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 2692 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 2693 } 2694 if (vd != rvd) 2695 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 2696 } 2697} 2698 2699/* 2700 * Update the in-core space usage stats for this vdev, its metaslab class, 2701 * and the root vdev. 2702 */ 2703void 2704vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 2705 int64_t space_delta) 2706{ 2707 int64_t dspace_delta = space_delta; 2708 spa_t *spa = vd->vdev_spa; 2709 vdev_t *rvd = spa->spa_root_vdev; 2710 metaslab_group_t *mg = vd->vdev_mg; 2711 metaslab_class_t *mc = mg ? mg->mg_class : NULL; 2712 2713 ASSERT(vd == vd->vdev_top); 2714 2715 /* 2716 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 2717 * factor. We must calculate this here and not at the root vdev 2718 * because the root vdev's psize-to-asize is simply the max of its 2719 * childrens', thus not accurate enough for us. 2720 */ 2721 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 2722 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 2723 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 2724 vd->vdev_deflate_ratio; 2725 2726 mutex_enter(&vd->vdev_stat_lock); 2727 vd->vdev_stat.vs_alloc += alloc_delta; 2728 vd->vdev_stat.vs_space += space_delta; 2729 vd->vdev_stat.vs_dspace += dspace_delta; 2730 mutex_exit(&vd->vdev_stat_lock); 2731 2732 if (mc == spa_normal_class(spa)) { 2733 mutex_enter(&rvd->vdev_stat_lock); 2734 rvd->vdev_stat.vs_alloc += alloc_delta; 2735 rvd->vdev_stat.vs_space += space_delta; 2736 rvd->vdev_stat.vs_dspace += dspace_delta; 2737 mutex_exit(&rvd->vdev_stat_lock); 2738 } 2739 2740 if (mc != NULL) { 2741 ASSERT(rvd == vd->vdev_parent); 2742 ASSERT(vd->vdev_ms_count != 0); 2743 2744 metaslab_class_space_update(mc, 2745 alloc_delta, defer_delta, space_delta, dspace_delta); 2746 } 2747} 2748 2749/* 2750 * Mark a top-level vdev's config as dirty, placing it on the dirty list 2751 * so that it will be written out next time the vdev configuration is synced. 2752 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2753 */ 2754void 2755vdev_config_dirty(vdev_t *vd) 2756{ 2757 spa_t *spa = vd->vdev_spa; 2758 vdev_t *rvd = spa->spa_root_vdev; 2759 int c; 2760 2761 ASSERT(spa_writeable(spa)); 2762 2763 /* 2764 * If this is an aux vdev (as with l2cache and spare devices), then we 2765 * update the vdev config manually and set the sync flag. 2766 */ 2767 if (vd->vdev_aux != NULL) { 2768 spa_aux_vdev_t *sav = vd->vdev_aux; 2769 nvlist_t **aux; 2770 uint_t naux; 2771 2772 for (c = 0; c < sav->sav_count; c++) { 2773 if (sav->sav_vdevs[c] == vd) 2774 break; 2775 } 2776 2777 if (c == sav->sav_count) { 2778 /* 2779 * We're being removed. There's nothing more to do. 2780 */ 2781 ASSERT(sav->sav_sync == B_TRUE); 2782 return; 2783 } 2784 2785 sav->sav_sync = B_TRUE; 2786 2787 if (nvlist_lookup_nvlist_array(sav->sav_config, 2788 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 2789 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 2790 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 2791 } 2792 2793 ASSERT(c < naux); 2794 2795 /* 2796 * Setting the nvlist in the middle if the array is a little 2797 * sketchy, but it will work. 2798 */ 2799 nvlist_free(aux[c]); 2800 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 2801 2802 return; 2803 } 2804 2805 /* 2806 * The dirty list is protected by the SCL_CONFIG lock. The caller 2807 * must either hold SCL_CONFIG as writer, or must be the sync thread 2808 * (which holds SCL_CONFIG as reader). There's only one sync thread, 2809 * so this is sufficient to ensure mutual exclusion. 2810 */ 2811 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2812 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2813 spa_config_held(spa, SCL_CONFIG, RW_READER))); 2814 2815 if (vd == rvd) { 2816 for (c = 0; c < rvd->vdev_children; c++) 2817 vdev_config_dirty(rvd->vdev_child[c]); 2818 } else { 2819 ASSERT(vd == vd->vdev_top); 2820 2821 if (!list_link_active(&vd->vdev_config_dirty_node) && 2822 !vd->vdev_ishole) 2823 list_insert_head(&spa->spa_config_dirty_list, vd); 2824 } 2825} 2826 2827void 2828vdev_config_clean(vdev_t *vd) 2829{ 2830 spa_t *spa = vd->vdev_spa; 2831 2832 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2833 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2834 spa_config_held(spa, SCL_CONFIG, RW_READER))); 2835 2836 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 2837 list_remove(&spa->spa_config_dirty_list, vd); 2838} 2839 2840/* 2841 * Mark a top-level vdev's state as dirty, so that the next pass of 2842 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 2843 * the state changes from larger config changes because they require 2844 * much less locking, and are often needed for administrative actions. 2845 */ 2846void 2847vdev_state_dirty(vdev_t *vd) 2848{ 2849 spa_t *spa = vd->vdev_spa; 2850 2851 ASSERT(spa_writeable(spa)); 2852 ASSERT(vd == vd->vdev_top); 2853 2854 /* 2855 * The state list is protected by the SCL_STATE lock. The caller 2856 * must either hold SCL_STATE as writer, or must be the sync thread 2857 * (which holds SCL_STATE as reader). There's only one sync thread, 2858 * so this is sufficient to ensure mutual exclusion. 2859 */ 2860 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2861 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2862 spa_config_held(spa, SCL_STATE, RW_READER))); 2863 2864 if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 2865 list_insert_head(&spa->spa_state_dirty_list, vd); 2866} 2867 2868void 2869vdev_state_clean(vdev_t *vd) 2870{ 2871 spa_t *spa = vd->vdev_spa; 2872 2873 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2874 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2875 spa_config_held(spa, SCL_STATE, RW_READER))); 2876 2877 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 2878 list_remove(&spa->spa_state_dirty_list, vd); 2879} 2880 2881/* 2882 * Propagate vdev state up from children to parent. 2883 */ 2884void 2885vdev_propagate_state(vdev_t *vd) 2886{ 2887 spa_t *spa = vd->vdev_spa; 2888 vdev_t *rvd = spa->spa_root_vdev; 2889 int degraded = 0, faulted = 0; 2890 int corrupted = 0; 2891 vdev_t *child; 2892 2893 if (vd->vdev_children > 0) { 2894 for (int c = 0; c < vd->vdev_children; c++) { 2895 child = vd->vdev_child[c]; 2896 2897 /* 2898 * Don't factor holes into the decision. 2899 */ 2900 if (child->vdev_ishole) 2901 continue; 2902 2903 if (!vdev_readable(child) || 2904 (!vdev_writeable(child) && spa_writeable(spa))) { 2905 /* 2906 * Root special: if there is a top-level log 2907 * device, treat the root vdev as if it were 2908 * degraded. 2909 */ 2910 if (child->vdev_islog && vd == rvd) 2911 degraded++; 2912 else 2913 faulted++; 2914 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 2915 degraded++; 2916 } 2917 2918 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 2919 corrupted++; 2920 } 2921 2922 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 2923 2924 /* 2925 * Root special: if there is a top-level vdev that cannot be 2926 * opened due to corrupted metadata, then propagate the root 2927 * vdev's aux state as 'corrupt' rather than 'insufficient 2928 * replicas'. 2929 */ 2930 if (corrupted && vd == rvd && 2931 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 2932 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 2933 VDEV_AUX_CORRUPT_DATA); 2934 } 2935 2936 if (vd->vdev_parent) 2937 vdev_propagate_state(vd->vdev_parent); 2938} 2939 2940/* 2941 * Set a vdev's state. If this is during an open, we don't update the parent 2942 * state, because we're in the process of opening children depth-first. 2943 * Otherwise, we propagate the change to the parent. 2944 * 2945 * If this routine places a device in a faulted state, an appropriate ereport is 2946 * generated. 2947 */ 2948void 2949vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2950{ 2951 uint64_t save_state; 2952 spa_t *spa = vd->vdev_spa; 2953 2954 if (state == vd->vdev_state) { 2955 vd->vdev_stat.vs_aux = aux; 2956 return; 2957 } 2958 2959 save_state = vd->vdev_state; 2960 2961 vd->vdev_state = state; 2962 vd->vdev_stat.vs_aux = aux; 2963 2964 /* 2965 * If we are setting the vdev state to anything but an open state, then 2966 * always close the underlying device unless the device has requested 2967 * a delayed close (i.e. we're about to remove or fault the device). 2968 * Otherwise, we keep accessible but invalid devices open forever. 2969 * We don't call vdev_close() itself, because that implies some extra 2970 * checks (offline, etc) that we don't want here. This is limited to 2971 * leaf devices, because otherwise closing the device will affect other 2972 * children. 2973 */ 2974 if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 2975 vd->vdev_ops->vdev_op_leaf) 2976 vd->vdev_ops->vdev_op_close(vd); 2977 2978 /* 2979 * If we have brought this vdev back into service, we need 2980 * to notify fmd so that it can gracefully repair any outstanding 2981 * cases due to a missing device. We do this in all cases, even those 2982 * that probably don't correlate to a repaired fault. This is sure to 2983 * catch all cases, and we let the zfs-retire agent sort it out. If 2984 * this is a transient state it's OK, as the retire agent will 2985 * double-check the state of the vdev before repairing it. 2986 */ 2987 if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 2988 vd->vdev_prevstate != state) 2989 zfs_post_state_change(spa, vd); 2990 2991 if (vd->vdev_removed && 2992 state == VDEV_STATE_CANT_OPEN && 2993 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 2994 /* 2995 * If the previous state is set to VDEV_STATE_REMOVED, then this 2996 * device was previously marked removed and someone attempted to 2997 * reopen it. If this failed due to a nonexistent device, then 2998 * keep the device in the REMOVED state. We also let this be if 2999 * it is one of our special test online cases, which is only 3000 * attempting to online the device and shouldn't generate an FMA 3001 * fault. 3002 */ 3003 vd->vdev_state = VDEV_STATE_REMOVED; 3004 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 3005 } else if (state == VDEV_STATE_REMOVED) { 3006 vd->vdev_removed = B_TRUE; 3007 } else if (state == VDEV_STATE_CANT_OPEN) { 3008 /* 3009 * If we fail to open a vdev during an import or recovery, we 3010 * mark it as "not available", which signifies that it was 3011 * never there to begin with. Failure to open such a device 3012 * is not considered an error. 3013 */ 3014 if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 3015 spa_load_state(spa) == SPA_LOAD_RECOVER) && 3016 vd->vdev_ops->vdev_op_leaf) 3017 vd->vdev_not_present = 1; 3018 3019 /* 3020 * Post the appropriate ereport. If the 'prevstate' field is 3021 * set to something other than VDEV_STATE_UNKNOWN, it indicates 3022 * that this is part of a vdev_reopen(). In this case, we don't 3023 * want to post the ereport if the device was already in the 3024 * CANT_OPEN state beforehand. 3025 * 3026 * If the 'checkremove' flag is set, then this is an attempt to 3027 * online the device in response to an insertion event. If we 3028 * hit this case, then we have detected an insertion event for a 3029 * faulted or offline device that wasn't in the removed state. 3030 * In this scenario, we don't post an ereport because we are 3031 * about to replace the device, or attempt an online with 3032 * vdev_forcefault, which will generate the fault for us. 3033 */ 3034 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 3035 !vd->vdev_not_present && !vd->vdev_checkremove && 3036 vd != spa->spa_root_vdev) { 3037 const char *class; 3038 3039 switch (aux) { 3040 case VDEV_AUX_OPEN_FAILED: 3041 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3042 break; 3043 case VDEV_AUX_CORRUPT_DATA: 3044 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3045 break; 3046 case VDEV_AUX_NO_REPLICAS: 3047 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3048 break; 3049 case VDEV_AUX_BAD_GUID_SUM: 3050 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3051 break; 3052 case VDEV_AUX_TOO_SMALL: 3053 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3054 break; 3055 case VDEV_AUX_BAD_LABEL: 3056 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3057 break; 3058 default: 3059 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3060 } 3061 3062 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3063 } 3064 3065 /* Erase any notion of persistent removed state */ 3066 vd->vdev_removed = B_FALSE; 3067 } else { 3068 vd->vdev_removed = B_FALSE; 3069 } 3070 3071 if (!isopen && vd->vdev_parent) 3072 vdev_propagate_state(vd->vdev_parent); 3073} 3074 3075/* 3076 * Check the vdev configuration to ensure that it's capable of supporting 3077 * a root pool. 3078 * 3079 * On Solaris, we do not support RAID-Z or partial configuration. In 3080 * addition, only a single top-level vdev is allowed and none of the 3081 * leaves can be wholedisks. 3082 * 3083 * For FreeBSD, we can boot from any configuration. There is a 3084 * limitation that the boot filesystem must be either uncompressed or 3085 * compresses with lzjb compression but I'm not sure how to enforce 3086 * that here. 3087 */ 3088boolean_t 3089vdev_is_bootable(vdev_t *vd) 3090{ 3091#ifdef sun 3092 if (!vd->vdev_ops->vdev_op_leaf) { 3093 char *vdev_type = vd->vdev_ops->vdev_op_type; 3094 3095 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3096 vd->vdev_children > 1) { 3097 return (B_FALSE); 3098 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 3099 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 3100 return (B_FALSE); 3101 } 3102 } else if (vd->vdev_wholedisk == 1) { 3103 return (B_FALSE); 3104 } 3105 3106 for (int c = 0; c < vd->vdev_children; c++) { 3107 if (!vdev_is_bootable(vd->vdev_child[c])) 3108 return (B_FALSE); 3109 } 3110#endif /* sun */ 3111 return (B_TRUE); 3112} 3113 3114/* 3115 * Load the state from the original vdev tree (ovd) which 3116 * we've retrieved from the MOS config object. If the original 3117 * vdev was offline or faulted then we transfer that state to the 3118 * device in the current vdev tree (nvd). 3119 */ 3120void 3121vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3122{ 3123 spa_t *spa = nvd->vdev_spa; 3124 3125 ASSERT(nvd->vdev_top->vdev_islog); 3126 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3127 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3128 3129 for (int c = 0; c < nvd->vdev_children; c++) 3130 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3131 3132 if (nvd->vdev_ops->vdev_op_leaf) { 3133 /* 3134 * Restore the persistent vdev state 3135 */ 3136 nvd->vdev_offline = ovd->vdev_offline; 3137 nvd->vdev_faulted = ovd->vdev_faulted; 3138 nvd->vdev_degraded = ovd->vdev_degraded; 3139 nvd->vdev_removed = ovd->vdev_removed; 3140 } 3141} 3142 3143/* 3144 * Determine if a log device has valid content. If the vdev was 3145 * removed or faulted in the MOS config then we know that 3146 * the content on the log device has already been written to the pool. 3147 */ 3148boolean_t 3149vdev_log_state_valid(vdev_t *vd) 3150{ 3151 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3152 !vd->vdev_removed) 3153 return (B_TRUE); 3154 3155 for (int c = 0; c < vd->vdev_children; c++) 3156 if (vdev_log_state_valid(vd->vdev_child[c])) 3157 return (B_TRUE); 3158 3159 return (B_FALSE); 3160} 3161 3162/* 3163 * Expand a vdev if possible. 3164 */ 3165void 3166vdev_expand(vdev_t *vd, uint64_t txg) 3167{ 3168 ASSERT(vd->vdev_top == vd); 3169 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3170 3171 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 3172 VERIFY(vdev_metaslab_init(vd, txg) == 0); 3173 vdev_config_dirty(vd); 3174 } 3175} 3176 3177/* 3178 * Split a vdev. 3179 */ 3180void 3181vdev_split(vdev_t *vd) 3182{ 3183 vdev_t *cvd, *pvd = vd->vdev_parent; 3184 3185 vdev_remove_child(pvd, vd); 3186 vdev_compact_children(pvd); 3187 3188 cvd = pvd->vdev_child[0]; 3189 if (pvd->vdev_children == 1) { 3190 vdev_remove_parent(cvd); 3191 cvd->vdev_splitting = B_TRUE; 3192 } 3193 vdev_propagate_state(cvd); 3194} 3195 3196void 3197vdev_deadman(vdev_t *vd) 3198{ 3199 for (int c = 0; c < vd->vdev_children; c++) { 3200 vdev_t *cvd = vd->vdev_child[c]; 3201 3202 vdev_deadman(cvd); 3203 } 3204 3205 if (vd->vdev_ops->vdev_op_leaf) { 3206 vdev_queue_t *vq = &vd->vdev_queue; 3207 3208 mutex_enter(&vq->vq_lock); 3209 if (avl_numnodes(&vq->vq_pending_tree) > 0) { 3210 spa_t *spa = vd->vdev_spa; 3211 zio_t *fio; 3212 uint64_t delta; 3213 3214 /* 3215 * Look at the head of all the pending queues, 3216 * if any I/O has been outstanding for longer than 3217 * the spa_deadman_synctime we panic the system. 3218 */ 3219 fio = avl_first(&vq->vq_pending_tree); 3220 delta = gethrtime() - fio->io_timestamp; 3221 if (delta > spa_deadman_synctime(spa)) { 3222 zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " 3223 "delta %lluns, last io %lluns", 3224 fio->io_timestamp, delta, 3225 vq->vq_io_complete_ts); 3226 fm_panic("I/O to pool '%s' appears to be " 3227 "hung on vdev guid %llu at '%s'.", 3228 spa_name(spa), 3229 (long long unsigned int) vd->vdev_guid, 3230 vd->vdev_path); 3231 } 3232 } 3233 mutex_exit(&vq->vq_lock); 3234 } 3235} 3236