1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright 2017 Nexenta Systems, Inc. 26 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 * Copyright (c) 2014 Integros [integros.com] 28 * Copyright 2016 Toomas Soome <tsoome@me.com> 29 * Copyright 2017 Joyent, Inc. 30 */ 31 32#include <sys/zfs_context.h> 33#include <sys/fm/fs/zfs.h> 34#include <sys/spa.h> 35#include <sys/spa_impl.h> 36#include <sys/bpobj.h> 37#include <sys/dmu.h> 38#include <sys/dmu_tx.h> 39#include <sys/dsl_dir.h> 40#include <sys/vdev_impl.h> 41#include <sys/uberblock_impl.h> 42#include <sys/metaslab.h> 43#include <sys/metaslab_impl.h> 44#include <sys/space_map.h> 45#include <sys/space_reftree.h> 46#include <sys/zio.h> 47#include <sys/zap.h> 48#include <sys/fs/zfs.h> 49#include <sys/arc.h> 50#include <sys/zil.h> 51#include <sys/dsl_scan.h> 52#include <sys/abd.h> 53#include <sys/trim_map.h> 54 55SYSCTL_DECL(_vfs_zfs); 56SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 57 58/* 59 * Virtual device management. 60 */ 61 62/* 63 * The limit for ZFS to automatically increase a top-level vdev's ashift 64 * from logical ashift to physical ashift. 65 * 66 * Example: one or more 512B emulation child vdevs 67 * child->vdev_ashift = 9 (512 bytes) 68 * child->vdev_physical_ashift = 12 (4096 bytes) 69 * zfs_max_auto_ashift = 11 (2048 bytes) 70 * zfs_min_auto_ashift = 9 (512 bytes) 71 * 72 * On pool creation or the addition of a new top-level vdev, ZFS will 73 * increase the ashift of the top-level vdev to 2048 as limited by 74 * zfs_max_auto_ashift. 75 * 76 * Example: one or more 512B emulation child vdevs 77 * child->vdev_ashift = 9 (512 bytes) 78 * child->vdev_physical_ashift = 12 (4096 bytes) 79 * zfs_max_auto_ashift = 13 (8192 bytes) 80 * zfs_min_auto_ashift = 9 (512 bytes) 81 * 82 * On pool creation or the addition of a new top-level vdev, ZFS will 83 * increase the ashift of the top-level vdev to 4096 to match the 84 * max vdev_physical_ashift. 85 * 86 * Example: one or more 512B emulation child vdevs 87 * child->vdev_ashift = 9 (512 bytes) 88 * child->vdev_physical_ashift = 9 (512 bytes) 89 * zfs_max_auto_ashift = 13 (8192 bytes) 90 * zfs_min_auto_ashift = 12 (4096 bytes) 91 * 92 * On pool creation or the addition of a new top-level vdev, ZFS will 93 * increase the ashift of the top-level vdev to 4096 to match the 94 * zfs_min_auto_ashift. 95 */ 96static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; 97static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; 98 99static int 100sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) 101{ 102 uint64_t val; 103 int err; 104 105 val = zfs_max_auto_ashift; 106 err = sysctl_handle_64(oidp, &val, 0, req); 107 if (err != 0 || req->newptr == NULL) 108 return (err); 109 110 if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) 111 return (EINVAL); 112 113 zfs_max_auto_ashift = val; 114 115 return (0); 116} 117SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, 118 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 119 sysctl_vfs_zfs_max_auto_ashift, "QU", 120 "Max ashift used when optimising for logical -> physical sectors size on " 121 "new top-level vdevs."); 122 123static int 124sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) 125{ 126 uint64_t val; 127 int err; 128 129 val = zfs_min_auto_ashift; 130 err = sysctl_handle_64(oidp, &val, 0, req); 131 if (err != 0 || req->newptr == NULL) 132 return (err); 133 134 if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) 135 return (EINVAL); 136 137 zfs_min_auto_ashift = val; 138 139 return (0); 140} 141SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, 142 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 143 sysctl_vfs_zfs_min_auto_ashift, "QU", 144 "Min ashift used when creating new top-level vdevs."); 145 146static vdev_ops_t *vdev_ops_table[] = { 147 &vdev_root_ops, 148 &vdev_raidz_ops, 149 &vdev_mirror_ops, 150 &vdev_replacing_ops, 151 &vdev_spare_ops, 152#ifdef _KERNEL 153 &vdev_geom_ops, 154#else 155 &vdev_disk_ops, 156#endif 157 &vdev_file_ops, 158 &vdev_missing_ops, 159 &vdev_hole_ops, 160 &vdev_indirect_ops, 161 NULL 162}; 163 164 165/* 166 * When a vdev is added, it will be divided into approximately (but no 167 * more than) this number of metaslabs. 168 */ 169int metaslabs_per_vdev = 200; 170SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN, 171 &metaslabs_per_vdev, 0, 172 "When a vdev is added, how many metaslabs the vdev should be divided into"); 173
| 1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright 2017 Nexenta Systems, Inc. 26 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 * Copyright (c) 2014 Integros [integros.com] 28 * Copyright 2016 Toomas Soome <tsoome@me.com> 29 * Copyright 2017 Joyent, Inc. 30 */ 31 32#include <sys/zfs_context.h> 33#include <sys/fm/fs/zfs.h> 34#include <sys/spa.h> 35#include <sys/spa_impl.h> 36#include <sys/bpobj.h> 37#include <sys/dmu.h> 38#include <sys/dmu_tx.h> 39#include <sys/dsl_dir.h> 40#include <sys/vdev_impl.h> 41#include <sys/uberblock_impl.h> 42#include <sys/metaslab.h> 43#include <sys/metaslab_impl.h> 44#include <sys/space_map.h> 45#include <sys/space_reftree.h> 46#include <sys/zio.h> 47#include <sys/zap.h> 48#include <sys/fs/zfs.h> 49#include <sys/arc.h> 50#include <sys/zil.h> 51#include <sys/dsl_scan.h> 52#include <sys/abd.h> 53#include <sys/trim_map.h> 54 55SYSCTL_DECL(_vfs_zfs); 56SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 57 58/* 59 * Virtual device management. 60 */ 61 62/* 63 * The limit for ZFS to automatically increase a top-level vdev's ashift 64 * from logical ashift to physical ashift. 65 * 66 * Example: one or more 512B emulation child vdevs 67 * child->vdev_ashift = 9 (512 bytes) 68 * child->vdev_physical_ashift = 12 (4096 bytes) 69 * zfs_max_auto_ashift = 11 (2048 bytes) 70 * zfs_min_auto_ashift = 9 (512 bytes) 71 * 72 * On pool creation or the addition of a new top-level vdev, ZFS will 73 * increase the ashift of the top-level vdev to 2048 as limited by 74 * zfs_max_auto_ashift. 75 * 76 * Example: one or more 512B emulation child vdevs 77 * child->vdev_ashift = 9 (512 bytes) 78 * child->vdev_physical_ashift = 12 (4096 bytes) 79 * zfs_max_auto_ashift = 13 (8192 bytes) 80 * zfs_min_auto_ashift = 9 (512 bytes) 81 * 82 * On pool creation or the addition of a new top-level vdev, ZFS will 83 * increase the ashift of the top-level vdev to 4096 to match the 84 * max vdev_physical_ashift. 85 * 86 * Example: one or more 512B emulation child vdevs 87 * child->vdev_ashift = 9 (512 bytes) 88 * child->vdev_physical_ashift = 9 (512 bytes) 89 * zfs_max_auto_ashift = 13 (8192 bytes) 90 * zfs_min_auto_ashift = 12 (4096 bytes) 91 * 92 * On pool creation or the addition of a new top-level vdev, ZFS will 93 * increase the ashift of the top-level vdev to 4096 to match the 94 * zfs_min_auto_ashift. 95 */ 96static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; 97static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; 98 99static int 100sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) 101{ 102 uint64_t val; 103 int err; 104 105 val = zfs_max_auto_ashift; 106 err = sysctl_handle_64(oidp, &val, 0, req); 107 if (err != 0 || req->newptr == NULL) 108 return (err); 109 110 if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) 111 return (EINVAL); 112 113 zfs_max_auto_ashift = val; 114 115 return (0); 116} 117SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, 118 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 119 sysctl_vfs_zfs_max_auto_ashift, "QU", 120 "Max ashift used when optimising for logical -> physical sectors size on " 121 "new top-level vdevs."); 122 123static int 124sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) 125{ 126 uint64_t val; 127 int err; 128 129 val = zfs_min_auto_ashift; 130 err = sysctl_handle_64(oidp, &val, 0, req); 131 if (err != 0 || req->newptr == NULL) 132 return (err); 133 134 if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) 135 return (EINVAL); 136 137 zfs_min_auto_ashift = val; 138 139 return (0); 140} 141SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, 142 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 143 sysctl_vfs_zfs_min_auto_ashift, "QU", 144 "Min ashift used when creating new top-level vdevs."); 145 146static vdev_ops_t *vdev_ops_table[] = { 147 &vdev_root_ops, 148 &vdev_raidz_ops, 149 &vdev_mirror_ops, 150 &vdev_replacing_ops, 151 &vdev_spare_ops, 152#ifdef _KERNEL 153 &vdev_geom_ops, 154#else 155 &vdev_disk_ops, 156#endif 157 &vdev_file_ops, 158 &vdev_missing_ops, 159 &vdev_hole_ops, 160 &vdev_indirect_ops, 161 NULL 162}; 163 164 165/* 166 * When a vdev is added, it will be divided into approximately (but no 167 * more than) this number of metaslabs. 168 */ 169int metaslabs_per_vdev = 200; 170SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN, 171 &metaslabs_per_vdev, 0, 172 "When a vdev is added, how many metaslabs the vdev should be divided into"); 173
|
| 174/*PRINTFLIKE2*/ 175void 176vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) 177{ 178 va_list adx; 179 char buf[256]; 180 181 va_start(adx, fmt); 182 (void) vsnprintf(buf, sizeof (buf), fmt, adx); 183 va_end(adx); 184 185 if (vd->vdev_path != NULL) { 186 zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, 187 vd->vdev_path, buf); 188 } else { 189 zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", 190 vd->vdev_ops->vdev_op_type, 191 (u_longlong_t)vd->vdev_id, 192 (u_longlong_t)vd->vdev_guid, buf); 193 } 194} 195
|
174/* 175 * Given a vdev type, return the appropriate ops vector. 176 */ 177static vdev_ops_t * 178vdev_getops(const char *type) 179{ 180 vdev_ops_t *ops, **opspp; 181 182 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 183 if (strcmp(ops->vdev_op_type, type) == 0) 184 break; 185 186 return (ops); 187} 188 189/* 190 * Default asize function: return the MAX of psize with the asize of 191 * all children. This is what's used by anything other than RAID-Z. 192 */ 193uint64_t 194vdev_default_asize(vdev_t *vd, uint64_t psize) 195{ 196 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 197 uint64_t csize; 198 199 for (int c = 0; c < vd->vdev_children; c++) { 200 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 201 asize = MAX(asize, csize); 202 } 203 204 return (asize); 205} 206 207/* 208 * Get the minimum allocatable size. We define the allocatable size as 209 * the vdev's asize rounded to the nearest metaslab. This allows us to 210 * replace or attach devices which don't have the same physical size but 211 * can still satisfy the same number of allocations. 212 */ 213uint64_t 214vdev_get_min_asize(vdev_t *vd) 215{ 216 vdev_t *pvd = vd->vdev_parent; 217 218 /* 219 * If our parent is NULL (inactive spare or cache) or is the root, 220 * just return our own asize. 221 */ 222 if (pvd == NULL) 223 return (vd->vdev_asize); 224 225 /* 226 * The top-level vdev just returns the allocatable size rounded 227 * to the nearest metaslab. 228 */ 229 if (vd == vd->vdev_top) 230 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 231 232 /* 233 * The allocatable space for a raidz vdev is N * sizeof(smallest child), 234 * so each child must provide at least 1/Nth of its asize. 235 */ 236 if (pvd->vdev_ops == &vdev_raidz_ops) 237 return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / 238 pvd->vdev_children); 239 240 return (pvd->vdev_min_asize); 241} 242 243void 244vdev_set_min_asize(vdev_t *vd) 245{ 246 vd->vdev_min_asize = vdev_get_min_asize(vd); 247 248 for (int c = 0; c < vd->vdev_children; c++) 249 vdev_set_min_asize(vd->vdev_child[c]); 250} 251 252vdev_t * 253vdev_lookup_top(spa_t *spa, uint64_t vdev) 254{ 255 vdev_t *rvd = spa->spa_root_vdev; 256 257 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 258 259 if (vdev < rvd->vdev_children) { 260 ASSERT(rvd->vdev_child[vdev] != NULL); 261 return (rvd->vdev_child[vdev]); 262 } 263 264 return (NULL); 265} 266 267vdev_t * 268vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 269{ 270 vdev_t *mvd; 271 272 if (vd->vdev_guid == guid) 273 return (vd); 274 275 for (int c = 0; c < vd->vdev_children; c++) 276 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 277 NULL) 278 return (mvd); 279 280 return (NULL); 281} 282 283static int 284vdev_count_leaves_impl(vdev_t *vd) 285{ 286 int n = 0; 287 288 if (vd->vdev_ops->vdev_op_leaf) 289 return (1); 290 291 for (int c = 0; c < vd->vdev_children; c++) 292 n += vdev_count_leaves_impl(vd->vdev_child[c]); 293 294 return (n); 295} 296 297int 298vdev_count_leaves(spa_t *spa) 299{ 300 return (vdev_count_leaves_impl(spa->spa_root_vdev)); 301} 302 303void 304vdev_add_child(vdev_t *pvd, vdev_t *cvd) 305{ 306 size_t oldsize, newsize; 307 uint64_t id = cvd->vdev_id; 308 vdev_t **newchild; 309 spa_t *spa = cvd->vdev_spa; 310 311 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 312 ASSERT(cvd->vdev_parent == NULL); 313 314 cvd->vdev_parent = pvd; 315 316 if (pvd == NULL) 317 return; 318 319 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 320 321 oldsize = pvd->vdev_children * sizeof (vdev_t *); 322 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 323 newsize = pvd->vdev_children * sizeof (vdev_t *); 324 325 newchild = kmem_zalloc(newsize, KM_SLEEP); 326 if (pvd->vdev_child != NULL) { 327 bcopy(pvd->vdev_child, newchild, oldsize); 328 kmem_free(pvd->vdev_child, oldsize); 329 } 330 331 pvd->vdev_child = newchild; 332 pvd->vdev_child[id] = cvd; 333 334 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 335 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 336 337 /* 338 * Walk up all ancestors to update guid sum. 339 */ 340 for (; pvd != NULL; pvd = pvd->vdev_parent) 341 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 342} 343 344void 345vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 346{ 347 int c; 348 uint_t id = cvd->vdev_id; 349 350 ASSERT(cvd->vdev_parent == pvd); 351 352 if (pvd == NULL) 353 return; 354 355 ASSERT(id < pvd->vdev_children); 356 ASSERT(pvd->vdev_child[id] == cvd); 357 358 pvd->vdev_child[id] = NULL; 359 cvd->vdev_parent = NULL; 360 361 for (c = 0; c < pvd->vdev_children; c++) 362 if (pvd->vdev_child[c]) 363 break; 364 365 if (c == pvd->vdev_children) { 366 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 367 pvd->vdev_child = NULL; 368 pvd->vdev_children = 0; 369 } 370 371 /* 372 * Walk up all ancestors to update guid sum. 373 */ 374 for (; pvd != NULL; pvd = pvd->vdev_parent) 375 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 376} 377 378/* 379 * Remove any holes in the child array. 380 */ 381void 382vdev_compact_children(vdev_t *pvd) 383{ 384 vdev_t **newchild, *cvd; 385 int oldc = pvd->vdev_children; 386 int newc; 387 388 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 389 390 for (int c = newc = 0; c < oldc; c++) 391 if (pvd->vdev_child[c]) 392 newc++; 393 394 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 395 396 for (int c = newc = 0; c < oldc; c++) { 397 if ((cvd = pvd->vdev_child[c]) != NULL) { 398 newchild[newc] = cvd; 399 cvd->vdev_id = newc++; 400 } 401 } 402 403 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 404 pvd->vdev_child = newchild; 405 pvd->vdev_children = newc; 406} 407 408/* 409 * Allocate and minimally initialize a vdev_t. 410 */ 411vdev_t * 412vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 413{ 414 vdev_t *vd; 415 vdev_indirect_config_t *vic; 416 417 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 418 vic = &vd->vdev_indirect_config; 419 420 if (spa->spa_root_vdev == NULL) { 421 ASSERT(ops == &vdev_root_ops); 422 spa->spa_root_vdev = vd; 423 spa->spa_load_guid = spa_generate_guid(NULL); 424 } 425 426 if (guid == 0 && ops != &vdev_hole_ops) { 427 if (spa->spa_root_vdev == vd) { 428 /* 429 * The root vdev's guid will also be the pool guid, 430 * which must be unique among all pools. 431 */ 432 guid = spa_generate_guid(NULL); 433 } else { 434 /* 435 * Any other vdev's guid must be unique within the pool. 436 */ 437 guid = spa_generate_guid(spa); 438 } 439 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 440 } 441 442 vd->vdev_spa = spa; 443 vd->vdev_id = id; 444 vd->vdev_guid = guid; 445 vd->vdev_guid_sum = guid; 446 vd->vdev_ops = ops; 447 vd->vdev_state = VDEV_STATE_CLOSED; 448 vd->vdev_ishole = (ops == &vdev_hole_ops); 449 vic->vic_prev_indirect_vdev = UINT64_MAX; 450 451 rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); 452 mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); 453 vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); 454 455 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 456 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 457 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 458 mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); 459 for (int t = 0; t < DTL_TYPES; t++) { 460 vd->vdev_dtl[t] = range_tree_create(NULL, NULL); 461 } 462 txg_list_create(&vd->vdev_ms_list, spa, 463 offsetof(struct metaslab, ms_txg_node)); 464 txg_list_create(&vd->vdev_dtl_list, spa, 465 offsetof(struct vdev, vdev_dtl_node)); 466 vd->vdev_stat.vs_timestamp = gethrtime(); 467 vdev_queue_init(vd); 468 vdev_cache_init(vd); 469 470 return (vd); 471} 472 473/* 474 * Allocate a new vdev. The 'alloctype' is used to control whether we are 475 * creating a new vdev or loading an existing one - the behavior is slightly 476 * different for each case. 477 */ 478int 479vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 480 int alloctype) 481{ 482 vdev_ops_t *ops; 483 char *type; 484 uint64_t guid = 0, islog, nparity; 485 vdev_t *vd; 486 vdev_indirect_config_t *vic; 487 488 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 489 490 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 491 return (SET_ERROR(EINVAL)); 492 493 if ((ops = vdev_getops(type)) == NULL) 494 return (SET_ERROR(EINVAL)); 495 496 /* 497 * If this is a load, get the vdev guid from the nvlist. 498 * Otherwise, vdev_alloc_common() will generate one for us. 499 */ 500 if (alloctype == VDEV_ALLOC_LOAD) { 501 uint64_t label_id; 502 503 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 504 label_id != id) 505 return (SET_ERROR(EINVAL)); 506 507 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 508 return (SET_ERROR(EINVAL)); 509 } else if (alloctype == VDEV_ALLOC_SPARE) { 510 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 511 return (SET_ERROR(EINVAL)); 512 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 513 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 514 return (SET_ERROR(EINVAL)); 515 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 516 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 517 return (SET_ERROR(EINVAL)); 518 } 519 520 /* 521 * The first allocated vdev must be of type 'root'. 522 */ 523 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 524 return (SET_ERROR(EINVAL)); 525 526 /* 527 * Determine whether we're a log vdev. 528 */ 529 islog = 0; 530 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 531 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 532 return (SET_ERROR(ENOTSUP)); 533 534 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 535 return (SET_ERROR(ENOTSUP)); 536 537 /* 538 * Set the nparity property for RAID-Z vdevs. 539 */ 540 nparity = -1ULL; 541 if (ops == &vdev_raidz_ops) { 542 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 543 &nparity) == 0) { 544 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 545 return (SET_ERROR(EINVAL)); 546 /* 547 * Previous versions could only support 1 or 2 parity 548 * device. 549 */ 550 if (nparity > 1 && 551 spa_version(spa) < SPA_VERSION_RAIDZ2) 552 return (SET_ERROR(ENOTSUP)); 553 if (nparity > 2 && 554 spa_version(spa) < SPA_VERSION_RAIDZ3) 555 return (SET_ERROR(ENOTSUP)); 556 } else { 557 /* 558 * We require the parity to be specified for SPAs that 559 * support multiple parity levels. 560 */ 561 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 562 return (SET_ERROR(EINVAL)); 563 /* 564 * Otherwise, we default to 1 parity device for RAID-Z. 565 */ 566 nparity = 1; 567 } 568 } else { 569 nparity = 0; 570 } 571 ASSERT(nparity != -1ULL); 572 573 vd = vdev_alloc_common(spa, id, guid, ops); 574 vic = &vd->vdev_indirect_config; 575 576 vd->vdev_islog = islog; 577 vd->vdev_nparity = nparity; 578 579 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 580 vd->vdev_path = spa_strdup(vd->vdev_path); 581 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 582 vd->vdev_devid = spa_strdup(vd->vdev_devid); 583 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 584 &vd->vdev_physpath) == 0) 585 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 586 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 587 vd->vdev_fru = spa_strdup(vd->vdev_fru); 588 589 /* 590 * Set the whole_disk property. If it's not specified, leave the value 591 * as -1. 592 */ 593 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 594 &vd->vdev_wholedisk) != 0) 595 vd->vdev_wholedisk = -1ULL; 596 597 ASSERT0(vic->vic_mapping_object); 598 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, 599 &vic->vic_mapping_object); 600 ASSERT0(vic->vic_births_object); 601 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, 602 &vic->vic_births_object); 603 ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); 604 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 605 &vic->vic_prev_indirect_vdev); 606 607 /* 608 * Look for the 'not present' flag. This will only be set if the device 609 * was not present at the time of import. 610 */ 611 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 612 &vd->vdev_not_present); 613 614 /* 615 * Get the alignment requirement. 616 */ 617 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 618 619 /* 620 * Retrieve the vdev creation time. 621 */ 622 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 623 &vd->vdev_crtxg); 624 625 /* 626 * If we're a top-level vdev, try to load the allocation parameters. 627 */ 628 if (parent && !parent->vdev_parent && 629 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 630 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 631 &vd->vdev_ms_array); 632 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 633 &vd->vdev_ms_shift); 634 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 635 &vd->vdev_asize); 636 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 637 &vd->vdev_removing); 638 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, 639 &vd->vdev_top_zap); 640 } else { 641 ASSERT0(vd->vdev_top_zap); 642 } 643 644 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 645 ASSERT(alloctype == VDEV_ALLOC_LOAD || 646 alloctype == VDEV_ALLOC_ADD || 647 alloctype == VDEV_ALLOC_SPLIT || 648 alloctype == VDEV_ALLOC_ROOTPOOL); 649 vd->vdev_mg = metaslab_group_create(islog ? 650 spa_log_class(spa) : spa_normal_class(spa), vd); 651 } 652 653 if (vd->vdev_ops->vdev_op_leaf && 654 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 655 (void) nvlist_lookup_uint64(nv, 656 ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); 657 } else { 658 ASSERT0(vd->vdev_leaf_zap); 659 } 660 661 /* 662 * If we're a leaf vdev, try to load the DTL object and other state. 663 */ 664 665 if (vd->vdev_ops->vdev_op_leaf && 666 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 667 alloctype == VDEV_ALLOC_ROOTPOOL)) { 668 if (alloctype == VDEV_ALLOC_LOAD) { 669 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 670 &vd->vdev_dtl_object); 671 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 672 &vd->vdev_unspare); 673 } 674 675 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 676 uint64_t spare = 0; 677 678 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 679 &spare) == 0 && spare) 680 spa_spare_add(vd); 681 } 682 683 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 684 &vd->vdev_offline); 685 686 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 687 &vd->vdev_resilver_txg); 688 689 /* 690 * When importing a pool, we want to ignore the persistent fault 691 * state, as the diagnosis made on another system may not be 692 * valid in the current context. Local vdevs will 693 * remain in the faulted state. 694 */ 695 if (spa_load_state(spa) == SPA_LOAD_OPEN) { 696 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 697 &vd->vdev_faulted); 698 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 699 &vd->vdev_degraded); 700 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 701 &vd->vdev_removed); 702 703 if (vd->vdev_faulted || vd->vdev_degraded) { 704 char *aux; 705 706 vd->vdev_label_aux = 707 VDEV_AUX_ERR_EXCEEDED; 708 if (nvlist_lookup_string(nv, 709 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 710 strcmp(aux, "external") == 0) 711 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 712 } 713 } 714 } 715 716 /* 717 * Add ourselves to the parent's list of children. 718 */ 719 vdev_add_child(parent, vd); 720 721 *vdp = vd; 722 723 return (0); 724} 725 726void 727vdev_free(vdev_t *vd) 728{ 729 spa_t *spa = vd->vdev_spa; 730 731 /* 732 * vdev_free() implies closing the vdev first. This is simpler than 733 * trying to ensure complicated semantics for all callers. 734 */ 735 vdev_close(vd); 736 737 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 738 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 739 740 /* 741 * Free all children. 742 */ 743 for (int c = 0; c < vd->vdev_children; c++) 744 vdev_free(vd->vdev_child[c]); 745 746 ASSERT(vd->vdev_child == NULL); 747 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 748 749 /* 750 * Discard allocation state. 751 */ 752 if (vd->vdev_mg != NULL) { 753 vdev_metaslab_fini(vd); 754 metaslab_group_destroy(vd->vdev_mg); 755 } 756 757 ASSERT0(vd->vdev_stat.vs_space); 758 ASSERT0(vd->vdev_stat.vs_dspace); 759 ASSERT0(vd->vdev_stat.vs_alloc); 760 761 /* 762 * Remove this vdev from its parent's child list. 763 */ 764 vdev_remove_child(vd->vdev_parent, vd); 765 766 ASSERT(vd->vdev_parent == NULL); 767 768 /* 769 * Clean up vdev structure. 770 */ 771 vdev_queue_fini(vd); 772 vdev_cache_fini(vd); 773 774 if (vd->vdev_path) 775 spa_strfree(vd->vdev_path); 776 if (vd->vdev_devid) 777 spa_strfree(vd->vdev_devid); 778 if (vd->vdev_physpath) 779 spa_strfree(vd->vdev_physpath); 780 if (vd->vdev_fru) 781 spa_strfree(vd->vdev_fru); 782 783 if (vd->vdev_isspare) 784 spa_spare_remove(vd); 785 if (vd->vdev_isl2cache) 786 spa_l2cache_remove(vd); 787 788 txg_list_destroy(&vd->vdev_ms_list); 789 txg_list_destroy(&vd->vdev_dtl_list); 790 791 mutex_enter(&vd->vdev_dtl_lock); 792 space_map_close(vd->vdev_dtl_sm); 793 for (int t = 0; t < DTL_TYPES; t++) { 794 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 795 range_tree_destroy(vd->vdev_dtl[t]); 796 } 797 mutex_exit(&vd->vdev_dtl_lock); 798 799 EQUIV(vd->vdev_indirect_births != NULL, 800 vd->vdev_indirect_mapping != NULL); 801 if (vd->vdev_indirect_births != NULL) { 802 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 803 vdev_indirect_births_close(vd->vdev_indirect_births); 804 } 805 806 if (vd->vdev_obsolete_sm != NULL) { 807 ASSERT(vd->vdev_removing || 808 vd->vdev_ops == &vdev_indirect_ops); 809 space_map_close(vd->vdev_obsolete_sm); 810 vd->vdev_obsolete_sm = NULL; 811 } 812 range_tree_destroy(vd->vdev_obsolete_segments); 813 rw_destroy(&vd->vdev_indirect_rwlock); 814 mutex_destroy(&vd->vdev_obsolete_lock); 815 816 mutex_destroy(&vd->vdev_queue_lock); 817 mutex_destroy(&vd->vdev_dtl_lock); 818 mutex_destroy(&vd->vdev_stat_lock); 819 mutex_destroy(&vd->vdev_probe_lock); 820 821 if (vd == spa->spa_root_vdev) 822 spa->spa_root_vdev = NULL; 823 824 kmem_free(vd, sizeof (vdev_t)); 825} 826 827/* 828 * Transfer top-level vdev state from svd to tvd. 829 */ 830static void 831vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 832{ 833 spa_t *spa = svd->vdev_spa; 834 metaslab_t *msp; 835 vdev_t *vd; 836 int t; 837 838 ASSERT(tvd == tvd->vdev_top); 839 840 tvd->vdev_ms_array = svd->vdev_ms_array; 841 tvd->vdev_ms_shift = svd->vdev_ms_shift; 842 tvd->vdev_ms_count = svd->vdev_ms_count; 843 tvd->vdev_top_zap = svd->vdev_top_zap; 844 845 svd->vdev_ms_array = 0; 846 svd->vdev_ms_shift = 0; 847 svd->vdev_ms_count = 0; 848 svd->vdev_top_zap = 0; 849 850 if (tvd->vdev_mg) 851 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 852 tvd->vdev_mg = svd->vdev_mg; 853 tvd->vdev_ms = svd->vdev_ms; 854 855 svd->vdev_mg = NULL; 856 svd->vdev_ms = NULL; 857 858 if (tvd->vdev_mg != NULL) 859 tvd->vdev_mg->mg_vd = tvd; 860 861 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 862 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 863 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 864 865 svd->vdev_stat.vs_alloc = 0; 866 svd->vdev_stat.vs_space = 0; 867 svd->vdev_stat.vs_dspace = 0; 868 869 for (t = 0; t < TXG_SIZE; t++) { 870 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 871 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 872 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 873 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 874 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 875 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 876 } 877 878 if (list_link_active(&svd->vdev_config_dirty_node)) { 879 vdev_config_clean(svd); 880 vdev_config_dirty(tvd); 881 } 882 883 if (list_link_active(&svd->vdev_state_dirty_node)) { 884 vdev_state_clean(svd); 885 vdev_state_dirty(tvd); 886 } 887 888 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 889 svd->vdev_deflate_ratio = 0; 890 891 tvd->vdev_islog = svd->vdev_islog; 892 svd->vdev_islog = 0; 893} 894 895static void 896vdev_top_update(vdev_t *tvd, vdev_t *vd) 897{ 898 if (vd == NULL) 899 return; 900 901 vd->vdev_top = tvd; 902 903 for (int c = 0; c < vd->vdev_children; c++) 904 vdev_top_update(tvd, vd->vdev_child[c]); 905} 906 907/* 908 * Add a mirror/replacing vdev above an existing vdev. 909 */ 910vdev_t * 911vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 912{ 913 spa_t *spa = cvd->vdev_spa; 914 vdev_t *pvd = cvd->vdev_parent; 915 vdev_t *mvd; 916 917 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 918 919 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 920 921 mvd->vdev_asize = cvd->vdev_asize; 922 mvd->vdev_min_asize = cvd->vdev_min_asize; 923 mvd->vdev_max_asize = cvd->vdev_max_asize; 924 mvd->vdev_psize = cvd->vdev_psize; 925 mvd->vdev_ashift = cvd->vdev_ashift; 926 mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; 927 mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; 928 mvd->vdev_state = cvd->vdev_state; 929 mvd->vdev_crtxg = cvd->vdev_crtxg; 930 931 vdev_remove_child(pvd, cvd); 932 vdev_add_child(pvd, mvd); 933 cvd->vdev_id = mvd->vdev_children; 934 vdev_add_child(mvd, cvd); 935 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 936 937 if (mvd == mvd->vdev_top) 938 vdev_top_transfer(cvd, mvd); 939 940 return (mvd); 941} 942 943/* 944 * Remove a 1-way mirror/replacing vdev from the tree. 945 */ 946void 947vdev_remove_parent(vdev_t *cvd) 948{ 949 vdev_t *mvd = cvd->vdev_parent; 950 vdev_t *pvd = mvd->vdev_parent; 951 952 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 953 954 ASSERT(mvd->vdev_children == 1); 955 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 956 mvd->vdev_ops == &vdev_replacing_ops || 957 mvd->vdev_ops == &vdev_spare_ops); 958 cvd->vdev_ashift = mvd->vdev_ashift; 959 cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; 960 cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; 961 962 vdev_remove_child(mvd, cvd); 963 vdev_remove_child(pvd, mvd); 964 965 /* 966 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 967 * Otherwise, we could have detached an offline device, and when we 968 * go to import the pool we'll think we have two top-level vdevs, 969 * instead of a different version of the same top-level vdev. 970 */ 971 if (mvd->vdev_top == mvd) { 972 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 973 cvd->vdev_orig_guid = cvd->vdev_guid; 974 cvd->vdev_guid += guid_delta; 975 cvd->vdev_guid_sum += guid_delta; 976 } 977 cvd->vdev_id = mvd->vdev_id; 978 vdev_add_child(pvd, cvd); 979 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 980 981 if (cvd == cvd->vdev_top) 982 vdev_top_transfer(mvd, cvd); 983 984 ASSERT(mvd->vdev_children == 0); 985 vdev_free(mvd); 986} 987 988int 989vdev_metaslab_init(vdev_t *vd, uint64_t txg) 990{ 991 spa_t *spa = vd->vdev_spa; 992 objset_t *mos = spa->spa_meta_objset; 993 uint64_t m; 994 uint64_t oldc = vd->vdev_ms_count; 995 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 996 metaslab_t **mspp; 997 int error; 998 999 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1000 1001 /* 1002 * This vdev is not being allocated from yet or is a hole. 1003 */ 1004 if (vd->vdev_ms_shift == 0) 1005 return (0); 1006 1007 ASSERT(!vd->vdev_ishole); 1008 1009 ASSERT(oldc <= newc); 1010 1011 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 1012 1013 if (oldc != 0) { 1014 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 1015 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 1016 } 1017 1018 vd->vdev_ms = mspp; 1019 vd->vdev_ms_count = newc; 1020 1021 for (m = oldc; m < newc; m++) { 1022 uint64_t object = 0; 1023 1024 /* 1025 * vdev_ms_array may be 0 if we are creating the "fake" 1026 * metaslabs for an indirect vdev for zdb's leak detection. 1027 * See zdb_leak_init(). 1028 */ 1029 if (txg == 0 && vd->vdev_ms_array != 0) { 1030 error = dmu_read(mos, vd->vdev_ms_array, 1031 m * sizeof (uint64_t), sizeof (uint64_t), &object, 1032 DMU_READ_PREFETCH);
| 196/* 197 * Given a vdev type, return the appropriate ops vector. 198 */ 199static vdev_ops_t * 200vdev_getops(const char *type) 201{ 202 vdev_ops_t *ops, **opspp; 203 204 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 205 if (strcmp(ops->vdev_op_type, type) == 0) 206 break; 207 208 return (ops); 209} 210 211/* 212 * Default asize function: return the MAX of psize with the asize of 213 * all children. This is what's used by anything other than RAID-Z. 214 */ 215uint64_t 216vdev_default_asize(vdev_t *vd, uint64_t psize) 217{ 218 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 219 uint64_t csize; 220 221 for (int c = 0; c < vd->vdev_children; c++) { 222 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 223 asize = MAX(asize, csize); 224 } 225 226 return (asize); 227} 228 229/* 230 * Get the minimum allocatable size. We define the allocatable size as 231 * the vdev's asize rounded to the nearest metaslab. This allows us to 232 * replace or attach devices which don't have the same physical size but 233 * can still satisfy the same number of allocations. 234 */ 235uint64_t 236vdev_get_min_asize(vdev_t *vd) 237{ 238 vdev_t *pvd = vd->vdev_parent; 239 240 /* 241 * If our parent is NULL (inactive spare or cache) or is the root, 242 * just return our own asize. 243 */ 244 if (pvd == NULL) 245 return (vd->vdev_asize); 246 247 /* 248 * The top-level vdev just returns the allocatable size rounded 249 * to the nearest metaslab. 250 */ 251 if (vd == vd->vdev_top) 252 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 253 254 /* 255 * The allocatable space for a raidz vdev is N * sizeof(smallest child), 256 * so each child must provide at least 1/Nth of its asize. 257 */ 258 if (pvd->vdev_ops == &vdev_raidz_ops) 259 return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / 260 pvd->vdev_children); 261 262 return (pvd->vdev_min_asize); 263} 264 265void 266vdev_set_min_asize(vdev_t *vd) 267{ 268 vd->vdev_min_asize = vdev_get_min_asize(vd); 269 270 for (int c = 0; c < vd->vdev_children; c++) 271 vdev_set_min_asize(vd->vdev_child[c]); 272} 273 274vdev_t * 275vdev_lookup_top(spa_t *spa, uint64_t vdev) 276{ 277 vdev_t *rvd = spa->spa_root_vdev; 278 279 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 280 281 if (vdev < rvd->vdev_children) { 282 ASSERT(rvd->vdev_child[vdev] != NULL); 283 return (rvd->vdev_child[vdev]); 284 } 285 286 return (NULL); 287} 288 289vdev_t * 290vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 291{ 292 vdev_t *mvd; 293 294 if (vd->vdev_guid == guid) 295 return (vd); 296 297 for (int c = 0; c < vd->vdev_children; c++) 298 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 299 NULL) 300 return (mvd); 301 302 return (NULL); 303} 304 305static int 306vdev_count_leaves_impl(vdev_t *vd) 307{ 308 int n = 0; 309 310 if (vd->vdev_ops->vdev_op_leaf) 311 return (1); 312 313 for (int c = 0; c < vd->vdev_children; c++) 314 n += vdev_count_leaves_impl(vd->vdev_child[c]); 315 316 return (n); 317} 318 319int 320vdev_count_leaves(spa_t *spa) 321{ 322 return (vdev_count_leaves_impl(spa->spa_root_vdev)); 323} 324 325void 326vdev_add_child(vdev_t *pvd, vdev_t *cvd) 327{ 328 size_t oldsize, newsize; 329 uint64_t id = cvd->vdev_id; 330 vdev_t **newchild; 331 spa_t *spa = cvd->vdev_spa; 332 333 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 334 ASSERT(cvd->vdev_parent == NULL); 335 336 cvd->vdev_parent = pvd; 337 338 if (pvd == NULL) 339 return; 340 341 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 342 343 oldsize = pvd->vdev_children * sizeof (vdev_t *); 344 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 345 newsize = pvd->vdev_children * sizeof (vdev_t *); 346 347 newchild = kmem_zalloc(newsize, KM_SLEEP); 348 if (pvd->vdev_child != NULL) { 349 bcopy(pvd->vdev_child, newchild, oldsize); 350 kmem_free(pvd->vdev_child, oldsize); 351 } 352 353 pvd->vdev_child = newchild; 354 pvd->vdev_child[id] = cvd; 355 356 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 357 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 358 359 /* 360 * Walk up all ancestors to update guid sum. 361 */ 362 for (; pvd != NULL; pvd = pvd->vdev_parent) 363 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 364} 365 366void 367vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 368{ 369 int c; 370 uint_t id = cvd->vdev_id; 371 372 ASSERT(cvd->vdev_parent == pvd); 373 374 if (pvd == NULL) 375 return; 376 377 ASSERT(id < pvd->vdev_children); 378 ASSERT(pvd->vdev_child[id] == cvd); 379 380 pvd->vdev_child[id] = NULL; 381 cvd->vdev_parent = NULL; 382 383 for (c = 0; c < pvd->vdev_children; c++) 384 if (pvd->vdev_child[c]) 385 break; 386 387 if (c == pvd->vdev_children) { 388 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 389 pvd->vdev_child = NULL; 390 pvd->vdev_children = 0; 391 } 392 393 /* 394 * Walk up all ancestors to update guid sum. 395 */ 396 for (; pvd != NULL; pvd = pvd->vdev_parent) 397 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 398} 399 400/* 401 * Remove any holes in the child array. 402 */ 403void 404vdev_compact_children(vdev_t *pvd) 405{ 406 vdev_t **newchild, *cvd; 407 int oldc = pvd->vdev_children; 408 int newc; 409 410 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 411 412 for (int c = newc = 0; c < oldc; c++) 413 if (pvd->vdev_child[c]) 414 newc++; 415 416 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 417 418 for (int c = newc = 0; c < oldc; c++) { 419 if ((cvd = pvd->vdev_child[c]) != NULL) { 420 newchild[newc] = cvd; 421 cvd->vdev_id = newc++; 422 } 423 } 424 425 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 426 pvd->vdev_child = newchild; 427 pvd->vdev_children = newc; 428} 429 430/* 431 * Allocate and minimally initialize a vdev_t. 432 */ 433vdev_t * 434vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 435{ 436 vdev_t *vd; 437 vdev_indirect_config_t *vic; 438 439 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 440 vic = &vd->vdev_indirect_config; 441 442 if (spa->spa_root_vdev == NULL) { 443 ASSERT(ops == &vdev_root_ops); 444 spa->spa_root_vdev = vd; 445 spa->spa_load_guid = spa_generate_guid(NULL); 446 } 447 448 if (guid == 0 && ops != &vdev_hole_ops) { 449 if (spa->spa_root_vdev == vd) { 450 /* 451 * The root vdev's guid will also be the pool guid, 452 * which must be unique among all pools. 453 */ 454 guid = spa_generate_guid(NULL); 455 } else { 456 /* 457 * Any other vdev's guid must be unique within the pool. 458 */ 459 guid = spa_generate_guid(spa); 460 } 461 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 462 } 463 464 vd->vdev_spa = spa; 465 vd->vdev_id = id; 466 vd->vdev_guid = guid; 467 vd->vdev_guid_sum = guid; 468 vd->vdev_ops = ops; 469 vd->vdev_state = VDEV_STATE_CLOSED; 470 vd->vdev_ishole = (ops == &vdev_hole_ops); 471 vic->vic_prev_indirect_vdev = UINT64_MAX; 472 473 rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); 474 mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); 475 vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); 476 477 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 478 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 479 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 480 mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); 481 for (int t = 0; t < DTL_TYPES; t++) { 482 vd->vdev_dtl[t] = range_tree_create(NULL, NULL); 483 } 484 txg_list_create(&vd->vdev_ms_list, spa, 485 offsetof(struct metaslab, ms_txg_node)); 486 txg_list_create(&vd->vdev_dtl_list, spa, 487 offsetof(struct vdev, vdev_dtl_node)); 488 vd->vdev_stat.vs_timestamp = gethrtime(); 489 vdev_queue_init(vd); 490 vdev_cache_init(vd); 491 492 return (vd); 493} 494 495/* 496 * Allocate a new vdev. The 'alloctype' is used to control whether we are 497 * creating a new vdev or loading an existing one - the behavior is slightly 498 * different for each case. 499 */ 500int 501vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 502 int alloctype) 503{ 504 vdev_ops_t *ops; 505 char *type; 506 uint64_t guid = 0, islog, nparity; 507 vdev_t *vd; 508 vdev_indirect_config_t *vic; 509 510 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 511 512 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 513 return (SET_ERROR(EINVAL)); 514 515 if ((ops = vdev_getops(type)) == NULL) 516 return (SET_ERROR(EINVAL)); 517 518 /* 519 * If this is a load, get the vdev guid from the nvlist. 520 * Otherwise, vdev_alloc_common() will generate one for us. 521 */ 522 if (alloctype == VDEV_ALLOC_LOAD) { 523 uint64_t label_id; 524 525 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 526 label_id != id) 527 return (SET_ERROR(EINVAL)); 528 529 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 530 return (SET_ERROR(EINVAL)); 531 } else if (alloctype == VDEV_ALLOC_SPARE) { 532 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 533 return (SET_ERROR(EINVAL)); 534 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 535 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 536 return (SET_ERROR(EINVAL)); 537 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 538 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 539 return (SET_ERROR(EINVAL)); 540 } 541 542 /* 543 * The first allocated vdev must be of type 'root'. 544 */ 545 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 546 return (SET_ERROR(EINVAL)); 547 548 /* 549 * Determine whether we're a log vdev. 550 */ 551 islog = 0; 552 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 553 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 554 return (SET_ERROR(ENOTSUP)); 555 556 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 557 return (SET_ERROR(ENOTSUP)); 558 559 /* 560 * Set the nparity property for RAID-Z vdevs. 561 */ 562 nparity = -1ULL; 563 if (ops == &vdev_raidz_ops) { 564 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 565 &nparity) == 0) { 566 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 567 return (SET_ERROR(EINVAL)); 568 /* 569 * Previous versions could only support 1 or 2 parity 570 * device. 571 */ 572 if (nparity > 1 && 573 spa_version(spa) < SPA_VERSION_RAIDZ2) 574 return (SET_ERROR(ENOTSUP)); 575 if (nparity > 2 && 576 spa_version(spa) < SPA_VERSION_RAIDZ3) 577 return (SET_ERROR(ENOTSUP)); 578 } else { 579 /* 580 * We require the parity to be specified for SPAs that 581 * support multiple parity levels. 582 */ 583 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 584 return (SET_ERROR(EINVAL)); 585 /* 586 * Otherwise, we default to 1 parity device for RAID-Z. 587 */ 588 nparity = 1; 589 } 590 } else { 591 nparity = 0; 592 } 593 ASSERT(nparity != -1ULL); 594 595 vd = vdev_alloc_common(spa, id, guid, ops); 596 vic = &vd->vdev_indirect_config; 597 598 vd->vdev_islog = islog; 599 vd->vdev_nparity = nparity; 600 601 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 602 vd->vdev_path = spa_strdup(vd->vdev_path); 603 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 604 vd->vdev_devid = spa_strdup(vd->vdev_devid); 605 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 606 &vd->vdev_physpath) == 0) 607 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 608 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 609 vd->vdev_fru = spa_strdup(vd->vdev_fru); 610 611 /* 612 * Set the whole_disk property. If it's not specified, leave the value 613 * as -1. 614 */ 615 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 616 &vd->vdev_wholedisk) != 0) 617 vd->vdev_wholedisk = -1ULL; 618 619 ASSERT0(vic->vic_mapping_object); 620 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, 621 &vic->vic_mapping_object); 622 ASSERT0(vic->vic_births_object); 623 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, 624 &vic->vic_births_object); 625 ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); 626 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 627 &vic->vic_prev_indirect_vdev); 628 629 /* 630 * Look for the 'not present' flag. This will only be set if the device 631 * was not present at the time of import. 632 */ 633 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 634 &vd->vdev_not_present); 635 636 /* 637 * Get the alignment requirement. 638 */ 639 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 640 641 /* 642 * Retrieve the vdev creation time. 643 */ 644 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 645 &vd->vdev_crtxg); 646 647 /* 648 * If we're a top-level vdev, try to load the allocation parameters. 649 */ 650 if (parent && !parent->vdev_parent && 651 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 652 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 653 &vd->vdev_ms_array); 654 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 655 &vd->vdev_ms_shift); 656 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 657 &vd->vdev_asize); 658 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 659 &vd->vdev_removing); 660 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, 661 &vd->vdev_top_zap); 662 } else { 663 ASSERT0(vd->vdev_top_zap); 664 } 665 666 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 667 ASSERT(alloctype == VDEV_ALLOC_LOAD || 668 alloctype == VDEV_ALLOC_ADD || 669 alloctype == VDEV_ALLOC_SPLIT || 670 alloctype == VDEV_ALLOC_ROOTPOOL); 671 vd->vdev_mg = metaslab_group_create(islog ? 672 spa_log_class(spa) : spa_normal_class(spa), vd); 673 } 674 675 if (vd->vdev_ops->vdev_op_leaf && 676 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 677 (void) nvlist_lookup_uint64(nv, 678 ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); 679 } else { 680 ASSERT0(vd->vdev_leaf_zap); 681 } 682 683 /* 684 * If we're a leaf vdev, try to load the DTL object and other state. 685 */ 686 687 if (vd->vdev_ops->vdev_op_leaf && 688 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 689 alloctype == VDEV_ALLOC_ROOTPOOL)) { 690 if (alloctype == VDEV_ALLOC_LOAD) { 691 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 692 &vd->vdev_dtl_object); 693 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 694 &vd->vdev_unspare); 695 } 696 697 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 698 uint64_t spare = 0; 699 700 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 701 &spare) == 0 && spare) 702 spa_spare_add(vd); 703 } 704 705 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 706 &vd->vdev_offline); 707 708 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 709 &vd->vdev_resilver_txg); 710 711 /* 712 * When importing a pool, we want to ignore the persistent fault 713 * state, as the diagnosis made on another system may not be 714 * valid in the current context. Local vdevs will 715 * remain in the faulted state. 716 */ 717 if (spa_load_state(spa) == SPA_LOAD_OPEN) { 718 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 719 &vd->vdev_faulted); 720 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 721 &vd->vdev_degraded); 722 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 723 &vd->vdev_removed); 724 725 if (vd->vdev_faulted || vd->vdev_degraded) { 726 char *aux; 727 728 vd->vdev_label_aux = 729 VDEV_AUX_ERR_EXCEEDED; 730 if (nvlist_lookup_string(nv, 731 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 732 strcmp(aux, "external") == 0) 733 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 734 } 735 } 736 } 737 738 /* 739 * Add ourselves to the parent's list of children. 740 */ 741 vdev_add_child(parent, vd); 742 743 *vdp = vd; 744 745 return (0); 746} 747 748void 749vdev_free(vdev_t *vd) 750{ 751 spa_t *spa = vd->vdev_spa; 752 753 /* 754 * vdev_free() implies closing the vdev first. This is simpler than 755 * trying to ensure complicated semantics for all callers. 756 */ 757 vdev_close(vd); 758 759 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 760 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 761 762 /* 763 * Free all children. 764 */ 765 for (int c = 0; c < vd->vdev_children; c++) 766 vdev_free(vd->vdev_child[c]); 767 768 ASSERT(vd->vdev_child == NULL); 769 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 770 771 /* 772 * Discard allocation state. 773 */ 774 if (vd->vdev_mg != NULL) { 775 vdev_metaslab_fini(vd); 776 metaslab_group_destroy(vd->vdev_mg); 777 } 778 779 ASSERT0(vd->vdev_stat.vs_space); 780 ASSERT0(vd->vdev_stat.vs_dspace); 781 ASSERT0(vd->vdev_stat.vs_alloc); 782 783 /* 784 * Remove this vdev from its parent's child list. 785 */ 786 vdev_remove_child(vd->vdev_parent, vd); 787 788 ASSERT(vd->vdev_parent == NULL); 789 790 /* 791 * Clean up vdev structure. 792 */ 793 vdev_queue_fini(vd); 794 vdev_cache_fini(vd); 795 796 if (vd->vdev_path) 797 spa_strfree(vd->vdev_path); 798 if (vd->vdev_devid) 799 spa_strfree(vd->vdev_devid); 800 if (vd->vdev_physpath) 801 spa_strfree(vd->vdev_physpath); 802 if (vd->vdev_fru) 803 spa_strfree(vd->vdev_fru); 804 805 if (vd->vdev_isspare) 806 spa_spare_remove(vd); 807 if (vd->vdev_isl2cache) 808 spa_l2cache_remove(vd); 809 810 txg_list_destroy(&vd->vdev_ms_list); 811 txg_list_destroy(&vd->vdev_dtl_list); 812 813 mutex_enter(&vd->vdev_dtl_lock); 814 space_map_close(vd->vdev_dtl_sm); 815 for (int t = 0; t < DTL_TYPES; t++) { 816 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 817 range_tree_destroy(vd->vdev_dtl[t]); 818 } 819 mutex_exit(&vd->vdev_dtl_lock); 820 821 EQUIV(vd->vdev_indirect_births != NULL, 822 vd->vdev_indirect_mapping != NULL); 823 if (vd->vdev_indirect_births != NULL) { 824 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 825 vdev_indirect_births_close(vd->vdev_indirect_births); 826 } 827 828 if (vd->vdev_obsolete_sm != NULL) { 829 ASSERT(vd->vdev_removing || 830 vd->vdev_ops == &vdev_indirect_ops); 831 space_map_close(vd->vdev_obsolete_sm); 832 vd->vdev_obsolete_sm = NULL; 833 } 834 range_tree_destroy(vd->vdev_obsolete_segments); 835 rw_destroy(&vd->vdev_indirect_rwlock); 836 mutex_destroy(&vd->vdev_obsolete_lock); 837 838 mutex_destroy(&vd->vdev_queue_lock); 839 mutex_destroy(&vd->vdev_dtl_lock); 840 mutex_destroy(&vd->vdev_stat_lock); 841 mutex_destroy(&vd->vdev_probe_lock); 842 843 if (vd == spa->spa_root_vdev) 844 spa->spa_root_vdev = NULL; 845 846 kmem_free(vd, sizeof (vdev_t)); 847} 848 849/* 850 * Transfer top-level vdev state from svd to tvd. 851 */ 852static void 853vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 854{ 855 spa_t *spa = svd->vdev_spa; 856 metaslab_t *msp; 857 vdev_t *vd; 858 int t; 859 860 ASSERT(tvd == tvd->vdev_top); 861 862 tvd->vdev_ms_array = svd->vdev_ms_array; 863 tvd->vdev_ms_shift = svd->vdev_ms_shift; 864 tvd->vdev_ms_count = svd->vdev_ms_count; 865 tvd->vdev_top_zap = svd->vdev_top_zap; 866 867 svd->vdev_ms_array = 0; 868 svd->vdev_ms_shift = 0; 869 svd->vdev_ms_count = 0; 870 svd->vdev_top_zap = 0; 871 872 if (tvd->vdev_mg) 873 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 874 tvd->vdev_mg = svd->vdev_mg; 875 tvd->vdev_ms = svd->vdev_ms; 876 877 svd->vdev_mg = NULL; 878 svd->vdev_ms = NULL; 879 880 if (tvd->vdev_mg != NULL) 881 tvd->vdev_mg->mg_vd = tvd; 882 883 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 884 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 885 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 886 887 svd->vdev_stat.vs_alloc = 0; 888 svd->vdev_stat.vs_space = 0; 889 svd->vdev_stat.vs_dspace = 0; 890 891 for (t = 0; t < TXG_SIZE; t++) { 892 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 893 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 894 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 895 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 896 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 897 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 898 } 899 900 if (list_link_active(&svd->vdev_config_dirty_node)) { 901 vdev_config_clean(svd); 902 vdev_config_dirty(tvd); 903 } 904 905 if (list_link_active(&svd->vdev_state_dirty_node)) { 906 vdev_state_clean(svd); 907 vdev_state_dirty(tvd); 908 } 909 910 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 911 svd->vdev_deflate_ratio = 0; 912 913 tvd->vdev_islog = svd->vdev_islog; 914 svd->vdev_islog = 0; 915} 916 917static void 918vdev_top_update(vdev_t *tvd, vdev_t *vd) 919{ 920 if (vd == NULL) 921 return; 922 923 vd->vdev_top = tvd; 924 925 for (int c = 0; c < vd->vdev_children; c++) 926 vdev_top_update(tvd, vd->vdev_child[c]); 927} 928 929/* 930 * Add a mirror/replacing vdev above an existing vdev. 931 */ 932vdev_t * 933vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 934{ 935 spa_t *spa = cvd->vdev_spa; 936 vdev_t *pvd = cvd->vdev_parent; 937 vdev_t *mvd; 938 939 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 940 941 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 942 943 mvd->vdev_asize = cvd->vdev_asize; 944 mvd->vdev_min_asize = cvd->vdev_min_asize; 945 mvd->vdev_max_asize = cvd->vdev_max_asize; 946 mvd->vdev_psize = cvd->vdev_psize; 947 mvd->vdev_ashift = cvd->vdev_ashift; 948 mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; 949 mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; 950 mvd->vdev_state = cvd->vdev_state; 951 mvd->vdev_crtxg = cvd->vdev_crtxg; 952 953 vdev_remove_child(pvd, cvd); 954 vdev_add_child(pvd, mvd); 955 cvd->vdev_id = mvd->vdev_children; 956 vdev_add_child(mvd, cvd); 957 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 958 959 if (mvd == mvd->vdev_top) 960 vdev_top_transfer(cvd, mvd); 961 962 return (mvd); 963} 964 965/* 966 * Remove a 1-way mirror/replacing vdev from the tree. 967 */ 968void 969vdev_remove_parent(vdev_t *cvd) 970{ 971 vdev_t *mvd = cvd->vdev_parent; 972 vdev_t *pvd = mvd->vdev_parent; 973 974 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 975 976 ASSERT(mvd->vdev_children == 1); 977 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 978 mvd->vdev_ops == &vdev_replacing_ops || 979 mvd->vdev_ops == &vdev_spare_ops); 980 cvd->vdev_ashift = mvd->vdev_ashift; 981 cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; 982 cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; 983 984 vdev_remove_child(mvd, cvd); 985 vdev_remove_child(pvd, mvd); 986 987 /* 988 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 989 * Otherwise, we could have detached an offline device, and when we 990 * go to import the pool we'll think we have two top-level vdevs, 991 * instead of a different version of the same top-level vdev. 992 */ 993 if (mvd->vdev_top == mvd) { 994 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 995 cvd->vdev_orig_guid = cvd->vdev_guid; 996 cvd->vdev_guid += guid_delta; 997 cvd->vdev_guid_sum += guid_delta; 998 } 999 cvd->vdev_id = mvd->vdev_id; 1000 vdev_add_child(pvd, cvd); 1001 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 1002 1003 if (cvd == cvd->vdev_top) 1004 vdev_top_transfer(mvd, cvd); 1005 1006 ASSERT(mvd->vdev_children == 0); 1007 vdev_free(mvd); 1008} 1009 1010int 1011vdev_metaslab_init(vdev_t *vd, uint64_t txg) 1012{ 1013 spa_t *spa = vd->vdev_spa; 1014 objset_t *mos = spa->spa_meta_objset; 1015 uint64_t m; 1016 uint64_t oldc = vd->vdev_ms_count; 1017 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 1018 metaslab_t **mspp; 1019 int error; 1020 1021 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1022 1023 /* 1024 * This vdev is not being allocated from yet or is a hole. 1025 */ 1026 if (vd->vdev_ms_shift == 0) 1027 return (0); 1028 1029 ASSERT(!vd->vdev_ishole); 1030 1031 ASSERT(oldc <= newc); 1032 1033 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 1034 1035 if (oldc != 0) { 1036 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 1037 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 1038 } 1039 1040 vd->vdev_ms = mspp; 1041 vd->vdev_ms_count = newc; 1042 1043 for (m = oldc; m < newc; m++) { 1044 uint64_t object = 0; 1045 1046 /* 1047 * vdev_ms_array may be 0 if we are creating the "fake" 1048 * metaslabs for an indirect vdev for zdb's leak detection. 1049 * See zdb_leak_init(). 1050 */ 1051 if (txg == 0 && vd->vdev_ms_array != 0) { 1052 error = dmu_read(mos, vd->vdev_ms_array, 1053 m * sizeof (uint64_t), sizeof (uint64_t), &object, 1054 DMU_READ_PREFETCH);
|
1033 if (error)
| 1055 if (error != 0) { 1056 vdev_dbgmsg(vd, "unable to read the metaslab " 1057 "array [error=%d]", error);
|
1034 return (error);
| 1058 return (error);
|
| 1059 }
|
1035 } 1036 1037 error = metaslab_init(vd->vdev_mg, m, object, txg, 1038 &(vd->vdev_ms[m]));
| 1060 } 1061 1062 error = metaslab_init(vd->vdev_mg, m, object, txg, 1063 &(vd->vdev_ms[m]));
|
1039 if (error)
| 1064 if (error != 0) { 1065 vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", 1066 error);
|
1040 return (error);
| 1067 return (error);
|
| 1068 }
|
1041 } 1042 1043 if (txg == 0) 1044 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 1045 1046 /* 1047 * If the vdev is being removed we don't activate 1048 * the metaslabs since we want to ensure that no new 1049 * allocations are performed on this device. 1050 */ 1051 if (oldc == 0 && !vd->vdev_removing) 1052 metaslab_group_activate(vd->vdev_mg); 1053 1054 if (txg == 0) 1055 spa_config_exit(spa, SCL_ALLOC, FTAG); 1056 1057 return (0); 1058} 1059 1060void 1061vdev_metaslab_fini(vdev_t *vd) 1062{ 1063 if (vd->vdev_ms != NULL) { 1064 uint64_t count = vd->vdev_ms_count; 1065 1066 metaslab_group_passivate(vd->vdev_mg); 1067 for (uint64_t m = 0; m < count; m++) { 1068 metaslab_t *msp = vd->vdev_ms[m]; 1069 1070 if (msp != NULL) 1071 metaslab_fini(msp); 1072 } 1073 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 1074 vd->vdev_ms = NULL; 1075 1076 vd->vdev_ms_count = 0; 1077 } 1078 ASSERT0(vd->vdev_ms_count); 1079} 1080 1081typedef struct vdev_probe_stats { 1082 boolean_t vps_readable; 1083 boolean_t vps_writeable; 1084 int vps_flags; 1085} vdev_probe_stats_t; 1086 1087static void 1088vdev_probe_done(zio_t *zio) 1089{ 1090 spa_t *spa = zio->io_spa; 1091 vdev_t *vd = zio->io_vd; 1092 vdev_probe_stats_t *vps = zio->io_private; 1093 1094 ASSERT(vd->vdev_probe_zio != NULL); 1095 1096 if (zio->io_type == ZIO_TYPE_READ) { 1097 if (zio->io_error == 0) 1098 vps->vps_readable = 1; 1099 if (zio->io_error == 0 && spa_writeable(spa)) { 1100 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 1101 zio->io_offset, zio->io_size, zio->io_abd, 1102 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1103 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 1104 } else { 1105 abd_free(zio->io_abd); 1106 } 1107 } else if (zio->io_type == ZIO_TYPE_WRITE) { 1108 if (zio->io_error == 0) 1109 vps->vps_writeable = 1; 1110 abd_free(zio->io_abd); 1111 } else if (zio->io_type == ZIO_TYPE_NULL) { 1112 zio_t *pio; 1113 1114 vd->vdev_cant_read |= !vps->vps_readable; 1115 vd->vdev_cant_write |= !vps->vps_writeable; 1116 1117 if (vdev_readable(vd) && 1118 (vdev_writeable(vd) || !spa_writeable(spa))) { 1119 zio->io_error = 0; 1120 } else { 1121 ASSERT(zio->io_error != 0);
| 1069 } 1070 1071 if (txg == 0) 1072 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 1073 1074 /* 1075 * If the vdev is being removed we don't activate 1076 * the metaslabs since we want to ensure that no new 1077 * allocations are performed on this device. 1078 */ 1079 if (oldc == 0 && !vd->vdev_removing) 1080 metaslab_group_activate(vd->vdev_mg); 1081 1082 if (txg == 0) 1083 spa_config_exit(spa, SCL_ALLOC, FTAG); 1084 1085 return (0); 1086} 1087 1088void 1089vdev_metaslab_fini(vdev_t *vd) 1090{ 1091 if (vd->vdev_ms != NULL) { 1092 uint64_t count = vd->vdev_ms_count; 1093 1094 metaslab_group_passivate(vd->vdev_mg); 1095 for (uint64_t m = 0; m < count; m++) { 1096 metaslab_t *msp = vd->vdev_ms[m]; 1097 1098 if (msp != NULL) 1099 metaslab_fini(msp); 1100 } 1101 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 1102 vd->vdev_ms = NULL; 1103 1104 vd->vdev_ms_count = 0; 1105 } 1106 ASSERT0(vd->vdev_ms_count); 1107} 1108 1109typedef struct vdev_probe_stats { 1110 boolean_t vps_readable; 1111 boolean_t vps_writeable; 1112 int vps_flags; 1113} vdev_probe_stats_t; 1114 1115static void 1116vdev_probe_done(zio_t *zio) 1117{ 1118 spa_t *spa = zio->io_spa; 1119 vdev_t *vd = zio->io_vd; 1120 vdev_probe_stats_t *vps = zio->io_private; 1121 1122 ASSERT(vd->vdev_probe_zio != NULL); 1123 1124 if (zio->io_type == ZIO_TYPE_READ) { 1125 if (zio->io_error == 0) 1126 vps->vps_readable = 1; 1127 if (zio->io_error == 0 && spa_writeable(spa)) { 1128 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 1129 zio->io_offset, zio->io_size, zio->io_abd, 1130 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1131 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 1132 } else { 1133 abd_free(zio->io_abd); 1134 } 1135 } else if (zio->io_type == ZIO_TYPE_WRITE) { 1136 if (zio->io_error == 0) 1137 vps->vps_writeable = 1; 1138 abd_free(zio->io_abd); 1139 } else if (zio->io_type == ZIO_TYPE_NULL) { 1140 zio_t *pio; 1141 1142 vd->vdev_cant_read |= !vps->vps_readable; 1143 vd->vdev_cant_write |= !vps->vps_writeable; 1144 1145 if (vdev_readable(vd) && 1146 (vdev_writeable(vd) || !spa_writeable(spa))) { 1147 zio->io_error = 0; 1148 } else { 1149 ASSERT(zio->io_error != 0);
|
1122 zfs_dbgmsg("failed probe on vdev %llu", 1123 (longlong_t)vd->vdev_id);
| 1150 vdev_dbgmsg(vd, "failed probe");
|
1124 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 1125 spa, vd, NULL, 0, 0); 1126 zio->io_error = SET_ERROR(ENXIO); 1127 } 1128 1129 mutex_enter(&vd->vdev_probe_lock); 1130 ASSERT(vd->vdev_probe_zio == zio); 1131 vd->vdev_probe_zio = NULL; 1132 mutex_exit(&vd->vdev_probe_lock); 1133 1134 zio_link_t *zl = NULL; 1135 while ((pio = zio_walk_parents(zio, &zl)) != NULL) 1136 if (!vdev_accessible(vd, pio)) 1137 pio->io_error = SET_ERROR(ENXIO); 1138 1139 kmem_free(vps, sizeof (*vps)); 1140 } 1141} 1142 1143/* 1144 * Determine whether this device is accessible. 1145 * 1146 * Read and write to several known locations: the pad regions of each 1147 * vdev label but the first, which we leave alone in case it contains 1148 * a VTOC. 1149 */ 1150zio_t * 1151vdev_probe(vdev_t *vd, zio_t *zio) 1152{ 1153 spa_t *spa = vd->vdev_spa; 1154 vdev_probe_stats_t *vps = NULL; 1155 zio_t *pio; 1156 1157 ASSERT(vd->vdev_ops->vdev_op_leaf); 1158 1159 /* 1160 * Don't probe the probe. 1161 */ 1162 if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1163 return (NULL); 1164 1165 /* 1166 * To prevent 'probe storms' when a device fails, we create 1167 * just one probe i/o at a time. All zios that want to probe 1168 * this vdev will become parents of the probe io. 1169 */ 1170 mutex_enter(&vd->vdev_probe_lock); 1171 1172 if ((pio = vd->vdev_probe_zio) == NULL) { 1173 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 1174 1175 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 1176 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 1177 ZIO_FLAG_TRYHARD; 1178 1179 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1180 /* 1181 * vdev_cant_read and vdev_cant_write can only 1182 * transition from TRUE to FALSE when we have the 1183 * SCL_ZIO lock as writer; otherwise they can only 1184 * transition from FALSE to TRUE. This ensures that 1185 * any zio looking at these values can assume that 1186 * failures persist for the life of the I/O. That's 1187 * important because when a device has intermittent 1188 * connectivity problems, we want to ensure that 1189 * they're ascribed to the device (ENXIO) and not 1190 * the zio (EIO). 1191 * 1192 * Since we hold SCL_ZIO as writer here, clear both 1193 * values so the probe can reevaluate from first 1194 * principles. 1195 */ 1196 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1197 vd->vdev_cant_read = B_FALSE; 1198 vd->vdev_cant_write = B_FALSE; 1199 } 1200 1201 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1202 vdev_probe_done, vps, 1203 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1204 1205 /* 1206 * We can't change the vdev state in this context, so we 1207 * kick off an async task to do it on our behalf. 1208 */ 1209 if (zio != NULL) { 1210 vd->vdev_probe_wanted = B_TRUE; 1211 spa_async_request(spa, SPA_ASYNC_PROBE); 1212 } 1213 } 1214 1215 if (zio != NULL) 1216 zio_add_child(zio, pio); 1217 1218 mutex_exit(&vd->vdev_probe_lock); 1219 1220 if (vps == NULL) { 1221 ASSERT(zio != NULL); 1222 return (NULL); 1223 } 1224 1225 for (int l = 1; l < VDEV_LABELS; l++) { 1226 zio_nowait(zio_read_phys(pio, vd, 1227 vdev_label_offset(vd->vdev_psize, l, 1228 offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, 1229 abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), 1230 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1231 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1232 } 1233 1234 if (zio == NULL) 1235 return (pio); 1236 1237 zio_nowait(pio); 1238 return (NULL); 1239} 1240 1241static void 1242vdev_open_child(void *arg) 1243{ 1244 vdev_t *vd = arg; 1245 1246 vd->vdev_open_thread = curthread; 1247 vd->vdev_open_error = vdev_open(vd); 1248 vd->vdev_open_thread = NULL; 1249} 1250 1251boolean_t 1252vdev_uses_zvols(vdev_t *vd) 1253{ 1254 if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1255 strlen(ZVOL_DIR)) == 0) 1256 return (B_TRUE); 1257 for (int c = 0; c < vd->vdev_children; c++) 1258 if (vdev_uses_zvols(vd->vdev_child[c])) 1259 return (B_TRUE); 1260 return (B_FALSE); 1261} 1262 1263void 1264vdev_open_children(vdev_t *vd) 1265{ 1266 taskq_t *tq; 1267 int children = vd->vdev_children; 1268 1269 /* 1270 * in order to handle pools on top of zvols, do the opens 1271 * in a single thread so that the same thread holds the 1272 * spa_namespace_lock 1273 */ 1274 if (B_TRUE || vdev_uses_zvols(vd)) { 1275 for (int c = 0; c < children; c++) 1276 vd->vdev_child[c]->vdev_open_error = 1277 vdev_open(vd->vdev_child[c]); 1278 return; 1279 } 1280 tq = taskq_create("vdev_open", children, minclsyspri, 1281 children, children, TASKQ_PREPOPULATE); 1282 1283 for (int c = 0; c < children; c++) 1284 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1285 TQ_SLEEP) != 0); 1286 1287 taskq_destroy(tq); 1288} 1289 1290/* 1291 * Compute the raidz-deflation ratio. Note, we hard-code 1292 * in 128k (1 << 17) because it is the "typical" blocksize. 1293 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, 1294 * otherwise it would inconsistently account for existing bp's. 1295 */ 1296static void 1297vdev_set_deflate_ratio(vdev_t *vd) 1298{ 1299 if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { 1300 vd->vdev_deflate_ratio = (1 << 17) / 1301 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 1302 } 1303} 1304 1305/* 1306 * Prepare a virtual device for access. 1307 */ 1308int 1309vdev_open(vdev_t *vd) 1310{ 1311 spa_t *spa = vd->vdev_spa; 1312 int error; 1313 uint64_t osize = 0; 1314 uint64_t max_osize = 0; 1315 uint64_t asize, max_asize, psize; 1316 uint64_t logical_ashift = 0; 1317 uint64_t physical_ashift = 0; 1318 1319 ASSERT(vd->vdev_open_thread == curthread || 1320 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1321 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1322 vd->vdev_state == VDEV_STATE_CANT_OPEN || 1323 vd->vdev_state == VDEV_STATE_OFFLINE); 1324 1325 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1326 vd->vdev_cant_read = B_FALSE; 1327 vd->vdev_cant_write = B_FALSE; 1328 vd->vdev_notrim = B_FALSE; 1329 vd->vdev_min_asize = vdev_get_min_asize(vd); 1330 1331 /* 1332 * If this vdev is not removed, check its fault status. If it's 1333 * faulted, bail out of the open. 1334 */ 1335 if (!vd->vdev_removed && vd->vdev_faulted) { 1336 ASSERT(vd->vdev_children == 0); 1337 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1338 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1339 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1340 vd->vdev_label_aux); 1341 return (SET_ERROR(ENXIO)); 1342 } else if (vd->vdev_offline) { 1343 ASSERT(vd->vdev_children == 0); 1344 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1345 return (SET_ERROR(ENXIO)); 1346 } 1347 1348 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, 1349 &logical_ashift, &physical_ashift); 1350 1351 /* 1352 * Reset the vdev_reopening flag so that we actually close 1353 * the vdev on error. 1354 */ 1355 vd->vdev_reopening = B_FALSE; 1356 if (zio_injection_enabled && error == 0) 1357 error = zio_handle_device_injection(vd, NULL, ENXIO); 1358 1359 if (error) { 1360 if (vd->vdev_removed && 1361 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1362 vd->vdev_removed = B_FALSE; 1363 1364 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1365 vd->vdev_stat.vs_aux); 1366 return (error); 1367 } 1368 1369 vd->vdev_removed = B_FALSE; 1370 1371 /* 1372 * Recheck the faulted flag now that we have confirmed that 1373 * the vdev is accessible. If we're faulted, bail. 1374 */ 1375 if (vd->vdev_faulted) { 1376 ASSERT(vd->vdev_children == 0); 1377 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1378 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1379 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1380 vd->vdev_label_aux); 1381 return (SET_ERROR(ENXIO)); 1382 } 1383 1384 if (vd->vdev_degraded) { 1385 ASSERT(vd->vdev_children == 0); 1386 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1387 VDEV_AUX_ERR_EXCEEDED); 1388 } else { 1389 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1390 } 1391 1392 /* 1393 * For hole or missing vdevs we just return success. 1394 */ 1395 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1396 return (0); 1397 1398 if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) 1399 trim_map_create(vd); 1400 1401 for (int c = 0; c < vd->vdev_children; c++) { 1402 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1403 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1404 VDEV_AUX_NONE); 1405 break; 1406 } 1407 } 1408 1409 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1410 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1411 1412 if (vd->vdev_children == 0) { 1413 if (osize < SPA_MINDEVSIZE) { 1414 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1415 VDEV_AUX_TOO_SMALL); 1416 return (SET_ERROR(EOVERFLOW)); 1417 } 1418 psize = osize; 1419 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1420 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1421 VDEV_LABEL_END_SIZE); 1422 } else { 1423 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1424 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1425 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1426 VDEV_AUX_TOO_SMALL); 1427 return (SET_ERROR(EOVERFLOW)); 1428 } 1429 psize = 0; 1430 asize = osize; 1431 max_asize = max_osize; 1432 } 1433 1434 vd->vdev_psize = psize; 1435 1436 /* 1437 * Make sure the allocatable size hasn't shrunk too much. 1438 */ 1439 if (asize < vd->vdev_min_asize) { 1440 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1441 VDEV_AUX_BAD_LABEL); 1442 return (SET_ERROR(EINVAL)); 1443 } 1444 1445 vd->vdev_physical_ashift = 1446 MAX(physical_ashift, vd->vdev_physical_ashift); 1447 vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); 1448 vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); 1449 1450 if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { 1451 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1452 VDEV_AUX_ASHIFT_TOO_BIG); 1453 return (EINVAL); 1454 } 1455 1456 if (vd->vdev_asize == 0) { 1457 /* 1458 * This is the first-ever open, so use the computed values. 1459 * For testing purposes, a higher ashift can be requested. 1460 */ 1461 vd->vdev_asize = asize; 1462 vd->vdev_max_asize = max_asize; 1463 } else { 1464 /* 1465 * Make sure the alignment requirement hasn't increased. 1466 */ 1467 if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && 1468 vd->vdev_ops->vdev_op_leaf) { 1469 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1470 VDEV_AUX_BAD_LABEL); 1471 return (EINVAL); 1472 } 1473 vd->vdev_max_asize = max_asize; 1474 } 1475 1476 /* 1477 * If all children are healthy we update asize if either: 1478 * The asize has increased, due to a device expansion caused by dynamic 1479 * LUN growth or vdev replacement, and automatic expansion is enabled; 1480 * making the additional space available. 1481 * 1482 * The asize has decreased, due to a device shrink usually caused by a 1483 * vdev replace with a smaller device. This ensures that calculations 1484 * based of max_asize and asize e.g. esize are always valid. It's safe 1485 * to do this as we've already validated that asize is greater than 1486 * vdev_min_asize. 1487 */ 1488 if (vd->vdev_state == VDEV_STATE_HEALTHY && 1489 ((asize > vd->vdev_asize && 1490 (vd->vdev_expanding || spa->spa_autoexpand)) || 1491 (asize < vd->vdev_asize))) 1492 vd->vdev_asize = asize; 1493 1494 vdev_set_min_asize(vd); 1495 1496 /* 1497 * Ensure we can issue some IO before declaring the 1498 * vdev open for business. 1499 */ 1500 if (vd->vdev_ops->vdev_op_leaf && 1501 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1502 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1503 VDEV_AUX_ERR_EXCEEDED); 1504 return (error); 1505 } 1506 1507 if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1508 !vd->vdev_isl2cache && !vd->vdev_islog) { 1509 if (vd->vdev_ashift > spa->spa_max_ashift) 1510 spa->spa_max_ashift = vd->vdev_ashift; 1511 if (vd->vdev_ashift < spa->spa_min_ashift) 1512 spa->spa_min_ashift = vd->vdev_ashift; 1513 } 1514 1515 /* 1516 * Track the min and max ashift values for normal data devices. 1517 */ 1518 if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1519 !vd->vdev_islog && vd->vdev_aux == NULL) { 1520 if (vd->vdev_ashift > spa->spa_max_ashift) 1521 spa->spa_max_ashift = vd->vdev_ashift; 1522 if (vd->vdev_ashift < spa->spa_min_ashift) 1523 spa->spa_min_ashift = vd->vdev_ashift; 1524 } 1525 1526 /* 1527 * If a leaf vdev has a DTL, and seems healthy, then kick off a 1528 * resilver. But don't do this if we are doing a reopen for a scrub, 1529 * since this would just restart the scrub we are already doing. 1530 */ 1531 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1532 vdev_resilver_needed(vd, NULL, NULL)) 1533 spa_async_request(spa, SPA_ASYNC_RESILVER); 1534 1535 return (0); 1536} 1537 1538/* 1539 * Called once the vdevs are all opened, this routine validates the label 1540 * contents. This needs to be done before vdev_load() so that we don't 1541 * inadvertently do repair I/Os to the wrong device. 1542 * 1543 * If 'strict' is false ignore the spa guid check. This is necessary because 1544 * if the machine crashed during a re-guid the new guid might have been written 1545 * to all of the vdev labels, but not the cached config. The strict check 1546 * will be performed when the pool is opened again using the mos config. 1547 * 1548 * This function will only return failure if one of the vdevs indicates that it 1549 * has since been destroyed or exported. This is only possible if 1550 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1551 * will be updated but the function will return 0. 1552 */ 1553int 1554vdev_validate(vdev_t *vd, boolean_t strict) 1555{ 1556 spa_t *spa = vd->vdev_spa; 1557 nvlist_t *label; 1558 uint64_t guid = 0, top_guid; 1559 uint64_t state; 1560 1561 for (int c = 0; c < vd->vdev_children; c++) 1562 if (vdev_validate(vd->vdev_child[c], strict) != 0) 1563 return (SET_ERROR(EBADF)); 1564 1565 /* 1566 * If the device has already failed, or was marked offline, don't do 1567 * any further validation. Otherwise, label I/O will fail and we will 1568 * overwrite the previous state. 1569 */ 1570 if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1571 uint64_t aux_guid = 0; 1572 nvlist_t *nvl; 1573 uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1574 spa_last_synced_txg(spa) : -1ULL; 1575 1576 if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1577 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1578 VDEV_AUX_BAD_LABEL);
| 1151 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 1152 spa, vd, NULL, 0, 0); 1153 zio->io_error = SET_ERROR(ENXIO); 1154 } 1155 1156 mutex_enter(&vd->vdev_probe_lock); 1157 ASSERT(vd->vdev_probe_zio == zio); 1158 vd->vdev_probe_zio = NULL; 1159 mutex_exit(&vd->vdev_probe_lock); 1160 1161 zio_link_t *zl = NULL; 1162 while ((pio = zio_walk_parents(zio, &zl)) != NULL) 1163 if (!vdev_accessible(vd, pio)) 1164 pio->io_error = SET_ERROR(ENXIO); 1165 1166 kmem_free(vps, sizeof (*vps)); 1167 } 1168} 1169 1170/* 1171 * Determine whether this device is accessible. 1172 * 1173 * Read and write to several known locations: the pad regions of each 1174 * vdev label but the first, which we leave alone in case it contains 1175 * a VTOC. 1176 */ 1177zio_t * 1178vdev_probe(vdev_t *vd, zio_t *zio) 1179{ 1180 spa_t *spa = vd->vdev_spa; 1181 vdev_probe_stats_t *vps = NULL; 1182 zio_t *pio; 1183 1184 ASSERT(vd->vdev_ops->vdev_op_leaf); 1185 1186 /* 1187 * Don't probe the probe. 1188 */ 1189 if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1190 return (NULL); 1191 1192 /* 1193 * To prevent 'probe storms' when a device fails, we create 1194 * just one probe i/o at a time. All zios that want to probe 1195 * this vdev will become parents of the probe io. 1196 */ 1197 mutex_enter(&vd->vdev_probe_lock); 1198 1199 if ((pio = vd->vdev_probe_zio) == NULL) { 1200 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 1201 1202 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 1203 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 1204 ZIO_FLAG_TRYHARD; 1205 1206 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1207 /* 1208 * vdev_cant_read and vdev_cant_write can only 1209 * transition from TRUE to FALSE when we have the 1210 * SCL_ZIO lock as writer; otherwise they can only 1211 * transition from FALSE to TRUE. This ensures that 1212 * any zio looking at these values can assume that 1213 * failures persist for the life of the I/O. That's 1214 * important because when a device has intermittent 1215 * connectivity problems, we want to ensure that 1216 * they're ascribed to the device (ENXIO) and not 1217 * the zio (EIO). 1218 * 1219 * Since we hold SCL_ZIO as writer here, clear both 1220 * values so the probe can reevaluate from first 1221 * principles. 1222 */ 1223 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1224 vd->vdev_cant_read = B_FALSE; 1225 vd->vdev_cant_write = B_FALSE; 1226 } 1227 1228 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1229 vdev_probe_done, vps, 1230 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1231 1232 /* 1233 * We can't change the vdev state in this context, so we 1234 * kick off an async task to do it on our behalf. 1235 */ 1236 if (zio != NULL) { 1237 vd->vdev_probe_wanted = B_TRUE; 1238 spa_async_request(spa, SPA_ASYNC_PROBE); 1239 } 1240 } 1241 1242 if (zio != NULL) 1243 zio_add_child(zio, pio); 1244 1245 mutex_exit(&vd->vdev_probe_lock); 1246 1247 if (vps == NULL) { 1248 ASSERT(zio != NULL); 1249 return (NULL); 1250 } 1251 1252 for (int l = 1; l < VDEV_LABELS; l++) { 1253 zio_nowait(zio_read_phys(pio, vd, 1254 vdev_label_offset(vd->vdev_psize, l, 1255 offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, 1256 abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), 1257 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1258 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1259 } 1260 1261 if (zio == NULL) 1262 return (pio); 1263 1264 zio_nowait(pio); 1265 return (NULL); 1266} 1267 1268static void 1269vdev_open_child(void *arg) 1270{ 1271 vdev_t *vd = arg; 1272 1273 vd->vdev_open_thread = curthread; 1274 vd->vdev_open_error = vdev_open(vd); 1275 vd->vdev_open_thread = NULL; 1276} 1277 1278boolean_t 1279vdev_uses_zvols(vdev_t *vd) 1280{ 1281 if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1282 strlen(ZVOL_DIR)) == 0) 1283 return (B_TRUE); 1284 for (int c = 0; c < vd->vdev_children; c++) 1285 if (vdev_uses_zvols(vd->vdev_child[c])) 1286 return (B_TRUE); 1287 return (B_FALSE); 1288} 1289 1290void 1291vdev_open_children(vdev_t *vd) 1292{ 1293 taskq_t *tq; 1294 int children = vd->vdev_children; 1295 1296 /* 1297 * in order to handle pools on top of zvols, do the opens 1298 * in a single thread so that the same thread holds the 1299 * spa_namespace_lock 1300 */ 1301 if (B_TRUE || vdev_uses_zvols(vd)) { 1302 for (int c = 0; c < children; c++) 1303 vd->vdev_child[c]->vdev_open_error = 1304 vdev_open(vd->vdev_child[c]); 1305 return; 1306 } 1307 tq = taskq_create("vdev_open", children, minclsyspri, 1308 children, children, TASKQ_PREPOPULATE); 1309 1310 for (int c = 0; c < children; c++) 1311 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1312 TQ_SLEEP) != 0); 1313 1314 taskq_destroy(tq); 1315} 1316 1317/* 1318 * Compute the raidz-deflation ratio. Note, we hard-code 1319 * in 128k (1 << 17) because it is the "typical" blocksize. 1320 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, 1321 * otherwise it would inconsistently account for existing bp's. 1322 */ 1323static void 1324vdev_set_deflate_ratio(vdev_t *vd) 1325{ 1326 if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { 1327 vd->vdev_deflate_ratio = (1 << 17) / 1328 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 1329 } 1330} 1331 1332/* 1333 * Prepare a virtual device for access. 1334 */ 1335int 1336vdev_open(vdev_t *vd) 1337{ 1338 spa_t *spa = vd->vdev_spa; 1339 int error; 1340 uint64_t osize = 0; 1341 uint64_t max_osize = 0; 1342 uint64_t asize, max_asize, psize; 1343 uint64_t logical_ashift = 0; 1344 uint64_t physical_ashift = 0; 1345 1346 ASSERT(vd->vdev_open_thread == curthread || 1347 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1348 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1349 vd->vdev_state == VDEV_STATE_CANT_OPEN || 1350 vd->vdev_state == VDEV_STATE_OFFLINE); 1351 1352 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1353 vd->vdev_cant_read = B_FALSE; 1354 vd->vdev_cant_write = B_FALSE; 1355 vd->vdev_notrim = B_FALSE; 1356 vd->vdev_min_asize = vdev_get_min_asize(vd); 1357 1358 /* 1359 * If this vdev is not removed, check its fault status. If it's 1360 * faulted, bail out of the open. 1361 */ 1362 if (!vd->vdev_removed && vd->vdev_faulted) { 1363 ASSERT(vd->vdev_children == 0); 1364 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1365 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1366 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1367 vd->vdev_label_aux); 1368 return (SET_ERROR(ENXIO)); 1369 } else if (vd->vdev_offline) { 1370 ASSERT(vd->vdev_children == 0); 1371 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1372 return (SET_ERROR(ENXIO)); 1373 } 1374 1375 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, 1376 &logical_ashift, &physical_ashift); 1377 1378 /* 1379 * Reset the vdev_reopening flag so that we actually close 1380 * the vdev on error. 1381 */ 1382 vd->vdev_reopening = B_FALSE; 1383 if (zio_injection_enabled && error == 0) 1384 error = zio_handle_device_injection(vd, NULL, ENXIO); 1385 1386 if (error) { 1387 if (vd->vdev_removed && 1388 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1389 vd->vdev_removed = B_FALSE; 1390 1391 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1392 vd->vdev_stat.vs_aux); 1393 return (error); 1394 } 1395 1396 vd->vdev_removed = B_FALSE; 1397 1398 /* 1399 * Recheck the faulted flag now that we have confirmed that 1400 * the vdev is accessible. If we're faulted, bail. 1401 */ 1402 if (vd->vdev_faulted) { 1403 ASSERT(vd->vdev_children == 0); 1404 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1405 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1406 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1407 vd->vdev_label_aux); 1408 return (SET_ERROR(ENXIO)); 1409 } 1410 1411 if (vd->vdev_degraded) { 1412 ASSERT(vd->vdev_children == 0); 1413 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1414 VDEV_AUX_ERR_EXCEEDED); 1415 } else { 1416 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1417 } 1418 1419 /* 1420 * For hole or missing vdevs we just return success. 1421 */ 1422 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1423 return (0); 1424 1425 if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) 1426 trim_map_create(vd); 1427 1428 for (int c = 0; c < vd->vdev_children; c++) { 1429 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1430 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1431 VDEV_AUX_NONE); 1432 break; 1433 } 1434 } 1435 1436 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1437 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1438 1439 if (vd->vdev_children == 0) { 1440 if (osize < SPA_MINDEVSIZE) { 1441 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1442 VDEV_AUX_TOO_SMALL); 1443 return (SET_ERROR(EOVERFLOW)); 1444 } 1445 psize = osize; 1446 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1447 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1448 VDEV_LABEL_END_SIZE); 1449 } else { 1450 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1451 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1452 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1453 VDEV_AUX_TOO_SMALL); 1454 return (SET_ERROR(EOVERFLOW)); 1455 } 1456 psize = 0; 1457 asize = osize; 1458 max_asize = max_osize; 1459 } 1460 1461 vd->vdev_psize = psize; 1462 1463 /* 1464 * Make sure the allocatable size hasn't shrunk too much. 1465 */ 1466 if (asize < vd->vdev_min_asize) { 1467 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1468 VDEV_AUX_BAD_LABEL); 1469 return (SET_ERROR(EINVAL)); 1470 } 1471 1472 vd->vdev_physical_ashift = 1473 MAX(physical_ashift, vd->vdev_physical_ashift); 1474 vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); 1475 vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); 1476 1477 if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { 1478 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1479 VDEV_AUX_ASHIFT_TOO_BIG); 1480 return (EINVAL); 1481 } 1482 1483 if (vd->vdev_asize == 0) { 1484 /* 1485 * This is the first-ever open, so use the computed values. 1486 * For testing purposes, a higher ashift can be requested. 1487 */ 1488 vd->vdev_asize = asize; 1489 vd->vdev_max_asize = max_asize; 1490 } else { 1491 /* 1492 * Make sure the alignment requirement hasn't increased. 1493 */ 1494 if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && 1495 vd->vdev_ops->vdev_op_leaf) { 1496 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1497 VDEV_AUX_BAD_LABEL); 1498 return (EINVAL); 1499 } 1500 vd->vdev_max_asize = max_asize; 1501 } 1502 1503 /* 1504 * If all children are healthy we update asize if either: 1505 * The asize has increased, due to a device expansion caused by dynamic 1506 * LUN growth or vdev replacement, and automatic expansion is enabled; 1507 * making the additional space available. 1508 * 1509 * The asize has decreased, due to a device shrink usually caused by a 1510 * vdev replace with a smaller device. This ensures that calculations 1511 * based of max_asize and asize e.g. esize are always valid. It's safe 1512 * to do this as we've already validated that asize is greater than 1513 * vdev_min_asize. 1514 */ 1515 if (vd->vdev_state == VDEV_STATE_HEALTHY && 1516 ((asize > vd->vdev_asize && 1517 (vd->vdev_expanding || spa->spa_autoexpand)) || 1518 (asize < vd->vdev_asize))) 1519 vd->vdev_asize = asize; 1520 1521 vdev_set_min_asize(vd); 1522 1523 /* 1524 * Ensure we can issue some IO before declaring the 1525 * vdev open for business. 1526 */ 1527 if (vd->vdev_ops->vdev_op_leaf && 1528 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1529 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1530 VDEV_AUX_ERR_EXCEEDED); 1531 return (error); 1532 } 1533 1534 if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1535 !vd->vdev_isl2cache && !vd->vdev_islog) { 1536 if (vd->vdev_ashift > spa->spa_max_ashift) 1537 spa->spa_max_ashift = vd->vdev_ashift; 1538 if (vd->vdev_ashift < spa->spa_min_ashift) 1539 spa->spa_min_ashift = vd->vdev_ashift; 1540 } 1541 1542 /* 1543 * Track the min and max ashift values for normal data devices. 1544 */ 1545 if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1546 !vd->vdev_islog && vd->vdev_aux == NULL) { 1547 if (vd->vdev_ashift > spa->spa_max_ashift) 1548 spa->spa_max_ashift = vd->vdev_ashift; 1549 if (vd->vdev_ashift < spa->spa_min_ashift) 1550 spa->spa_min_ashift = vd->vdev_ashift; 1551 } 1552 1553 /* 1554 * If a leaf vdev has a DTL, and seems healthy, then kick off a 1555 * resilver. But don't do this if we are doing a reopen for a scrub, 1556 * since this would just restart the scrub we are already doing. 1557 */ 1558 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1559 vdev_resilver_needed(vd, NULL, NULL)) 1560 spa_async_request(spa, SPA_ASYNC_RESILVER); 1561 1562 return (0); 1563} 1564 1565/* 1566 * Called once the vdevs are all opened, this routine validates the label 1567 * contents. This needs to be done before vdev_load() so that we don't 1568 * inadvertently do repair I/Os to the wrong device. 1569 * 1570 * If 'strict' is false ignore the spa guid check. This is necessary because 1571 * if the machine crashed during a re-guid the new guid might have been written 1572 * to all of the vdev labels, but not the cached config. The strict check 1573 * will be performed when the pool is opened again using the mos config. 1574 * 1575 * This function will only return failure if one of the vdevs indicates that it 1576 * has since been destroyed or exported. This is only possible if 1577 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1578 * will be updated but the function will return 0. 1579 */ 1580int 1581vdev_validate(vdev_t *vd, boolean_t strict) 1582{ 1583 spa_t *spa = vd->vdev_spa; 1584 nvlist_t *label; 1585 uint64_t guid = 0, top_guid; 1586 uint64_t state; 1587 1588 for (int c = 0; c < vd->vdev_children; c++) 1589 if (vdev_validate(vd->vdev_child[c], strict) != 0) 1590 return (SET_ERROR(EBADF)); 1591 1592 /* 1593 * If the device has already failed, or was marked offline, don't do 1594 * any further validation. Otherwise, label I/O will fail and we will 1595 * overwrite the previous state. 1596 */ 1597 if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1598 uint64_t aux_guid = 0; 1599 nvlist_t *nvl; 1600 uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1601 spa_last_synced_txg(spa) : -1ULL; 1602 1603 if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1604 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1605 VDEV_AUX_BAD_LABEL);
|
| 1606 vdev_dbgmsg(vd, "vdev_validate: failed reading config");
|
1579 return (0); 1580 } 1581 1582 /* 1583 * Determine if this vdev has been split off into another 1584 * pool. If so, then refuse to open it. 1585 */ 1586 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1587 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1588 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1589 VDEV_AUX_SPLIT_POOL); 1590 nvlist_free(label);
| 1607 return (0); 1608 } 1609 1610 /* 1611 * Determine if this vdev has been split off into another 1612 * pool. If so, then refuse to open it. 1613 */ 1614 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1615 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1616 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1617 VDEV_AUX_SPLIT_POOL); 1618 nvlist_free(label);
|
| 1619 vdev_dbgmsg(vd, "vdev_validate: vdev split into other " 1620 "pool");
|
1591 return (0); 1592 } 1593 1594 if (strict && (nvlist_lookup_uint64(label, 1595 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 1596 guid != spa_guid(spa))) { 1597 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1598 VDEV_AUX_CORRUPT_DATA); 1599 nvlist_free(label);
| 1621 return (0); 1622 } 1623 1624 if (strict && (nvlist_lookup_uint64(label, 1625 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 1626 guid != spa_guid(spa))) { 1627 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1628 VDEV_AUX_CORRUPT_DATA); 1629 nvlist_free(label);
|
| 1630 vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid " 1631 "doesn't match config (%llu != %llu)", 1632 (u_longlong_t)guid, 1633 (u_longlong_t)spa_guid(spa));
|
1600 return (0); 1601 } 1602 1603 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1604 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1605 &aux_guid) != 0) 1606 aux_guid = 0; 1607 1608 /* 1609 * If this vdev just became a top-level vdev because its 1610 * sibling was detached, it will have adopted the parent's 1611 * vdev guid -- but the label may or may not be on disk yet. 1612 * Fortunately, either version of the label will have the 1613 * same top guid, so if we're a top-level vdev, we can 1614 * safely compare to that instead. 1615 * 1616 * If we split this vdev off instead, then we also check the 1617 * original pool's guid. We don't want to consider the vdev 1618 * corrupt if it is partway through a split operation. 1619 */ 1620 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1621 &guid) != 0 || 1622 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1623 &top_guid) != 0 || 1624 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1625 (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1626 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1627 VDEV_AUX_CORRUPT_DATA); 1628 nvlist_free(label);
| 1634 return (0); 1635 } 1636 1637 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1638 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1639 &aux_guid) != 0) 1640 aux_guid = 0; 1641 1642 /* 1643 * If this vdev just became a top-level vdev because its 1644 * sibling was detached, it will have adopted the parent's 1645 * vdev guid -- but the label may or may not be on disk yet. 1646 * Fortunately, either version of the label will have the 1647 * same top guid, so if we're a top-level vdev, we can 1648 * safely compare to that instead. 1649 * 1650 * If we split this vdev off instead, then we also check the 1651 * original pool's guid. We don't want to consider the vdev 1652 * corrupt if it is partway through a split operation. 1653 */ 1654 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1655 &guid) != 0 || 1656 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1657 &top_guid) != 0 || 1658 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1659 (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1660 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1661 VDEV_AUX_CORRUPT_DATA); 1662 nvlist_free(label);
|
| 1663 vdev_dbgmsg(vd, "vdev_validate: config guid doesn't " 1664 "match label guid (%llu != %llu)", 1665 (u_longlong_t)vd->vdev_guid, (u_longlong_t)guid);
|
1629 return (0); 1630 } 1631 1632 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1633 &state) != 0) { 1634 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1635 VDEV_AUX_CORRUPT_DATA); 1636 nvlist_free(label);
| 1666 return (0); 1667 } 1668 1669 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1670 &state) != 0) { 1671 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1672 VDEV_AUX_CORRUPT_DATA); 1673 nvlist_free(label);
|
| 1674 vdev_dbgmsg(vd, "vdev_validate: '%s' missing", 1675 ZPOOL_CONFIG_POOL_STATE);
|
1637 return (0); 1638 } 1639 1640 nvlist_free(label); 1641 1642 /* 1643 * If this is a verbatim import, no need to check the 1644 * state of the pool. 1645 */ 1646 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1647 spa_load_state(spa) == SPA_LOAD_OPEN &&
| 1676 return (0); 1677 } 1678 1679 nvlist_free(label); 1680 1681 /* 1682 * If this is a verbatim import, no need to check the 1683 * state of the pool. 1684 */ 1685 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1686 spa_load_state(spa) == SPA_LOAD_OPEN &&
|
1648 state != POOL_STATE_ACTIVE)
| 1687 state != POOL_STATE_ACTIVE) { 1688 vdev_dbgmsg(vd, "vdev_validate: invalid pool state " 1689 "(%llu) for spa %s", (u_longlong_t)state, 1690 spa->spa_name);
|
1649 return (SET_ERROR(EBADF));
| 1691 return (SET_ERROR(EBADF));
|
| 1692 }
|
1650 1651 /* 1652 * If we were able to open and validate a vdev that was 1653 * previously marked permanently unavailable, clear that state 1654 * now. 1655 */ 1656 if (vd->vdev_not_present) 1657 vd->vdev_not_present = 0; 1658 } 1659 1660 return (0); 1661} 1662 1663/* 1664 * Close a virtual device. 1665 */ 1666void 1667vdev_close(vdev_t *vd) 1668{ 1669 spa_t *spa = vd->vdev_spa; 1670 vdev_t *pvd = vd->vdev_parent; 1671 1672 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1673 1674 /* 1675 * If our parent is reopening, then we are as well, unless we are 1676 * going offline. 1677 */ 1678 if (pvd != NULL && pvd->vdev_reopening) 1679 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1680 1681 vd->vdev_ops->vdev_op_close(vd); 1682 1683 vdev_cache_purge(vd); 1684 1685 if (vd->vdev_ops->vdev_op_leaf) 1686 trim_map_destroy(vd); 1687 1688 /* 1689 * We record the previous state before we close it, so that if we are 1690 * doing a reopen(), we don't generate FMA ereports if we notice that 1691 * it's still faulted. 1692 */ 1693 vd->vdev_prevstate = vd->vdev_state; 1694 1695 if (vd->vdev_offline) 1696 vd->vdev_state = VDEV_STATE_OFFLINE; 1697 else 1698 vd->vdev_state = VDEV_STATE_CLOSED; 1699 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1700} 1701 1702void 1703vdev_hold(vdev_t *vd) 1704{ 1705 spa_t *spa = vd->vdev_spa; 1706 1707 ASSERT(spa_is_root(spa)); 1708 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1709 return; 1710 1711 for (int c = 0; c < vd->vdev_children; c++) 1712 vdev_hold(vd->vdev_child[c]); 1713 1714 if (vd->vdev_ops->vdev_op_leaf) 1715 vd->vdev_ops->vdev_op_hold(vd); 1716} 1717 1718void 1719vdev_rele(vdev_t *vd) 1720{ 1721 spa_t *spa = vd->vdev_spa; 1722 1723 ASSERT(spa_is_root(spa)); 1724 for (int c = 0; c < vd->vdev_children; c++) 1725 vdev_rele(vd->vdev_child[c]); 1726 1727 if (vd->vdev_ops->vdev_op_leaf) 1728 vd->vdev_ops->vdev_op_rele(vd); 1729} 1730 1731/* 1732 * Reopen all interior vdevs and any unopened leaves. We don't actually 1733 * reopen leaf vdevs which had previously been opened as they might deadlock 1734 * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1735 * If the leaf has never been opened then open it, as usual. 1736 */ 1737void 1738vdev_reopen(vdev_t *vd) 1739{ 1740 spa_t *spa = vd->vdev_spa; 1741 1742 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1743 1744 /* set the reopening flag unless we're taking the vdev offline */ 1745 vd->vdev_reopening = !vd->vdev_offline; 1746 vdev_close(vd); 1747 (void) vdev_open(vd); 1748 1749 /* 1750 * Call vdev_validate() here to make sure we have the same device. 1751 * Otherwise, a device with an invalid label could be successfully 1752 * opened in response to vdev_reopen(). 1753 */ 1754 if (vd->vdev_aux) { 1755 (void) vdev_validate_aux(vd); 1756 if (vdev_readable(vd) && vdev_writeable(vd) && 1757 vd->vdev_aux == &spa->spa_l2cache && 1758 !l2arc_vdev_present(vd)) 1759 l2arc_add_vdev(spa, vd); 1760 } else { 1761 (void) vdev_validate(vd, B_TRUE); 1762 } 1763 1764 /* 1765 * Reassess parent vdev's health. 1766 */ 1767 vdev_propagate_state(vd); 1768} 1769 1770int 1771vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1772{ 1773 int error; 1774 1775 /* 1776 * Normally, partial opens (e.g. of a mirror) are allowed. 1777 * For a create, however, we want to fail the request if 1778 * there are any components we can't open. 1779 */ 1780 error = vdev_open(vd); 1781 1782 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1783 vdev_close(vd); 1784 return (error ? error : ENXIO); 1785 } 1786 1787 /* 1788 * Recursively load DTLs and initialize all labels. 1789 */ 1790 if ((error = vdev_dtl_load(vd)) != 0 || 1791 (error = vdev_label_init(vd, txg, isreplacing ? 1792 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1793 vdev_close(vd); 1794 return (error); 1795 } 1796 1797 return (0); 1798} 1799 1800void 1801vdev_metaslab_set_size(vdev_t *vd) 1802{ 1803 /* 1804 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. 1805 */ 1806 vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); 1807 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1808} 1809 1810/* 1811 * Maximize performance by inflating the configured ashift for top level 1812 * vdevs to be as close to the physical ashift as possible while maintaining 1813 * administrator defined limits and ensuring it doesn't go below the 1814 * logical ashift. 1815 */ 1816void 1817vdev_ashift_optimize(vdev_t *vd) 1818{ 1819 if (vd == vd->vdev_top) { 1820 if (vd->vdev_ashift < vd->vdev_physical_ashift) { 1821 vd->vdev_ashift = MIN( 1822 MAX(zfs_max_auto_ashift, vd->vdev_ashift), 1823 MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); 1824 } else { 1825 /* 1826 * Unusual case where logical ashift > physical ashift 1827 * so we can't cap the calculated ashift based on max 1828 * ashift as that would cause failures. 1829 * We still check if we need to increase it to match 1830 * the min ashift. 1831 */ 1832 vd->vdev_ashift = MAX(zfs_min_auto_ashift, 1833 vd->vdev_ashift); 1834 } 1835 } 1836} 1837 1838void 1839vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1840{ 1841 ASSERT(vd == vd->vdev_top); 1842 /* indirect vdevs don't have metaslabs or dtls */ 1843 ASSERT(vdev_is_concrete(vd) || flags == 0); 1844 ASSERT(ISP2(flags)); 1845 ASSERT(spa_writeable(vd->vdev_spa)); 1846 1847 if (flags & VDD_METASLAB) 1848 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1849 1850 if (flags & VDD_DTL) 1851 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1852 1853 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1854} 1855 1856void 1857vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) 1858{ 1859 for (int c = 0; c < vd->vdev_children; c++) 1860 vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 1861 1862 if (vd->vdev_ops->vdev_op_leaf) 1863 vdev_dirty(vd->vdev_top, flags, vd, txg); 1864} 1865 1866/* 1867 * DTLs. 1868 * 1869 * A vdev's DTL (dirty time log) is the set of transaction groups for which 1870 * the vdev has less than perfect replication. There are four kinds of DTL: 1871 * 1872 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1873 * 1874 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1875 * 1876 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1877 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1878 * txgs that was scrubbed. 1879 * 1880 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1881 * persistent errors or just some device being offline. 1882 * Unlike the other three, the DTL_OUTAGE map is not generally 1883 * maintained; it's only computed when needed, typically to 1884 * determine whether a device can be detached. 1885 * 1886 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1887 * either has the data or it doesn't. 1888 * 1889 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1890 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1891 * if any child is less than fully replicated, then so is its parent. 1892 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1893 * comprising only those txgs which appear in 'maxfaults' or more children; 1894 * those are the txgs we don't have enough replication to read. For example, 1895 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1896 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1897 * two child DTL_MISSING maps. 1898 * 1899 * It should be clear from the above that to compute the DTLs and outage maps 1900 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1901 * Therefore, that is all we keep on disk. When loading the pool, or after 1902 * a configuration change, we generate all other DTLs from first principles. 1903 */ 1904void 1905vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1906{ 1907 range_tree_t *rt = vd->vdev_dtl[t]; 1908 1909 ASSERT(t < DTL_TYPES); 1910 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1911 ASSERT(spa_writeable(vd->vdev_spa)); 1912 1913 mutex_enter(&vd->vdev_dtl_lock); 1914 if (!range_tree_contains(rt, txg, size)) 1915 range_tree_add(rt, txg, size); 1916 mutex_exit(&vd->vdev_dtl_lock); 1917} 1918 1919boolean_t 1920vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1921{ 1922 range_tree_t *rt = vd->vdev_dtl[t]; 1923 boolean_t dirty = B_FALSE; 1924 1925 ASSERT(t < DTL_TYPES); 1926 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1927 1928 /* 1929 * While we are loading the pool, the DTLs have not been loaded yet. 1930 * Ignore the DTLs and try all devices. This avoids a recursive 1931 * mutex enter on the vdev_dtl_lock, and also makes us try hard 1932 * when loading the pool (relying on the checksum to ensure that 1933 * we get the right data -- note that we while loading, we are 1934 * only reading the MOS, which is always checksummed). 1935 */ 1936 if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) 1937 return (B_FALSE); 1938 1939 mutex_enter(&vd->vdev_dtl_lock); 1940 if (range_tree_space(rt) != 0) 1941 dirty = range_tree_contains(rt, txg, size); 1942 mutex_exit(&vd->vdev_dtl_lock); 1943 1944 return (dirty); 1945} 1946 1947boolean_t 1948vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1949{ 1950 range_tree_t *rt = vd->vdev_dtl[t]; 1951 boolean_t empty; 1952 1953 mutex_enter(&vd->vdev_dtl_lock); 1954 empty = (range_tree_space(rt) == 0); 1955 mutex_exit(&vd->vdev_dtl_lock); 1956 1957 return (empty); 1958} 1959 1960/* 1961 * Returns the lowest txg in the DTL range. 1962 */ 1963static uint64_t 1964vdev_dtl_min(vdev_t *vd) 1965{ 1966 range_seg_t *rs; 1967 1968 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1969 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1970 ASSERT0(vd->vdev_children); 1971 1972 rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1973 return (rs->rs_start - 1); 1974} 1975 1976/* 1977 * Returns the highest txg in the DTL. 1978 */ 1979static uint64_t 1980vdev_dtl_max(vdev_t *vd) 1981{ 1982 range_seg_t *rs; 1983 1984 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1985 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1986 ASSERT0(vd->vdev_children); 1987 1988 rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1989 return (rs->rs_end); 1990} 1991 1992/* 1993 * Determine if a resilvering vdev should remove any DTL entries from 1994 * its range. If the vdev was resilvering for the entire duration of the 1995 * scan then it should excise that range from its DTLs. Otherwise, this 1996 * vdev is considered partially resilvered and should leave its DTL 1997 * entries intact. The comment in vdev_dtl_reassess() describes how we 1998 * excise the DTLs. 1999 */ 2000static boolean_t 2001vdev_dtl_should_excise(vdev_t *vd) 2002{ 2003 spa_t *spa = vd->vdev_spa; 2004 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 2005 2006 ASSERT0(scn->scn_phys.scn_errors); 2007 ASSERT0(vd->vdev_children); 2008 2009 if (vd->vdev_state < VDEV_STATE_DEGRADED) 2010 return (B_FALSE); 2011 2012 if (vd->vdev_resilver_txg == 0 || 2013 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) 2014 return (B_TRUE); 2015 2016 /* 2017 * When a resilver is initiated the scan will assign the scn_max_txg 2018 * value to the highest txg value that exists in all DTLs. If this 2019 * device's max DTL is not part of this scan (i.e. it is not in 2020 * the range (scn_min_txg, scn_max_txg] then it is not eligible 2021 * for excision. 2022 */ 2023 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 2024 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); 2025 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); 2026 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); 2027 return (B_TRUE); 2028 } 2029 return (B_FALSE); 2030} 2031 2032/* 2033 * Reassess DTLs after a config change or scrub completion. 2034 */ 2035void 2036vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 2037{ 2038 spa_t *spa = vd->vdev_spa; 2039 avl_tree_t reftree; 2040 int minref; 2041 2042 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2043 2044 for (int c = 0; c < vd->vdev_children; c++) 2045 vdev_dtl_reassess(vd->vdev_child[c], txg, 2046 scrub_txg, scrub_done); 2047 2048 if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) 2049 return; 2050 2051 if (vd->vdev_ops->vdev_op_leaf) { 2052 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 2053 2054 mutex_enter(&vd->vdev_dtl_lock); 2055 2056 /* 2057 * If we've completed a scan cleanly then determine 2058 * if this vdev should remove any DTLs. We only want to 2059 * excise regions on vdevs that were available during 2060 * the entire duration of this scan. 2061 */ 2062 if (scrub_txg != 0 && 2063 (spa->spa_scrub_started || 2064 (scn != NULL && scn->scn_phys.scn_errors == 0)) && 2065 vdev_dtl_should_excise(vd)) { 2066 /* 2067 * We completed a scrub up to scrub_txg. If we 2068 * did it without rebooting, then the scrub dtl 2069 * will be valid, so excise the old region and 2070 * fold in the scrub dtl. Otherwise, leave the 2071 * dtl as-is if there was an error. 2072 * 2073 * There's little trick here: to excise the beginning 2074 * of the DTL_MISSING map, we put it into a reference 2075 * tree and then add a segment with refcnt -1 that 2076 * covers the range [0, scrub_txg). This means 2077 * that each txg in that range has refcnt -1 or 0. 2078 * We then add DTL_SCRUB with a refcnt of 2, so that 2079 * entries in the range [0, scrub_txg) will have a 2080 * positive refcnt -- either 1 or 2. We then convert 2081 * the reference tree into the new DTL_MISSING map. 2082 */ 2083 space_reftree_create(&reftree); 2084 space_reftree_add_map(&reftree, 2085 vd->vdev_dtl[DTL_MISSING], 1); 2086 space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 2087 space_reftree_add_map(&reftree, 2088 vd->vdev_dtl[DTL_SCRUB], 2); 2089 space_reftree_generate_map(&reftree, 2090 vd->vdev_dtl[DTL_MISSING], 1); 2091 space_reftree_destroy(&reftree); 2092 } 2093 range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 2094 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 2095 range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 2096 if (scrub_done) 2097 range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 2098 range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 2099 if (!vdev_readable(vd)) 2100 range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 2101 else 2102 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 2103 range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 2104 2105 /* 2106 * If the vdev was resilvering and no longer has any 2107 * DTLs then reset its resilvering flag and dirty 2108 * the top level so that we persist the change. 2109 */ 2110 if (vd->vdev_resilver_txg != 0 && 2111 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && 2112 range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { 2113 vd->vdev_resilver_txg = 0; 2114 vdev_config_dirty(vd->vdev_top); 2115 } 2116 2117 mutex_exit(&vd->vdev_dtl_lock); 2118 2119 if (txg != 0) 2120 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 2121 return; 2122 } 2123 2124 mutex_enter(&vd->vdev_dtl_lock); 2125 for (int t = 0; t < DTL_TYPES; t++) { 2126 /* account for child's outage in parent's missing map */ 2127 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 2128 if (t == DTL_SCRUB) 2129 continue; /* leaf vdevs only */ 2130 if (t == DTL_PARTIAL) 2131 minref = 1; /* i.e. non-zero */ 2132 else if (vd->vdev_nparity != 0) 2133 minref = vd->vdev_nparity + 1; /* RAID-Z */ 2134 else 2135 minref = vd->vdev_children; /* any kind of mirror */ 2136 space_reftree_create(&reftree); 2137 for (int c = 0; c < vd->vdev_children; c++) { 2138 vdev_t *cvd = vd->vdev_child[c]; 2139 mutex_enter(&cvd->vdev_dtl_lock); 2140 space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); 2141 mutex_exit(&cvd->vdev_dtl_lock); 2142 } 2143 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); 2144 space_reftree_destroy(&reftree); 2145 } 2146 mutex_exit(&vd->vdev_dtl_lock); 2147} 2148 2149int 2150vdev_dtl_load(vdev_t *vd) 2151{ 2152 spa_t *spa = vd->vdev_spa; 2153 objset_t *mos = spa->spa_meta_objset; 2154 int error = 0; 2155 2156 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 2157 ASSERT(vdev_is_concrete(vd)); 2158 2159 error = space_map_open(&vd->vdev_dtl_sm, mos, 2160 vd->vdev_dtl_object, 0, -1ULL, 0); 2161 if (error) 2162 return (error); 2163 ASSERT(vd->vdev_dtl_sm != NULL); 2164 2165 mutex_enter(&vd->vdev_dtl_lock); 2166 2167 /* 2168 * Now that we've opened the space_map we need to update 2169 * the in-core DTL. 2170 */ 2171 space_map_update(vd->vdev_dtl_sm); 2172 2173 error = space_map_load(vd->vdev_dtl_sm, 2174 vd->vdev_dtl[DTL_MISSING], SM_ALLOC); 2175 mutex_exit(&vd->vdev_dtl_lock); 2176 2177 return (error); 2178 } 2179 2180 for (int c = 0; c < vd->vdev_children; c++) { 2181 error = vdev_dtl_load(vd->vdev_child[c]); 2182 if (error != 0) 2183 break; 2184 } 2185 2186 return (error); 2187} 2188 2189void 2190vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) 2191{ 2192 spa_t *spa = vd->vdev_spa; 2193 2194 VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); 2195 VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2196 zapobj, tx)); 2197} 2198 2199uint64_t 2200vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) 2201{ 2202 spa_t *spa = vd->vdev_spa; 2203 uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, 2204 DMU_OT_NONE, 0, tx); 2205 2206 ASSERT(zap != 0); 2207 VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2208 zap, tx)); 2209 2210 return (zap); 2211} 2212 2213void 2214vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) 2215{ 2216 if (vd->vdev_ops != &vdev_hole_ops && 2217 vd->vdev_ops != &vdev_missing_ops && 2218 vd->vdev_ops != &vdev_root_ops && 2219 !vd->vdev_top->vdev_removing) { 2220 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { 2221 vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); 2222 } 2223 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { 2224 vd->vdev_top_zap = vdev_create_link_zap(vd, tx); 2225 } 2226 } 2227 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2228 vdev_construct_zaps(vd->vdev_child[i], tx); 2229 } 2230} 2231 2232void 2233vdev_dtl_sync(vdev_t *vd, uint64_t txg) 2234{ 2235 spa_t *spa = vd->vdev_spa; 2236 range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; 2237 objset_t *mos = spa->spa_meta_objset; 2238 range_tree_t *rtsync; 2239 dmu_tx_t *tx; 2240 uint64_t object = space_map_object(vd->vdev_dtl_sm); 2241 2242 ASSERT(vdev_is_concrete(vd)); 2243 ASSERT(vd->vdev_ops->vdev_op_leaf); 2244 2245 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2246 2247 if (vd->vdev_detached || vd->vdev_top->vdev_removing) { 2248 mutex_enter(&vd->vdev_dtl_lock); 2249 space_map_free(vd->vdev_dtl_sm, tx); 2250 space_map_close(vd->vdev_dtl_sm); 2251 vd->vdev_dtl_sm = NULL; 2252 mutex_exit(&vd->vdev_dtl_lock); 2253 2254 /* 2255 * We only destroy the leaf ZAP for detached leaves or for 2256 * removed log devices. Removed data devices handle leaf ZAP 2257 * cleanup later, once cancellation is no longer possible. 2258 */ 2259 if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || 2260 vd->vdev_top->vdev_islog)) { 2261 vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); 2262 vd->vdev_leaf_zap = 0; 2263 } 2264 2265 dmu_tx_commit(tx); 2266 return; 2267 } 2268 2269 if (vd->vdev_dtl_sm == NULL) { 2270 uint64_t new_object; 2271 2272 new_object = space_map_alloc(mos, tx); 2273 VERIFY3U(new_object, !=, 0); 2274 2275 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 2276 0, -1ULL, 0)); 2277 ASSERT(vd->vdev_dtl_sm != NULL); 2278 } 2279 2280 rtsync = range_tree_create(NULL, NULL); 2281 2282 mutex_enter(&vd->vdev_dtl_lock); 2283 range_tree_walk(rt, range_tree_add, rtsync); 2284 mutex_exit(&vd->vdev_dtl_lock); 2285 2286 space_map_truncate(vd->vdev_dtl_sm, tx); 2287 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); 2288 range_tree_vacate(rtsync, NULL, NULL); 2289 2290 range_tree_destroy(rtsync); 2291 2292 /* 2293 * If the object for the space map has changed then dirty 2294 * the top level so that we update the config. 2295 */ 2296 if (object != space_map_object(vd->vdev_dtl_sm)) {
| 1693 1694 /* 1695 * If we were able to open and validate a vdev that was 1696 * previously marked permanently unavailable, clear that state 1697 * now. 1698 */ 1699 if (vd->vdev_not_present) 1700 vd->vdev_not_present = 0; 1701 } 1702 1703 return (0); 1704} 1705 1706/* 1707 * Close a virtual device. 1708 */ 1709void 1710vdev_close(vdev_t *vd) 1711{ 1712 spa_t *spa = vd->vdev_spa; 1713 vdev_t *pvd = vd->vdev_parent; 1714 1715 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1716 1717 /* 1718 * If our parent is reopening, then we are as well, unless we are 1719 * going offline. 1720 */ 1721 if (pvd != NULL && pvd->vdev_reopening) 1722 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1723 1724 vd->vdev_ops->vdev_op_close(vd); 1725 1726 vdev_cache_purge(vd); 1727 1728 if (vd->vdev_ops->vdev_op_leaf) 1729 trim_map_destroy(vd); 1730 1731 /* 1732 * We record the previous state before we close it, so that if we are 1733 * doing a reopen(), we don't generate FMA ereports if we notice that 1734 * it's still faulted. 1735 */ 1736 vd->vdev_prevstate = vd->vdev_state; 1737 1738 if (vd->vdev_offline) 1739 vd->vdev_state = VDEV_STATE_OFFLINE; 1740 else 1741 vd->vdev_state = VDEV_STATE_CLOSED; 1742 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1743} 1744 1745void 1746vdev_hold(vdev_t *vd) 1747{ 1748 spa_t *spa = vd->vdev_spa; 1749 1750 ASSERT(spa_is_root(spa)); 1751 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1752 return; 1753 1754 for (int c = 0; c < vd->vdev_children; c++) 1755 vdev_hold(vd->vdev_child[c]); 1756 1757 if (vd->vdev_ops->vdev_op_leaf) 1758 vd->vdev_ops->vdev_op_hold(vd); 1759} 1760 1761void 1762vdev_rele(vdev_t *vd) 1763{ 1764 spa_t *spa = vd->vdev_spa; 1765 1766 ASSERT(spa_is_root(spa)); 1767 for (int c = 0; c < vd->vdev_children; c++) 1768 vdev_rele(vd->vdev_child[c]); 1769 1770 if (vd->vdev_ops->vdev_op_leaf) 1771 vd->vdev_ops->vdev_op_rele(vd); 1772} 1773 1774/* 1775 * Reopen all interior vdevs and any unopened leaves. We don't actually 1776 * reopen leaf vdevs which had previously been opened as they might deadlock 1777 * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1778 * If the leaf has never been opened then open it, as usual. 1779 */ 1780void 1781vdev_reopen(vdev_t *vd) 1782{ 1783 spa_t *spa = vd->vdev_spa; 1784 1785 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1786 1787 /* set the reopening flag unless we're taking the vdev offline */ 1788 vd->vdev_reopening = !vd->vdev_offline; 1789 vdev_close(vd); 1790 (void) vdev_open(vd); 1791 1792 /* 1793 * Call vdev_validate() here to make sure we have the same device. 1794 * Otherwise, a device with an invalid label could be successfully 1795 * opened in response to vdev_reopen(). 1796 */ 1797 if (vd->vdev_aux) { 1798 (void) vdev_validate_aux(vd); 1799 if (vdev_readable(vd) && vdev_writeable(vd) && 1800 vd->vdev_aux == &spa->spa_l2cache && 1801 !l2arc_vdev_present(vd)) 1802 l2arc_add_vdev(spa, vd); 1803 } else { 1804 (void) vdev_validate(vd, B_TRUE); 1805 } 1806 1807 /* 1808 * Reassess parent vdev's health. 1809 */ 1810 vdev_propagate_state(vd); 1811} 1812 1813int 1814vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1815{ 1816 int error; 1817 1818 /* 1819 * Normally, partial opens (e.g. of a mirror) are allowed. 1820 * For a create, however, we want to fail the request if 1821 * there are any components we can't open. 1822 */ 1823 error = vdev_open(vd); 1824 1825 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1826 vdev_close(vd); 1827 return (error ? error : ENXIO); 1828 } 1829 1830 /* 1831 * Recursively load DTLs and initialize all labels. 1832 */ 1833 if ((error = vdev_dtl_load(vd)) != 0 || 1834 (error = vdev_label_init(vd, txg, isreplacing ? 1835 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1836 vdev_close(vd); 1837 return (error); 1838 } 1839 1840 return (0); 1841} 1842 1843void 1844vdev_metaslab_set_size(vdev_t *vd) 1845{ 1846 /* 1847 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. 1848 */ 1849 vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); 1850 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1851} 1852 1853/* 1854 * Maximize performance by inflating the configured ashift for top level 1855 * vdevs to be as close to the physical ashift as possible while maintaining 1856 * administrator defined limits and ensuring it doesn't go below the 1857 * logical ashift. 1858 */ 1859void 1860vdev_ashift_optimize(vdev_t *vd) 1861{ 1862 if (vd == vd->vdev_top) { 1863 if (vd->vdev_ashift < vd->vdev_physical_ashift) { 1864 vd->vdev_ashift = MIN( 1865 MAX(zfs_max_auto_ashift, vd->vdev_ashift), 1866 MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); 1867 } else { 1868 /* 1869 * Unusual case where logical ashift > physical ashift 1870 * so we can't cap the calculated ashift based on max 1871 * ashift as that would cause failures. 1872 * We still check if we need to increase it to match 1873 * the min ashift. 1874 */ 1875 vd->vdev_ashift = MAX(zfs_min_auto_ashift, 1876 vd->vdev_ashift); 1877 } 1878 } 1879} 1880 1881void 1882vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1883{ 1884 ASSERT(vd == vd->vdev_top); 1885 /* indirect vdevs don't have metaslabs or dtls */ 1886 ASSERT(vdev_is_concrete(vd) || flags == 0); 1887 ASSERT(ISP2(flags)); 1888 ASSERT(spa_writeable(vd->vdev_spa)); 1889 1890 if (flags & VDD_METASLAB) 1891 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1892 1893 if (flags & VDD_DTL) 1894 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1895 1896 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1897} 1898 1899void 1900vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) 1901{ 1902 for (int c = 0; c < vd->vdev_children; c++) 1903 vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 1904 1905 if (vd->vdev_ops->vdev_op_leaf) 1906 vdev_dirty(vd->vdev_top, flags, vd, txg); 1907} 1908 1909/* 1910 * DTLs. 1911 * 1912 * A vdev's DTL (dirty time log) is the set of transaction groups for which 1913 * the vdev has less than perfect replication. There are four kinds of DTL: 1914 * 1915 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1916 * 1917 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1918 * 1919 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1920 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1921 * txgs that was scrubbed. 1922 * 1923 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1924 * persistent errors or just some device being offline. 1925 * Unlike the other three, the DTL_OUTAGE map is not generally 1926 * maintained; it's only computed when needed, typically to 1927 * determine whether a device can be detached. 1928 * 1929 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1930 * either has the data or it doesn't. 1931 * 1932 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1933 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1934 * if any child is less than fully replicated, then so is its parent. 1935 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1936 * comprising only those txgs which appear in 'maxfaults' or more children; 1937 * those are the txgs we don't have enough replication to read. For example, 1938 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1939 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1940 * two child DTL_MISSING maps. 1941 * 1942 * It should be clear from the above that to compute the DTLs and outage maps 1943 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1944 * Therefore, that is all we keep on disk. When loading the pool, or after 1945 * a configuration change, we generate all other DTLs from first principles. 1946 */ 1947void 1948vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1949{ 1950 range_tree_t *rt = vd->vdev_dtl[t]; 1951 1952 ASSERT(t < DTL_TYPES); 1953 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1954 ASSERT(spa_writeable(vd->vdev_spa)); 1955 1956 mutex_enter(&vd->vdev_dtl_lock); 1957 if (!range_tree_contains(rt, txg, size)) 1958 range_tree_add(rt, txg, size); 1959 mutex_exit(&vd->vdev_dtl_lock); 1960} 1961 1962boolean_t 1963vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1964{ 1965 range_tree_t *rt = vd->vdev_dtl[t]; 1966 boolean_t dirty = B_FALSE; 1967 1968 ASSERT(t < DTL_TYPES); 1969 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1970 1971 /* 1972 * While we are loading the pool, the DTLs have not been loaded yet. 1973 * Ignore the DTLs and try all devices. This avoids a recursive 1974 * mutex enter on the vdev_dtl_lock, and also makes us try hard 1975 * when loading the pool (relying on the checksum to ensure that 1976 * we get the right data -- note that we while loading, we are 1977 * only reading the MOS, which is always checksummed). 1978 */ 1979 if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) 1980 return (B_FALSE); 1981 1982 mutex_enter(&vd->vdev_dtl_lock); 1983 if (range_tree_space(rt) != 0) 1984 dirty = range_tree_contains(rt, txg, size); 1985 mutex_exit(&vd->vdev_dtl_lock); 1986 1987 return (dirty); 1988} 1989 1990boolean_t 1991vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1992{ 1993 range_tree_t *rt = vd->vdev_dtl[t]; 1994 boolean_t empty; 1995 1996 mutex_enter(&vd->vdev_dtl_lock); 1997 empty = (range_tree_space(rt) == 0); 1998 mutex_exit(&vd->vdev_dtl_lock); 1999 2000 return (empty); 2001} 2002 2003/* 2004 * Returns the lowest txg in the DTL range. 2005 */ 2006static uint64_t 2007vdev_dtl_min(vdev_t *vd) 2008{ 2009 range_seg_t *rs; 2010 2011 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 2012 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 2013 ASSERT0(vd->vdev_children); 2014 2015 rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); 2016 return (rs->rs_start - 1); 2017} 2018 2019/* 2020 * Returns the highest txg in the DTL. 2021 */ 2022static uint64_t 2023vdev_dtl_max(vdev_t *vd) 2024{ 2025 range_seg_t *rs; 2026 2027 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 2028 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 2029 ASSERT0(vd->vdev_children); 2030 2031 rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); 2032 return (rs->rs_end); 2033} 2034 2035/* 2036 * Determine if a resilvering vdev should remove any DTL entries from 2037 * its range. If the vdev was resilvering for the entire duration of the 2038 * scan then it should excise that range from its DTLs. Otherwise, this 2039 * vdev is considered partially resilvered and should leave its DTL 2040 * entries intact. The comment in vdev_dtl_reassess() describes how we 2041 * excise the DTLs. 2042 */ 2043static boolean_t 2044vdev_dtl_should_excise(vdev_t *vd) 2045{ 2046 spa_t *spa = vd->vdev_spa; 2047 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 2048 2049 ASSERT0(scn->scn_phys.scn_errors); 2050 ASSERT0(vd->vdev_children); 2051 2052 if (vd->vdev_state < VDEV_STATE_DEGRADED) 2053 return (B_FALSE); 2054 2055 if (vd->vdev_resilver_txg == 0 || 2056 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) 2057 return (B_TRUE); 2058 2059 /* 2060 * When a resilver is initiated the scan will assign the scn_max_txg 2061 * value to the highest txg value that exists in all DTLs. If this 2062 * device's max DTL is not part of this scan (i.e. it is not in 2063 * the range (scn_min_txg, scn_max_txg] then it is not eligible 2064 * for excision. 2065 */ 2066 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 2067 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); 2068 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); 2069 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); 2070 return (B_TRUE); 2071 } 2072 return (B_FALSE); 2073} 2074 2075/* 2076 * Reassess DTLs after a config change or scrub completion. 2077 */ 2078void 2079vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 2080{ 2081 spa_t *spa = vd->vdev_spa; 2082 avl_tree_t reftree; 2083 int minref; 2084 2085 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2086 2087 for (int c = 0; c < vd->vdev_children; c++) 2088 vdev_dtl_reassess(vd->vdev_child[c], txg, 2089 scrub_txg, scrub_done); 2090 2091 if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) 2092 return; 2093 2094 if (vd->vdev_ops->vdev_op_leaf) { 2095 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 2096 2097 mutex_enter(&vd->vdev_dtl_lock); 2098 2099 /* 2100 * If we've completed a scan cleanly then determine 2101 * if this vdev should remove any DTLs. We only want to 2102 * excise regions on vdevs that were available during 2103 * the entire duration of this scan. 2104 */ 2105 if (scrub_txg != 0 && 2106 (spa->spa_scrub_started || 2107 (scn != NULL && scn->scn_phys.scn_errors == 0)) && 2108 vdev_dtl_should_excise(vd)) { 2109 /* 2110 * We completed a scrub up to scrub_txg. If we 2111 * did it without rebooting, then the scrub dtl 2112 * will be valid, so excise the old region and 2113 * fold in the scrub dtl. Otherwise, leave the 2114 * dtl as-is if there was an error. 2115 * 2116 * There's little trick here: to excise the beginning 2117 * of the DTL_MISSING map, we put it into a reference 2118 * tree and then add a segment with refcnt -1 that 2119 * covers the range [0, scrub_txg). This means 2120 * that each txg in that range has refcnt -1 or 0. 2121 * We then add DTL_SCRUB with a refcnt of 2, so that 2122 * entries in the range [0, scrub_txg) will have a 2123 * positive refcnt -- either 1 or 2. We then convert 2124 * the reference tree into the new DTL_MISSING map. 2125 */ 2126 space_reftree_create(&reftree); 2127 space_reftree_add_map(&reftree, 2128 vd->vdev_dtl[DTL_MISSING], 1); 2129 space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 2130 space_reftree_add_map(&reftree, 2131 vd->vdev_dtl[DTL_SCRUB], 2); 2132 space_reftree_generate_map(&reftree, 2133 vd->vdev_dtl[DTL_MISSING], 1); 2134 space_reftree_destroy(&reftree); 2135 } 2136 range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 2137 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 2138 range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 2139 if (scrub_done) 2140 range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 2141 range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 2142 if (!vdev_readable(vd)) 2143 range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 2144 else 2145 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 2146 range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 2147 2148 /* 2149 * If the vdev was resilvering and no longer has any 2150 * DTLs then reset its resilvering flag and dirty 2151 * the top level so that we persist the change. 2152 */ 2153 if (vd->vdev_resilver_txg != 0 && 2154 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && 2155 range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { 2156 vd->vdev_resilver_txg = 0; 2157 vdev_config_dirty(vd->vdev_top); 2158 } 2159 2160 mutex_exit(&vd->vdev_dtl_lock); 2161 2162 if (txg != 0) 2163 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 2164 return; 2165 } 2166 2167 mutex_enter(&vd->vdev_dtl_lock); 2168 for (int t = 0; t < DTL_TYPES; t++) { 2169 /* account for child's outage in parent's missing map */ 2170 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 2171 if (t == DTL_SCRUB) 2172 continue; /* leaf vdevs only */ 2173 if (t == DTL_PARTIAL) 2174 minref = 1; /* i.e. non-zero */ 2175 else if (vd->vdev_nparity != 0) 2176 minref = vd->vdev_nparity + 1; /* RAID-Z */ 2177 else 2178 minref = vd->vdev_children; /* any kind of mirror */ 2179 space_reftree_create(&reftree); 2180 for (int c = 0; c < vd->vdev_children; c++) { 2181 vdev_t *cvd = vd->vdev_child[c]; 2182 mutex_enter(&cvd->vdev_dtl_lock); 2183 space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); 2184 mutex_exit(&cvd->vdev_dtl_lock); 2185 } 2186 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); 2187 space_reftree_destroy(&reftree); 2188 } 2189 mutex_exit(&vd->vdev_dtl_lock); 2190} 2191 2192int 2193vdev_dtl_load(vdev_t *vd) 2194{ 2195 spa_t *spa = vd->vdev_spa; 2196 objset_t *mos = spa->spa_meta_objset; 2197 int error = 0; 2198 2199 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 2200 ASSERT(vdev_is_concrete(vd)); 2201 2202 error = space_map_open(&vd->vdev_dtl_sm, mos, 2203 vd->vdev_dtl_object, 0, -1ULL, 0); 2204 if (error) 2205 return (error); 2206 ASSERT(vd->vdev_dtl_sm != NULL); 2207 2208 mutex_enter(&vd->vdev_dtl_lock); 2209 2210 /* 2211 * Now that we've opened the space_map we need to update 2212 * the in-core DTL. 2213 */ 2214 space_map_update(vd->vdev_dtl_sm); 2215 2216 error = space_map_load(vd->vdev_dtl_sm, 2217 vd->vdev_dtl[DTL_MISSING], SM_ALLOC); 2218 mutex_exit(&vd->vdev_dtl_lock); 2219 2220 return (error); 2221 } 2222 2223 for (int c = 0; c < vd->vdev_children; c++) { 2224 error = vdev_dtl_load(vd->vdev_child[c]); 2225 if (error != 0) 2226 break; 2227 } 2228 2229 return (error); 2230} 2231 2232void 2233vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) 2234{ 2235 spa_t *spa = vd->vdev_spa; 2236 2237 VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); 2238 VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2239 zapobj, tx)); 2240} 2241 2242uint64_t 2243vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) 2244{ 2245 spa_t *spa = vd->vdev_spa; 2246 uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, 2247 DMU_OT_NONE, 0, tx); 2248 2249 ASSERT(zap != 0); 2250 VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2251 zap, tx)); 2252 2253 return (zap); 2254} 2255 2256void 2257vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) 2258{ 2259 if (vd->vdev_ops != &vdev_hole_ops && 2260 vd->vdev_ops != &vdev_missing_ops && 2261 vd->vdev_ops != &vdev_root_ops && 2262 !vd->vdev_top->vdev_removing) { 2263 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { 2264 vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); 2265 } 2266 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { 2267 vd->vdev_top_zap = vdev_create_link_zap(vd, tx); 2268 } 2269 } 2270 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2271 vdev_construct_zaps(vd->vdev_child[i], tx); 2272 } 2273} 2274 2275void 2276vdev_dtl_sync(vdev_t *vd, uint64_t txg) 2277{ 2278 spa_t *spa = vd->vdev_spa; 2279 range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; 2280 objset_t *mos = spa->spa_meta_objset; 2281 range_tree_t *rtsync; 2282 dmu_tx_t *tx; 2283 uint64_t object = space_map_object(vd->vdev_dtl_sm); 2284 2285 ASSERT(vdev_is_concrete(vd)); 2286 ASSERT(vd->vdev_ops->vdev_op_leaf); 2287 2288 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2289 2290 if (vd->vdev_detached || vd->vdev_top->vdev_removing) { 2291 mutex_enter(&vd->vdev_dtl_lock); 2292 space_map_free(vd->vdev_dtl_sm, tx); 2293 space_map_close(vd->vdev_dtl_sm); 2294 vd->vdev_dtl_sm = NULL; 2295 mutex_exit(&vd->vdev_dtl_lock); 2296 2297 /* 2298 * We only destroy the leaf ZAP for detached leaves or for 2299 * removed log devices. Removed data devices handle leaf ZAP 2300 * cleanup later, once cancellation is no longer possible. 2301 */ 2302 if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || 2303 vd->vdev_top->vdev_islog)) { 2304 vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); 2305 vd->vdev_leaf_zap = 0; 2306 } 2307 2308 dmu_tx_commit(tx); 2309 return; 2310 } 2311 2312 if (vd->vdev_dtl_sm == NULL) { 2313 uint64_t new_object; 2314 2315 new_object = space_map_alloc(mos, tx); 2316 VERIFY3U(new_object, !=, 0); 2317 2318 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 2319 0, -1ULL, 0)); 2320 ASSERT(vd->vdev_dtl_sm != NULL); 2321 } 2322 2323 rtsync = range_tree_create(NULL, NULL); 2324 2325 mutex_enter(&vd->vdev_dtl_lock); 2326 range_tree_walk(rt, range_tree_add, rtsync); 2327 mutex_exit(&vd->vdev_dtl_lock); 2328 2329 space_map_truncate(vd->vdev_dtl_sm, tx); 2330 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); 2331 range_tree_vacate(rtsync, NULL, NULL); 2332 2333 range_tree_destroy(rtsync); 2334 2335 /* 2336 * If the object for the space map has changed then dirty 2337 * the top level so that we update the config. 2338 */ 2339 if (object != space_map_object(vd->vdev_dtl_sm)) {
|
2297 zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " 2298 "new object %llu", txg, spa_name(spa), object, 2299 space_map_object(vd->vdev_dtl_sm));
| 2340 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " 2341 "new object %llu", (u_longlong_t)txg, spa_name(spa), 2342 (u_longlong_t)object, 2343 (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
|
2300 vdev_config_dirty(vd->vdev_top); 2301 } 2302 2303 dmu_tx_commit(tx); 2304 2305 mutex_enter(&vd->vdev_dtl_lock); 2306 space_map_update(vd->vdev_dtl_sm); 2307 mutex_exit(&vd->vdev_dtl_lock); 2308} 2309 2310/* 2311 * Determine whether the specified vdev can be offlined/detached/removed 2312 * without losing data. 2313 */ 2314boolean_t 2315vdev_dtl_required(vdev_t *vd) 2316{ 2317 spa_t *spa = vd->vdev_spa; 2318 vdev_t *tvd = vd->vdev_top; 2319 uint8_t cant_read = vd->vdev_cant_read; 2320 boolean_t required; 2321 2322 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2323 2324 if (vd == spa->spa_root_vdev || vd == tvd) 2325 return (B_TRUE); 2326 2327 /* 2328 * Temporarily mark the device as unreadable, and then determine 2329 * whether this results in any DTL outages in the top-level vdev. 2330 * If not, we can safely offline/detach/remove the device. 2331 */ 2332 vd->vdev_cant_read = B_TRUE; 2333 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2334 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 2335 vd->vdev_cant_read = cant_read; 2336 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2337 2338 if (!required && zio_injection_enabled) 2339 required = !!zio_handle_device_injection(vd, NULL, ECHILD); 2340 2341 return (required); 2342} 2343 2344/* 2345 * Determine if resilver is needed, and if so the txg range. 2346 */ 2347boolean_t 2348vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 2349{ 2350 boolean_t needed = B_FALSE; 2351 uint64_t thismin = UINT64_MAX; 2352 uint64_t thismax = 0; 2353 2354 if (vd->vdev_children == 0) { 2355 mutex_enter(&vd->vdev_dtl_lock); 2356 if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && 2357 vdev_writeable(vd)) { 2358 2359 thismin = vdev_dtl_min(vd); 2360 thismax = vdev_dtl_max(vd); 2361 needed = B_TRUE; 2362 } 2363 mutex_exit(&vd->vdev_dtl_lock); 2364 } else { 2365 for (int c = 0; c < vd->vdev_children; c++) { 2366 vdev_t *cvd = vd->vdev_child[c]; 2367 uint64_t cmin, cmax; 2368 2369 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 2370 thismin = MIN(thismin, cmin); 2371 thismax = MAX(thismax, cmax); 2372 needed = B_TRUE; 2373 } 2374 } 2375 } 2376 2377 if (needed && minp) { 2378 *minp = thismin; 2379 *maxp = thismax; 2380 } 2381 return (needed); 2382} 2383 2384int 2385vdev_load(vdev_t *vd) 2386{ 2387 int error = 0; 2388 /* 2389 * Recursively load all children. 2390 */ 2391 for (int c = 0; c < vd->vdev_children; c++) { 2392 error = vdev_load(vd->vdev_child[c]); 2393 if (error != 0) { 2394 return (error); 2395 } 2396 } 2397 2398 vdev_set_deflate_ratio(vd); 2399 2400 /* 2401 * If this is a top-level vdev, initialize its metaslabs. 2402 */ 2403 if (vd == vd->vdev_top && vdev_is_concrete(vd)) { 2404 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { 2405 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2406 VDEV_AUX_CORRUPT_DATA);
| 2344 vdev_config_dirty(vd->vdev_top); 2345 } 2346 2347 dmu_tx_commit(tx); 2348 2349 mutex_enter(&vd->vdev_dtl_lock); 2350 space_map_update(vd->vdev_dtl_sm); 2351 mutex_exit(&vd->vdev_dtl_lock); 2352} 2353 2354/* 2355 * Determine whether the specified vdev can be offlined/detached/removed 2356 * without losing data. 2357 */ 2358boolean_t 2359vdev_dtl_required(vdev_t *vd) 2360{ 2361 spa_t *spa = vd->vdev_spa; 2362 vdev_t *tvd = vd->vdev_top; 2363 uint8_t cant_read = vd->vdev_cant_read; 2364 boolean_t required; 2365 2366 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2367 2368 if (vd == spa->spa_root_vdev || vd == tvd) 2369 return (B_TRUE); 2370 2371 /* 2372 * Temporarily mark the device as unreadable, and then determine 2373 * whether this results in any DTL outages in the top-level vdev. 2374 * If not, we can safely offline/detach/remove the device. 2375 */ 2376 vd->vdev_cant_read = B_TRUE; 2377 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2378 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 2379 vd->vdev_cant_read = cant_read; 2380 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2381 2382 if (!required && zio_injection_enabled) 2383 required = !!zio_handle_device_injection(vd, NULL, ECHILD); 2384 2385 return (required); 2386} 2387 2388/* 2389 * Determine if resilver is needed, and if so the txg range. 2390 */ 2391boolean_t 2392vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 2393{ 2394 boolean_t needed = B_FALSE; 2395 uint64_t thismin = UINT64_MAX; 2396 uint64_t thismax = 0; 2397 2398 if (vd->vdev_children == 0) { 2399 mutex_enter(&vd->vdev_dtl_lock); 2400 if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && 2401 vdev_writeable(vd)) { 2402 2403 thismin = vdev_dtl_min(vd); 2404 thismax = vdev_dtl_max(vd); 2405 needed = B_TRUE; 2406 } 2407 mutex_exit(&vd->vdev_dtl_lock); 2408 } else { 2409 for (int c = 0; c < vd->vdev_children; c++) { 2410 vdev_t *cvd = vd->vdev_child[c]; 2411 uint64_t cmin, cmax; 2412 2413 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 2414 thismin = MIN(thismin, cmin); 2415 thismax = MAX(thismax, cmax); 2416 needed = B_TRUE; 2417 } 2418 } 2419 } 2420 2421 if (needed && minp) { 2422 *minp = thismin; 2423 *maxp = thismax; 2424 } 2425 return (needed); 2426} 2427 2428int 2429vdev_load(vdev_t *vd) 2430{ 2431 int error = 0; 2432 /* 2433 * Recursively load all children. 2434 */ 2435 for (int c = 0; c < vd->vdev_children; c++) { 2436 error = vdev_load(vd->vdev_child[c]); 2437 if (error != 0) { 2438 return (error); 2439 } 2440 } 2441 2442 vdev_set_deflate_ratio(vd); 2443 2444 /* 2445 * If this is a top-level vdev, initialize its metaslabs. 2446 */ 2447 if (vd == vd->vdev_top && vdev_is_concrete(vd)) { 2448 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { 2449 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2450 VDEV_AUX_CORRUPT_DATA);
|
| 2451 vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " 2452 "asize=%llu", (u_longlong_t)vd->vdev_ashift, 2453 (u_longlong_t)vd->vdev_asize);
|
2407 return (SET_ERROR(ENXIO)); 2408 } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
| 2454 return (SET_ERROR(ENXIO)); 2455 } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
|
| 2456 vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " 2457 "[error=%d]", error);
|
2409 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2410 VDEV_AUX_CORRUPT_DATA); 2411 return (error); 2412 } 2413 } 2414 2415 /* 2416 * If this is a leaf vdev, load its DTL. 2417 */ 2418 if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { 2419 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2420 VDEV_AUX_CORRUPT_DATA);
| 2458 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2459 VDEV_AUX_CORRUPT_DATA); 2460 return (error); 2461 } 2462 } 2463 2464 /* 2465 * If this is a leaf vdev, load its DTL. 2466 */ 2467 if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { 2468 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2469 VDEV_AUX_CORRUPT_DATA);
|
| 2470 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " 2471 "[error=%d]", error);
|
2421 return (error); 2422 } 2423 2424 uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); 2425 if (obsolete_sm_object != 0) { 2426 objset_t *mos = vd->vdev_spa->spa_meta_objset; 2427 ASSERT(vd->vdev_asize != 0); 2428 ASSERT(vd->vdev_obsolete_sm == NULL); 2429 2430 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, 2431 obsolete_sm_object, 0, vd->vdev_asize, 0))) { 2432 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2433 VDEV_AUX_CORRUPT_DATA);
| 2472 return (error); 2473 } 2474 2475 uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); 2476 if (obsolete_sm_object != 0) { 2477 objset_t *mos = vd->vdev_spa->spa_meta_objset; 2478 ASSERT(vd->vdev_asize != 0); 2479 ASSERT(vd->vdev_obsolete_sm == NULL); 2480 2481 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, 2482 obsolete_sm_object, 0, vd->vdev_asize, 0))) { 2483 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2484 VDEV_AUX_CORRUPT_DATA);
|
| 2485 vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " 2486 "obsolete spacemap (obj %llu) [error=%d]", 2487 (u_longlong_t)obsolete_sm_object, error);
|
2434 return (error); 2435 } 2436 space_map_update(vd->vdev_obsolete_sm); 2437 } 2438 2439 return (0); 2440} 2441 2442/* 2443 * The special vdev case is used for hot spares and l2cache devices. Its 2444 * sole purpose it to set the vdev state for the associated vdev. To do this, 2445 * we make sure that we can open the underlying device, then try to read the 2446 * label, and make sure that the label is sane and that it hasn't been 2447 * repurposed to another pool. 2448 */ 2449int 2450vdev_validate_aux(vdev_t *vd) 2451{ 2452 nvlist_t *label; 2453 uint64_t guid, version; 2454 uint64_t state; 2455 2456 if (!vdev_readable(vd)) 2457 return (0); 2458 2459 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 2460 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2461 VDEV_AUX_CORRUPT_DATA); 2462 return (-1); 2463 } 2464 2465 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 2466 !SPA_VERSION_IS_SUPPORTED(version) || 2467 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 2468 guid != vd->vdev_guid || 2469 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2470 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2471 VDEV_AUX_CORRUPT_DATA); 2472 nvlist_free(label); 2473 return (-1); 2474 } 2475 2476 /* 2477 * We don't actually check the pool state here. If it's in fact in 2478 * use by another pool, we update this fact on the fly when requested. 2479 */ 2480 nvlist_free(label); 2481 return (0); 2482} 2483 2484/* 2485 * Free the objects used to store this vdev's spacemaps, and the array 2486 * that points to them. 2487 */ 2488void 2489vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) 2490{ 2491 if (vd->vdev_ms_array == 0) 2492 return; 2493 2494 objset_t *mos = vd->vdev_spa->spa_meta_objset; 2495 uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; 2496 size_t array_bytes = array_count * sizeof (uint64_t); 2497 uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); 2498 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, 2499 array_bytes, smobj_array, 0)); 2500 2501 for (uint64_t i = 0; i < array_count; i++) { 2502 uint64_t smobj = smobj_array[i]; 2503 if (smobj == 0) 2504 continue; 2505 2506 space_map_free_obj(mos, smobj, tx); 2507 } 2508 2509 kmem_free(smobj_array, array_bytes); 2510 VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); 2511 vd->vdev_ms_array = 0; 2512} 2513 2514static void 2515vdev_remove_empty(vdev_t *vd, uint64_t txg) 2516{ 2517 spa_t *spa = vd->vdev_spa; 2518 dmu_tx_t *tx; 2519 2520 ASSERT(vd == vd->vdev_top); 2521 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 2522 2523 if (vd->vdev_ms != NULL) { 2524 metaslab_group_t *mg = vd->vdev_mg; 2525 2526 metaslab_group_histogram_verify(mg); 2527 metaslab_class_histogram_verify(mg->mg_class); 2528 2529 for (int m = 0; m < vd->vdev_ms_count; m++) { 2530 metaslab_t *msp = vd->vdev_ms[m]; 2531 2532 if (msp == NULL || msp->ms_sm == NULL) 2533 continue; 2534 2535 mutex_enter(&msp->ms_lock); 2536 /* 2537 * If the metaslab was not loaded when the vdev 2538 * was removed then the histogram accounting may 2539 * not be accurate. Update the histogram information 2540 * here so that we ensure that the metaslab group 2541 * and metaslab class are up-to-date. 2542 */ 2543 metaslab_group_histogram_remove(mg, msp); 2544 2545 VERIFY0(space_map_allocated(msp->ms_sm)); 2546 space_map_close(msp->ms_sm); 2547 msp->ms_sm = NULL; 2548 mutex_exit(&msp->ms_lock); 2549 } 2550 2551 metaslab_group_histogram_verify(mg); 2552 metaslab_class_histogram_verify(mg->mg_class); 2553 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 2554 ASSERT0(mg->mg_histogram[i]); 2555 } 2556 2557 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2558 vdev_destroy_spacemaps(vd, tx); 2559 2560 if (vd->vdev_islog && vd->vdev_top_zap != 0) { 2561 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); 2562 vd->vdev_top_zap = 0; 2563 } 2564 dmu_tx_commit(tx); 2565} 2566 2567void 2568vdev_sync_done(vdev_t *vd, uint64_t txg) 2569{ 2570 metaslab_t *msp; 2571 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2572 2573 ASSERT(vdev_is_concrete(vd)); 2574 2575 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2576 metaslab_sync_done(msp, txg); 2577 2578 if (reassess) 2579 metaslab_sync_reassess(vd->vdev_mg); 2580} 2581 2582void 2583vdev_sync(vdev_t *vd, uint64_t txg) 2584{ 2585 spa_t *spa = vd->vdev_spa; 2586 vdev_t *lvd; 2587 metaslab_t *msp; 2588 dmu_tx_t *tx; 2589 2590 if (range_tree_space(vd->vdev_obsolete_segments) > 0) { 2591 dmu_tx_t *tx; 2592 2593 ASSERT(vd->vdev_removing || 2594 vd->vdev_ops == &vdev_indirect_ops); 2595 2596 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2597 vdev_indirect_sync_obsolete(vd, tx); 2598 dmu_tx_commit(tx); 2599 2600 /* 2601 * If the vdev is indirect, it can't have dirty 2602 * metaslabs or DTLs. 2603 */ 2604 if (vd->vdev_ops == &vdev_indirect_ops) { 2605 ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); 2606 ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); 2607 return; 2608 } 2609 } 2610 2611 ASSERT(vdev_is_concrete(vd)); 2612 2613 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && 2614 !vd->vdev_removing) { 2615 ASSERT(vd == vd->vdev_top); 2616 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 2617 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2618 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2619 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2620 ASSERT(vd->vdev_ms_array != 0); 2621 vdev_config_dirty(vd); 2622 dmu_tx_commit(tx); 2623 } 2624 2625 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2626 metaslab_sync(msp, txg); 2627 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2628 } 2629 2630 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2631 vdev_dtl_sync(lvd, txg); 2632 2633 /* 2634 * Remove the metadata associated with this vdev once it's empty. 2635 * Note that this is typically used for log/cache device removal; 2636 * we don't empty toplevel vdevs when removing them. But if 2637 * a toplevel happens to be emptied, this is not harmful. 2638 */ 2639 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) { 2640 vdev_remove_empty(vd, txg); 2641 } 2642 2643 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2644} 2645 2646uint64_t 2647vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2648{ 2649 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2650} 2651 2652/* 2653 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2654 * not be opened, and no I/O is attempted. 2655 */ 2656int 2657vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2658{ 2659 vdev_t *vd, *tvd; 2660 2661 spa_vdev_state_enter(spa, SCL_NONE); 2662 2663 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2664 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2665 2666 if (!vd->vdev_ops->vdev_op_leaf) 2667 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2668 2669 tvd = vd->vdev_top; 2670 2671 /* 2672 * We don't directly use the aux state here, but if we do a 2673 * vdev_reopen(), we need this value to be present to remember why we 2674 * were faulted. 2675 */ 2676 vd->vdev_label_aux = aux; 2677 2678 /* 2679 * Faulted state takes precedence over degraded. 2680 */ 2681 vd->vdev_delayed_close = B_FALSE; 2682 vd->vdev_faulted = 1ULL; 2683 vd->vdev_degraded = 0ULL; 2684 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2685 2686 /* 2687 * If this device has the only valid copy of the data, then 2688 * back off and simply mark the vdev as degraded instead. 2689 */ 2690 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2691 vd->vdev_degraded = 1ULL; 2692 vd->vdev_faulted = 0ULL; 2693 2694 /* 2695 * If we reopen the device and it's not dead, only then do we 2696 * mark it degraded. 2697 */ 2698 vdev_reopen(tvd); 2699 2700 if (vdev_readable(vd)) 2701 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2702 } 2703 2704 return (spa_vdev_state_exit(spa, vd, 0)); 2705} 2706 2707/* 2708 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2709 * user that something is wrong. The vdev continues to operate as normal as far 2710 * as I/O is concerned. 2711 */ 2712int 2713vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2714{ 2715 vdev_t *vd; 2716 2717 spa_vdev_state_enter(spa, SCL_NONE); 2718 2719 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2720 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2721 2722 if (!vd->vdev_ops->vdev_op_leaf) 2723 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2724 2725 /* 2726 * If the vdev is already faulted, then don't do anything. 2727 */ 2728 if (vd->vdev_faulted || vd->vdev_degraded) 2729 return (spa_vdev_state_exit(spa, NULL, 0)); 2730 2731 vd->vdev_degraded = 1ULL; 2732 if (!vdev_is_dead(vd)) 2733 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2734 aux); 2735 2736 return (spa_vdev_state_exit(spa, vd, 0)); 2737} 2738 2739/* 2740 * Online the given vdev. 2741 * 2742 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2743 * spare device should be detached when the device finishes resilvering. 2744 * Second, the online should be treated like a 'test' online case, so no FMA 2745 * events are generated if the device fails to open. 2746 */ 2747int 2748vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 2749{ 2750 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 2751 boolean_t wasoffline; 2752 vdev_state_t oldstate; 2753 2754 spa_vdev_state_enter(spa, SCL_NONE); 2755 2756 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2757 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2758 2759 if (!vd->vdev_ops->vdev_op_leaf) 2760 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2761 2762 wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); 2763 oldstate = vd->vdev_state; 2764 2765 tvd = vd->vdev_top; 2766 vd->vdev_offline = B_FALSE; 2767 vd->vdev_tmpoffline = B_FALSE; 2768 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2769 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2770 2771 /* XXX - L2ARC 1.0 does not support expansion */ 2772 if (!vd->vdev_aux) { 2773 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2774 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2775 } 2776 2777 vdev_reopen(tvd); 2778 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2779 2780 if (!vd->vdev_aux) { 2781 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2782 pvd->vdev_expanding = B_FALSE; 2783 } 2784 2785 if (newstate) 2786 *newstate = vd->vdev_state; 2787 if ((flags & ZFS_ONLINE_UNSPARE) && 2788 !vdev_is_dead(vd) && vd->vdev_parent && 2789 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2790 vd->vdev_parent->vdev_child[0] == vd) 2791 vd->vdev_unspare = B_TRUE; 2792 2793 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2794 2795 /* XXX - L2ARC 1.0 does not support expansion */ 2796 if (vd->vdev_aux) 2797 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2798 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2799 } 2800 2801 if (wasoffline || 2802 (oldstate < VDEV_STATE_DEGRADED && 2803 vd->vdev_state >= VDEV_STATE_DEGRADED)) 2804 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); 2805 2806 return (spa_vdev_state_exit(spa, vd, 0)); 2807} 2808 2809static int 2810vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2811{ 2812 vdev_t *vd, *tvd; 2813 int error = 0; 2814 uint64_t generation; 2815 metaslab_group_t *mg; 2816 2817top: 2818 spa_vdev_state_enter(spa, SCL_ALLOC); 2819 2820 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2821 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2822 2823 if (!vd->vdev_ops->vdev_op_leaf) 2824 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2825 2826 tvd = vd->vdev_top; 2827 mg = tvd->vdev_mg; 2828 generation = spa->spa_config_generation + 1; 2829 2830 /* 2831 * If the device isn't already offline, try to offline it. 2832 */ 2833 if (!vd->vdev_offline) { 2834 /* 2835 * If this device has the only valid copy of some data, 2836 * don't allow it to be offlined. Log devices are always 2837 * expendable. 2838 */ 2839 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2840 vdev_dtl_required(vd)) 2841 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2842 2843 /* 2844 * If the top-level is a slog and it has had allocations 2845 * then proceed. We check that the vdev's metaslab group 2846 * is not NULL since it's possible that we may have just 2847 * added this vdev but not yet initialized its metaslabs. 2848 */ 2849 if (tvd->vdev_islog && mg != NULL) { 2850 /* 2851 * Prevent any future allocations. 2852 */ 2853 metaslab_group_passivate(mg); 2854 (void) spa_vdev_state_exit(spa, vd, 0); 2855 2856 error = spa_reset_logs(spa); 2857 2858 spa_vdev_state_enter(spa, SCL_ALLOC); 2859 2860 /* 2861 * Check to see if the config has changed. 2862 */ 2863 if (error || generation != spa->spa_config_generation) { 2864 metaslab_group_activate(mg); 2865 if (error) 2866 return (spa_vdev_state_exit(spa, 2867 vd, error)); 2868 (void) spa_vdev_state_exit(spa, vd, 0); 2869 goto top; 2870 } 2871 ASSERT0(tvd->vdev_stat.vs_alloc); 2872 } 2873 2874 /* 2875 * Offline this device and reopen its top-level vdev. 2876 * If the top-level vdev is a log device then just offline 2877 * it. Otherwise, if this action results in the top-level 2878 * vdev becoming unusable, undo it and fail the request. 2879 */ 2880 vd->vdev_offline = B_TRUE; 2881 vdev_reopen(tvd); 2882 2883 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2884 vdev_is_dead(tvd)) { 2885 vd->vdev_offline = B_FALSE; 2886 vdev_reopen(tvd); 2887 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2888 } 2889 2890 /* 2891 * Add the device back into the metaslab rotor so that 2892 * once we online the device it's open for business. 2893 */ 2894 if (tvd->vdev_islog && mg != NULL) 2895 metaslab_group_activate(mg); 2896 } 2897 2898 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2899 2900 return (spa_vdev_state_exit(spa, vd, 0)); 2901} 2902 2903int 2904vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2905{ 2906 int error; 2907 2908 mutex_enter(&spa->spa_vdev_top_lock); 2909 error = vdev_offline_locked(spa, guid, flags); 2910 mutex_exit(&spa->spa_vdev_top_lock); 2911 2912 return (error); 2913} 2914 2915/* 2916 * Clear the error counts associated with this vdev. Unlike vdev_online() and 2917 * vdev_offline(), we assume the spa config is locked. We also clear all 2918 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2919 */ 2920void 2921vdev_clear(spa_t *spa, vdev_t *vd) 2922{ 2923 vdev_t *rvd = spa->spa_root_vdev; 2924 2925 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2926 2927 if (vd == NULL) 2928 vd = rvd; 2929 2930 vd->vdev_stat.vs_read_errors = 0; 2931 vd->vdev_stat.vs_write_errors = 0; 2932 vd->vdev_stat.vs_checksum_errors = 0; 2933 2934 for (int c = 0; c < vd->vdev_children; c++) 2935 vdev_clear(spa, vd->vdev_child[c]); 2936 2937 if (vd == rvd) { 2938 for (int c = 0; c < spa->spa_l2cache.sav_count; c++) 2939 vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); 2940 2941 for (int c = 0; c < spa->spa_spares.sav_count; c++) 2942 vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); 2943 } 2944 2945 /* 2946 * It makes no sense to "clear" an indirect vdev. 2947 */ 2948 if (!vdev_is_concrete(vd)) 2949 return; 2950 2951 /* 2952 * If we're in the FAULTED state or have experienced failed I/O, then 2953 * clear the persistent state and attempt to reopen the device. We 2954 * also mark the vdev config dirty, so that the new faulted state is 2955 * written out to disk. 2956 */ 2957 if (vd->vdev_faulted || vd->vdev_degraded || 2958 !vdev_readable(vd) || !vdev_writeable(vd)) { 2959 2960 /* 2961 * When reopening in reponse to a clear event, it may be due to 2962 * a fmadm repair request. In this case, if the device is 2963 * still broken, we want to still post the ereport again. 2964 */ 2965 vd->vdev_forcefault = B_TRUE; 2966 2967 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2968 vd->vdev_cant_read = B_FALSE; 2969 vd->vdev_cant_write = B_FALSE; 2970 2971 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 2972 2973 vd->vdev_forcefault = B_FALSE; 2974 2975 if (vd != rvd && vdev_writeable(vd->vdev_top)) 2976 vdev_state_dirty(vd->vdev_top); 2977 2978 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2979 spa_async_request(spa, SPA_ASYNC_RESILVER); 2980 2981 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); 2982 } 2983 2984 /* 2985 * When clearing a FMA-diagnosed fault, we always want to 2986 * unspare the device, as we assume that the original spare was 2987 * done in response to the FMA fault. 2988 */ 2989 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2990 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2991 vd->vdev_parent->vdev_child[0] == vd) 2992 vd->vdev_unspare = B_TRUE; 2993} 2994 2995boolean_t 2996vdev_is_dead(vdev_t *vd) 2997{ 2998 /* 2999 * Holes and missing devices are always considered "dead". 3000 * This simplifies the code since we don't have to check for 3001 * these types of devices in the various code paths. 3002 * Instead we rely on the fact that we skip over dead devices 3003 * before issuing I/O to them. 3004 */ 3005 return (vd->vdev_state < VDEV_STATE_DEGRADED || 3006 vd->vdev_ops == &vdev_hole_ops || 3007 vd->vdev_ops == &vdev_missing_ops); 3008} 3009 3010boolean_t 3011vdev_readable(vdev_t *vd) 3012{ 3013 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 3014} 3015 3016boolean_t 3017vdev_writeable(vdev_t *vd) 3018{ 3019 return (!vdev_is_dead(vd) && !vd->vdev_cant_write && 3020 vdev_is_concrete(vd)); 3021} 3022 3023boolean_t 3024vdev_allocatable(vdev_t *vd) 3025{ 3026 uint64_t state = vd->vdev_state; 3027 3028 /* 3029 * We currently allow allocations from vdevs which may be in the 3030 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 3031 * fails to reopen then we'll catch it later when we're holding 3032 * the proper locks. Note that we have to get the vdev state 3033 * in a local variable because although it changes atomically, 3034 * we're asking two separate questions about it. 3035 */ 3036 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 3037 !vd->vdev_cant_write && vdev_is_concrete(vd) && 3038 vd->vdev_mg->mg_initialized); 3039} 3040 3041boolean_t 3042vdev_accessible(vdev_t *vd, zio_t *zio) 3043{ 3044 ASSERT(zio->io_vd == vd); 3045 3046 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 3047 return (B_FALSE); 3048 3049 if (zio->io_type == ZIO_TYPE_READ) 3050 return (!vd->vdev_cant_read); 3051 3052 if (zio->io_type == ZIO_TYPE_WRITE) 3053 return (!vd->vdev_cant_write); 3054 3055 return (B_TRUE); 3056} 3057 3058/* 3059 * Get statistics for the given vdev. 3060 */ 3061void 3062vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 3063{ 3064 spa_t *spa = vd->vdev_spa; 3065 vdev_t *rvd = spa->spa_root_vdev; 3066 vdev_t *tvd = vd->vdev_top; 3067 3068 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 3069 3070 mutex_enter(&vd->vdev_stat_lock); 3071 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 3072 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 3073 vs->vs_state = vd->vdev_state; 3074 vs->vs_rsize = vdev_get_min_asize(vd); 3075 if (vd->vdev_ops->vdev_op_leaf) 3076 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 3077 /* 3078 * Report expandable space on top-level, non-auxillary devices only. 3079 * The expandable space is reported in terms of metaslab sized units 3080 * since that determines how much space the pool can expand. 3081 */ 3082 if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { 3083 vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - 3084 spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); 3085 } 3086 vs->vs_configured_ashift = vd->vdev_top != NULL 3087 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; 3088 vs->vs_logical_ashift = vd->vdev_logical_ashift; 3089 vs->vs_physical_ashift = vd->vdev_physical_ashift; 3090 if (vd->vdev_aux == NULL && vd == vd->vdev_top && 3091 vdev_is_concrete(vd)) { 3092 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 3093 } 3094 3095 /* 3096 * If we're getting stats on the root vdev, aggregate the I/O counts 3097 * over all top-level vdevs (i.e. the direct children of the root). 3098 */ 3099 if (vd == rvd) { 3100 for (int c = 0; c < rvd->vdev_children; c++) { 3101 vdev_t *cvd = rvd->vdev_child[c]; 3102 vdev_stat_t *cvs = &cvd->vdev_stat; 3103 3104 for (int t = 0; t < ZIO_TYPES; t++) { 3105 vs->vs_ops[t] += cvs->vs_ops[t]; 3106 vs->vs_bytes[t] += cvs->vs_bytes[t]; 3107 } 3108 cvs->vs_scan_removing = cvd->vdev_removing; 3109 } 3110 } 3111 mutex_exit(&vd->vdev_stat_lock); 3112} 3113 3114void 3115vdev_clear_stats(vdev_t *vd) 3116{ 3117 mutex_enter(&vd->vdev_stat_lock); 3118 vd->vdev_stat.vs_space = 0; 3119 vd->vdev_stat.vs_dspace = 0; 3120 vd->vdev_stat.vs_alloc = 0; 3121 mutex_exit(&vd->vdev_stat_lock); 3122} 3123 3124void 3125vdev_scan_stat_init(vdev_t *vd) 3126{ 3127 vdev_stat_t *vs = &vd->vdev_stat; 3128 3129 for (int c = 0; c < vd->vdev_children; c++) 3130 vdev_scan_stat_init(vd->vdev_child[c]); 3131 3132 mutex_enter(&vd->vdev_stat_lock); 3133 vs->vs_scan_processed = 0; 3134 mutex_exit(&vd->vdev_stat_lock); 3135} 3136 3137void 3138vdev_stat_update(zio_t *zio, uint64_t psize) 3139{ 3140 spa_t *spa = zio->io_spa; 3141 vdev_t *rvd = spa->spa_root_vdev; 3142 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 3143 vdev_t *pvd; 3144 uint64_t txg = zio->io_txg; 3145 vdev_stat_t *vs = &vd->vdev_stat; 3146 zio_type_t type = zio->io_type; 3147 int flags = zio->io_flags; 3148 3149 /* 3150 * If this i/o is a gang leader, it didn't do any actual work. 3151 */ 3152 if (zio->io_gang_tree) 3153 return; 3154 3155 if (zio->io_error == 0) { 3156 /* 3157 * If this is a root i/o, don't count it -- we've already 3158 * counted the top-level vdevs, and vdev_get_stats() will 3159 * aggregate them when asked. This reduces contention on 3160 * the root vdev_stat_lock and implicitly handles blocks 3161 * that compress away to holes, for which there is no i/o. 3162 * (Holes never create vdev children, so all the counters 3163 * remain zero, which is what we want.) 3164 * 3165 * Note: this only applies to successful i/o (io_error == 0) 3166 * because unlike i/o counts, errors are not additive. 3167 * When reading a ditto block, for example, failure of 3168 * one top-level vdev does not imply a root-level error. 3169 */ 3170 if (vd == rvd) 3171 return; 3172 3173 ASSERT(vd == zio->io_vd); 3174 3175 if (flags & ZIO_FLAG_IO_BYPASS) 3176 return; 3177 3178 mutex_enter(&vd->vdev_stat_lock); 3179 3180 if (flags & ZIO_FLAG_IO_REPAIR) { 3181 if (flags & ZIO_FLAG_SCAN_THREAD) { 3182 dsl_scan_phys_t *scn_phys = 3183 &spa->spa_dsl_pool->dp_scan->scn_phys; 3184 uint64_t *processed = &scn_phys->scn_processed; 3185 3186 /* XXX cleanup? */ 3187 if (vd->vdev_ops->vdev_op_leaf) 3188 atomic_add_64(processed, psize); 3189 vs->vs_scan_processed += psize; 3190 } 3191 3192 if (flags & ZIO_FLAG_SELF_HEAL) 3193 vs->vs_self_healed += psize; 3194 } 3195 3196 vs->vs_ops[type]++; 3197 vs->vs_bytes[type] += psize; 3198 3199 mutex_exit(&vd->vdev_stat_lock); 3200 return; 3201 } 3202 3203 if (flags & ZIO_FLAG_SPECULATIVE) 3204 return; 3205 3206 /* 3207 * If this is an I/O error that is going to be retried, then ignore the 3208 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 3209 * hard errors, when in reality they can happen for any number of 3210 * innocuous reasons (bus resets, MPxIO link failure, etc). 3211 */ 3212 if (zio->io_error == EIO && 3213 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 3214 return; 3215 3216 /* 3217 * Intent logs writes won't propagate their error to the root 3218 * I/O so don't mark these types of failures as pool-level 3219 * errors. 3220 */ 3221 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 3222 return; 3223 3224 mutex_enter(&vd->vdev_stat_lock); 3225 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 3226 if (zio->io_error == ECKSUM) 3227 vs->vs_checksum_errors++; 3228 else 3229 vs->vs_read_errors++; 3230 } 3231 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 3232 vs->vs_write_errors++; 3233 mutex_exit(&vd->vdev_stat_lock); 3234 3235 if (spa->spa_load_state == SPA_LOAD_NONE && 3236 type == ZIO_TYPE_WRITE && txg != 0 && 3237 (!(flags & ZIO_FLAG_IO_REPAIR) || 3238 (flags & ZIO_FLAG_SCAN_THREAD) || 3239 spa->spa_claiming)) { 3240 /* 3241 * This is either a normal write (not a repair), or it's 3242 * a repair induced by the scrub thread, or it's a repair 3243 * made by zil_claim() during spa_load() in the first txg. 3244 * In the normal case, we commit the DTL change in the same 3245 * txg as the block was born. In the scrub-induced repair 3246 * case, we know that scrubs run in first-pass syncing context, 3247 * so we commit the DTL change in spa_syncing_txg(spa). 3248 * In the zil_claim() case, we commit in spa_first_txg(spa). 3249 * 3250 * We currently do not make DTL entries for failed spontaneous 3251 * self-healing writes triggered by normal (non-scrubbing) 3252 * reads, because we have no transactional context in which to 3253 * do so -- and it's not clear that it'd be desirable anyway. 3254 */ 3255 if (vd->vdev_ops->vdev_op_leaf) { 3256 uint64_t commit_txg = txg; 3257 if (flags & ZIO_FLAG_SCAN_THREAD) { 3258 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3259 ASSERT(spa_sync_pass(spa) == 1); 3260 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 3261 commit_txg = spa_syncing_txg(spa); 3262 } else if (spa->spa_claiming) { 3263 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3264 commit_txg = spa_first_txg(spa); 3265 } 3266 ASSERT(commit_txg >= spa_syncing_txg(spa)); 3267 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 3268 return; 3269 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 3270 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 3271 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 3272 } 3273 if (vd != rvd) 3274 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 3275 } 3276} 3277 3278/* 3279 * Update the in-core space usage stats for this vdev, its metaslab class, 3280 * and the root vdev. 3281 */ 3282void 3283vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 3284 int64_t space_delta) 3285{ 3286 int64_t dspace_delta = space_delta; 3287 spa_t *spa = vd->vdev_spa; 3288 vdev_t *rvd = spa->spa_root_vdev; 3289 metaslab_group_t *mg = vd->vdev_mg; 3290 metaslab_class_t *mc = mg ? mg->mg_class : NULL; 3291 3292 ASSERT(vd == vd->vdev_top); 3293 3294 /* 3295 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 3296 * factor. We must calculate this here and not at the root vdev 3297 * because the root vdev's psize-to-asize is simply the max of its 3298 * childrens', thus not accurate enough for us. 3299 */ 3300 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 3301 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 3302 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 3303 vd->vdev_deflate_ratio; 3304 3305 mutex_enter(&vd->vdev_stat_lock); 3306 vd->vdev_stat.vs_alloc += alloc_delta; 3307 vd->vdev_stat.vs_space += space_delta; 3308 vd->vdev_stat.vs_dspace += dspace_delta; 3309 mutex_exit(&vd->vdev_stat_lock); 3310 3311 if (mc == spa_normal_class(spa)) { 3312 mutex_enter(&rvd->vdev_stat_lock); 3313 rvd->vdev_stat.vs_alloc += alloc_delta; 3314 rvd->vdev_stat.vs_space += space_delta; 3315 rvd->vdev_stat.vs_dspace += dspace_delta; 3316 mutex_exit(&rvd->vdev_stat_lock); 3317 } 3318 3319 if (mc != NULL) { 3320 ASSERT(rvd == vd->vdev_parent); 3321 ASSERT(vd->vdev_ms_count != 0); 3322 3323 metaslab_class_space_update(mc, 3324 alloc_delta, defer_delta, space_delta, dspace_delta); 3325 } 3326} 3327 3328/* 3329 * Mark a top-level vdev's config as dirty, placing it on the dirty list 3330 * so that it will be written out next time the vdev configuration is synced. 3331 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 3332 */ 3333void 3334vdev_config_dirty(vdev_t *vd) 3335{ 3336 spa_t *spa = vd->vdev_spa; 3337 vdev_t *rvd = spa->spa_root_vdev; 3338 int c; 3339 3340 ASSERT(spa_writeable(spa)); 3341 3342 /* 3343 * If this is an aux vdev (as with l2cache and spare devices), then we 3344 * update the vdev config manually and set the sync flag. 3345 */ 3346 if (vd->vdev_aux != NULL) { 3347 spa_aux_vdev_t *sav = vd->vdev_aux; 3348 nvlist_t **aux; 3349 uint_t naux; 3350 3351 for (c = 0; c < sav->sav_count; c++) { 3352 if (sav->sav_vdevs[c] == vd) 3353 break; 3354 } 3355 3356 if (c == sav->sav_count) { 3357 /* 3358 * We're being removed. There's nothing more to do. 3359 */ 3360 ASSERT(sav->sav_sync == B_TRUE); 3361 return; 3362 } 3363 3364 sav->sav_sync = B_TRUE; 3365 3366 if (nvlist_lookup_nvlist_array(sav->sav_config, 3367 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 3368 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 3369 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 3370 } 3371 3372 ASSERT(c < naux); 3373 3374 /* 3375 * Setting the nvlist in the middle if the array is a little 3376 * sketchy, but it will work. 3377 */ 3378 nvlist_free(aux[c]); 3379 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3380 3381 return; 3382 } 3383 3384 /* 3385 * The dirty list is protected by the SCL_CONFIG lock. The caller 3386 * must either hold SCL_CONFIG as writer, or must be the sync thread 3387 * (which holds SCL_CONFIG as reader). There's only one sync thread, 3388 * so this is sufficient to ensure mutual exclusion. 3389 */ 3390 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3391 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3392 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3393 3394 if (vd == rvd) { 3395 for (c = 0; c < rvd->vdev_children; c++) 3396 vdev_config_dirty(rvd->vdev_child[c]); 3397 } else { 3398 ASSERT(vd == vd->vdev_top); 3399 3400 if (!list_link_active(&vd->vdev_config_dirty_node) && 3401 vdev_is_concrete(vd)) { 3402 list_insert_head(&spa->spa_config_dirty_list, vd); 3403 } 3404 } 3405} 3406 3407void 3408vdev_config_clean(vdev_t *vd) 3409{ 3410 spa_t *spa = vd->vdev_spa; 3411 3412 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3413 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3414 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3415 3416 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3417 list_remove(&spa->spa_config_dirty_list, vd); 3418} 3419 3420/* 3421 * Mark a top-level vdev's state as dirty, so that the next pass of 3422 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3423 * the state changes from larger config changes because they require 3424 * much less locking, and are often needed for administrative actions. 3425 */ 3426void 3427vdev_state_dirty(vdev_t *vd) 3428{ 3429 spa_t *spa = vd->vdev_spa; 3430 3431 ASSERT(spa_writeable(spa)); 3432 ASSERT(vd == vd->vdev_top); 3433 3434 /* 3435 * The state list is protected by the SCL_STATE lock. The caller 3436 * must either hold SCL_STATE as writer, or must be the sync thread 3437 * (which holds SCL_STATE as reader). There's only one sync thread, 3438 * so this is sufficient to ensure mutual exclusion. 3439 */ 3440 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3441 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3442 spa_config_held(spa, SCL_STATE, RW_READER))); 3443 3444 if (!list_link_active(&vd->vdev_state_dirty_node) && 3445 vdev_is_concrete(vd)) 3446 list_insert_head(&spa->spa_state_dirty_list, vd); 3447} 3448 3449void 3450vdev_state_clean(vdev_t *vd) 3451{ 3452 spa_t *spa = vd->vdev_spa; 3453 3454 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3455 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3456 spa_config_held(spa, SCL_STATE, RW_READER))); 3457 3458 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 3459 list_remove(&spa->spa_state_dirty_list, vd); 3460} 3461 3462/* 3463 * Propagate vdev state up from children to parent. 3464 */ 3465void 3466vdev_propagate_state(vdev_t *vd) 3467{ 3468 spa_t *spa = vd->vdev_spa; 3469 vdev_t *rvd = spa->spa_root_vdev; 3470 int degraded = 0, faulted = 0; 3471 int corrupted = 0; 3472 vdev_t *child; 3473 3474 if (vd->vdev_children > 0) { 3475 for (int c = 0; c < vd->vdev_children; c++) { 3476 child = vd->vdev_child[c]; 3477 3478 /* 3479 * Don't factor holes or indirect vdevs into the 3480 * decision. 3481 */ 3482 if (!vdev_is_concrete(child)) 3483 continue; 3484 3485 if (!vdev_readable(child) || 3486 (!vdev_writeable(child) && spa_writeable(spa))) { 3487 /* 3488 * Root special: if there is a top-level log 3489 * device, treat the root vdev as if it were 3490 * degraded. 3491 */ 3492 if (child->vdev_islog && vd == rvd) 3493 degraded++; 3494 else 3495 faulted++; 3496 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 3497 degraded++; 3498 } 3499 3500 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 3501 corrupted++; 3502 } 3503 3504 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 3505 3506 /* 3507 * Root special: if there is a top-level vdev that cannot be 3508 * opened due to corrupted metadata, then propagate the root 3509 * vdev's aux state as 'corrupt' rather than 'insufficient 3510 * replicas'. 3511 */ 3512 if (corrupted && vd == rvd && 3513 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 3514 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 3515 VDEV_AUX_CORRUPT_DATA); 3516 } 3517 3518 if (vd->vdev_parent) 3519 vdev_propagate_state(vd->vdev_parent); 3520} 3521 3522/* 3523 * Set a vdev's state. If this is during an open, we don't update the parent 3524 * state, because we're in the process of opening children depth-first. 3525 * Otherwise, we propagate the change to the parent. 3526 * 3527 * If this routine places a device in a faulted state, an appropriate ereport is 3528 * generated. 3529 */ 3530void 3531vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 3532{ 3533 uint64_t save_state; 3534 spa_t *spa = vd->vdev_spa; 3535 3536 if (state == vd->vdev_state) { 3537 vd->vdev_stat.vs_aux = aux; 3538 return; 3539 } 3540 3541 save_state = vd->vdev_state; 3542 3543 vd->vdev_state = state; 3544 vd->vdev_stat.vs_aux = aux; 3545 3546 /* 3547 * If we are setting the vdev state to anything but an open state, then 3548 * always close the underlying device unless the device has requested 3549 * a delayed close (i.e. we're about to remove or fault the device). 3550 * Otherwise, we keep accessible but invalid devices open forever. 3551 * We don't call vdev_close() itself, because that implies some extra 3552 * checks (offline, etc) that we don't want here. This is limited to 3553 * leaf devices, because otherwise closing the device will affect other 3554 * children. 3555 */ 3556 if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 3557 vd->vdev_ops->vdev_op_leaf) 3558 vd->vdev_ops->vdev_op_close(vd); 3559 3560 if (vd->vdev_removed && 3561 state == VDEV_STATE_CANT_OPEN && 3562 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 3563 /* 3564 * If the previous state is set to VDEV_STATE_REMOVED, then this 3565 * device was previously marked removed and someone attempted to 3566 * reopen it. If this failed due to a nonexistent device, then 3567 * keep the device in the REMOVED state. We also let this be if 3568 * it is one of our special test online cases, which is only 3569 * attempting to online the device and shouldn't generate an FMA 3570 * fault. 3571 */ 3572 vd->vdev_state = VDEV_STATE_REMOVED; 3573 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 3574 } else if (state == VDEV_STATE_REMOVED) { 3575 vd->vdev_removed = B_TRUE; 3576 } else if (state == VDEV_STATE_CANT_OPEN) { 3577 /* 3578 * If we fail to open a vdev during an import or recovery, we 3579 * mark it as "not available", which signifies that it was 3580 * never there to begin with. Failure to open such a device 3581 * is not considered an error. 3582 */ 3583 if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 3584 spa_load_state(spa) == SPA_LOAD_RECOVER) && 3585 vd->vdev_ops->vdev_op_leaf) 3586 vd->vdev_not_present = 1; 3587 3588 /* 3589 * Post the appropriate ereport. If the 'prevstate' field is 3590 * set to something other than VDEV_STATE_UNKNOWN, it indicates 3591 * that this is part of a vdev_reopen(). In this case, we don't 3592 * want to post the ereport if the device was already in the 3593 * CANT_OPEN state beforehand. 3594 * 3595 * If the 'checkremove' flag is set, then this is an attempt to 3596 * online the device in response to an insertion event. If we 3597 * hit this case, then we have detected an insertion event for a 3598 * faulted or offline device that wasn't in the removed state. 3599 * In this scenario, we don't post an ereport because we are 3600 * about to replace the device, or attempt an online with 3601 * vdev_forcefault, which will generate the fault for us. 3602 */ 3603 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 3604 !vd->vdev_not_present && !vd->vdev_checkremove && 3605 vd != spa->spa_root_vdev) { 3606 const char *class; 3607 3608 switch (aux) { 3609 case VDEV_AUX_OPEN_FAILED: 3610 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3611 break; 3612 case VDEV_AUX_CORRUPT_DATA: 3613 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3614 break; 3615 case VDEV_AUX_NO_REPLICAS: 3616 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3617 break; 3618 case VDEV_AUX_BAD_GUID_SUM: 3619 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3620 break; 3621 case VDEV_AUX_TOO_SMALL: 3622 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3623 break; 3624 case VDEV_AUX_BAD_LABEL: 3625 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3626 break; 3627 default: 3628 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3629 } 3630 3631 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3632 } 3633 3634 /* Erase any notion of persistent removed state */ 3635 vd->vdev_removed = B_FALSE; 3636 } else { 3637 vd->vdev_removed = B_FALSE; 3638 } 3639 3640 /* 3641 * Notify the fmd of the state change. Be verbose and post 3642 * notifications even for stuff that's not important; the fmd agent can 3643 * sort it out. Don't emit state change events for non-leaf vdevs since 3644 * they can't change state on their own. The FMD can check their state 3645 * if it wants to when it sees that a leaf vdev had a state change. 3646 */ 3647 if (vd->vdev_ops->vdev_op_leaf) 3648 zfs_post_state_change(spa, vd); 3649 3650 if (!isopen && vd->vdev_parent) 3651 vdev_propagate_state(vd->vdev_parent); 3652} 3653 3654/* 3655 * Check the vdev configuration to ensure that it's capable of supporting 3656 * a root pool. We do not support partial configuration. 3657 * In addition, only a single top-level vdev is allowed. 3658 * 3659 * FreeBSD does not have above limitations. 3660 */ 3661boolean_t 3662vdev_is_bootable(vdev_t *vd) 3663{ 3664#ifdef illumos 3665 if (!vd->vdev_ops->vdev_op_leaf) { 3666 char *vdev_type = vd->vdev_ops->vdev_op_type; 3667 3668 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3669 vd->vdev_children > 1) { 3670 return (B_FALSE); 3671 } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || 3672 strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { 3673 return (B_FALSE); 3674 } 3675 } 3676 3677 for (int c = 0; c < vd->vdev_children; c++) { 3678 if (!vdev_is_bootable(vd->vdev_child[c])) 3679 return (B_FALSE); 3680 } 3681#endif /* illumos */ 3682 return (B_TRUE); 3683} 3684 3685boolean_t 3686vdev_is_concrete(vdev_t *vd) 3687{ 3688 vdev_ops_t *ops = vd->vdev_ops; 3689 if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || 3690 ops == &vdev_missing_ops || ops == &vdev_root_ops) { 3691 return (B_FALSE); 3692 } else { 3693 return (B_TRUE); 3694 } 3695} 3696 3697/* 3698 * Load the state from the original vdev tree (ovd) which 3699 * we've retrieved from the MOS config object. If the original 3700 * vdev was offline or faulted then we transfer that state to the 3701 * device in the current vdev tree (nvd). 3702 */ 3703void 3704vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3705{ 3706 spa_t *spa = nvd->vdev_spa; 3707 3708 ASSERT(nvd->vdev_top->vdev_islog); 3709 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3710 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3711 3712 for (int c = 0; c < nvd->vdev_children; c++) 3713 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3714 3715 if (nvd->vdev_ops->vdev_op_leaf) { 3716 /* 3717 * Restore the persistent vdev state 3718 */ 3719 nvd->vdev_offline = ovd->vdev_offline; 3720 nvd->vdev_faulted = ovd->vdev_faulted; 3721 nvd->vdev_degraded = ovd->vdev_degraded; 3722 nvd->vdev_removed = ovd->vdev_removed; 3723 } 3724} 3725 3726/* 3727 * Determine if a log device has valid content. If the vdev was 3728 * removed or faulted in the MOS config then we know that 3729 * the content on the log device has already been written to the pool. 3730 */ 3731boolean_t 3732vdev_log_state_valid(vdev_t *vd) 3733{ 3734 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3735 !vd->vdev_removed) 3736 return (B_TRUE); 3737 3738 for (int c = 0; c < vd->vdev_children; c++) 3739 if (vdev_log_state_valid(vd->vdev_child[c])) 3740 return (B_TRUE); 3741 3742 return (B_FALSE); 3743} 3744 3745/* 3746 * Expand a vdev if possible. 3747 */ 3748void 3749vdev_expand(vdev_t *vd, uint64_t txg) 3750{ 3751 ASSERT(vd->vdev_top == vd); 3752 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3753 3754 vdev_set_deflate_ratio(vd); 3755 3756 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && 3757 vdev_is_concrete(vd)) { 3758 VERIFY(vdev_metaslab_init(vd, txg) == 0); 3759 vdev_config_dirty(vd); 3760 } 3761} 3762 3763/* 3764 * Split a vdev. 3765 */ 3766void 3767vdev_split(vdev_t *vd) 3768{ 3769 vdev_t *cvd, *pvd = vd->vdev_parent; 3770 3771 vdev_remove_child(pvd, vd); 3772 vdev_compact_children(pvd); 3773 3774 cvd = pvd->vdev_child[0]; 3775 if (pvd->vdev_children == 1) { 3776 vdev_remove_parent(cvd); 3777 cvd->vdev_splitting = B_TRUE; 3778 } 3779 vdev_propagate_state(cvd); 3780} 3781 3782void 3783vdev_deadman(vdev_t *vd) 3784{ 3785 for (int c = 0; c < vd->vdev_children; c++) { 3786 vdev_t *cvd = vd->vdev_child[c]; 3787 3788 vdev_deadman(cvd); 3789 } 3790 3791 if (vd->vdev_ops->vdev_op_leaf) { 3792 vdev_queue_t *vq = &vd->vdev_queue; 3793 3794 mutex_enter(&vq->vq_lock); 3795 if (avl_numnodes(&vq->vq_active_tree) > 0) { 3796 spa_t *spa = vd->vdev_spa; 3797 zio_t *fio; 3798 uint64_t delta; 3799 3800 /* 3801 * Look at the head of all the pending queues, 3802 * if any I/O has been outstanding for longer than 3803 * the spa_deadman_synctime we panic the system. 3804 */ 3805 fio = avl_first(&vq->vq_active_tree); 3806 delta = gethrtime() - fio->io_timestamp; 3807 if (delta > spa_deadman_synctime(spa)) {
| 2488 return (error); 2489 } 2490 space_map_update(vd->vdev_obsolete_sm); 2491 } 2492 2493 return (0); 2494} 2495 2496/* 2497 * The special vdev case is used for hot spares and l2cache devices. Its 2498 * sole purpose it to set the vdev state for the associated vdev. To do this, 2499 * we make sure that we can open the underlying device, then try to read the 2500 * label, and make sure that the label is sane and that it hasn't been 2501 * repurposed to another pool. 2502 */ 2503int 2504vdev_validate_aux(vdev_t *vd) 2505{ 2506 nvlist_t *label; 2507 uint64_t guid, version; 2508 uint64_t state; 2509 2510 if (!vdev_readable(vd)) 2511 return (0); 2512 2513 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 2514 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2515 VDEV_AUX_CORRUPT_DATA); 2516 return (-1); 2517 } 2518 2519 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 2520 !SPA_VERSION_IS_SUPPORTED(version) || 2521 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 2522 guid != vd->vdev_guid || 2523 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2524 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2525 VDEV_AUX_CORRUPT_DATA); 2526 nvlist_free(label); 2527 return (-1); 2528 } 2529 2530 /* 2531 * We don't actually check the pool state here. If it's in fact in 2532 * use by another pool, we update this fact on the fly when requested. 2533 */ 2534 nvlist_free(label); 2535 return (0); 2536} 2537 2538/* 2539 * Free the objects used to store this vdev's spacemaps, and the array 2540 * that points to them. 2541 */ 2542void 2543vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) 2544{ 2545 if (vd->vdev_ms_array == 0) 2546 return; 2547 2548 objset_t *mos = vd->vdev_spa->spa_meta_objset; 2549 uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; 2550 size_t array_bytes = array_count * sizeof (uint64_t); 2551 uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); 2552 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, 2553 array_bytes, smobj_array, 0)); 2554 2555 for (uint64_t i = 0; i < array_count; i++) { 2556 uint64_t smobj = smobj_array[i]; 2557 if (smobj == 0) 2558 continue; 2559 2560 space_map_free_obj(mos, smobj, tx); 2561 } 2562 2563 kmem_free(smobj_array, array_bytes); 2564 VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); 2565 vd->vdev_ms_array = 0; 2566} 2567 2568static void 2569vdev_remove_empty(vdev_t *vd, uint64_t txg) 2570{ 2571 spa_t *spa = vd->vdev_spa; 2572 dmu_tx_t *tx; 2573 2574 ASSERT(vd == vd->vdev_top); 2575 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 2576 2577 if (vd->vdev_ms != NULL) { 2578 metaslab_group_t *mg = vd->vdev_mg; 2579 2580 metaslab_group_histogram_verify(mg); 2581 metaslab_class_histogram_verify(mg->mg_class); 2582 2583 for (int m = 0; m < vd->vdev_ms_count; m++) { 2584 metaslab_t *msp = vd->vdev_ms[m]; 2585 2586 if (msp == NULL || msp->ms_sm == NULL) 2587 continue; 2588 2589 mutex_enter(&msp->ms_lock); 2590 /* 2591 * If the metaslab was not loaded when the vdev 2592 * was removed then the histogram accounting may 2593 * not be accurate. Update the histogram information 2594 * here so that we ensure that the metaslab group 2595 * and metaslab class are up-to-date. 2596 */ 2597 metaslab_group_histogram_remove(mg, msp); 2598 2599 VERIFY0(space_map_allocated(msp->ms_sm)); 2600 space_map_close(msp->ms_sm); 2601 msp->ms_sm = NULL; 2602 mutex_exit(&msp->ms_lock); 2603 } 2604 2605 metaslab_group_histogram_verify(mg); 2606 metaslab_class_histogram_verify(mg->mg_class); 2607 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 2608 ASSERT0(mg->mg_histogram[i]); 2609 } 2610 2611 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2612 vdev_destroy_spacemaps(vd, tx); 2613 2614 if (vd->vdev_islog && vd->vdev_top_zap != 0) { 2615 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); 2616 vd->vdev_top_zap = 0; 2617 } 2618 dmu_tx_commit(tx); 2619} 2620 2621void 2622vdev_sync_done(vdev_t *vd, uint64_t txg) 2623{ 2624 metaslab_t *msp; 2625 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2626 2627 ASSERT(vdev_is_concrete(vd)); 2628 2629 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2630 metaslab_sync_done(msp, txg); 2631 2632 if (reassess) 2633 metaslab_sync_reassess(vd->vdev_mg); 2634} 2635 2636void 2637vdev_sync(vdev_t *vd, uint64_t txg) 2638{ 2639 spa_t *spa = vd->vdev_spa; 2640 vdev_t *lvd; 2641 metaslab_t *msp; 2642 dmu_tx_t *tx; 2643 2644 if (range_tree_space(vd->vdev_obsolete_segments) > 0) { 2645 dmu_tx_t *tx; 2646 2647 ASSERT(vd->vdev_removing || 2648 vd->vdev_ops == &vdev_indirect_ops); 2649 2650 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2651 vdev_indirect_sync_obsolete(vd, tx); 2652 dmu_tx_commit(tx); 2653 2654 /* 2655 * If the vdev is indirect, it can't have dirty 2656 * metaslabs or DTLs. 2657 */ 2658 if (vd->vdev_ops == &vdev_indirect_ops) { 2659 ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); 2660 ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); 2661 return; 2662 } 2663 } 2664 2665 ASSERT(vdev_is_concrete(vd)); 2666 2667 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && 2668 !vd->vdev_removing) { 2669 ASSERT(vd == vd->vdev_top); 2670 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 2671 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2672 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2673 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2674 ASSERT(vd->vdev_ms_array != 0); 2675 vdev_config_dirty(vd); 2676 dmu_tx_commit(tx); 2677 } 2678 2679 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2680 metaslab_sync(msp, txg); 2681 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2682 } 2683 2684 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2685 vdev_dtl_sync(lvd, txg); 2686 2687 /* 2688 * Remove the metadata associated with this vdev once it's empty. 2689 * Note that this is typically used for log/cache device removal; 2690 * we don't empty toplevel vdevs when removing them. But if 2691 * a toplevel happens to be emptied, this is not harmful. 2692 */ 2693 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) { 2694 vdev_remove_empty(vd, txg); 2695 } 2696 2697 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2698} 2699 2700uint64_t 2701vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2702{ 2703 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2704} 2705 2706/* 2707 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2708 * not be opened, and no I/O is attempted. 2709 */ 2710int 2711vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2712{ 2713 vdev_t *vd, *tvd; 2714 2715 spa_vdev_state_enter(spa, SCL_NONE); 2716 2717 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2718 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2719 2720 if (!vd->vdev_ops->vdev_op_leaf) 2721 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2722 2723 tvd = vd->vdev_top; 2724 2725 /* 2726 * We don't directly use the aux state here, but if we do a 2727 * vdev_reopen(), we need this value to be present to remember why we 2728 * were faulted. 2729 */ 2730 vd->vdev_label_aux = aux; 2731 2732 /* 2733 * Faulted state takes precedence over degraded. 2734 */ 2735 vd->vdev_delayed_close = B_FALSE; 2736 vd->vdev_faulted = 1ULL; 2737 vd->vdev_degraded = 0ULL; 2738 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2739 2740 /* 2741 * If this device has the only valid copy of the data, then 2742 * back off and simply mark the vdev as degraded instead. 2743 */ 2744 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2745 vd->vdev_degraded = 1ULL; 2746 vd->vdev_faulted = 0ULL; 2747 2748 /* 2749 * If we reopen the device and it's not dead, only then do we 2750 * mark it degraded. 2751 */ 2752 vdev_reopen(tvd); 2753 2754 if (vdev_readable(vd)) 2755 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2756 } 2757 2758 return (spa_vdev_state_exit(spa, vd, 0)); 2759} 2760 2761/* 2762 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2763 * user that something is wrong. The vdev continues to operate as normal as far 2764 * as I/O is concerned. 2765 */ 2766int 2767vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2768{ 2769 vdev_t *vd; 2770 2771 spa_vdev_state_enter(spa, SCL_NONE); 2772 2773 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2774 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2775 2776 if (!vd->vdev_ops->vdev_op_leaf) 2777 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2778 2779 /* 2780 * If the vdev is already faulted, then don't do anything. 2781 */ 2782 if (vd->vdev_faulted || vd->vdev_degraded) 2783 return (spa_vdev_state_exit(spa, NULL, 0)); 2784 2785 vd->vdev_degraded = 1ULL; 2786 if (!vdev_is_dead(vd)) 2787 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2788 aux); 2789 2790 return (spa_vdev_state_exit(spa, vd, 0)); 2791} 2792 2793/* 2794 * Online the given vdev. 2795 * 2796 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2797 * spare device should be detached when the device finishes resilvering. 2798 * Second, the online should be treated like a 'test' online case, so no FMA 2799 * events are generated if the device fails to open. 2800 */ 2801int 2802vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 2803{ 2804 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 2805 boolean_t wasoffline; 2806 vdev_state_t oldstate; 2807 2808 spa_vdev_state_enter(spa, SCL_NONE); 2809 2810 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2811 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2812 2813 if (!vd->vdev_ops->vdev_op_leaf) 2814 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2815 2816 wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); 2817 oldstate = vd->vdev_state; 2818 2819 tvd = vd->vdev_top; 2820 vd->vdev_offline = B_FALSE; 2821 vd->vdev_tmpoffline = B_FALSE; 2822 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2823 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2824 2825 /* XXX - L2ARC 1.0 does not support expansion */ 2826 if (!vd->vdev_aux) { 2827 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2828 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2829 } 2830 2831 vdev_reopen(tvd); 2832 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2833 2834 if (!vd->vdev_aux) { 2835 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2836 pvd->vdev_expanding = B_FALSE; 2837 } 2838 2839 if (newstate) 2840 *newstate = vd->vdev_state; 2841 if ((flags & ZFS_ONLINE_UNSPARE) && 2842 !vdev_is_dead(vd) && vd->vdev_parent && 2843 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2844 vd->vdev_parent->vdev_child[0] == vd) 2845 vd->vdev_unspare = B_TRUE; 2846 2847 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2848 2849 /* XXX - L2ARC 1.0 does not support expansion */ 2850 if (vd->vdev_aux) 2851 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2852 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2853 } 2854 2855 if (wasoffline || 2856 (oldstate < VDEV_STATE_DEGRADED && 2857 vd->vdev_state >= VDEV_STATE_DEGRADED)) 2858 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); 2859 2860 return (spa_vdev_state_exit(spa, vd, 0)); 2861} 2862 2863static int 2864vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2865{ 2866 vdev_t *vd, *tvd; 2867 int error = 0; 2868 uint64_t generation; 2869 metaslab_group_t *mg; 2870 2871top: 2872 spa_vdev_state_enter(spa, SCL_ALLOC); 2873 2874 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2875 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2876 2877 if (!vd->vdev_ops->vdev_op_leaf) 2878 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2879 2880 tvd = vd->vdev_top; 2881 mg = tvd->vdev_mg; 2882 generation = spa->spa_config_generation + 1; 2883 2884 /* 2885 * If the device isn't already offline, try to offline it. 2886 */ 2887 if (!vd->vdev_offline) { 2888 /* 2889 * If this device has the only valid copy of some data, 2890 * don't allow it to be offlined. Log devices are always 2891 * expendable. 2892 */ 2893 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2894 vdev_dtl_required(vd)) 2895 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2896 2897 /* 2898 * If the top-level is a slog and it has had allocations 2899 * then proceed. We check that the vdev's metaslab group 2900 * is not NULL since it's possible that we may have just 2901 * added this vdev but not yet initialized its metaslabs. 2902 */ 2903 if (tvd->vdev_islog && mg != NULL) { 2904 /* 2905 * Prevent any future allocations. 2906 */ 2907 metaslab_group_passivate(mg); 2908 (void) spa_vdev_state_exit(spa, vd, 0); 2909 2910 error = spa_reset_logs(spa); 2911 2912 spa_vdev_state_enter(spa, SCL_ALLOC); 2913 2914 /* 2915 * Check to see if the config has changed. 2916 */ 2917 if (error || generation != spa->spa_config_generation) { 2918 metaslab_group_activate(mg); 2919 if (error) 2920 return (spa_vdev_state_exit(spa, 2921 vd, error)); 2922 (void) spa_vdev_state_exit(spa, vd, 0); 2923 goto top; 2924 } 2925 ASSERT0(tvd->vdev_stat.vs_alloc); 2926 } 2927 2928 /* 2929 * Offline this device and reopen its top-level vdev. 2930 * If the top-level vdev is a log device then just offline 2931 * it. Otherwise, if this action results in the top-level 2932 * vdev becoming unusable, undo it and fail the request. 2933 */ 2934 vd->vdev_offline = B_TRUE; 2935 vdev_reopen(tvd); 2936 2937 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2938 vdev_is_dead(tvd)) { 2939 vd->vdev_offline = B_FALSE; 2940 vdev_reopen(tvd); 2941 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2942 } 2943 2944 /* 2945 * Add the device back into the metaslab rotor so that 2946 * once we online the device it's open for business. 2947 */ 2948 if (tvd->vdev_islog && mg != NULL) 2949 metaslab_group_activate(mg); 2950 } 2951 2952 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2953 2954 return (spa_vdev_state_exit(spa, vd, 0)); 2955} 2956 2957int 2958vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2959{ 2960 int error; 2961 2962 mutex_enter(&spa->spa_vdev_top_lock); 2963 error = vdev_offline_locked(spa, guid, flags); 2964 mutex_exit(&spa->spa_vdev_top_lock); 2965 2966 return (error); 2967} 2968 2969/* 2970 * Clear the error counts associated with this vdev. Unlike vdev_online() and 2971 * vdev_offline(), we assume the spa config is locked. We also clear all 2972 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2973 */ 2974void 2975vdev_clear(spa_t *spa, vdev_t *vd) 2976{ 2977 vdev_t *rvd = spa->spa_root_vdev; 2978 2979 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2980 2981 if (vd == NULL) 2982 vd = rvd; 2983 2984 vd->vdev_stat.vs_read_errors = 0; 2985 vd->vdev_stat.vs_write_errors = 0; 2986 vd->vdev_stat.vs_checksum_errors = 0; 2987 2988 for (int c = 0; c < vd->vdev_children; c++) 2989 vdev_clear(spa, vd->vdev_child[c]); 2990 2991 if (vd == rvd) { 2992 for (int c = 0; c < spa->spa_l2cache.sav_count; c++) 2993 vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); 2994 2995 for (int c = 0; c < spa->spa_spares.sav_count; c++) 2996 vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); 2997 } 2998 2999 /* 3000 * It makes no sense to "clear" an indirect vdev. 3001 */ 3002 if (!vdev_is_concrete(vd)) 3003 return; 3004 3005 /* 3006 * If we're in the FAULTED state or have experienced failed I/O, then 3007 * clear the persistent state and attempt to reopen the device. We 3008 * also mark the vdev config dirty, so that the new faulted state is 3009 * written out to disk. 3010 */ 3011 if (vd->vdev_faulted || vd->vdev_degraded || 3012 !vdev_readable(vd) || !vdev_writeable(vd)) { 3013 3014 /* 3015 * When reopening in reponse to a clear event, it may be due to 3016 * a fmadm repair request. In this case, if the device is 3017 * still broken, we want to still post the ereport again. 3018 */ 3019 vd->vdev_forcefault = B_TRUE; 3020 3021 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 3022 vd->vdev_cant_read = B_FALSE; 3023 vd->vdev_cant_write = B_FALSE; 3024 3025 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 3026 3027 vd->vdev_forcefault = B_FALSE; 3028 3029 if (vd != rvd && vdev_writeable(vd->vdev_top)) 3030 vdev_state_dirty(vd->vdev_top); 3031 3032 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 3033 spa_async_request(spa, SPA_ASYNC_RESILVER); 3034 3035 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); 3036 } 3037 3038 /* 3039 * When clearing a FMA-diagnosed fault, we always want to 3040 * unspare the device, as we assume that the original spare was 3041 * done in response to the FMA fault. 3042 */ 3043 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 3044 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 3045 vd->vdev_parent->vdev_child[0] == vd) 3046 vd->vdev_unspare = B_TRUE; 3047} 3048 3049boolean_t 3050vdev_is_dead(vdev_t *vd) 3051{ 3052 /* 3053 * Holes and missing devices are always considered "dead". 3054 * This simplifies the code since we don't have to check for 3055 * these types of devices in the various code paths. 3056 * Instead we rely on the fact that we skip over dead devices 3057 * before issuing I/O to them. 3058 */ 3059 return (vd->vdev_state < VDEV_STATE_DEGRADED || 3060 vd->vdev_ops == &vdev_hole_ops || 3061 vd->vdev_ops == &vdev_missing_ops); 3062} 3063 3064boolean_t 3065vdev_readable(vdev_t *vd) 3066{ 3067 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 3068} 3069 3070boolean_t 3071vdev_writeable(vdev_t *vd) 3072{ 3073 return (!vdev_is_dead(vd) && !vd->vdev_cant_write && 3074 vdev_is_concrete(vd)); 3075} 3076 3077boolean_t 3078vdev_allocatable(vdev_t *vd) 3079{ 3080 uint64_t state = vd->vdev_state; 3081 3082 /* 3083 * We currently allow allocations from vdevs which may be in the 3084 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 3085 * fails to reopen then we'll catch it later when we're holding 3086 * the proper locks. Note that we have to get the vdev state 3087 * in a local variable because although it changes atomically, 3088 * we're asking two separate questions about it. 3089 */ 3090 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 3091 !vd->vdev_cant_write && vdev_is_concrete(vd) && 3092 vd->vdev_mg->mg_initialized); 3093} 3094 3095boolean_t 3096vdev_accessible(vdev_t *vd, zio_t *zio) 3097{ 3098 ASSERT(zio->io_vd == vd); 3099 3100 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 3101 return (B_FALSE); 3102 3103 if (zio->io_type == ZIO_TYPE_READ) 3104 return (!vd->vdev_cant_read); 3105 3106 if (zio->io_type == ZIO_TYPE_WRITE) 3107 return (!vd->vdev_cant_write); 3108 3109 return (B_TRUE); 3110} 3111 3112/* 3113 * Get statistics for the given vdev. 3114 */ 3115void 3116vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 3117{ 3118 spa_t *spa = vd->vdev_spa; 3119 vdev_t *rvd = spa->spa_root_vdev; 3120 vdev_t *tvd = vd->vdev_top; 3121 3122 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 3123 3124 mutex_enter(&vd->vdev_stat_lock); 3125 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 3126 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 3127 vs->vs_state = vd->vdev_state; 3128 vs->vs_rsize = vdev_get_min_asize(vd); 3129 if (vd->vdev_ops->vdev_op_leaf) 3130 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 3131 /* 3132 * Report expandable space on top-level, non-auxillary devices only. 3133 * The expandable space is reported in terms of metaslab sized units 3134 * since that determines how much space the pool can expand. 3135 */ 3136 if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { 3137 vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - 3138 spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); 3139 } 3140 vs->vs_configured_ashift = vd->vdev_top != NULL 3141 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; 3142 vs->vs_logical_ashift = vd->vdev_logical_ashift; 3143 vs->vs_physical_ashift = vd->vdev_physical_ashift; 3144 if (vd->vdev_aux == NULL && vd == vd->vdev_top && 3145 vdev_is_concrete(vd)) { 3146 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 3147 } 3148 3149 /* 3150 * If we're getting stats on the root vdev, aggregate the I/O counts 3151 * over all top-level vdevs (i.e. the direct children of the root). 3152 */ 3153 if (vd == rvd) { 3154 for (int c = 0; c < rvd->vdev_children; c++) { 3155 vdev_t *cvd = rvd->vdev_child[c]; 3156 vdev_stat_t *cvs = &cvd->vdev_stat; 3157 3158 for (int t = 0; t < ZIO_TYPES; t++) { 3159 vs->vs_ops[t] += cvs->vs_ops[t]; 3160 vs->vs_bytes[t] += cvs->vs_bytes[t]; 3161 } 3162 cvs->vs_scan_removing = cvd->vdev_removing; 3163 } 3164 } 3165 mutex_exit(&vd->vdev_stat_lock); 3166} 3167 3168void 3169vdev_clear_stats(vdev_t *vd) 3170{ 3171 mutex_enter(&vd->vdev_stat_lock); 3172 vd->vdev_stat.vs_space = 0; 3173 vd->vdev_stat.vs_dspace = 0; 3174 vd->vdev_stat.vs_alloc = 0; 3175 mutex_exit(&vd->vdev_stat_lock); 3176} 3177 3178void 3179vdev_scan_stat_init(vdev_t *vd) 3180{ 3181 vdev_stat_t *vs = &vd->vdev_stat; 3182 3183 for (int c = 0; c < vd->vdev_children; c++) 3184 vdev_scan_stat_init(vd->vdev_child[c]); 3185 3186 mutex_enter(&vd->vdev_stat_lock); 3187 vs->vs_scan_processed = 0; 3188 mutex_exit(&vd->vdev_stat_lock); 3189} 3190 3191void 3192vdev_stat_update(zio_t *zio, uint64_t psize) 3193{ 3194 spa_t *spa = zio->io_spa; 3195 vdev_t *rvd = spa->spa_root_vdev; 3196 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 3197 vdev_t *pvd; 3198 uint64_t txg = zio->io_txg; 3199 vdev_stat_t *vs = &vd->vdev_stat; 3200 zio_type_t type = zio->io_type; 3201 int flags = zio->io_flags; 3202 3203 /* 3204 * If this i/o is a gang leader, it didn't do any actual work. 3205 */ 3206 if (zio->io_gang_tree) 3207 return; 3208 3209 if (zio->io_error == 0) { 3210 /* 3211 * If this is a root i/o, don't count it -- we've already 3212 * counted the top-level vdevs, and vdev_get_stats() will 3213 * aggregate them when asked. This reduces contention on 3214 * the root vdev_stat_lock and implicitly handles blocks 3215 * that compress away to holes, for which there is no i/o. 3216 * (Holes never create vdev children, so all the counters 3217 * remain zero, which is what we want.) 3218 * 3219 * Note: this only applies to successful i/o (io_error == 0) 3220 * because unlike i/o counts, errors are not additive. 3221 * When reading a ditto block, for example, failure of 3222 * one top-level vdev does not imply a root-level error. 3223 */ 3224 if (vd == rvd) 3225 return; 3226 3227 ASSERT(vd == zio->io_vd); 3228 3229 if (flags & ZIO_FLAG_IO_BYPASS) 3230 return; 3231 3232 mutex_enter(&vd->vdev_stat_lock); 3233 3234 if (flags & ZIO_FLAG_IO_REPAIR) { 3235 if (flags & ZIO_FLAG_SCAN_THREAD) { 3236 dsl_scan_phys_t *scn_phys = 3237 &spa->spa_dsl_pool->dp_scan->scn_phys; 3238 uint64_t *processed = &scn_phys->scn_processed; 3239 3240 /* XXX cleanup? */ 3241 if (vd->vdev_ops->vdev_op_leaf) 3242 atomic_add_64(processed, psize); 3243 vs->vs_scan_processed += psize; 3244 } 3245 3246 if (flags & ZIO_FLAG_SELF_HEAL) 3247 vs->vs_self_healed += psize; 3248 } 3249 3250 vs->vs_ops[type]++; 3251 vs->vs_bytes[type] += psize; 3252 3253 mutex_exit(&vd->vdev_stat_lock); 3254 return; 3255 } 3256 3257 if (flags & ZIO_FLAG_SPECULATIVE) 3258 return; 3259 3260 /* 3261 * If this is an I/O error that is going to be retried, then ignore the 3262 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 3263 * hard errors, when in reality they can happen for any number of 3264 * innocuous reasons (bus resets, MPxIO link failure, etc). 3265 */ 3266 if (zio->io_error == EIO && 3267 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 3268 return; 3269 3270 /* 3271 * Intent logs writes won't propagate their error to the root 3272 * I/O so don't mark these types of failures as pool-level 3273 * errors. 3274 */ 3275 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 3276 return; 3277 3278 mutex_enter(&vd->vdev_stat_lock); 3279 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 3280 if (zio->io_error == ECKSUM) 3281 vs->vs_checksum_errors++; 3282 else 3283 vs->vs_read_errors++; 3284 } 3285 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 3286 vs->vs_write_errors++; 3287 mutex_exit(&vd->vdev_stat_lock); 3288 3289 if (spa->spa_load_state == SPA_LOAD_NONE && 3290 type == ZIO_TYPE_WRITE && txg != 0 && 3291 (!(flags & ZIO_FLAG_IO_REPAIR) || 3292 (flags & ZIO_FLAG_SCAN_THREAD) || 3293 spa->spa_claiming)) { 3294 /* 3295 * This is either a normal write (not a repair), or it's 3296 * a repair induced by the scrub thread, or it's a repair 3297 * made by zil_claim() during spa_load() in the first txg. 3298 * In the normal case, we commit the DTL change in the same 3299 * txg as the block was born. In the scrub-induced repair 3300 * case, we know that scrubs run in first-pass syncing context, 3301 * so we commit the DTL change in spa_syncing_txg(spa). 3302 * In the zil_claim() case, we commit in spa_first_txg(spa). 3303 * 3304 * We currently do not make DTL entries for failed spontaneous 3305 * self-healing writes triggered by normal (non-scrubbing) 3306 * reads, because we have no transactional context in which to 3307 * do so -- and it's not clear that it'd be desirable anyway. 3308 */ 3309 if (vd->vdev_ops->vdev_op_leaf) { 3310 uint64_t commit_txg = txg; 3311 if (flags & ZIO_FLAG_SCAN_THREAD) { 3312 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3313 ASSERT(spa_sync_pass(spa) == 1); 3314 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 3315 commit_txg = spa_syncing_txg(spa); 3316 } else if (spa->spa_claiming) { 3317 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3318 commit_txg = spa_first_txg(spa); 3319 } 3320 ASSERT(commit_txg >= spa_syncing_txg(spa)); 3321 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 3322 return; 3323 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 3324 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 3325 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 3326 } 3327 if (vd != rvd) 3328 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 3329 } 3330} 3331 3332/* 3333 * Update the in-core space usage stats for this vdev, its metaslab class, 3334 * and the root vdev. 3335 */ 3336void 3337vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 3338 int64_t space_delta) 3339{ 3340 int64_t dspace_delta = space_delta; 3341 spa_t *spa = vd->vdev_spa; 3342 vdev_t *rvd = spa->spa_root_vdev; 3343 metaslab_group_t *mg = vd->vdev_mg; 3344 metaslab_class_t *mc = mg ? mg->mg_class : NULL; 3345 3346 ASSERT(vd == vd->vdev_top); 3347 3348 /* 3349 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 3350 * factor. We must calculate this here and not at the root vdev 3351 * because the root vdev's psize-to-asize is simply the max of its 3352 * childrens', thus not accurate enough for us. 3353 */ 3354 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 3355 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 3356 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 3357 vd->vdev_deflate_ratio; 3358 3359 mutex_enter(&vd->vdev_stat_lock); 3360 vd->vdev_stat.vs_alloc += alloc_delta; 3361 vd->vdev_stat.vs_space += space_delta; 3362 vd->vdev_stat.vs_dspace += dspace_delta; 3363 mutex_exit(&vd->vdev_stat_lock); 3364 3365 if (mc == spa_normal_class(spa)) { 3366 mutex_enter(&rvd->vdev_stat_lock); 3367 rvd->vdev_stat.vs_alloc += alloc_delta; 3368 rvd->vdev_stat.vs_space += space_delta; 3369 rvd->vdev_stat.vs_dspace += dspace_delta; 3370 mutex_exit(&rvd->vdev_stat_lock); 3371 } 3372 3373 if (mc != NULL) { 3374 ASSERT(rvd == vd->vdev_parent); 3375 ASSERT(vd->vdev_ms_count != 0); 3376 3377 metaslab_class_space_update(mc, 3378 alloc_delta, defer_delta, space_delta, dspace_delta); 3379 } 3380} 3381 3382/* 3383 * Mark a top-level vdev's config as dirty, placing it on the dirty list 3384 * so that it will be written out next time the vdev configuration is synced. 3385 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 3386 */ 3387void 3388vdev_config_dirty(vdev_t *vd) 3389{ 3390 spa_t *spa = vd->vdev_spa; 3391 vdev_t *rvd = spa->spa_root_vdev; 3392 int c; 3393 3394 ASSERT(spa_writeable(spa)); 3395 3396 /* 3397 * If this is an aux vdev (as with l2cache and spare devices), then we 3398 * update the vdev config manually and set the sync flag. 3399 */ 3400 if (vd->vdev_aux != NULL) { 3401 spa_aux_vdev_t *sav = vd->vdev_aux; 3402 nvlist_t **aux; 3403 uint_t naux; 3404 3405 for (c = 0; c < sav->sav_count; c++) { 3406 if (sav->sav_vdevs[c] == vd) 3407 break; 3408 } 3409 3410 if (c == sav->sav_count) { 3411 /* 3412 * We're being removed. There's nothing more to do. 3413 */ 3414 ASSERT(sav->sav_sync == B_TRUE); 3415 return; 3416 } 3417 3418 sav->sav_sync = B_TRUE; 3419 3420 if (nvlist_lookup_nvlist_array(sav->sav_config, 3421 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 3422 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 3423 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 3424 } 3425 3426 ASSERT(c < naux); 3427 3428 /* 3429 * Setting the nvlist in the middle if the array is a little 3430 * sketchy, but it will work. 3431 */ 3432 nvlist_free(aux[c]); 3433 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3434 3435 return; 3436 } 3437 3438 /* 3439 * The dirty list is protected by the SCL_CONFIG lock. The caller 3440 * must either hold SCL_CONFIG as writer, or must be the sync thread 3441 * (which holds SCL_CONFIG as reader). There's only one sync thread, 3442 * so this is sufficient to ensure mutual exclusion. 3443 */ 3444 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3445 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3446 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3447 3448 if (vd == rvd) { 3449 for (c = 0; c < rvd->vdev_children; c++) 3450 vdev_config_dirty(rvd->vdev_child[c]); 3451 } else { 3452 ASSERT(vd == vd->vdev_top); 3453 3454 if (!list_link_active(&vd->vdev_config_dirty_node) && 3455 vdev_is_concrete(vd)) { 3456 list_insert_head(&spa->spa_config_dirty_list, vd); 3457 } 3458 } 3459} 3460 3461void 3462vdev_config_clean(vdev_t *vd) 3463{ 3464 spa_t *spa = vd->vdev_spa; 3465 3466 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3467 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3468 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3469 3470 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3471 list_remove(&spa->spa_config_dirty_list, vd); 3472} 3473 3474/* 3475 * Mark a top-level vdev's state as dirty, so that the next pass of 3476 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3477 * the state changes from larger config changes because they require 3478 * much less locking, and are often needed for administrative actions. 3479 */ 3480void 3481vdev_state_dirty(vdev_t *vd) 3482{ 3483 spa_t *spa = vd->vdev_spa; 3484 3485 ASSERT(spa_writeable(spa)); 3486 ASSERT(vd == vd->vdev_top); 3487 3488 /* 3489 * The state list is protected by the SCL_STATE lock. The caller 3490 * must either hold SCL_STATE as writer, or must be the sync thread 3491 * (which holds SCL_STATE as reader). There's only one sync thread, 3492 * so this is sufficient to ensure mutual exclusion. 3493 */ 3494 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3495 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3496 spa_config_held(spa, SCL_STATE, RW_READER))); 3497 3498 if (!list_link_active(&vd->vdev_state_dirty_node) && 3499 vdev_is_concrete(vd)) 3500 list_insert_head(&spa->spa_state_dirty_list, vd); 3501} 3502 3503void 3504vdev_state_clean(vdev_t *vd) 3505{ 3506 spa_t *spa = vd->vdev_spa; 3507 3508 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3509 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3510 spa_config_held(spa, SCL_STATE, RW_READER))); 3511 3512 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 3513 list_remove(&spa->spa_state_dirty_list, vd); 3514} 3515 3516/* 3517 * Propagate vdev state up from children to parent. 3518 */ 3519void 3520vdev_propagate_state(vdev_t *vd) 3521{ 3522 spa_t *spa = vd->vdev_spa; 3523 vdev_t *rvd = spa->spa_root_vdev; 3524 int degraded = 0, faulted = 0; 3525 int corrupted = 0; 3526 vdev_t *child; 3527 3528 if (vd->vdev_children > 0) { 3529 for (int c = 0; c < vd->vdev_children; c++) { 3530 child = vd->vdev_child[c]; 3531 3532 /* 3533 * Don't factor holes or indirect vdevs into the 3534 * decision. 3535 */ 3536 if (!vdev_is_concrete(child)) 3537 continue; 3538 3539 if (!vdev_readable(child) || 3540 (!vdev_writeable(child) && spa_writeable(spa))) { 3541 /* 3542 * Root special: if there is a top-level log 3543 * device, treat the root vdev as if it were 3544 * degraded. 3545 */ 3546 if (child->vdev_islog && vd == rvd) 3547 degraded++; 3548 else 3549 faulted++; 3550 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 3551 degraded++; 3552 } 3553 3554 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 3555 corrupted++; 3556 } 3557 3558 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 3559 3560 /* 3561 * Root special: if there is a top-level vdev that cannot be 3562 * opened due to corrupted metadata, then propagate the root 3563 * vdev's aux state as 'corrupt' rather than 'insufficient 3564 * replicas'. 3565 */ 3566 if (corrupted && vd == rvd && 3567 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 3568 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 3569 VDEV_AUX_CORRUPT_DATA); 3570 } 3571 3572 if (vd->vdev_parent) 3573 vdev_propagate_state(vd->vdev_parent); 3574} 3575 3576/* 3577 * Set a vdev's state. If this is during an open, we don't update the parent 3578 * state, because we're in the process of opening children depth-first. 3579 * Otherwise, we propagate the change to the parent. 3580 * 3581 * If this routine places a device in a faulted state, an appropriate ereport is 3582 * generated. 3583 */ 3584void 3585vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 3586{ 3587 uint64_t save_state; 3588 spa_t *spa = vd->vdev_spa; 3589 3590 if (state == vd->vdev_state) { 3591 vd->vdev_stat.vs_aux = aux; 3592 return; 3593 } 3594 3595 save_state = vd->vdev_state; 3596 3597 vd->vdev_state = state; 3598 vd->vdev_stat.vs_aux = aux; 3599 3600 /* 3601 * If we are setting the vdev state to anything but an open state, then 3602 * always close the underlying device unless the device has requested 3603 * a delayed close (i.e. we're about to remove or fault the device). 3604 * Otherwise, we keep accessible but invalid devices open forever. 3605 * We don't call vdev_close() itself, because that implies some extra 3606 * checks (offline, etc) that we don't want here. This is limited to 3607 * leaf devices, because otherwise closing the device will affect other 3608 * children. 3609 */ 3610 if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 3611 vd->vdev_ops->vdev_op_leaf) 3612 vd->vdev_ops->vdev_op_close(vd); 3613 3614 if (vd->vdev_removed && 3615 state == VDEV_STATE_CANT_OPEN && 3616 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 3617 /* 3618 * If the previous state is set to VDEV_STATE_REMOVED, then this 3619 * device was previously marked removed and someone attempted to 3620 * reopen it. If this failed due to a nonexistent device, then 3621 * keep the device in the REMOVED state. We also let this be if 3622 * it is one of our special test online cases, which is only 3623 * attempting to online the device and shouldn't generate an FMA 3624 * fault. 3625 */ 3626 vd->vdev_state = VDEV_STATE_REMOVED; 3627 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 3628 } else if (state == VDEV_STATE_REMOVED) { 3629 vd->vdev_removed = B_TRUE; 3630 } else if (state == VDEV_STATE_CANT_OPEN) { 3631 /* 3632 * If we fail to open a vdev during an import or recovery, we 3633 * mark it as "not available", which signifies that it was 3634 * never there to begin with. Failure to open such a device 3635 * is not considered an error. 3636 */ 3637 if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 3638 spa_load_state(spa) == SPA_LOAD_RECOVER) && 3639 vd->vdev_ops->vdev_op_leaf) 3640 vd->vdev_not_present = 1; 3641 3642 /* 3643 * Post the appropriate ereport. If the 'prevstate' field is 3644 * set to something other than VDEV_STATE_UNKNOWN, it indicates 3645 * that this is part of a vdev_reopen(). In this case, we don't 3646 * want to post the ereport if the device was already in the 3647 * CANT_OPEN state beforehand. 3648 * 3649 * If the 'checkremove' flag is set, then this is an attempt to 3650 * online the device in response to an insertion event. If we 3651 * hit this case, then we have detected an insertion event for a 3652 * faulted or offline device that wasn't in the removed state. 3653 * In this scenario, we don't post an ereport because we are 3654 * about to replace the device, or attempt an online with 3655 * vdev_forcefault, which will generate the fault for us. 3656 */ 3657 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 3658 !vd->vdev_not_present && !vd->vdev_checkremove && 3659 vd != spa->spa_root_vdev) { 3660 const char *class; 3661 3662 switch (aux) { 3663 case VDEV_AUX_OPEN_FAILED: 3664 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3665 break; 3666 case VDEV_AUX_CORRUPT_DATA: 3667 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3668 break; 3669 case VDEV_AUX_NO_REPLICAS: 3670 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3671 break; 3672 case VDEV_AUX_BAD_GUID_SUM: 3673 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3674 break; 3675 case VDEV_AUX_TOO_SMALL: 3676 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3677 break; 3678 case VDEV_AUX_BAD_LABEL: 3679 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3680 break; 3681 default: 3682 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3683 } 3684 3685 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3686 } 3687 3688 /* Erase any notion of persistent removed state */ 3689 vd->vdev_removed = B_FALSE; 3690 } else { 3691 vd->vdev_removed = B_FALSE; 3692 } 3693 3694 /* 3695 * Notify the fmd of the state change. Be verbose and post 3696 * notifications even for stuff that's not important; the fmd agent can 3697 * sort it out. Don't emit state change events for non-leaf vdevs since 3698 * they can't change state on their own. The FMD can check their state 3699 * if it wants to when it sees that a leaf vdev had a state change. 3700 */ 3701 if (vd->vdev_ops->vdev_op_leaf) 3702 zfs_post_state_change(spa, vd); 3703 3704 if (!isopen && vd->vdev_parent) 3705 vdev_propagate_state(vd->vdev_parent); 3706} 3707 3708/* 3709 * Check the vdev configuration to ensure that it's capable of supporting 3710 * a root pool. We do not support partial configuration. 3711 * In addition, only a single top-level vdev is allowed. 3712 * 3713 * FreeBSD does not have above limitations. 3714 */ 3715boolean_t 3716vdev_is_bootable(vdev_t *vd) 3717{ 3718#ifdef illumos 3719 if (!vd->vdev_ops->vdev_op_leaf) { 3720 char *vdev_type = vd->vdev_ops->vdev_op_type; 3721 3722 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3723 vd->vdev_children > 1) { 3724 return (B_FALSE); 3725 } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || 3726 strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { 3727 return (B_FALSE); 3728 } 3729 } 3730 3731 for (int c = 0; c < vd->vdev_children; c++) { 3732 if (!vdev_is_bootable(vd->vdev_child[c])) 3733 return (B_FALSE); 3734 } 3735#endif /* illumos */ 3736 return (B_TRUE); 3737} 3738 3739boolean_t 3740vdev_is_concrete(vdev_t *vd) 3741{ 3742 vdev_ops_t *ops = vd->vdev_ops; 3743 if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || 3744 ops == &vdev_missing_ops || ops == &vdev_root_ops) { 3745 return (B_FALSE); 3746 } else { 3747 return (B_TRUE); 3748 } 3749} 3750 3751/* 3752 * Load the state from the original vdev tree (ovd) which 3753 * we've retrieved from the MOS config object. If the original 3754 * vdev was offline or faulted then we transfer that state to the 3755 * device in the current vdev tree (nvd). 3756 */ 3757void 3758vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3759{ 3760 spa_t *spa = nvd->vdev_spa; 3761 3762 ASSERT(nvd->vdev_top->vdev_islog); 3763 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3764 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3765 3766 for (int c = 0; c < nvd->vdev_children; c++) 3767 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3768 3769 if (nvd->vdev_ops->vdev_op_leaf) { 3770 /* 3771 * Restore the persistent vdev state 3772 */ 3773 nvd->vdev_offline = ovd->vdev_offline; 3774 nvd->vdev_faulted = ovd->vdev_faulted; 3775 nvd->vdev_degraded = ovd->vdev_degraded; 3776 nvd->vdev_removed = ovd->vdev_removed; 3777 } 3778} 3779 3780/* 3781 * Determine if a log device has valid content. If the vdev was 3782 * removed or faulted in the MOS config then we know that 3783 * the content on the log device has already been written to the pool. 3784 */ 3785boolean_t 3786vdev_log_state_valid(vdev_t *vd) 3787{ 3788 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3789 !vd->vdev_removed) 3790 return (B_TRUE); 3791 3792 for (int c = 0; c < vd->vdev_children; c++) 3793 if (vdev_log_state_valid(vd->vdev_child[c])) 3794 return (B_TRUE); 3795 3796 return (B_FALSE); 3797} 3798 3799/* 3800 * Expand a vdev if possible. 3801 */ 3802void 3803vdev_expand(vdev_t *vd, uint64_t txg) 3804{ 3805 ASSERT(vd->vdev_top == vd); 3806 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3807 3808 vdev_set_deflate_ratio(vd); 3809 3810 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && 3811 vdev_is_concrete(vd)) { 3812 VERIFY(vdev_metaslab_init(vd, txg) == 0); 3813 vdev_config_dirty(vd); 3814 } 3815} 3816 3817/* 3818 * Split a vdev. 3819 */ 3820void 3821vdev_split(vdev_t *vd) 3822{ 3823 vdev_t *cvd, *pvd = vd->vdev_parent; 3824 3825 vdev_remove_child(pvd, vd); 3826 vdev_compact_children(pvd); 3827 3828 cvd = pvd->vdev_child[0]; 3829 if (pvd->vdev_children == 1) { 3830 vdev_remove_parent(cvd); 3831 cvd->vdev_splitting = B_TRUE; 3832 } 3833 vdev_propagate_state(cvd); 3834} 3835 3836void 3837vdev_deadman(vdev_t *vd) 3838{ 3839 for (int c = 0; c < vd->vdev_children; c++) { 3840 vdev_t *cvd = vd->vdev_child[c]; 3841 3842 vdev_deadman(cvd); 3843 } 3844 3845 if (vd->vdev_ops->vdev_op_leaf) { 3846 vdev_queue_t *vq = &vd->vdev_queue; 3847 3848 mutex_enter(&vq->vq_lock); 3849 if (avl_numnodes(&vq->vq_active_tree) > 0) { 3850 spa_t *spa = vd->vdev_spa; 3851 zio_t *fio; 3852 uint64_t delta; 3853 3854 /* 3855 * Look at the head of all the pending queues, 3856 * if any I/O has been outstanding for longer than 3857 * the spa_deadman_synctime we panic the system. 3858 */ 3859 fio = avl_first(&vq->vq_active_tree); 3860 delta = gethrtime() - fio->io_timestamp; 3861 if (delta > spa_deadman_synctime(spa)) {
|
3808 zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " 3809 "delta %lluns, last io %lluns", 3810 fio->io_timestamp, delta,
| 3862 vdev_dbgmsg(vd, "SLOW IO: zio timestamp " 3863 "%lluns, delta %lluns, last io %lluns", 3864 fio->io_timestamp, (u_longlong_t)delta,
|
3811 vq->vq_io_complete_ts); 3812 fm_panic("I/O to pool '%s' appears to be " 3813 "hung on vdev guid %llu at '%s'.", 3814 spa_name(spa), 3815 (long long unsigned int) vd->vdev_guid, 3816 vd->vdev_path); 3817 } 3818 } 3819 mutex_exit(&vq->vq_lock); 3820 } 3821}
| 3865 vq->vq_io_complete_ts); 3866 fm_panic("I/O to pool '%s' appears to be " 3867 "hung on vdev guid %llu at '%s'.", 3868 spa_name(spa), 3869 (long long unsigned int) vd->vdev_guid, 3870 vd->vdev_path); 3871 } 3872 } 3873 mutex_exit(&vq->vq_lock); 3874 } 3875}
|