vdev_removal.c revision 339105
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/spa_impl.h> 29#include <sys/dmu.h> 30#include <sys/dmu_tx.h> 31#include <sys/zap.h> 32#include <sys/vdev_impl.h> 33#include <sys/metaslab.h> 34#include <sys/metaslab_impl.h> 35#include <sys/uberblock_impl.h> 36#include <sys/txg.h> 37#include <sys/avl.h> 38#include <sys/bpobj.h> 39#include <sys/dsl_pool.h> 40#include <sys/dsl_synctask.h> 41#include <sys/dsl_dir.h> 42#include <sys/arc.h> 43#include <sys/zfeature.h> 44#include <sys/vdev_indirect_births.h> 45#include <sys/vdev_indirect_mapping.h> 46#include <sys/abd.h> 47 48/* 49 * This file contains the necessary logic to remove vdevs from a 50 * storage pool. Currently, the only devices that can be removed 51 * are log, cache, and spare devices; and top level vdevs from a pool 52 * w/o raidz. (Note that members of a mirror can also be removed 53 * by the detach operation.) 54 * 55 * Log vdevs are removed by evacuating them and then turning the vdev 56 * into a hole vdev while holding spa config locks. 57 * 58 * Top level vdevs are removed and converted into an indirect vdev via 59 * a multi-step process: 60 * 61 * - Disable allocations from this device (spa_vdev_remove_top). 62 * 63 * - From a new thread (spa_vdev_remove_thread), copy data from 64 * the removing vdev to a different vdev. The copy happens in open 65 * context (spa_vdev_copy_impl) and issues a sync task 66 * (vdev_mapping_sync) so the sync thread can update the partial 67 * indirect mappings in core and on disk. 68 * 69 * - If a free happens during a removal, it is freed from the 70 * removing vdev, and if it has already been copied, from the new 71 * location as well (free_from_removing_vdev). 72 * 73 * - After the removal is completed, the copy thread converts the vdev 74 * into an indirect vdev (vdev_remove_complete) before instructing 75 * the sync thread to destroy the space maps and finish the removal 76 * (spa_finish_removal). 77 */ 78 79typedef struct vdev_copy_arg { 80 metaslab_t *vca_msp; 81 uint64_t vca_outstanding_bytes; 82 kcondvar_t vca_cv; 83 kmutex_t vca_lock; 84} vdev_copy_arg_t; 85 86typedef struct vdev_copy_seg_arg { 87 vdev_copy_arg_t *vcsa_copy_arg; 88 uint64_t vcsa_txg; 89 dva_t *vcsa_dest_dva; 90 blkptr_t *vcsa_dest_bp; 91} vdev_copy_seg_arg_t; 92 93/* 94 * The maximum amount of allowed data we're allowed to copy from a device 95 * at a time when removing it. 96 */ 97int zfs_remove_max_copy_bytes = 8 * 1024 * 1024; 98 99/* 100 * The largest contiguous segment that we will attempt to allocate when 101 * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If 102 * there is a performance problem with attempting to allocate large blocks, 103 * consider decreasing this. 104 * 105 * Note: we will issue I/Os of up to this size. The mpt driver does not 106 * respond well to I/Os larger than 1MB, so we set this to 1MB. (When 107 * mpt processes an I/O larger than 1MB, it needs to do an allocation of 108 * 2 physically contiguous pages; if this allocation fails, mpt will drop 109 * the I/O and hang the device.) 110 */ 111int zfs_remove_max_segment = 1024 * 1024; 112 113/* 114 * This is used by the test suite so that it can ensure that certain 115 * actions happen while in the middle of a removal. 116 */ 117uint64_t zfs_remove_max_bytes_pause = UINT64_MAX; 118 119#define VDEV_REMOVAL_ZAP_OBJS "lzap" 120 121static void spa_vdev_remove_thread(void *arg); 122 123static void 124spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) 125{ 126 VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, 127 DMU_POOL_DIRECTORY_OBJECT, 128 DMU_POOL_REMOVING, sizeof (uint64_t), 129 sizeof (spa->spa_removing_phys) / sizeof (uint64_t), 130 &spa->spa_removing_phys, tx)); 131} 132 133static nvlist_t * 134spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 135{ 136 for (int i = 0; i < count; i++) { 137 uint64_t guid = 138 fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); 139 140 if (guid == target_guid) 141 return (nvpp[i]); 142 } 143 144 return (NULL); 145} 146 147static void 148spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 149 nvlist_t *dev_to_remove) 150{ 151 nvlist_t **newdev = NULL; 152 153 if (count > 1) 154 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 155 156 for (int i = 0, j = 0; i < count; i++) { 157 if (dev[i] == dev_to_remove) 158 continue; 159 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 160 } 161 162 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 163 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 164 165 for (int i = 0; i < count - 1; i++) 166 nvlist_free(newdev[i]); 167 168 if (count > 1) 169 kmem_free(newdev, (count - 1) * sizeof (void *)); 170} 171 172static spa_vdev_removal_t * 173spa_vdev_removal_create(vdev_t *vd) 174{ 175 spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); 176 mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); 177 cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); 178 svr->svr_allocd_segs = range_tree_create(NULL, NULL); 179 svr->svr_vdev = vd; 180 181 for (int i = 0; i < TXG_SIZE; i++) { 182 svr->svr_frees[i] = range_tree_create(NULL, NULL); 183 list_create(&svr->svr_new_segments[i], 184 sizeof (vdev_indirect_mapping_entry_t), 185 offsetof(vdev_indirect_mapping_entry_t, vime_node)); 186 } 187 188 return (svr); 189} 190 191void 192spa_vdev_removal_destroy(spa_vdev_removal_t *svr) 193{ 194 for (int i = 0; i < TXG_SIZE; i++) { 195 ASSERT0(svr->svr_bytes_done[i]); 196 ASSERT0(svr->svr_max_offset_to_sync[i]); 197 range_tree_destroy(svr->svr_frees[i]); 198 list_destroy(&svr->svr_new_segments[i]); 199 } 200 201 range_tree_destroy(svr->svr_allocd_segs); 202 mutex_destroy(&svr->svr_lock); 203 cv_destroy(&svr->svr_cv); 204 kmem_free(svr, sizeof (*svr)); 205} 206 207/* 208 * This is called as a synctask in the txg in which we will mark this vdev 209 * as removing (in the config stored in the MOS). 210 * 211 * It begins the evacuation of a toplevel vdev by: 212 * - initializing the spa_removing_phys which tracks this removal 213 * - computing the amount of space to remove for accounting purposes 214 * - dirtying all dbufs in the spa_config_object 215 * - creating the spa_vdev_removal 216 * - starting the spa_vdev_remove_thread 217 */ 218static void 219vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) 220{ 221 vdev_t *vd = arg; 222 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 223 spa_t *spa = vd->vdev_spa; 224 objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; 225 spa_vdev_removal_t *svr = NULL; 226 uint64_t txg = dmu_tx_get_txg(tx); 227 228 ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); 229 svr = spa_vdev_removal_create(vd); 230 231 ASSERT(vd->vdev_removing); 232 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 233 234 spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); 235 if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 236 /* 237 * By activating the OBSOLETE_COUNTS feature, we prevent 238 * the pool from being downgraded and ensure that the 239 * refcounts are precise. 240 */ 241 spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 242 uint64_t one = 1; 243 VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, 244 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, 245 &one, tx)); 246 ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0); 247 } 248 249 vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); 250 vd->vdev_indirect_mapping = 251 vdev_indirect_mapping_open(mos, vic->vic_mapping_object); 252 vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); 253 vd->vdev_indirect_births = 254 vdev_indirect_births_open(mos, vic->vic_births_object); 255 spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; 256 spa->spa_removing_phys.sr_start_time = gethrestime_sec(); 257 spa->spa_removing_phys.sr_end_time = 0; 258 spa->spa_removing_phys.sr_state = DSS_SCANNING; 259 spa->spa_removing_phys.sr_to_copy = 0; 260 spa->spa_removing_phys.sr_copied = 0; 261 262 /* 263 * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because 264 * there may be space in the defer tree, which is free, but still 265 * counted in vs_alloc. 266 */ 267 for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { 268 metaslab_t *ms = vd->vdev_ms[i]; 269 if (ms->ms_sm == NULL) 270 continue; 271 272 /* 273 * Sync tasks happen before metaslab_sync(), therefore 274 * smp_alloc and sm_alloc must be the same. 275 */ 276 ASSERT3U(space_map_allocated(ms->ms_sm), ==, 277 ms->ms_sm->sm_phys->smp_alloc); 278 279 spa->spa_removing_phys.sr_to_copy += 280 space_map_allocated(ms->ms_sm); 281 282 /* 283 * Space which we are freeing this txg does not need to 284 * be copied. 285 */ 286 spa->spa_removing_phys.sr_to_copy -= 287 range_tree_space(ms->ms_freeing); 288 289 ASSERT0(range_tree_space(ms->ms_freed)); 290 for (int t = 0; t < TXG_SIZE; t++) 291 ASSERT0(range_tree_space(ms->ms_allocating[t])); 292 } 293 294 /* 295 * Sync tasks are called before metaslab_sync(), so there should 296 * be no already-synced metaslabs in the TXG_CLEAN list. 297 */ 298 ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); 299 300 spa_sync_removing_state(spa, tx); 301 302 /* 303 * All blocks that we need to read the most recent mapping must be 304 * stored on concrete vdevs. Therefore, we must dirty anything that 305 * is read before spa_remove_init(). Specifically, the 306 * spa_config_object. (Note that although we already modified the 307 * spa_config_object in spa_sync_removing_state, that may not have 308 * modified all blocks of the object.) 309 */ 310 dmu_object_info_t doi; 311 VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); 312 for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { 313 dmu_buf_t *dbuf; 314 VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, 315 offset, FTAG, &dbuf, 0)); 316 dmu_buf_will_dirty(dbuf, tx); 317 offset += dbuf->db_size; 318 dmu_buf_rele(dbuf, FTAG); 319 } 320 321 /* 322 * Now that we've allocated the im_object, dirty the vdev to ensure 323 * that the object gets written to the config on disk. 324 */ 325 vdev_config_dirty(vd); 326 327 zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu " 328 "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), 329 vic->vic_mapping_object); 330 331 spa_history_log_internal(spa, "vdev remove started", tx, 332 "%s vdev %llu %s", spa_name(spa), vd->vdev_id, 333 (vd->vdev_path != NULL) ? vd->vdev_path : "-"); 334 /* 335 * Setting spa_vdev_removal causes subsequent frees to call 336 * free_from_removing_vdev(). Note that we don't need any locking 337 * because we are the sync thread, and metaslab_free_impl() is only 338 * called from syncing context (potentially from a zio taskq thread, 339 * but in any case only when there are outstanding free i/os, which 340 * there are not). 341 */ 342 ASSERT3P(spa->spa_vdev_removal, ==, NULL); 343 spa->spa_vdev_removal = svr; 344 svr->svr_thread = thread_create(NULL, 0, 345 spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri); 346} 347 348/* 349 * When we are opening a pool, we must read the mapping for each 350 * indirect vdev in order from most recently removed to least 351 * recently removed. We do this because the blocks for the mapping 352 * of older indirect vdevs may be stored on more recently removed vdevs. 353 * In order to read each indirect mapping object, we must have 354 * initialized all more recently removed vdevs. 355 */ 356int 357spa_remove_init(spa_t *spa) 358{ 359 int error; 360 361 error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, 362 DMU_POOL_DIRECTORY_OBJECT, 363 DMU_POOL_REMOVING, sizeof (uint64_t), 364 sizeof (spa->spa_removing_phys) / sizeof (uint64_t), 365 &spa->spa_removing_phys); 366 367 if (error == ENOENT) { 368 spa->spa_removing_phys.sr_state = DSS_NONE; 369 spa->spa_removing_phys.sr_removing_vdev = -1; 370 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 371 spa->spa_indirect_vdevs_loaded = B_TRUE; 372 return (0); 373 } else if (error != 0) { 374 return (error); 375 } 376 377 if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { 378 /* 379 * We are currently removing a vdev. Create and 380 * initialize a spa_vdev_removal_t from the bonus 381 * buffer of the removing vdevs vdev_im_object, and 382 * initialize its partial mapping. 383 */ 384 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 385 vdev_t *vd = vdev_lookup_top(spa, 386 spa->spa_removing_phys.sr_removing_vdev); 387 spa_config_exit(spa, SCL_STATE, FTAG); 388 389 if (vd == NULL) 390 return (EINVAL); 391 392 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 393 394 ASSERT(vdev_is_concrete(vd)); 395 spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); 396 ASSERT(svr->svr_vdev->vdev_removing); 397 398 vd->vdev_indirect_mapping = vdev_indirect_mapping_open( 399 spa->spa_meta_objset, vic->vic_mapping_object); 400 vd->vdev_indirect_births = vdev_indirect_births_open( 401 spa->spa_meta_objset, vic->vic_births_object); 402 403 spa->spa_vdev_removal = svr; 404 } 405 406 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 407 uint64_t indirect_vdev_id = 408 spa->spa_removing_phys.sr_prev_indirect_vdev; 409 while (indirect_vdev_id != UINT64_MAX) { 410 vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); 411 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 412 413 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 414 vd->vdev_indirect_mapping = vdev_indirect_mapping_open( 415 spa->spa_meta_objset, vic->vic_mapping_object); 416 vd->vdev_indirect_births = vdev_indirect_births_open( 417 spa->spa_meta_objset, vic->vic_births_object); 418 419 indirect_vdev_id = vic->vic_prev_indirect_vdev; 420 } 421 spa_config_exit(spa, SCL_STATE, FTAG); 422 423 /* 424 * Now that we've loaded all the indirect mappings, we can allow 425 * reads from other blocks (e.g. via predictive prefetch). 426 */ 427 spa->spa_indirect_vdevs_loaded = B_TRUE; 428 return (0); 429} 430 431void 432spa_restart_removal(spa_t *spa) 433{ 434 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 435 436 if (svr == NULL) 437 return; 438 439 /* 440 * In general when this function is called there is no 441 * removal thread running. The only scenario where this 442 * is not true is during spa_import() where this function 443 * is called twice [once from spa_import_impl() and 444 * spa_async_resume()]. Thus, in the scenario where we 445 * import a pool that has an ongoing removal we don't 446 * want to spawn a second thread. 447 */ 448 if (svr->svr_thread != NULL) 449 return; 450 451 if (!spa_writeable(spa)) 452 return; 453 454 vdev_t *vd = svr->svr_vdev; 455 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 456 457 ASSERT3P(vd, !=, NULL); 458 ASSERT(vd->vdev_removing); 459 460 zfs_dbgmsg("restarting removal of %llu at count=%llu", 461 vd->vdev_id, vdev_indirect_mapping_num_entries(vim)); 462 svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd, 463 0, &p0, TS_RUN, minclsyspri); 464} 465 466/* 467 * Process freeing from a device which is in the middle of being removed. 468 * We must handle this carefully so that we attempt to copy freed data, 469 * and we correctly free already-copied data. 470 */ 471void 472free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) 473{ 474 spa_t *spa = vd->vdev_spa; 475 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 476 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 477 uint64_t txg = spa_syncing_txg(spa); 478 uint64_t max_offset_yet = 0; 479 480 ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); 481 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, 482 vdev_indirect_mapping_object(vim)); 483 ASSERT3P(vd, ==, svr->svr_vdev); 484 485 mutex_enter(&svr->svr_lock); 486 487 /* 488 * Remove the segment from the removing vdev's spacemap. This 489 * ensures that we will not attempt to copy this space (if the 490 * removal thread has not yet visited it), and also ensures 491 * that we know what is actually allocated on the new vdevs 492 * (needed if we cancel the removal). 493 * 494 * Note: we must do the metaslab_free_concrete() with the svr_lock 495 * held, so that the remove_thread can not load this metaslab and then 496 * visit this offset between the time that we metaslab_free_concrete() 497 * and when we check to see if it has been visited. 498 * 499 * Note: The checkpoint flag is set to false as having/taking 500 * a checkpoint and removing a device can't happen at the same 501 * time. 502 */ 503 ASSERT(!spa_has_checkpoint(spa)); 504 metaslab_free_concrete(vd, offset, size, B_FALSE); 505 506 uint64_t synced_size = 0; 507 uint64_t synced_offset = 0; 508 uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); 509 if (offset < max_offset_synced) { 510 /* 511 * The mapping for this offset is already on disk. 512 * Free from the new location. 513 * 514 * Note that we use svr_max_synced_offset because it is 515 * updated atomically with respect to the in-core mapping. 516 * By contrast, vim_max_offset is not. 517 * 518 * This block may be split between a synced entry and an 519 * in-flight or unvisited entry. Only process the synced 520 * portion of it here. 521 */ 522 synced_size = MIN(size, max_offset_synced - offset); 523 synced_offset = offset; 524 525 ASSERT3U(max_offset_yet, <=, max_offset_synced); 526 max_offset_yet = max_offset_synced; 527 528 DTRACE_PROBE3(remove__free__synced, 529 spa_t *, spa, 530 uint64_t, offset, 531 uint64_t, synced_size); 532 533 size -= synced_size; 534 offset += synced_size; 535 } 536 537 /* 538 * Look at all in-flight txgs starting from the currently syncing one 539 * and see if a section of this free is being copied. By starting from 540 * this txg and iterating forward, we might find that this region 541 * was copied in two different txgs and handle it appropriately. 542 */ 543 for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { 544 int txgoff = (txg + i) & TXG_MASK; 545 if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { 546 /* 547 * The mapping for this offset is in flight, and 548 * will be synced in txg+i. 549 */ 550 uint64_t inflight_size = MIN(size, 551 svr->svr_max_offset_to_sync[txgoff] - offset); 552 553 DTRACE_PROBE4(remove__free__inflight, 554 spa_t *, spa, 555 uint64_t, offset, 556 uint64_t, inflight_size, 557 uint64_t, txg + i); 558 559 /* 560 * We copy data in order of increasing offset. 561 * Therefore the max_offset_to_sync[] must increase 562 * (or be zero, indicating that nothing is being 563 * copied in that txg). 564 */ 565 if (svr->svr_max_offset_to_sync[txgoff] != 0) { 566 ASSERT3U(svr->svr_max_offset_to_sync[txgoff], 567 >=, max_offset_yet); 568 max_offset_yet = 569 svr->svr_max_offset_to_sync[txgoff]; 570 } 571 572 /* 573 * We've already committed to copying this segment: 574 * we have allocated space elsewhere in the pool for 575 * it and have an IO outstanding to copy the data. We 576 * cannot free the space before the copy has 577 * completed, or else the copy IO might overwrite any 578 * new data. To free that space, we record the 579 * segment in the appropriate svr_frees tree and free 580 * the mapped space later, in the txg where we have 581 * completed the copy and synced the mapping (see 582 * vdev_mapping_sync). 583 */ 584 range_tree_add(svr->svr_frees[txgoff], 585 offset, inflight_size); 586 size -= inflight_size; 587 offset += inflight_size; 588 589 /* 590 * This space is already accounted for as being 591 * done, because it is being copied in txg+i. 592 * However, if i!=0, then it is being copied in 593 * a future txg. If we crash after this txg 594 * syncs but before txg+i syncs, then the space 595 * will be free. Therefore we must account 596 * for the space being done in *this* txg 597 * (when it is freed) rather than the future txg 598 * (when it will be copied). 599 */ 600 ASSERT3U(svr->svr_bytes_done[txgoff], >=, 601 inflight_size); 602 svr->svr_bytes_done[txgoff] -= inflight_size; 603 svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; 604 } 605 } 606 ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); 607 608 if (size > 0) { 609 /* 610 * The copy thread has not yet visited this offset. Ensure 611 * that it doesn't. 612 */ 613 614 DTRACE_PROBE3(remove__free__unvisited, 615 spa_t *, spa, 616 uint64_t, offset, 617 uint64_t, size); 618 619 if (svr->svr_allocd_segs != NULL) 620 range_tree_clear(svr->svr_allocd_segs, offset, size); 621 622 /* 623 * Since we now do not need to copy this data, for 624 * accounting purposes we have done our job and can count 625 * it as completed. 626 */ 627 svr->svr_bytes_done[txg & TXG_MASK] += size; 628 } 629 mutex_exit(&svr->svr_lock); 630 631 /* 632 * Now that we have dropped svr_lock, process the synced portion 633 * of this free. 634 */ 635 if (synced_size > 0) { 636 vdev_indirect_mark_obsolete(vd, synced_offset, synced_size); 637 638 /* 639 * Note: this can only be called from syncing context, 640 * and the vdev_indirect_mapping is only changed from the 641 * sync thread, so we don't need svr_lock while doing 642 * metaslab_free_impl_cb. 643 */ 644 boolean_t checkpoint = B_FALSE; 645 vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, 646 metaslab_free_impl_cb, &checkpoint); 647 } 648} 649 650/* 651 * Stop an active removal and update the spa_removing phys. 652 */ 653static void 654spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) 655{ 656 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 657 ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); 658 659 /* Ensure the removal thread has completed before we free the svr. */ 660 spa_vdev_remove_suspend(spa); 661 662 ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); 663 664 if (state == DSS_FINISHED) { 665 spa_removing_phys_t *srp = &spa->spa_removing_phys; 666 vdev_t *vd = svr->svr_vdev; 667 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 668 669 if (srp->sr_prev_indirect_vdev != UINT64_MAX) { 670 vdev_t *pvd = vdev_lookup_top(spa, 671 srp->sr_prev_indirect_vdev); 672 ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); 673 } 674 675 vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; 676 srp->sr_prev_indirect_vdev = vd->vdev_id; 677 } 678 spa->spa_removing_phys.sr_state = state; 679 spa->spa_removing_phys.sr_end_time = gethrestime_sec(); 680 681 spa->spa_vdev_removal = NULL; 682 spa_vdev_removal_destroy(svr); 683 684 spa_sync_removing_state(spa, tx); 685 686 vdev_config_dirty(spa->spa_root_vdev); 687} 688 689static void 690free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) 691{ 692 vdev_t *vd = arg; 693 vdev_indirect_mark_obsolete(vd, offset, size); 694 boolean_t checkpoint = B_FALSE; 695 vdev_indirect_ops.vdev_op_remap(vd, offset, size, 696 metaslab_free_impl_cb, &checkpoint); 697} 698 699/* 700 * On behalf of the removal thread, syncs an incremental bit more of 701 * the indirect mapping to disk and updates the in-memory mapping. 702 * Called as a sync task in every txg that the removal thread makes progress. 703 */ 704static void 705vdev_mapping_sync(void *arg, dmu_tx_t *tx) 706{ 707 spa_vdev_removal_t *svr = arg; 708 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 709 vdev_t *vd = svr->svr_vdev; 710 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 711 uint64_t txg = dmu_tx_get_txg(tx); 712 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 713 714 ASSERT(vic->vic_mapping_object != 0); 715 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 716 717 vdev_indirect_mapping_add_entries(vim, 718 &svr->svr_new_segments[txg & TXG_MASK], tx); 719 vdev_indirect_births_add_entry(vd->vdev_indirect_births, 720 vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); 721 722 /* 723 * Free the copied data for anything that was freed while the 724 * mapping entries were in flight. 725 */ 726 mutex_enter(&svr->svr_lock); 727 range_tree_vacate(svr->svr_frees[txg & TXG_MASK], 728 free_mapped_segment_cb, vd); 729 ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, 730 vdev_indirect_mapping_max_offset(vim)); 731 svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; 732 mutex_exit(&svr->svr_lock); 733 734 spa_sync_removing_state(spa, tx); 735} 736 737static void 738spa_vdev_copy_segment_write_done(zio_t *zio) 739{ 740 vdev_copy_seg_arg_t *vcsa = zio->io_private; 741 vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg; 742 spa_config_exit(zio->io_spa, SCL_STATE, FTAG); 743 abd_free(zio->io_abd); 744 745 mutex_enter(&vca->vca_lock); 746 vca->vca_outstanding_bytes -= zio->io_size; 747 cv_signal(&vca->vca_cv); 748 mutex_exit(&vca->vca_lock); 749 750 ASSERT0(zio->io_error); 751 kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t)); 752 kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t)); 753} 754 755static void 756spa_vdev_copy_segment_read_done(zio_t *zio) 757{ 758 vdev_copy_seg_arg_t *vcsa = zio->io_private; 759 dva_t *dest_dva = vcsa->vcsa_dest_dva; 760 uint64_t txg = vcsa->vcsa_txg; 761 spa_t *spa = zio->io_spa; 762 vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva)); 763 blkptr_t *bp = NULL; 764 dva_t *dva = NULL; 765 uint64_t size = zio->io_size; 766 767 ASSERT3P(dest_vd, !=, NULL); 768 ASSERT0(zio->io_error); 769 770 vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); 771 bp = vcsa->vcsa_dest_bp; 772 dva = bp->blk_dva; 773 774 BP_ZERO(bp); 775 776 /* initialize with dest_dva */ 777 bcopy(dest_dva, dva, sizeof (dva_t)); 778 BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); 779 780 BP_SET_LSIZE(bp, size); 781 BP_SET_PSIZE(bp, size); 782 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 783 BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); 784 BP_SET_TYPE(bp, DMU_OT_NONE); 785 BP_SET_LEVEL(bp, 0); 786 BP_SET_DEDUP(bp, 0); 787 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 788 789 zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa, 790 txg, bp, zio->io_abd, size, 791 spa_vdev_copy_segment_write_done, vcsa, 792 ZIO_PRIORITY_REMOVAL, 0, NULL)); 793} 794 795static int 796spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, 797 vdev_copy_arg_t *vca, zio_alloc_list_t *zal) 798{ 799 metaslab_group_t *mg = vd->vdev_mg; 800 spa_t *spa = vd->vdev_spa; 801 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 802 vdev_indirect_mapping_entry_t *entry; 803 vdev_copy_seg_arg_t *private; 804 dva_t dst = { 0 }; 805 blkptr_t blk, *bp = &blk; 806 dva_t *dva = bp->blk_dva; 807 808 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 809 810 /* 811 * We use allocator 0 for this I/O because we don't expect device remap 812 * to be the steady state of the system, so parallelizing is not as 813 * critical as it is for other allocation types. We also want to ensure 814 * that the IOs are allocated together as much as possible, to reduce 815 * mapping sizes. 816 */ 817 int error = metaslab_alloc_dva(spa, mg->mg_class, size, 818 &dst, 0, NULL, txg, 0, zal, 0); 819 if (error != 0) 820 return (error); 821 822 /* 823 * We can't have any padding of the allocated size, otherwise we will 824 * misunderstand what's allocated, and the size of the mapping. 825 * The caller ensures this will be true by passing in a size that is 826 * aligned to the worst (highest) ashift in the pool. 827 */ 828 ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); 829 830 mutex_enter(&vca->vca_lock); 831 vca->vca_outstanding_bytes += size; 832 mutex_exit(&vca->vca_lock); 833 834 entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); 835 DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); 836 entry->vime_mapping.vimep_dst = dst; 837 838 private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP); 839 private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; 840 private->vcsa_txg = txg; 841 private->vcsa_copy_arg = vca; 842 843 /* 844 * This lock is eventually released by the donefunc for the 845 * zio_write_phys that finishes copying the data. 846 */ 847 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 848 849 /* 850 * Do logical I/O, letting the redundancy vdevs (like mirror) 851 * handle their own I/O instead of duplicating that code here. 852 */ 853 BP_ZERO(bp); 854 855 DVA_SET_VDEV(&dva[0], vd->vdev_id); 856 DVA_SET_OFFSET(&dva[0], start); 857 DVA_SET_GANG(&dva[0], 0); 858 DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size)); 859 860 BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); 861 862 BP_SET_LSIZE(bp, size); 863 BP_SET_PSIZE(bp, size); 864 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 865 BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); 866 BP_SET_TYPE(bp, DMU_OT_NONE); 867 BP_SET_LEVEL(bp, 0); 868 BP_SET_DEDUP(bp, 0); 869 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 870 871 zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, 872 bp, abd_alloc_for_io(size, B_FALSE), size, 873 spa_vdev_copy_segment_read_done, private, 874 ZIO_PRIORITY_REMOVAL, 0, NULL)); 875 876 list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); 877 ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); 878 vdev_dirty(vd, 0, NULL, txg); 879 880 return (0); 881} 882 883/* 884 * Complete the removal of a toplevel vdev. This is called as a 885 * synctask in the same txg that we will sync out the new config (to the 886 * MOS object) which indicates that this vdev is indirect. 887 */ 888static void 889vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) 890{ 891 spa_vdev_removal_t *svr = arg; 892 vdev_t *vd = svr->svr_vdev; 893 spa_t *spa = vd->vdev_spa; 894 895 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 896 897 for (int i = 0; i < TXG_SIZE; i++) { 898 ASSERT0(svr->svr_bytes_done[i]); 899 } 900 901 ASSERT3U(spa->spa_removing_phys.sr_copied, ==, 902 spa->spa_removing_phys.sr_to_copy); 903 904 vdev_destroy_spacemaps(vd, tx); 905 906 /* destroy leaf zaps, if any */ 907 ASSERT3P(svr->svr_zaplist, !=, NULL); 908 for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); 909 pair != NULL; 910 pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { 911 vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); 912 } 913 fnvlist_free(svr->svr_zaplist); 914 915 spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); 916 /* vd->vdev_path is not available here */ 917 spa_history_log_internal(spa, "vdev remove completed", tx, 918 "%s vdev %llu", spa_name(spa), vd->vdev_id); 919} 920 921static void 922vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd) 923{ 924 ivd->vdev_indirect_config = vd->vdev_indirect_config; 925 926 ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL); 927 ASSERT(vd->vdev_indirect_mapping != NULL); 928 ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping; 929 vd->vdev_indirect_mapping = NULL; 930 931 ASSERT3P(ivd->vdev_indirect_births, ==, NULL); 932 ASSERT(vd->vdev_indirect_births != NULL); 933 ivd->vdev_indirect_births = vd->vdev_indirect_births; 934 vd->vdev_indirect_births = NULL; 935 936 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 937 ASSERT0(range_tree_space(ivd->vdev_obsolete_segments)); 938 939 if (vd->vdev_obsolete_sm != NULL) { 940 ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize); 941 942 /* 943 * We cannot use space_map_{open,close} because we hold all 944 * the config locks as writer. 945 */ 946 ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL); 947 ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm; 948 vd->vdev_obsolete_sm = NULL; 949 } 950} 951 952static void 953vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) 954{ 955 ASSERT3P(zlist, !=, NULL); 956 ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); 957 958 if (vd->vdev_leaf_zap != 0) { 959 char zkey[32]; 960 (void) snprintf(zkey, sizeof (zkey), "%s-%ju", 961 VDEV_REMOVAL_ZAP_OBJS, (uintmax_t)vd->vdev_leaf_zap); 962 fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); 963 } 964 965 for (uint64_t id = 0; id < vd->vdev_children; id++) { 966 vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); 967 } 968} 969 970static void 971vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) 972{ 973 vdev_t *ivd; 974 dmu_tx_t *tx; 975 spa_t *spa = vd->vdev_spa; 976 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 977 978 /* 979 * First, build a list of leaf zaps to be destroyed. 980 * This is passed to the sync context thread, 981 * which does the actual unlinking. 982 */ 983 svr->svr_zaplist = fnvlist_alloc(); 984 vdev_remove_enlist_zaps(vd, svr->svr_zaplist); 985 986 ivd = vdev_add_parent(vd, &vdev_indirect_ops); 987 988 vd->vdev_leaf_zap = 0; 989 990 vdev_remove_child(ivd, vd); 991 vdev_compact_children(ivd); 992 993 vdev_indirect_state_transfer(ivd, vd); 994 995 svr->svr_vdev = ivd; 996 997 ASSERT(!ivd->vdev_removing); 998 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 999 1000 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1001 dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, 1002 0, ZFS_SPACE_CHECK_NONE, tx); 1003 dmu_tx_commit(tx); 1004 1005 /* 1006 * Indicate that this thread has exited. 1007 * After this, we can not use svr. 1008 */ 1009 mutex_enter(&svr->svr_lock); 1010 svr->svr_thread = NULL; 1011 cv_broadcast(&svr->svr_cv); 1012 mutex_exit(&svr->svr_lock); 1013} 1014 1015/* 1016 * Complete the removal of a toplevel vdev. This is called in open 1017 * context by the removal thread after we have copied all vdev's data. 1018 */ 1019static void 1020vdev_remove_complete(vdev_t *vd) 1021{ 1022 spa_t *spa = vd->vdev_spa; 1023 uint64_t txg; 1024 1025 /* 1026 * Wait for any deferred frees to be synced before we call 1027 * vdev_metaslab_fini() 1028 */ 1029 txg_wait_synced(spa->spa_dsl_pool, 0); 1030 1031 txg = spa_vdev_enter(spa); 1032 zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", 1033 vd->vdev_id, txg); 1034 1035 /* 1036 * Discard allocation state. 1037 */ 1038 if (vd->vdev_mg != NULL) { 1039 vdev_metaslab_fini(vd); 1040 metaslab_group_destroy(vd->vdev_mg); 1041 vd->vdev_mg = NULL; 1042 } 1043 ASSERT0(vd->vdev_stat.vs_space); 1044 ASSERT0(vd->vdev_stat.vs_dspace); 1045 1046 vdev_remove_replace_with_indirect(vd, txg); 1047 1048 /* 1049 * We now release the locks, allowing spa_sync to run and finish the 1050 * removal via vdev_remove_complete_sync in syncing context. 1051 */ 1052 (void) spa_vdev_exit(spa, NULL, txg, 0); 1053 1054 /* 1055 * Top ZAP should have been transferred to the indirect vdev in 1056 * vdev_remove_replace_with_indirect. 1057 */ 1058 ASSERT0(vd->vdev_top_zap); 1059 1060 /* 1061 * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. 1062 */ 1063 ASSERT0(vd->vdev_leaf_zap); 1064 1065 txg = spa_vdev_enter(spa); 1066 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1067 /* 1068 * Request to update the config and the config cachefile. 1069 */ 1070 vdev_config_dirty(spa->spa_root_vdev); 1071 (void) spa_vdev_exit(spa, vd, txg, 0); 1072} 1073 1074/* 1075 * Evacuates a segment of size at most max_alloc from the vdev 1076 * via repeated calls to spa_vdev_copy_segment. If an allocation 1077 * fails, the pool is probably too fragmented to handle such a 1078 * large size, so decrease max_alloc so that the caller will not try 1079 * this size again this txg. 1080 */ 1081static void 1082spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, 1083 uint64_t *max_alloc, dmu_tx_t *tx) 1084{ 1085 uint64_t txg = dmu_tx_get_txg(tx); 1086 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1087 1088 mutex_enter(&svr->svr_lock); 1089 1090 range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); 1091 if (rs == NULL) { 1092 mutex_exit(&svr->svr_lock); 1093 return; 1094 } 1095 uint64_t offset = rs->rs_start; 1096 uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc); 1097 1098 range_tree_remove(svr->svr_allocd_segs, offset, length); 1099 1100 if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { 1101 dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, 1102 svr, 0, ZFS_SPACE_CHECK_NONE, tx); 1103 } 1104 1105 svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length; 1106 1107 /* 1108 * Note: this is the amount of *allocated* space 1109 * that we are taking care of each txg. 1110 */ 1111 svr->svr_bytes_done[txg & TXG_MASK] += length; 1112 1113 mutex_exit(&svr->svr_lock); 1114 1115 zio_alloc_list_t zal; 1116 metaslab_trace_init(&zal); 1117 uint64_t thismax = *max_alloc; 1118 while (length > 0) { 1119 uint64_t mylen = MIN(length, thismax); 1120 1121 int error = spa_vdev_copy_segment(svr->svr_vdev, 1122 offset, mylen, txg, vca, &zal); 1123 1124 if (error == ENOSPC) { 1125 /* 1126 * Cut our segment in half, and don't try this 1127 * segment size again this txg. Note that the 1128 * allocation size must be aligned to the highest 1129 * ashift in the pool, so that the allocation will 1130 * not be padded out to a multiple of the ashift, 1131 * which could cause us to think that this mapping 1132 * is larger than we intended. 1133 */ 1134 ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); 1135 ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); 1136 thismax = P2ROUNDUP(mylen / 2, 1137 1 << spa->spa_max_ashift); 1138 ASSERT3U(thismax, <, mylen); 1139 /* 1140 * The minimum-size allocation can not fail. 1141 */ 1142 ASSERT3U(mylen, >, 1 << spa->spa_max_ashift); 1143 *max_alloc = mylen - (1 << spa->spa_max_ashift); 1144 } else { 1145 ASSERT0(error); 1146 length -= mylen; 1147 offset += mylen; 1148 1149 /* 1150 * We've performed an allocation, so reset the 1151 * alloc trace list. 1152 */ 1153 metaslab_trace_fini(&zal); 1154 metaslab_trace_init(&zal); 1155 } 1156 } 1157 metaslab_trace_fini(&zal); 1158} 1159 1160/* 1161 * The removal thread operates in open context. It iterates over all 1162 * allocated space in the vdev, by loading each metaslab's spacemap. 1163 * For each contiguous segment of allocated space (capping the segment 1164 * size at SPA_MAXBLOCKSIZE), we: 1165 * - Allocate space for it on another vdev. 1166 * - Create a new mapping from the old location to the new location 1167 * (as a record in svr_new_segments). 1168 * - Initiate a logical read zio to get the data off the removing disk. 1169 * - In the read zio's done callback, initiate a logical write zio to 1170 * write it to the new vdev. 1171 * Note that all of this will take effect when a particular TXG syncs. 1172 * The sync thread ensures that all the phys reads and writes for the syncing 1173 * TXG have completed (see spa_txg_zio) and writes the new mappings to disk 1174 * (see vdev_mapping_sync()). 1175 */ 1176static void 1177spa_vdev_remove_thread(void *arg) 1178{ 1179 vdev_t *vd = arg; 1180 spa_t *spa = vd->vdev_spa; 1181 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1182 vdev_copy_arg_t vca; 1183 uint64_t max_alloc = zfs_remove_max_segment; 1184 uint64_t last_txg = 0; 1185 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1186 uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); 1187 1188 ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); 1189 ASSERT(vdev_is_concrete(vd)); 1190 ASSERT(vd->vdev_removing); 1191 ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); 1192 ASSERT3P(svr->svr_vdev, ==, vd); 1193 ASSERT(vim != NULL); 1194 1195 mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); 1196 cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); 1197 vca.vca_outstanding_bytes = 0; 1198 1199 mutex_enter(&svr->svr_lock); 1200 1201 /* 1202 * Start from vim_max_offset so we pick up where we left off 1203 * if we are restarting the removal after opening the pool. 1204 */ 1205 uint64_t msi; 1206 for (msi = start_offset >> vd->vdev_ms_shift; 1207 msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { 1208 metaslab_t *msp = vd->vdev_ms[msi]; 1209 ASSERT3U(msi, <=, vd->vdev_ms_count); 1210 1211 ASSERT0(range_tree_space(svr->svr_allocd_segs)); 1212 1213 mutex_enter(&msp->ms_sync_lock); 1214 mutex_enter(&msp->ms_lock); 1215 1216 /* 1217 * Assert nothing in flight -- ms_*tree is empty. 1218 */ 1219 for (int i = 0; i < TXG_SIZE; i++) { 1220 ASSERT0(range_tree_space(msp->ms_allocating[i])); 1221 } 1222 1223 /* 1224 * If the metaslab has ever been allocated from (ms_sm!=NULL), 1225 * read the allocated segments from the space map object 1226 * into svr_allocd_segs. Since we do this while holding 1227 * svr_lock and ms_sync_lock, concurrent frees (which 1228 * would have modified the space map) will wait for us 1229 * to finish loading the spacemap, and then take the 1230 * appropriate action (see free_from_removing_vdev()). 1231 */ 1232 if (msp->ms_sm != NULL) { 1233 space_map_t *sm = NULL; 1234 1235 /* 1236 * We have to open a new space map here, because 1237 * ms_sm's sm_length and sm_alloc may not reflect 1238 * what's in the object contents, if we are in between 1239 * metaslab_sync() and metaslab_sync_done(). 1240 */ 1241 VERIFY0(space_map_open(&sm, 1242 spa->spa_dsl_pool->dp_meta_objset, 1243 msp->ms_sm->sm_object, msp->ms_sm->sm_start, 1244 msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); 1245 space_map_update(sm); 1246 VERIFY0(space_map_load(sm, svr->svr_allocd_segs, 1247 SM_ALLOC)); 1248 space_map_close(sm); 1249 1250 range_tree_walk(msp->ms_freeing, 1251 range_tree_remove, svr->svr_allocd_segs); 1252 1253 /* 1254 * When we are resuming from a paused removal (i.e. 1255 * when importing a pool with a removal in progress), 1256 * discard any state that we have already processed. 1257 */ 1258 range_tree_clear(svr->svr_allocd_segs, 0, start_offset); 1259 } 1260 mutex_exit(&msp->ms_lock); 1261 mutex_exit(&msp->ms_sync_lock); 1262 1263 vca.vca_msp = msp; 1264 zfs_dbgmsg("copying %llu segments for metaslab %llu", 1265 avl_numnodes(&svr->svr_allocd_segs->rt_root), 1266 msp->ms_id); 1267 1268 while (!svr->svr_thread_exit && 1269 !range_tree_is_empty(svr->svr_allocd_segs)) { 1270 1271 mutex_exit(&svr->svr_lock); 1272 1273 /* 1274 * This delay will pause the removal around the point 1275 * specified by zfs_remove_max_bytes_pause. We do this 1276 * solely from the test suite or during debugging. 1277 */ 1278 uint64_t bytes_copied = 1279 spa->spa_removing_phys.sr_copied; 1280 for (int i = 0; i < TXG_SIZE; i++) 1281 bytes_copied += svr->svr_bytes_done[i]; 1282 while (zfs_remove_max_bytes_pause <= bytes_copied && 1283 !svr->svr_thread_exit) 1284 delay(hz); 1285 1286 mutex_enter(&vca.vca_lock); 1287 while (vca.vca_outstanding_bytes > 1288 zfs_remove_max_copy_bytes) { 1289 cv_wait(&vca.vca_cv, &vca.vca_lock); 1290 } 1291 mutex_exit(&vca.vca_lock); 1292 1293 dmu_tx_t *tx = 1294 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1295 1296 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1297 uint64_t txg = dmu_tx_get_txg(tx); 1298 1299 if (txg != last_txg) 1300 max_alloc = zfs_remove_max_segment; 1301 last_txg = txg; 1302 1303 spa_vdev_copy_impl(svr, &vca, &max_alloc, tx); 1304 1305 dmu_tx_commit(tx); 1306 mutex_enter(&svr->svr_lock); 1307 } 1308 } 1309 1310 mutex_exit(&svr->svr_lock); 1311 /* 1312 * Wait for all copies to finish before cleaning up the vca. 1313 */ 1314 txg_wait_synced(spa->spa_dsl_pool, 0); 1315 ASSERT0(vca.vca_outstanding_bytes); 1316 1317 mutex_destroy(&vca.vca_lock); 1318 cv_destroy(&vca.vca_cv); 1319 1320 if (svr->svr_thread_exit) { 1321 mutex_enter(&svr->svr_lock); 1322 range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); 1323 svr->svr_thread = NULL; 1324 cv_broadcast(&svr->svr_cv); 1325 mutex_exit(&svr->svr_lock); 1326 } else { 1327 ASSERT0(range_tree_space(svr->svr_allocd_segs)); 1328 vdev_remove_complete(vd); 1329 } 1330 thread_exit(); 1331} 1332 1333void 1334spa_vdev_remove_suspend(spa_t *spa) 1335{ 1336 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1337 1338 if (svr == NULL) 1339 return; 1340 1341 mutex_enter(&svr->svr_lock); 1342 svr->svr_thread_exit = B_TRUE; 1343 while (svr->svr_thread != NULL) 1344 cv_wait(&svr->svr_cv, &svr->svr_lock); 1345 svr->svr_thread_exit = B_FALSE; 1346 mutex_exit(&svr->svr_lock); 1347} 1348 1349/* ARGSUSED */ 1350static int 1351spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) 1352{ 1353 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1354 1355 if (spa->spa_vdev_removal == NULL) 1356 return (ESRCH); 1357 return (0); 1358} 1359 1360/* 1361 * Cancel a removal by freeing all entries from the partial mapping 1362 * and marking the vdev as no longer being removing. 1363 */ 1364/* ARGSUSED */ 1365static void 1366spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) 1367{ 1368 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1369 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1370 vdev_t *vd = svr->svr_vdev; 1371 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 1372 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1373 objset_t *mos = spa->spa_meta_objset; 1374 1375 ASSERT3P(svr->svr_thread, ==, NULL); 1376 1377 spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); 1378 if (vdev_obsolete_counts_are_precise(vd)) { 1379 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 1380 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 1381 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); 1382 } 1383 1384 if (vdev_obsolete_sm_object(vd) != 0) { 1385 ASSERT(vd->vdev_obsolete_sm != NULL); 1386 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 1387 space_map_object(vd->vdev_obsolete_sm)); 1388 1389 space_map_free(vd->vdev_obsolete_sm, tx); 1390 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 1391 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); 1392 space_map_close(vd->vdev_obsolete_sm); 1393 vd->vdev_obsolete_sm = NULL; 1394 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 1395 } 1396 for (int i = 0; i < TXG_SIZE; i++) { 1397 ASSERT(list_is_empty(&svr->svr_new_segments[i])); 1398 ASSERT3U(svr->svr_max_offset_to_sync[i], <=, 1399 vdev_indirect_mapping_max_offset(vim)); 1400 } 1401 1402 for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { 1403 metaslab_t *msp = vd->vdev_ms[msi]; 1404 1405 if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) 1406 break; 1407 1408 ASSERT0(range_tree_space(svr->svr_allocd_segs)); 1409 1410 mutex_enter(&msp->ms_lock); 1411 1412 /* 1413 * Assert nothing in flight -- ms_*tree is empty. 1414 */ 1415 for (int i = 0; i < TXG_SIZE; i++) 1416 ASSERT0(range_tree_space(msp->ms_allocating[i])); 1417 for (int i = 0; i < TXG_DEFER_SIZE; i++) 1418 ASSERT0(range_tree_space(msp->ms_defer[i])); 1419 ASSERT0(range_tree_space(msp->ms_freed)); 1420 1421 if (msp->ms_sm != NULL) { 1422 /* 1423 * Assert that the in-core spacemap has the same 1424 * length as the on-disk one, so we can use the 1425 * existing in-core spacemap to load it from disk. 1426 */ 1427 ASSERT3U(msp->ms_sm->sm_alloc, ==, 1428 msp->ms_sm->sm_phys->smp_alloc); 1429 ASSERT3U(msp->ms_sm->sm_length, ==, 1430 msp->ms_sm->sm_phys->smp_objsize); 1431 1432 mutex_enter(&svr->svr_lock); 1433 VERIFY0(space_map_load(msp->ms_sm, 1434 svr->svr_allocd_segs, SM_ALLOC)); 1435 range_tree_walk(msp->ms_freeing, 1436 range_tree_remove, svr->svr_allocd_segs); 1437 1438 /* 1439 * Clear everything past what has been synced, 1440 * because we have not allocated mappings for it yet. 1441 */ 1442 uint64_t syncd = vdev_indirect_mapping_max_offset(vim); 1443 range_tree_clear(svr->svr_allocd_segs, syncd, 1444 msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd); 1445 1446 mutex_exit(&svr->svr_lock); 1447 } 1448 mutex_exit(&msp->ms_lock); 1449 1450 mutex_enter(&svr->svr_lock); 1451 range_tree_vacate(svr->svr_allocd_segs, 1452 free_mapped_segment_cb, vd); 1453 mutex_exit(&svr->svr_lock); 1454 } 1455 1456 /* 1457 * Note: this must happen after we invoke free_mapped_segment_cb, 1458 * because it adds to the obsolete_segments. 1459 */ 1460 range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); 1461 1462 ASSERT3U(vic->vic_mapping_object, ==, 1463 vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); 1464 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 1465 vd->vdev_indirect_mapping = NULL; 1466 vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); 1467 vic->vic_mapping_object = 0; 1468 1469 ASSERT3U(vic->vic_births_object, ==, 1470 vdev_indirect_births_object(vd->vdev_indirect_births)); 1471 vdev_indirect_births_close(vd->vdev_indirect_births); 1472 vd->vdev_indirect_births = NULL; 1473 vdev_indirect_births_free(mos, vic->vic_births_object, tx); 1474 vic->vic_births_object = 0; 1475 1476 /* 1477 * We may have processed some frees from the removing vdev in this 1478 * txg, thus increasing svr_bytes_done; discard that here to 1479 * satisfy the assertions in spa_vdev_removal_destroy(). 1480 * Note that future txg's can not have any bytes_done, because 1481 * future TXG's are only modified from open context, and we have 1482 * already shut down the copying thread. 1483 */ 1484 svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; 1485 spa_finish_removal(spa, DSS_CANCELED, tx); 1486 1487 vd->vdev_removing = B_FALSE; 1488 vdev_config_dirty(vd); 1489 1490 zfs_dbgmsg("canceled device removal for vdev %llu in %llu", 1491 vd->vdev_id, dmu_tx_get_txg(tx)); 1492 spa_history_log_internal(spa, "vdev remove canceled", tx, 1493 "%s vdev %llu %s", spa_name(spa), 1494 vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); 1495} 1496 1497int 1498spa_vdev_remove_cancel(spa_t *spa) 1499{ 1500 spa_vdev_remove_suspend(spa); 1501 1502 if (spa->spa_vdev_removal == NULL) 1503 return (ESRCH); 1504 1505 uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id; 1506 1507 int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, 1508 spa_vdev_remove_cancel_sync, NULL, 0, 1509 ZFS_SPACE_CHECK_EXTRA_RESERVED); 1510 1511 if (error == 0) { 1512 spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); 1513 vdev_t *vd = vdev_lookup_top(spa, vdid); 1514 metaslab_group_activate(vd->vdev_mg); 1515 spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); 1516 } 1517 1518 return (error); 1519} 1520 1521/* 1522 * Called every sync pass of every txg if there's a svr. 1523 */ 1524void 1525svr_sync(spa_t *spa, dmu_tx_t *tx) 1526{ 1527 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1528 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 1529 1530 /* 1531 * This check is necessary so that we do not dirty the 1532 * DIRECTORY_OBJECT via spa_sync_removing_state() when there 1533 * is nothing to do. Dirtying it every time would prevent us 1534 * from syncing-to-convergence. 1535 */ 1536 if (svr->svr_bytes_done[txgoff] == 0) 1537 return; 1538 1539 /* 1540 * Update progress accounting. 1541 */ 1542 spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; 1543 svr->svr_bytes_done[txgoff] = 0; 1544 1545 spa_sync_removing_state(spa, tx); 1546} 1547 1548static void 1549vdev_remove_make_hole_and_free(vdev_t *vd) 1550{ 1551 uint64_t id = vd->vdev_id; 1552 spa_t *spa = vd->vdev_spa; 1553 vdev_t *rvd = spa->spa_root_vdev; 1554 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 1555 1556 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1557 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1558 1559 vdev_free(vd); 1560 1561 if (last_vdev) { 1562 vdev_compact_children(rvd); 1563 } else { 1564 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 1565 vdev_add_child(rvd, vd); 1566 } 1567 vdev_config_dirty(rvd); 1568 1569 /* 1570 * Reassess the health of our root vdev. 1571 */ 1572 vdev_reopen(rvd); 1573} 1574 1575/* 1576 * Remove a log device. The config lock is held for the specified TXG. 1577 */ 1578static int 1579spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) 1580{ 1581 metaslab_group_t *mg = vd->vdev_mg; 1582 spa_t *spa = vd->vdev_spa; 1583 int error = 0; 1584 1585 ASSERT(vd->vdev_islog); 1586 ASSERT(vd == vd->vdev_top); 1587 1588 /* 1589 * Stop allocating from this vdev. 1590 */ 1591 metaslab_group_passivate(mg); 1592 1593 /* 1594 * Wait for the youngest allocations and frees to sync, 1595 * and then wait for the deferral of those frees to finish. 1596 */ 1597 spa_vdev_config_exit(spa, NULL, 1598 *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 1599 1600 /* 1601 * Evacuate the device. We don't hold the config lock as writer 1602 * since we need to do I/O but we do keep the 1603 * spa_namespace_lock held. Once this completes the device 1604 * should no longer have any blocks allocated on it. 1605 */ 1606 if (vd->vdev_islog) { 1607 if (vd->vdev_stat.vs_alloc != 0) 1608 error = spa_reset_logs(spa); 1609 } 1610 1611 *txg = spa_vdev_config_enter(spa); 1612 1613 if (error != 0) { 1614 metaslab_group_activate(mg); 1615 return (error); 1616 } 1617 ASSERT0(vd->vdev_stat.vs_alloc); 1618 1619 /* 1620 * The evacuation succeeded. Remove any remaining MOS metadata 1621 * associated with this vdev, and wait for these changes to sync. 1622 */ 1623 vd->vdev_removing = B_TRUE; 1624 1625 vdev_dirty_leaves(vd, VDD_DTL, *txg); 1626 vdev_config_dirty(vd); 1627 1628 spa_history_log_internal(spa, "vdev remove", NULL, 1629 "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id, 1630 (vd->vdev_path != NULL) ? vd->vdev_path : "-"); 1631 1632 /* Make sure these changes are sync'ed */ 1633 spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); 1634 1635 *txg = spa_vdev_config_enter(spa); 1636 1637 sysevent_t *ev = spa_event_create(spa, vd, NULL, 1638 ESC_ZFS_VDEV_REMOVE_DEV); 1639 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1640 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1641 1642 /* The top ZAP should have been destroyed by vdev_remove_empty. */ 1643 ASSERT0(vd->vdev_top_zap); 1644 /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ 1645 ASSERT0(vd->vdev_leaf_zap); 1646 1647 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1648 1649 if (list_link_active(&vd->vdev_state_dirty_node)) 1650 vdev_state_clean(vd); 1651 if (list_link_active(&vd->vdev_config_dirty_node)) 1652 vdev_config_clean(vd); 1653 1654 /* 1655 * Clean up the vdev namespace. 1656 */ 1657 vdev_remove_make_hole_and_free(vd); 1658 1659 if (ev != NULL) 1660 spa_event_post(ev); 1661 1662 return (0); 1663} 1664 1665static int 1666spa_vdev_remove_top_check(vdev_t *vd) 1667{ 1668 spa_t *spa = vd->vdev_spa; 1669 1670 if (vd != vd->vdev_top) 1671 return (SET_ERROR(ENOTSUP)); 1672 1673 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) 1674 return (SET_ERROR(ENOTSUP)); 1675 1676 /* 1677 * There has to be enough free space to remove the 1678 * device and leave double the "slop" space (i.e. we 1679 * must leave at least 3% of the pool free, in addition to 1680 * the normal slop space). 1681 */ 1682 if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir, 1683 NULL, 0, B_TRUE) < 1684 vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { 1685 return (SET_ERROR(ENOSPC)); 1686 } 1687 1688 /* 1689 * There can not be a removal in progress. 1690 */ 1691 if (spa->spa_removing_phys.sr_state == DSS_SCANNING) 1692 return (SET_ERROR(EBUSY)); 1693 1694 /* 1695 * The device must have all its data. 1696 */ 1697 if (!vdev_dtl_empty(vd, DTL_MISSING) || 1698 !vdev_dtl_empty(vd, DTL_OUTAGE)) 1699 return (SET_ERROR(EBUSY)); 1700 1701 /* 1702 * The device must be healthy. 1703 */ 1704 if (!vdev_readable(vd)) 1705 return (SET_ERROR(EIO)); 1706 1707 /* 1708 * All vdevs in normal class must have the same ashift. 1709 */ 1710 if (spa->spa_max_ashift != spa->spa_min_ashift) { 1711 return (SET_ERROR(EINVAL)); 1712 } 1713 1714 /* 1715 * All vdevs in normal class must have the same ashift 1716 * and not be raidz. 1717 */ 1718 vdev_t *rvd = spa->spa_root_vdev; 1719 int num_indirect = 0; 1720 for (uint64_t id = 0; id < rvd->vdev_children; id++) { 1721 vdev_t *cvd = rvd->vdev_child[id]; 1722 if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) 1723 ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); 1724 if (cvd->vdev_ops == &vdev_indirect_ops) 1725 num_indirect++; 1726 if (!vdev_is_concrete(cvd)) 1727 continue; 1728 if (cvd->vdev_ops == &vdev_raidz_ops) 1729 return (SET_ERROR(EINVAL)); 1730 /* 1731 * Need the mirror to be mirror of leaf vdevs only 1732 */ 1733 if (cvd->vdev_ops == &vdev_mirror_ops) { 1734 for (uint64_t cid = 0; 1735 cid < cvd->vdev_children; cid++) { 1736 vdev_t *tmp = cvd->vdev_child[cid]; 1737 if (!tmp->vdev_ops->vdev_op_leaf) 1738 return (SET_ERROR(EINVAL)); 1739 } 1740 } 1741 } 1742 1743 return (0); 1744} 1745 1746/* 1747 * Initiate removal of a top-level vdev, reducing the total space in the pool. 1748 * The config lock is held for the specified TXG. Once initiated, 1749 * evacuation of all allocated space (copying it to other vdevs) happens 1750 * in the background (see spa_vdev_remove_thread()), and can be canceled 1751 * (see spa_vdev_remove_cancel()). If successful, the vdev will 1752 * be transformed to an indirect vdev (see spa_vdev_remove_complete()). 1753 */ 1754static int 1755spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) 1756{ 1757 spa_t *spa = vd->vdev_spa; 1758 int error; 1759 1760 /* 1761 * Check for errors up-front, so that we don't waste time 1762 * passivating the metaslab group and clearing the ZIL if there 1763 * are errors. 1764 */ 1765 error = spa_vdev_remove_top_check(vd); 1766 if (error != 0) 1767 return (error); 1768 1769 /* 1770 * Stop allocating from this vdev. Note that we must check 1771 * that this is not the only device in the pool before 1772 * passivating, otherwise we will not be able to make 1773 * progress because we can't allocate from any vdevs. 1774 * The above check for sufficient free space serves this 1775 * purpose. 1776 */ 1777 metaslab_group_t *mg = vd->vdev_mg; 1778 metaslab_group_passivate(mg); 1779 1780 /* 1781 * Wait for the youngest allocations and frees to sync, 1782 * and then wait for the deferral of those frees to finish. 1783 */ 1784 spa_vdev_config_exit(spa, NULL, 1785 *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 1786 1787 /* 1788 * We must ensure that no "stubby" log blocks are allocated 1789 * on the device to be removed. These blocks could be 1790 * written at any time, including while we are in the middle 1791 * of copying them. 1792 */ 1793 error = spa_reset_logs(spa); 1794 1795 *txg = spa_vdev_config_enter(spa); 1796 1797 /* 1798 * Things might have changed while the config lock was dropped 1799 * (e.g. space usage). Check for errors again. 1800 */ 1801 if (error == 0) 1802 error = spa_vdev_remove_top_check(vd); 1803 1804 if (error != 0) { 1805 metaslab_group_activate(mg); 1806 return (error); 1807 } 1808 1809 vd->vdev_removing = B_TRUE; 1810 1811 vdev_dirty_leaves(vd, VDD_DTL, *txg); 1812 vdev_config_dirty(vd); 1813 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); 1814 dsl_sync_task_nowait(spa->spa_dsl_pool, 1815 vdev_remove_initiate_sync, 1816 vd, 0, ZFS_SPACE_CHECK_NONE, tx); 1817 dmu_tx_commit(tx); 1818 1819 return (0); 1820} 1821 1822/* 1823 * Remove a device from the pool. 1824 * 1825 * Removing a device from the vdev namespace requires several steps 1826 * and can take a significant amount of time. As a result we use 1827 * the spa_vdev_config_[enter/exit] functions which allow us to 1828 * grab and release the spa_config_lock while still holding the namespace 1829 * lock. During each step the configuration is synced out. 1830 */ 1831int 1832spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1833{ 1834 vdev_t *vd; 1835 nvlist_t **spares, **l2cache, *nv; 1836 uint64_t txg = 0; 1837 uint_t nspares, nl2cache; 1838 int error = 0; 1839 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 1840 sysevent_t *ev = NULL; 1841 1842 ASSERT(spa_writeable(spa)); 1843 1844 if (!locked) 1845 txg = spa_vdev_enter(spa); 1846 1847 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1848 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 1849 error = (spa_has_checkpoint(spa)) ? 1850 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 1851 1852 if (!locked) 1853 return (spa_vdev_exit(spa, NULL, txg, error)); 1854 1855 return (error); 1856 } 1857 1858 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 1859 1860 if (spa->spa_spares.sav_vdevs != NULL && 1861 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1862 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 1863 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 1864 /* 1865 * Only remove the hot spare if it's not currently in use 1866 * in this pool. 1867 */ 1868 if (vd == NULL || unspare) { 1869 char *nvstr = fnvlist_lookup_string(nv, 1870 ZPOOL_CONFIG_PATH); 1871 spa_history_log_internal(spa, "vdev remove", NULL, 1872 "%s vdev (%s) %s", spa_name(spa), 1873 VDEV_TYPE_SPARE, nvstr); 1874 if (vd == NULL) 1875 vd = spa_lookup_by_guid(spa, guid, B_TRUE); 1876 ev = spa_event_create(spa, vd, NULL, 1877 ESC_ZFS_VDEV_REMOVE_AUX); 1878 spa_vdev_remove_aux(spa->spa_spares.sav_config, 1879 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 1880 spa_load_spares(spa); 1881 spa->spa_spares.sav_sync = B_TRUE; 1882 } else { 1883 error = SET_ERROR(EBUSY); 1884 } 1885 } else if (spa->spa_l2cache.sav_vdevs != NULL && 1886 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1887 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 1888 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 1889 char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); 1890 spa_history_log_internal(spa, "vdev remove", NULL, 1891 "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr); 1892 /* 1893 * Cache devices can always be removed. 1894 */ 1895 vd = spa_lookup_by_guid(spa, guid, B_TRUE); 1896 ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); 1897 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 1898 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 1899 spa_load_l2cache(spa); 1900 spa->spa_l2cache.sav_sync = B_TRUE; 1901 } else if (vd != NULL && vd->vdev_islog) { 1902 ASSERT(!locked); 1903 error = spa_vdev_remove_log(vd, &txg); 1904 } else if (vd != NULL) { 1905 ASSERT(!locked); 1906 error = spa_vdev_remove_top(vd, &txg); 1907 } else { 1908 /* 1909 * There is no vdev of any kind with the specified guid. 1910 */ 1911 error = SET_ERROR(ENOENT); 1912 } 1913 1914 if (!locked) 1915 error = spa_vdev_exit(spa, NULL, txg, error); 1916 1917 if (ev != NULL) { 1918 if (error != 0) { 1919 spa_event_discard(ev); 1920 } else { 1921 spa_event_post(ev); 1922 } 1923 } 1924 1925 return (error); 1926} 1927 1928int 1929spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) 1930{ 1931 prs->prs_state = spa->spa_removing_phys.sr_state; 1932 1933 if (prs->prs_state == DSS_NONE) 1934 return (SET_ERROR(ENOENT)); 1935 1936 prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; 1937 prs->prs_start_time = spa->spa_removing_phys.sr_start_time; 1938 prs->prs_end_time = spa->spa_removing_phys.sr_end_time; 1939 prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; 1940 prs->prs_copied = spa->spa_removing_phys.sr_copied; 1941 1942 if (spa->spa_vdev_removal != NULL) { 1943 for (int i = 0; i < TXG_SIZE; i++) { 1944 prs->prs_copied += 1945 spa->spa_vdev_removal->svr_bytes_done[i]; 1946 } 1947 } 1948 1949 prs->prs_mapping_memory = 0; 1950 uint64_t indirect_vdev_id = 1951 spa->spa_removing_phys.sr_prev_indirect_vdev; 1952 while (indirect_vdev_id != -1) { 1953 vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; 1954 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 1955 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1956 1957 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 1958 prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); 1959 indirect_vdev_id = vic->vic_prev_indirect_vdev; 1960 } 1961 1962 return (0); 1963} 1964