1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
25 * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/spa_impl.h>
30#include <sys/dmu.h>
31#include <sys/dmu_tx.h>
32#include <sys/zap.h>
33#include <sys/vdev_impl.h>
34#include <sys/metaslab.h>
35#include <sys/metaslab_impl.h>
36#include <sys/uberblock_impl.h>
37#include <sys/txg.h>
38#include <sys/avl.h>
39#include <sys/bpobj.h>
40#include <sys/dsl_pool.h>
41#include <sys/dsl_synctask.h>
42#include <sys/dsl_dir.h>
43#include <sys/arc.h>
44#include <sys/zfeature.h>
45#include <sys/vdev_indirect_births.h>
46#include <sys/vdev_indirect_mapping.h>
47#include <sys/abd.h>
48#include <sys/vdev_initialize.h>
49#include <sys/vdev_trim.h>
50#include <sys/trace_zfs.h>
51
52/*
53 * This file contains the necessary logic to remove vdevs from a
54 * storage pool.  Currently, the only devices that can be removed
55 * are log, cache, and spare devices; and top level vdevs from a pool
56 * w/o raidz or mirrors.  (Note that members of a mirror can be removed
57 * by the detach operation.)
58 *
59 * Log vdevs are removed by evacuating them and then turning the vdev
60 * into a hole vdev while holding spa config locks.
61 *
62 * Top level vdevs are removed and converted into an indirect vdev via
63 * a multi-step process:
64 *
65 *  - Disable allocations from this device (spa_vdev_remove_top).
66 *
67 *  - From a new thread (spa_vdev_remove_thread), copy data from
68 *    the removing vdev to a different vdev.  The copy happens in open
69 *    context (spa_vdev_copy_impl) and issues a sync task
70 *    (vdev_mapping_sync) so the sync thread can update the partial
71 *    indirect mappings in core and on disk.
72 *
73 *  - If a free happens during a removal, it is freed from the
74 *    removing vdev, and if it has already been copied, from the new
75 *    location as well (free_from_removing_vdev).
76 *
77 *  - After the removal is completed, the copy thread converts the vdev
78 *    into an indirect vdev (vdev_remove_complete) before instructing
79 *    the sync thread to destroy the space maps and finish the removal
80 *    (spa_finish_removal).
81 */
82
83typedef struct vdev_copy_arg {
84	metaslab_t	*vca_msp;
85	uint64_t	vca_outstanding_bytes;
86	uint64_t	vca_read_error_bytes;
87	uint64_t	vca_write_error_bytes;
88	kcondvar_t	vca_cv;
89	kmutex_t	vca_lock;
90} vdev_copy_arg_t;
91
92/*
93 * The maximum amount of memory we can use for outstanding i/o while
94 * doing a device removal.  This determines how much i/o we can have
95 * in flight concurrently.
96 */
97static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
98
99/*
100 * The largest contiguous segment that we will attempt to allocate when
101 * removing a device.  This can be no larger than SPA_MAXBLOCKSIZE.  If
102 * there is a performance problem with attempting to allocate large blocks,
103 * consider decreasing this.
104 *
105 * See also the accessor function spa_remove_max_segment().
106 */
107uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
108
109/*
110 * Ignore hard IO errors during device removal.  When set if a device
111 * encounters hard IO error during the removal process the removal will
112 * not be cancelled.  This can result in a normally recoverable block
113 * becoming permanently damaged and is not recommended.
114 */
115static int zfs_removal_ignore_errors = 0;
116
117/*
118 * Allow a remap segment to span free chunks of at most this size. The main
119 * impact of a larger span is that we will read and write larger, more
120 * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
121 * for iops.  The value here was chosen to align with
122 * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
123 * reads (but there's no reason it has to be the same).
124 *
125 * Additionally, a higher span will have the following relatively minor
126 * effects:
127 *  - the mapping will be smaller, since one entry can cover more allocated
128 *    segments
129 *  - more of the fragmentation in the removing device will be preserved
130 *  - we'll do larger allocations, which may fail and fall back on smaller
131 *    allocations
132 */
133uint_t vdev_removal_max_span = 32 * 1024;
134
135/*
136 * This is used by the test suite so that it can ensure that certain
137 * actions happen while in the middle of a removal.
138 */
139int zfs_removal_suspend_progress = 0;
140
141#define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
142
143static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg);
144static int spa_vdev_remove_cancel_impl(spa_t *spa);
145
146static void
147spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
148{
149	VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
150	    DMU_POOL_DIRECTORY_OBJECT,
151	    DMU_POOL_REMOVING, sizeof (uint64_t),
152	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
153	    &spa->spa_removing_phys, tx));
154}
155
156static nvlist_t *
157spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
158{
159	for (int i = 0; i < count; i++) {
160		uint64_t guid =
161		    fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
162
163		if (guid == target_guid)
164			return (nvpp[i]);
165	}
166
167	return (NULL);
168}
169
170static void
171vdev_activate(vdev_t *vd)
172{
173	metaslab_group_t *mg = vd->vdev_mg;
174	spa_t *spa = vd->vdev_spa;
175	uint64_t vdev_space = spa_deflate(spa) ?
176	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
177
178	ASSERT(!vd->vdev_islog);
179	ASSERT(vd->vdev_noalloc);
180
181	metaslab_group_activate(mg);
182	metaslab_group_activate(vd->vdev_log_mg);
183
184	ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
185
186	spa->spa_nonallocating_dspace -= vdev_space;
187
188	vd->vdev_noalloc = B_FALSE;
189}
190
191static int
192vdev_passivate(vdev_t *vd, uint64_t *txg)
193{
194	spa_t *spa = vd->vdev_spa;
195	int error;
196
197	ASSERT(!vd->vdev_noalloc);
198
199	vdev_t *rvd = spa->spa_root_vdev;
200	metaslab_group_t *mg = vd->vdev_mg;
201	metaslab_class_t *normal = spa_normal_class(spa);
202	if (mg->mg_class == normal) {
203		/*
204		 * We must check that this is not the only allocating device in
205		 * the pool before passivating, otherwise we will not be able
206		 * to make progress because we can't allocate from any vdevs.
207		 */
208		boolean_t last = B_TRUE;
209		for (uint64_t id = 0; id < rvd->vdev_children; id++) {
210			vdev_t *cvd = rvd->vdev_child[id];
211
212			if (cvd == vd ||
213			    cvd->vdev_ops == &vdev_indirect_ops)
214				continue;
215
216			metaslab_class_t *mc = cvd->vdev_mg->mg_class;
217			if (mc != normal)
218				continue;
219
220			if (!cvd->vdev_noalloc) {
221				last = B_FALSE;
222				break;
223			}
224		}
225		if (last)
226			return (SET_ERROR(EINVAL));
227	}
228
229	metaslab_group_passivate(mg);
230	ASSERT(!vd->vdev_islog);
231	metaslab_group_passivate(vd->vdev_log_mg);
232
233	/*
234	 * Wait for the youngest allocations and frees to sync,
235	 * and then wait for the deferral of those frees to finish.
236	 */
237	spa_vdev_config_exit(spa, NULL,
238	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
239
240	/*
241	 * We must ensure that no "stubby" log blocks are allocated
242	 * on the device to be removed.  These blocks could be
243	 * written at any time, including while we are in the middle
244	 * of copying them.
245	 */
246	error = spa_reset_logs(spa);
247
248	*txg = spa_vdev_config_enter(spa);
249
250	if (error != 0) {
251		metaslab_group_activate(mg);
252		ASSERT(!vd->vdev_islog);
253		if (vd->vdev_log_mg != NULL)
254			metaslab_group_activate(vd->vdev_log_mg);
255		return (error);
256	}
257
258	spa->spa_nonallocating_dspace += spa_deflate(spa) ?
259	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
260	vd->vdev_noalloc = B_TRUE;
261
262	return (0);
263}
264
265/*
266 * Turn off allocations for a top-level device from the pool.
267 *
268 * Turning off allocations for a top-level device can take a significant
269 * amount of time. As a result we use the spa_vdev_config_[enter/exit]
270 * functions which allow us to grab and release the spa_config_lock while
271 * still holding the namespace lock. During each step the configuration
272 * is synced out.
273 */
274int
275spa_vdev_noalloc(spa_t *spa, uint64_t guid)
276{
277	vdev_t *vd;
278	uint64_t txg;
279	int error = 0;
280
281	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
282	ASSERT(spa_writeable(spa));
283
284	txg = spa_vdev_enter(spa);
285
286	ASSERT(MUTEX_HELD(&spa_namespace_lock));
287
288	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
289
290	if (vd == NULL)
291		error = SET_ERROR(ENOENT);
292	else if (vd->vdev_mg == NULL)
293		error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
294	else if (!vd->vdev_noalloc)
295		error = vdev_passivate(vd, &txg);
296
297	if (error == 0) {
298		vdev_dirty_leaves(vd, VDD_DTL, txg);
299		vdev_config_dirty(vd);
300	}
301
302	error = spa_vdev_exit(spa, NULL, txg, error);
303
304	return (error);
305}
306
307int
308spa_vdev_alloc(spa_t *spa, uint64_t guid)
309{
310	vdev_t *vd;
311	uint64_t txg;
312	int error = 0;
313
314	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
315	ASSERT(spa_writeable(spa));
316
317	txg = spa_vdev_enter(spa);
318
319	ASSERT(MUTEX_HELD(&spa_namespace_lock));
320
321	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
322
323	if (vd == NULL)
324		error = SET_ERROR(ENOENT);
325	else if (vd->vdev_mg == NULL)
326		error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
327	else if (!vd->vdev_removing)
328		vdev_activate(vd);
329
330	if (error == 0) {
331		vdev_dirty_leaves(vd, VDD_DTL, txg);
332		vdev_config_dirty(vd);
333	}
334
335	(void) spa_vdev_exit(spa, NULL, txg, error);
336
337	return (error);
338}
339
340static void
341spa_vdev_remove_aux(nvlist_t *config, const char *name, nvlist_t **dev,
342    int count, nvlist_t *dev_to_remove)
343{
344	nvlist_t **newdev = NULL;
345
346	if (count > 1)
347		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
348
349	for (int i = 0, j = 0; i < count; i++) {
350		if (dev[i] == dev_to_remove)
351			continue;
352		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
353	}
354
355	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
356	fnvlist_add_nvlist_array(config, name, (const nvlist_t * const *)newdev,
357	    count - 1);
358
359	for (int i = 0; i < count - 1; i++)
360		nvlist_free(newdev[i]);
361
362	if (count > 1)
363		kmem_free(newdev, (count - 1) * sizeof (void *));
364}
365
366static spa_vdev_removal_t *
367spa_vdev_removal_create(vdev_t *vd)
368{
369	spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
370	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
371	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
372	svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
373	svr->svr_vdev_id = vd->vdev_id;
374
375	for (int i = 0; i < TXG_SIZE; i++) {
376		svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL,
377		    0, 0);
378		list_create(&svr->svr_new_segments[i],
379		    sizeof (vdev_indirect_mapping_entry_t),
380		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
381	}
382
383	return (svr);
384}
385
386void
387spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
388{
389	for (int i = 0; i < TXG_SIZE; i++) {
390		ASSERT0(svr->svr_bytes_done[i]);
391		ASSERT0(svr->svr_max_offset_to_sync[i]);
392		range_tree_destroy(svr->svr_frees[i]);
393		list_destroy(&svr->svr_new_segments[i]);
394	}
395
396	range_tree_destroy(svr->svr_allocd_segs);
397	mutex_destroy(&svr->svr_lock);
398	cv_destroy(&svr->svr_cv);
399	kmem_free(svr, sizeof (*svr));
400}
401
402/*
403 * This is called as a synctask in the txg in which we will mark this vdev
404 * as removing (in the config stored in the MOS).
405 *
406 * It begins the evacuation of a toplevel vdev by:
407 * - initializing the spa_removing_phys which tracks this removal
408 * - computing the amount of space to remove for accounting purposes
409 * - dirtying all dbufs in the spa_config_object
410 * - creating the spa_vdev_removal
411 * - starting the spa_vdev_remove_thread
412 */
413static void
414vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
415{
416	int vdev_id = (uintptr_t)arg;
417	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
418	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
419	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
420	objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
421	spa_vdev_removal_t *svr = NULL;
422	uint64_t txg __maybe_unused = dmu_tx_get_txg(tx);
423
424	ASSERT0(vdev_get_nparity(vd));
425	svr = spa_vdev_removal_create(vd);
426
427	ASSERT(vd->vdev_removing);
428	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
429
430	spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
431	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
432		/*
433		 * By activating the OBSOLETE_COUNTS feature, we prevent
434		 * the pool from being downgraded and ensure that the
435		 * refcounts are precise.
436		 */
437		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
438		uint64_t one = 1;
439		VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
440		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
441		    &one, tx));
442		boolean_t are_precise __maybe_unused;
443		ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
444		ASSERT3B(are_precise, ==, B_TRUE);
445	}
446
447	vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
448	vd->vdev_indirect_mapping =
449	    vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
450	vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
451	vd->vdev_indirect_births =
452	    vdev_indirect_births_open(mos, vic->vic_births_object);
453	spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
454	spa->spa_removing_phys.sr_start_time = gethrestime_sec();
455	spa->spa_removing_phys.sr_end_time = 0;
456	spa->spa_removing_phys.sr_state = DSS_SCANNING;
457	spa->spa_removing_phys.sr_to_copy = 0;
458	spa->spa_removing_phys.sr_copied = 0;
459
460	/*
461	 * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
462	 * there may be space in the defer tree, which is free, but still
463	 * counted in vs_alloc.
464	 */
465	for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
466		metaslab_t *ms = vd->vdev_ms[i];
467		if (ms->ms_sm == NULL)
468			continue;
469
470		spa->spa_removing_phys.sr_to_copy +=
471		    metaslab_allocated_space(ms);
472
473		/*
474		 * Space which we are freeing this txg does not need to
475		 * be copied.
476		 */
477		spa->spa_removing_phys.sr_to_copy -=
478		    range_tree_space(ms->ms_freeing);
479
480		ASSERT0(range_tree_space(ms->ms_freed));
481		for (int t = 0; t < TXG_SIZE; t++)
482			ASSERT0(range_tree_space(ms->ms_allocating[t]));
483	}
484
485	/*
486	 * Sync tasks are called before metaslab_sync(), so there should
487	 * be no already-synced metaslabs in the TXG_CLEAN list.
488	 */
489	ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
490
491	spa_sync_removing_state(spa, tx);
492
493	/*
494	 * All blocks that we need to read the most recent mapping must be
495	 * stored on concrete vdevs.  Therefore, we must dirty anything that
496	 * is read before spa_remove_init().  Specifically, the
497	 * spa_config_object.  (Note that although we already modified the
498	 * spa_config_object in spa_sync_removing_state, that may not have
499	 * modified all blocks of the object.)
500	 */
501	dmu_object_info_t doi;
502	VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
503	for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
504		dmu_buf_t *dbuf;
505		VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
506		    offset, FTAG, &dbuf, 0));
507		dmu_buf_will_dirty(dbuf, tx);
508		offset += dbuf->db_size;
509		dmu_buf_rele(dbuf, FTAG);
510	}
511
512	/*
513	 * Now that we've allocated the im_object, dirty the vdev to ensure
514	 * that the object gets written to the config on disk.
515	 */
516	vdev_config_dirty(vd);
517
518	zfs_dbgmsg("starting removal thread for vdev %llu (%px) in txg %llu "
519	    "im_obj=%llu", (u_longlong_t)vd->vdev_id, vd,
520	    (u_longlong_t)dmu_tx_get_txg(tx),
521	    (u_longlong_t)vic->vic_mapping_object);
522
523	spa_history_log_internal(spa, "vdev remove started", tx,
524	    "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id,
525	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
526	/*
527	 * Setting spa_vdev_removal causes subsequent frees to call
528	 * free_from_removing_vdev().  Note that we don't need any locking
529	 * because we are the sync thread, and metaslab_free_impl() is only
530	 * called from syncing context (potentially from a zio taskq thread,
531	 * but in any case only when there are outstanding free i/os, which
532	 * there are not).
533	 */
534	ASSERT3P(spa->spa_vdev_removal, ==, NULL);
535	spa->spa_vdev_removal = svr;
536	svr->svr_thread = thread_create(NULL, 0,
537	    spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
538}
539
540/*
541 * When we are opening a pool, we must read the mapping for each
542 * indirect vdev in order from most recently removed to least
543 * recently removed.  We do this because the blocks for the mapping
544 * of older indirect vdevs may be stored on more recently removed vdevs.
545 * In order to read each indirect mapping object, we must have
546 * initialized all more recently removed vdevs.
547 */
548int
549spa_remove_init(spa_t *spa)
550{
551	int error;
552
553	error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
554	    DMU_POOL_DIRECTORY_OBJECT,
555	    DMU_POOL_REMOVING, sizeof (uint64_t),
556	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
557	    &spa->spa_removing_phys);
558
559	if (error == ENOENT) {
560		spa->spa_removing_phys.sr_state = DSS_NONE;
561		spa->spa_removing_phys.sr_removing_vdev = -1;
562		spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
563		spa->spa_indirect_vdevs_loaded = B_TRUE;
564		return (0);
565	} else if (error != 0) {
566		return (error);
567	}
568
569	if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
570		/*
571		 * We are currently removing a vdev.  Create and
572		 * initialize a spa_vdev_removal_t from the bonus
573		 * buffer of the removing vdevs vdev_im_object, and
574		 * initialize its partial mapping.
575		 */
576		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
577		vdev_t *vd = vdev_lookup_top(spa,
578		    spa->spa_removing_phys.sr_removing_vdev);
579
580		if (vd == NULL) {
581			spa_config_exit(spa, SCL_STATE, FTAG);
582			return (EINVAL);
583		}
584
585		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
586
587		ASSERT(vdev_is_concrete(vd));
588		spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
589		ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
590		ASSERT(vd->vdev_removing);
591
592		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
593		    spa->spa_meta_objset, vic->vic_mapping_object);
594		vd->vdev_indirect_births = vdev_indirect_births_open(
595		    spa->spa_meta_objset, vic->vic_births_object);
596		spa_config_exit(spa, SCL_STATE, FTAG);
597
598		spa->spa_vdev_removal = svr;
599	}
600
601	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
602	uint64_t indirect_vdev_id =
603	    spa->spa_removing_phys.sr_prev_indirect_vdev;
604	while (indirect_vdev_id != UINT64_MAX) {
605		vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
606		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
607
608		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
609		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
610		    spa->spa_meta_objset, vic->vic_mapping_object);
611		vd->vdev_indirect_births = vdev_indirect_births_open(
612		    spa->spa_meta_objset, vic->vic_births_object);
613
614		indirect_vdev_id = vic->vic_prev_indirect_vdev;
615	}
616	spa_config_exit(spa, SCL_STATE, FTAG);
617
618	/*
619	 * Now that we've loaded all the indirect mappings, we can allow
620	 * reads from other blocks (e.g. via predictive prefetch).
621	 */
622	spa->spa_indirect_vdevs_loaded = B_TRUE;
623	return (0);
624}
625
626void
627spa_restart_removal(spa_t *spa)
628{
629	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
630
631	if (svr == NULL)
632		return;
633
634	/*
635	 * In general when this function is called there is no
636	 * removal thread running. The only scenario where this
637	 * is not true is during spa_import() where this function
638	 * is called twice [once from spa_import_impl() and
639	 * spa_async_resume()]. Thus, in the scenario where we
640	 * import a pool that has an ongoing removal we don't
641	 * want to spawn a second thread.
642	 */
643	if (svr->svr_thread != NULL)
644		return;
645
646	if (!spa_writeable(spa))
647		return;
648
649	zfs_dbgmsg("restarting removal of %llu",
650	    (u_longlong_t)svr->svr_vdev_id);
651	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
652	    0, &p0, TS_RUN, minclsyspri);
653}
654
655/*
656 * Process freeing from a device which is in the middle of being removed.
657 * We must handle this carefully so that we attempt to copy freed data,
658 * and we correctly free already-copied data.
659 */
660void
661free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
662{
663	spa_t *spa = vd->vdev_spa;
664	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
665	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
666	uint64_t txg = spa_syncing_txg(spa);
667	uint64_t max_offset_yet = 0;
668
669	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
670	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
671	    vdev_indirect_mapping_object(vim));
672	ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
673
674	mutex_enter(&svr->svr_lock);
675
676	/*
677	 * Remove the segment from the removing vdev's spacemap.  This
678	 * ensures that we will not attempt to copy this space (if the
679	 * removal thread has not yet visited it), and also ensures
680	 * that we know what is actually allocated on the new vdevs
681	 * (needed if we cancel the removal).
682	 *
683	 * Note: we must do the metaslab_free_concrete() with the svr_lock
684	 * held, so that the remove_thread can not load this metaslab and then
685	 * visit this offset between the time that we metaslab_free_concrete()
686	 * and when we check to see if it has been visited.
687	 *
688	 * Note: The checkpoint flag is set to false as having/taking
689	 * a checkpoint and removing a device can't happen at the same
690	 * time.
691	 */
692	ASSERT(!spa_has_checkpoint(spa));
693	metaslab_free_concrete(vd, offset, size, B_FALSE);
694
695	uint64_t synced_size = 0;
696	uint64_t synced_offset = 0;
697	uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
698	if (offset < max_offset_synced) {
699		/*
700		 * The mapping for this offset is already on disk.
701		 * Free from the new location.
702		 *
703		 * Note that we use svr_max_synced_offset because it is
704		 * updated atomically with respect to the in-core mapping.
705		 * By contrast, vim_max_offset is not.
706		 *
707		 * This block may be split between a synced entry and an
708		 * in-flight or unvisited entry.  Only process the synced
709		 * portion of it here.
710		 */
711		synced_size = MIN(size, max_offset_synced - offset);
712		synced_offset = offset;
713
714		ASSERT3U(max_offset_yet, <=, max_offset_synced);
715		max_offset_yet = max_offset_synced;
716
717		DTRACE_PROBE3(remove__free__synced,
718		    spa_t *, spa,
719		    uint64_t, offset,
720		    uint64_t, synced_size);
721
722		size -= synced_size;
723		offset += synced_size;
724	}
725
726	/*
727	 * Look at all in-flight txgs starting from the currently syncing one
728	 * and see if a section of this free is being copied. By starting from
729	 * this txg and iterating forward, we might find that this region
730	 * was copied in two different txgs and handle it appropriately.
731	 */
732	for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
733		int txgoff = (txg + i) & TXG_MASK;
734		if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
735			/*
736			 * The mapping for this offset is in flight, and
737			 * will be synced in txg+i.
738			 */
739			uint64_t inflight_size = MIN(size,
740			    svr->svr_max_offset_to_sync[txgoff] - offset);
741
742			DTRACE_PROBE4(remove__free__inflight,
743			    spa_t *, spa,
744			    uint64_t, offset,
745			    uint64_t, inflight_size,
746			    uint64_t, txg + i);
747
748			/*
749			 * We copy data in order of increasing offset.
750			 * Therefore the max_offset_to_sync[] must increase
751			 * (or be zero, indicating that nothing is being
752			 * copied in that txg).
753			 */
754			if (svr->svr_max_offset_to_sync[txgoff] != 0) {
755				ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
756				    >=, max_offset_yet);
757				max_offset_yet =
758				    svr->svr_max_offset_to_sync[txgoff];
759			}
760
761			/*
762			 * We've already committed to copying this segment:
763			 * we have allocated space elsewhere in the pool for
764			 * it and have an IO outstanding to copy the data. We
765			 * cannot free the space before the copy has
766			 * completed, or else the copy IO might overwrite any
767			 * new data. To free that space, we record the
768			 * segment in the appropriate svr_frees tree and free
769			 * the mapped space later, in the txg where we have
770			 * completed the copy and synced the mapping (see
771			 * vdev_mapping_sync).
772			 */
773			range_tree_add(svr->svr_frees[txgoff],
774			    offset, inflight_size);
775			size -= inflight_size;
776			offset += inflight_size;
777
778			/*
779			 * This space is already accounted for as being
780			 * done, because it is being copied in txg+i.
781			 * However, if i!=0, then it is being copied in
782			 * a future txg.  If we crash after this txg
783			 * syncs but before txg+i syncs, then the space
784			 * will be free.  Therefore we must account
785			 * for the space being done in *this* txg
786			 * (when it is freed) rather than the future txg
787			 * (when it will be copied).
788			 */
789			ASSERT3U(svr->svr_bytes_done[txgoff], >=,
790			    inflight_size);
791			svr->svr_bytes_done[txgoff] -= inflight_size;
792			svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
793		}
794	}
795	ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
796
797	if (size > 0) {
798		/*
799		 * The copy thread has not yet visited this offset.  Ensure
800		 * that it doesn't.
801		 */
802
803		DTRACE_PROBE3(remove__free__unvisited,
804		    spa_t *, spa,
805		    uint64_t, offset,
806		    uint64_t, size);
807
808		if (svr->svr_allocd_segs != NULL)
809			range_tree_clear(svr->svr_allocd_segs, offset, size);
810
811		/*
812		 * Since we now do not need to copy this data, for
813		 * accounting purposes we have done our job and can count
814		 * it as completed.
815		 */
816		svr->svr_bytes_done[txg & TXG_MASK] += size;
817	}
818	mutex_exit(&svr->svr_lock);
819
820	/*
821	 * Now that we have dropped svr_lock, process the synced portion
822	 * of this free.
823	 */
824	if (synced_size > 0) {
825		vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
826
827		/*
828		 * Note: this can only be called from syncing context,
829		 * and the vdev_indirect_mapping is only changed from the
830		 * sync thread, so we don't need svr_lock while doing
831		 * metaslab_free_impl_cb.
832		 */
833		boolean_t checkpoint = B_FALSE;
834		vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
835		    metaslab_free_impl_cb, &checkpoint);
836	}
837}
838
839/*
840 * Stop an active removal and update the spa_removing phys.
841 */
842static void
843spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
844{
845	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
846	ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
847
848	/* Ensure the removal thread has completed before we free the svr. */
849	spa_vdev_remove_suspend(spa);
850
851	ASSERT(state == DSS_FINISHED || state == DSS_CANCELED);
852
853	if (state == DSS_FINISHED) {
854		spa_removing_phys_t *srp = &spa->spa_removing_phys;
855		vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
856		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
857
858		if (srp->sr_prev_indirect_vdev != -1) {
859			vdev_t *pvd;
860			pvd = vdev_lookup_top(spa,
861			    srp->sr_prev_indirect_vdev);
862			ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops);
863		}
864
865		vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev;
866		srp->sr_prev_indirect_vdev = vd->vdev_id;
867	}
868	spa->spa_removing_phys.sr_state = state;
869	spa->spa_removing_phys.sr_end_time = gethrestime_sec();
870
871	spa->spa_vdev_removal = NULL;
872	spa_vdev_removal_destroy(svr);
873
874	spa_sync_removing_state(spa, tx);
875	spa_notify_waiters(spa);
876
877	vdev_config_dirty(spa->spa_root_vdev);
878}
879
880static void
881free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
882{
883	vdev_t *vd = arg;
884	vdev_indirect_mark_obsolete(vd, offset, size);
885	boolean_t checkpoint = B_FALSE;
886	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
887	    metaslab_free_impl_cb, &checkpoint);
888}
889
890/*
891 * On behalf of the removal thread, syncs an incremental bit more of
892 * the indirect mapping to disk and updates the in-memory mapping.
893 * Called as a sync task in every txg that the removal thread makes progress.
894 */
895static void
896vdev_mapping_sync(void *arg, dmu_tx_t *tx)
897{
898	spa_vdev_removal_t *svr = arg;
899	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
900	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
901	vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config;
902	uint64_t txg = dmu_tx_get_txg(tx);
903	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
904
905	ASSERT(vic->vic_mapping_object != 0);
906	ASSERT3U(txg, ==, spa_syncing_txg(spa));
907
908	vdev_indirect_mapping_add_entries(vim,
909	    &svr->svr_new_segments[txg & TXG_MASK], tx);
910	vdev_indirect_births_add_entry(vd->vdev_indirect_births,
911	    vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx);
912
913	/*
914	 * Free the copied data for anything that was freed while the
915	 * mapping entries were in flight.
916	 */
917	mutex_enter(&svr->svr_lock);
918	range_tree_vacate(svr->svr_frees[txg & TXG_MASK],
919	    free_mapped_segment_cb, vd);
920	ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=,
921	    vdev_indirect_mapping_max_offset(vim));
922	svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0;
923	mutex_exit(&svr->svr_lock);
924
925	spa_sync_removing_state(spa, tx);
926}
927
928typedef struct vdev_copy_segment_arg {
929	spa_t *vcsa_spa;
930	dva_t *vcsa_dest_dva;
931	uint64_t vcsa_txg;
932	range_tree_t *vcsa_obsolete_segs;
933} vdev_copy_segment_arg_t;
934
935static void
936unalloc_seg(void *arg, uint64_t start, uint64_t size)
937{
938	vdev_copy_segment_arg_t *vcsa = arg;
939	spa_t *spa = vcsa->vcsa_spa;
940	blkptr_t bp = { { { {0} } } };
941
942	BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
943	BP_SET_LSIZE(&bp, size);
944	BP_SET_PSIZE(&bp, size);
945	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
946	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
947	BP_SET_TYPE(&bp, DMU_OT_NONE);
948	BP_SET_LEVEL(&bp, 0);
949	BP_SET_DEDUP(&bp, 0);
950	BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
951
952	DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
953	DVA_SET_OFFSET(&bp.blk_dva[0],
954	    DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
955	DVA_SET_ASIZE(&bp.blk_dva[0], size);
956
957	zio_free(spa, vcsa->vcsa_txg, &bp);
958}
959
960/*
961 * All reads and writes associated with a call to spa_vdev_copy_segment()
962 * are done.
963 */
964static void
965spa_vdev_copy_segment_done(zio_t *zio)
966{
967	vdev_copy_segment_arg_t *vcsa = zio->io_private;
968
969	range_tree_vacate(vcsa->vcsa_obsolete_segs,
970	    unalloc_seg, vcsa);
971	range_tree_destroy(vcsa->vcsa_obsolete_segs);
972	kmem_free(vcsa, sizeof (*vcsa));
973
974	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
975}
976
977/*
978 * The write of the new location is done.
979 */
980static void
981spa_vdev_copy_segment_write_done(zio_t *zio)
982{
983	vdev_copy_arg_t *vca = zio->io_private;
984
985	abd_free(zio->io_abd);
986
987	mutex_enter(&vca->vca_lock);
988	vca->vca_outstanding_bytes -= zio->io_size;
989
990	if (zio->io_error != 0)
991		vca->vca_write_error_bytes += zio->io_size;
992
993	cv_signal(&vca->vca_cv);
994	mutex_exit(&vca->vca_lock);
995}
996
997/*
998 * The read of the old location is done.  The parent zio is the write to
999 * the new location.  Allow it to start.
1000 */
1001static void
1002spa_vdev_copy_segment_read_done(zio_t *zio)
1003{
1004	vdev_copy_arg_t *vca = zio->io_private;
1005
1006	if (zio->io_error != 0) {
1007		mutex_enter(&vca->vca_lock);
1008		vca->vca_read_error_bytes += zio->io_size;
1009		mutex_exit(&vca->vca_lock);
1010	}
1011
1012	zio_nowait(zio_unique_parent(zio));
1013}
1014
1015/*
1016 * If the old and new vdevs are mirrors, we will read both sides of the old
1017 * mirror, and write each copy to the corresponding side of the new mirror.
1018 * If the old and new vdevs have a different number of children, we will do
1019 * this as best as possible.  Since we aren't verifying checksums, this
1020 * ensures that as long as there's a good copy of the data, we'll have a
1021 * good copy after the removal, even if there's silent damage to one side
1022 * of the mirror. If we're removing a mirror that has some silent damage,
1023 * we'll have exactly the same damage in the new location (assuming that
1024 * the new location is also a mirror).
1025 *
1026 * We accomplish this by creating a tree of zio_t's, with as many writes as
1027 * there are "children" of the new vdev (a non-redundant vdev counts as one
1028 * child, a 2-way mirror has 2 children, etc). Each write has an associated
1029 * read from a child of the old vdev. Typically there will be the same
1030 * number of children of the old and new vdevs.  However, if there are more
1031 * children of the new vdev, some child(ren) of the old vdev will be issued
1032 * multiple reads.  If there are more children of the old vdev, some copies
1033 * will be dropped.
1034 *
1035 * For example, the tree of zio_t's for a 2-way mirror is:
1036 *
1037 *                            null
1038 *                           /    \
1039 *    write(new vdev, child 0)      write(new vdev, child 1)
1040 *      |                             |
1041 *    read(old vdev, child 0)       read(old vdev, child 1)
1042 *
1043 * Child zio's complete before their parents complete.  However, zio's
1044 * created with zio_vdev_child_io() may be issued before their children
1045 * complete.  In this case we need to make sure that the children (reads)
1046 * complete before the parents (writes) are *issued*.  We do this by not
1047 * calling zio_nowait() on each write until its corresponding read has
1048 * completed.
1049 *
1050 * The spa_config_lock must be held while zio's created by
1051 * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
1052 * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
1053 * zio is needed to release the spa_config_lock after all the reads and
1054 * writes complete. (Note that we can't grab the config lock for each read,
1055 * because it is not reentrant - we could deadlock with a thread waiting
1056 * for a write lock.)
1057 */
1058static void
1059spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
1060    vdev_t *source_vd, uint64_t source_offset,
1061    vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
1062{
1063	ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
1064
1065	/*
1066	 * If the destination child in unwritable then there is no point
1067	 * in issuing the source reads which cannot be written.
1068	 */
1069	if (!vdev_writeable(dest_child_vd))
1070		return;
1071
1072	mutex_enter(&vca->vca_lock);
1073	vca->vca_outstanding_bytes += size;
1074	mutex_exit(&vca->vca_lock);
1075
1076	abd_t *abd = abd_alloc_for_io(size, B_FALSE);
1077
1078	vdev_t *source_child_vd = NULL;
1079	if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
1080		/*
1081		 * Source and dest are both mirrors.  Copy from the same
1082		 * child id as we are copying to (wrapping around if there
1083		 * are more dest children than source children).  If the
1084		 * preferred source child is unreadable select another.
1085		 */
1086		for (int i = 0; i < source_vd->vdev_children; i++) {
1087			source_child_vd = source_vd->vdev_child[
1088			    (dest_id + i) % source_vd->vdev_children];
1089			if (vdev_readable(source_child_vd))
1090				break;
1091		}
1092	} else {
1093		source_child_vd = source_vd;
1094	}
1095
1096	/*
1097	 * There should always be at least one readable source child or
1098	 * the pool would be in a suspended state.  Somehow selecting an
1099	 * unreadable child would result in IO errors, the removal process
1100	 * being cancelled, and the pool reverting to its pre-removal state.
1101	 */
1102	ASSERT3P(source_child_vd, !=, NULL);
1103
1104	zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
1105	    dest_child_vd, dest_offset, abd, size,
1106	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
1107	    ZIO_FLAG_CANFAIL,
1108	    spa_vdev_copy_segment_write_done, vca);
1109
1110	zio_nowait(zio_vdev_child_io(write_zio, NULL,
1111	    source_child_vd, source_offset, abd, size,
1112	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
1113	    ZIO_FLAG_CANFAIL,
1114	    spa_vdev_copy_segment_read_done, vca));
1115}
1116
1117/*
1118 * Allocate a new location for this segment, and create the zio_t's to
1119 * read from the old location and write to the new location.
1120 */
1121static int
1122spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
1123    uint64_t maxalloc, uint64_t txg,
1124    vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
1125{
1126	metaslab_group_t *mg = vd->vdev_mg;
1127	spa_t *spa = vd->vdev_spa;
1128	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
1129	vdev_indirect_mapping_entry_t *entry;
1130	dva_t dst = {{ 0 }};
1131	uint64_t start = range_tree_min(segs);
1132	ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift));
1133
1134	ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
1135	ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift));
1136
1137	uint64_t size = range_tree_span(segs);
1138	if (range_tree_span(segs) > maxalloc) {
1139		/*
1140		 * We can't allocate all the segments.  Prefer to end
1141		 * the allocation at the end of a segment, thus avoiding
1142		 * additional split blocks.
1143		 */
1144		range_seg_max_t search;
1145		zfs_btree_index_t where;
1146		rs_set_start(&search, segs, start + maxalloc);
1147		rs_set_end(&search, segs, start + maxalloc);
1148		(void) zfs_btree_find(&segs->rt_root, &search, &where);
1149		range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where,
1150		    &where);
1151		if (rs != NULL) {
1152			size = rs_get_end(rs, segs) - start;
1153		} else {
1154			/*
1155			 * There are no segments that end before maxalloc.
1156			 * I.e. the first segment is larger than maxalloc,
1157			 * so we must split it.
1158			 */
1159			size = maxalloc;
1160		}
1161	}
1162	ASSERT3U(size, <=, maxalloc);
1163	ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift));
1164
1165	/*
1166	 * An allocation class might not have any remaining vdevs or space
1167	 */
1168	metaslab_class_t *mc = mg->mg_class;
1169	if (mc->mc_groups == 0)
1170		mc = spa_normal_class(spa);
1171	int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg,
1172	    METASLAB_DONT_THROTTLE, zal, 0);
1173	if (error == ENOSPC && mc != spa_normal_class(spa)) {
1174		error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
1175		    &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0);
1176	}
1177	if (error != 0)
1178		return (error);
1179
1180	/*
1181	 * Determine the ranges that are not actually needed.  Offsets are
1182	 * relative to the start of the range to be copied (i.e. relative to the
1183	 * local variable "start").
1184	 */
1185	range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL,
1186	    0, 0);
1187
1188	zfs_btree_index_t where;
1189	range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
1190	ASSERT3U(rs_get_start(rs, segs), ==, start);
1191	uint64_t prev_seg_end = rs_get_end(rs, segs);
1192	while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) {
1193		if (rs_get_start(rs, segs) >= start + size) {
1194			break;
1195		} else {
1196			range_tree_add(obsolete_segs,
1197			    prev_seg_end - start,
1198			    rs_get_start(rs, segs) - prev_seg_end);
1199		}
1200		prev_seg_end = rs_get_end(rs, segs);
1201	}
1202	/* We don't end in the middle of an obsolete range */
1203	ASSERT3U(start + size, <=, prev_seg_end);
1204
1205	range_tree_clear(segs, start, size);
1206
1207	/*
1208	 * We can't have any padding of the allocated size, otherwise we will
1209	 * misunderstand what's allocated, and the size of the mapping. We
1210	 * prevent padding by ensuring that all devices in the pool have the
1211	 * same ashift, and the allocation size is a multiple of the ashift.
1212	 */
1213	VERIFY3U(DVA_GET_ASIZE(&dst), ==, size);
1214
1215	entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
1216	DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
1217	entry->vime_mapping.vimep_dst = dst;
1218	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
1219		entry->vime_obsolete_count = range_tree_space(obsolete_segs);
1220	}
1221
1222	vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
1223	vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
1224	vcsa->vcsa_obsolete_segs = obsolete_segs;
1225	vcsa->vcsa_spa = spa;
1226	vcsa->vcsa_txg = txg;
1227
1228	/*
1229	 * See comment before spa_vdev_copy_one_child().
1230	 */
1231	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
1232	zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
1233	    spa_vdev_copy_segment_done, vcsa, 0);
1234	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
1235	if (dest_vd->vdev_ops == &vdev_mirror_ops) {
1236		for (int i = 0; i < dest_vd->vdev_children; i++) {
1237			vdev_t *child = dest_vd->vdev_child[i];
1238			spa_vdev_copy_one_child(vca, nzio, vd, start,
1239			    child, DVA_GET_OFFSET(&dst), i, size);
1240		}
1241	} else {
1242		spa_vdev_copy_one_child(vca, nzio, vd, start,
1243		    dest_vd, DVA_GET_OFFSET(&dst), -1, size);
1244	}
1245	zio_nowait(nzio);
1246
1247	list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
1248	ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
1249	vdev_dirty(vd, 0, NULL, txg);
1250
1251	return (0);
1252}
1253
1254/*
1255 * Complete the removal of a toplevel vdev. This is called as a
1256 * synctask in the same txg that we will sync out the new config (to the
1257 * MOS object) which indicates that this vdev is indirect.
1258 */
1259static void
1260vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
1261{
1262	spa_vdev_removal_t *svr = arg;
1263	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1264	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
1265
1266	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
1267
1268	for (int i = 0; i < TXG_SIZE; i++) {
1269		ASSERT0(svr->svr_bytes_done[i]);
1270	}
1271
1272	ASSERT3U(spa->spa_removing_phys.sr_copied, ==,
1273	    spa->spa_removing_phys.sr_to_copy);
1274
1275	vdev_destroy_spacemaps(vd, tx);
1276
1277	/* destroy leaf zaps, if any */
1278	ASSERT3P(svr->svr_zaplist, !=, NULL);
1279	for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL);
1280	    pair != NULL;
1281	    pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) {
1282		vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx);
1283	}
1284	fnvlist_free(svr->svr_zaplist);
1285
1286	spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx);
1287	/* vd->vdev_path is not available here */
1288	spa_history_log_internal(spa, "vdev remove completed",  tx,
1289	    "%s vdev %llu", spa_name(spa), (u_longlong_t)vd->vdev_id);
1290}
1291
1292static void
1293vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
1294{
1295	ASSERT3P(zlist, !=, NULL);
1296	ASSERT0(vdev_get_nparity(vd));
1297
1298	if (vd->vdev_leaf_zap != 0) {
1299		char zkey[32];
1300		(void) snprintf(zkey, sizeof (zkey), "%s-%llu",
1301		    VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap);
1302		fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap);
1303	}
1304
1305	for (uint64_t id = 0; id < vd->vdev_children; id++) {
1306		vdev_remove_enlist_zaps(vd->vdev_child[id], zlist);
1307	}
1308}
1309
1310static void
1311vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
1312{
1313	vdev_t *ivd;
1314	dmu_tx_t *tx;
1315	spa_t *spa = vd->vdev_spa;
1316	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
1317
1318	/*
1319	 * First, build a list of leaf zaps to be destroyed.
1320	 * This is passed to the sync context thread,
1321	 * which does the actual unlinking.
1322	 */
1323	svr->svr_zaplist = fnvlist_alloc();
1324	vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
1325
1326	ivd = vdev_add_parent(vd, &vdev_indirect_ops);
1327	ivd->vdev_removing = 0;
1328
1329	vd->vdev_leaf_zap = 0;
1330
1331	vdev_remove_child(ivd, vd);
1332	vdev_compact_children(ivd);
1333
1334	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
1335
1336	mutex_enter(&svr->svr_lock);
1337	svr->svr_thread = NULL;
1338	cv_broadcast(&svr->svr_cv);
1339	mutex_exit(&svr->svr_lock);
1340
1341	/* After this, we can not use svr. */
1342	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1343	dsl_sync_task_nowait(spa->spa_dsl_pool,
1344	    vdev_remove_complete_sync, svr, tx);
1345	dmu_tx_commit(tx);
1346}
1347
1348/*
1349 * Complete the removal of a toplevel vdev. This is called in open
1350 * context by the removal thread after we have copied all vdev's data.
1351 */
1352static void
1353vdev_remove_complete(spa_t *spa)
1354{
1355	uint64_t txg;
1356
1357	/*
1358	 * Wait for any deferred frees to be synced before we call
1359	 * vdev_metaslab_fini()
1360	 */
1361	txg_wait_synced(spa->spa_dsl_pool, 0);
1362	txg = spa_vdev_enter(spa);
1363	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
1364	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
1365	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
1366	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
1367	vdev_rebuild_stop_wait(vd);
1368	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
1369	uint64_t vdev_space = spa_deflate(spa) ?
1370	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
1371
1372	sysevent_t *ev = spa_event_create(spa, vd, NULL,
1373	    ESC_ZFS_VDEV_REMOVE_DEV);
1374
1375	zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
1376	    (u_longlong_t)vd->vdev_id, (u_longlong_t)txg);
1377
1378	ASSERT3U(0, !=, vdev_space);
1379	ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
1380
1381	/* the vdev is no longer part of the dspace */
1382	spa->spa_nonallocating_dspace -= vdev_space;
1383
1384	/*
1385	 * Discard allocation state.
1386	 */
1387	if (vd->vdev_mg != NULL) {
1388		vdev_metaslab_fini(vd);
1389		metaslab_group_destroy(vd->vdev_mg);
1390		vd->vdev_mg = NULL;
1391	}
1392	if (vd->vdev_log_mg != NULL) {
1393		ASSERT0(vd->vdev_ms_count);
1394		metaslab_group_destroy(vd->vdev_log_mg);
1395		vd->vdev_log_mg = NULL;
1396	}
1397	ASSERT0(vd->vdev_stat.vs_space);
1398	ASSERT0(vd->vdev_stat.vs_dspace);
1399
1400	vdev_remove_replace_with_indirect(vd, txg);
1401
1402	/*
1403	 * We now release the locks, allowing spa_sync to run and finish the
1404	 * removal via vdev_remove_complete_sync in syncing context.
1405	 *
1406	 * Note that we hold on to the vdev_t that has been replaced.  Since
1407	 * it isn't part of the vdev tree any longer, it can't be concurrently
1408	 * manipulated, even while we don't have the config lock.
1409	 */
1410	(void) spa_vdev_exit(spa, NULL, txg, 0);
1411
1412	/*
1413	 * Top ZAP should have been transferred to the indirect vdev in
1414	 * vdev_remove_replace_with_indirect.
1415	 */
1416	ASSERT0(vd->vdev_top_zap);
1417
1418	/*
1419	 * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect.
1420	 */
1421	ASSERT0(vd->vdev_leaf_zap);
1422
1423	txg = spa_vdev_enter(spa);
1424	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
1425	/*
1426	 * Request to update the config and the config cachefile.
1427	 */
1428	vdev_config_dirty(spa->spa_root_vdev);
1429	(void) spa_vdev_exit(spa, vd, txg, 0);
1430
1431	if (ev != NULL)
1432		spa_event_post(ev);
1433}
1434
1435/*
1436 * Evacuates a segment of size at most max_alloc from the vdev
1437 * via repeated calls to spa_vdev_copy_segment. If an allocation
1438 * fails, the pool is probably too fragmented to handle such a
1439 * large size, so decrease max_alloc so that the caller will not try
1440 * this size again this txg.
1441 */
1442static void
1443spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
1444    uint64_t *max_alloc, dmu_tx_t *tx)
1445{
1446	uint64_t txg = dmu_tx_get_txg(tx);
1447	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1448
1449	mutex_enter(&svr->svr_lock);
1450
1451	/*
1452	 * Determine how big of a chunk to copy.  We can allocate up
1453	 * to max_alloc bytes, and we can span up to vdev_removal_max_span
1454	 * bytes of unallocated space at a time.  "segs" will track the
1455	 * allocated segments that we are copying.  We may also be copying
1456	 * free segments (of up to vdev_removal_max_span bytes).
1457	 */
1458	range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
1459	for (;;) {
1460		range_tree_t *rt = svr->svr_allocd_segs;
1461		range_seg_t *rs = range_tree_first(rt);
1462
1463		if (rs == NULL)
1464			break;
1465
1466		uint64_t seg_length;
1467
1468		if (range_tree_is_empty(segs)) {
1469			/* need to truncate the first seg based on max_alloc */
1470			seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs,
1471			    rt), *max_alloc);
1472		} else {
1473			if (rs_get_start(rs, rt) - range_tree_max(segs) >
1474			    vdev_removal_max_span) {
1475				/*
1476				 * Including this segment would cause us to
1477				 * copy a larger unneeded chunk than is allowed.
1478				 */
1479				break;
1480			} else if (rs_get_end(rs, rt) - range_tree_min(segs) >
1481			    *max_alloc) {
1482				/*
1483				 * This additional segment would extend past
1484				 * max_alloc. Rather than splitting this
1485				 * segment, leave it for the next mapping.
1486				 */
1487				break;
1488			} else {
1489				seg_length = rs_get_end(rs, rt) -
1490				    rs_get_start(rs, rt);
1491			}
1492		}
1493
1494		range_tree_add(segs, rs_get_start(rs, rt), seg_length);
1495		range_tree_remove(svr->svr_allocd_segs,
1496		    rs_get_start(rs, rt), seg_length);
1497	}
1498
1499	if (range_tree_is_empty(segs)) {
1500		mutex_exit(&svr->svr_lock);
1501		range_tree_destroy(segs);
1502		return;
1503	}
1504
1505	if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
1506		dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
1507		    svr, tx);
1508	}
1509
1510	svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
1511
1512	/*
1513	 * Note: this is the amount of *allocated* space
1514	 * that we are taking care of each txg.
1515	 */
1516	svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
1517
1518	mutex_exit(&svr->svr_lock);
1519
1520	zio_alloc_list_t zal;
1521	metaslab_trace_init(&zal);
1522	uint64_t thismax = SPA_MAXBLOCKSIZE;
1523	while (!range_tree_is_empty(segs)) {
1524		int error = spa_vdev_copy_segment(vd,
1525		    segs, thismax, txg, vca, &zal);
1526
1527		if (error == ENOSPC) {
1528			/*
1529			 * Cut our segment in half, and don't try this
1530			 * segment size again this txg.  Note that the
1531			 * allocation size must be aligned to the highest
1532			 * ashift in the pool, so that the allocation will
1533			 * not be padded out to a multiple of the ashift,
1534			 * which could cause us to think that this mapping
1535			 * is larger than we intended.
1536			 */
1537			ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
1538			ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
1539			uint64_t attempted =
1540			    MIN(range_tree_span(segs), thismax);
1541			thismax = P2ROUNDUP(attempted / 2,
1542			    1 << spa->spa_max_ashift);
1543			/*
1544			 * The minimum-size allocation can not fail.
1545			 */
1546			ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
1547			*max_alloc = attempted - (1 << spa->spa_max_ashift);
1548		} else {
1549			ASSERT0(error);
1550
1551			/*
1552			 * We've performed an allocation, so reset the
1553			 * alloc trace list.
1554			 */
1555			metaslab_trace_fini(&zal);
1556			metaslab_trace_init(&zal);
1557		}
1558	}
1559	metaslab_trace_fini(&zal);
1560	range_tree_destroy(segs);
1561}
1562
1563/*
1564 * The size of each removal mapping is limited by the tunable
1565 * zfs_remove_max_segment, but we must adjust this to be a multiple of the
1566 * pool's ashift, so that we don't try to split individual sectors regardless
1567 * of the tunable value.  (Note that device removal requires that all devices
1568 * have the same ashift, so there's no difference between spa_min_ashift and
1569 * spa_max_ashift.) The raw tunable should not be used elsewhere.
1570 */
1571uint64_t
1572spa_remove_max_segment(spa_t *spa)
1573{
1574	return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift));
1575}
1576
1577/*
1578 * The removal thread operates in open context.  It iterates over all
1579 * allocated space in the vdev, by loading each metaslab's spacemap.
1580 * For each contiguous segment of allocated space (capping the segment
1581 * size at SPA_MAXBLOCKSIZE), we:
1582 *    - Allocate space for it on another vdev.
1583 *    - Create a new mapping from the old location to the new location
1584 *      (as a record in svr_new_segments).
1585 *    - Initiate a physical read zio to get the data off the removing disk.
1586 *    - In the read zio's done callback, initiate a physical write zio to
1587 *      write it to the new vdev.
1588 * Note that all of this will take effect when a particular TXG syncs.
1589 * The sync thread ensures that all the phys reads and writes for the syncing
1590 * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
1591 * (see vdev_mapping_sync()).
1592 */
1593static __attribute__((noreturn)) void
1594spa_vdev_remove_thread(void *arg)
1595{
1596	spa_t *spa = arg;
1597	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
1598	vdev_copy_arg_t vca;
1599	uint64_t max_alloc = spa_remove_max_segment(spa);
1600	uint64_t last_txg = 0;
1601
1602	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1603	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
1604	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
1605	uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
1606
1607	ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
1608	ASSERT(vdev_is_concrete(vd));
1609	ASSERT(vd->vdev_removing);
1610	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
1611	ASSERT(vim != NULL);
1612
1613	mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
1614	cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
1615	vca.vca_outstanding_bytes = 0;
1616	vca.vca_read_error_bytes = 0;
1617	vca.vca_write_error_bytes = 0;
1618
1619	mutex_enter(&svr->svr_lock);
1620
1621	/*
1622	 * Start from vim_max_offset so we pick up where we left off
1623	 * if we are restarting the removal after opening the pool.
1624	 */
1625	uint64_t msi;
1626	for (msi = start_offset >> vd->vdev_ms_shift;
1627	    msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) {
1628		metaslab_t *msp = vd->vdev_ms[msi];
1629		ASSERT3U(msi, <=, vd->vdev_ms_count);
1630
1631		ASSERT0(range_tree_space(svr->svr_allocd_segs));
1632
1633		mutex_enter(&msp->ms_sync_lock);
1634		mutex_enter(&msp->ms_lock);
1635
1636		/*
1637		 * Assert nothing in flight -- ms_*tree is empty.
1638		 */
1639		for (int i = 0; i < TXG_SIZE; i++) {
1640			ASSERT0(range_tree_space(msp->ms_allocating[i]));
1641		}
1642
1643		/*
1644		 * If the metaslab has ever been allocated from (ms_sm!=NULL),
1645		 * read the allocated segments from the space map object
1646		 * into svr_allocd_segs. Since we do this while holding
1647		 * svr_lock and ms_sync_lock, concurrent frees (which
1648		 * would have modified the space map) will wait for us
1649		 * to finish loading the spacemap, and then take the
1650		 * appropriate action (see free_from_removing_vdev()).
1651		 */
1652		if (msp->ms_sm != NULL) {
1653			VERIFY0(space_map_load(msp->ms_sm,
1654			    svr->svr_allocd_segs, SM_ALLOC));
1655
1656			range_tree_walk(msp->ms_unflushed_allocs,
1657			    range_tree_add, svr->svr_allocd_segs);
1658			range_tree_walk(msp->ms_unflushed_frees,
1659			    range_tree_remove, svr->svr_allocd_segs);
1660			range_tree_walk(msp->ms_freeing,
1661			    range_tree_remove, svr->svr_allocd_segs);
1662
1663			/*
1664			 * When we are resuming from a paused removal (i.e.
1665			 * when importing a pool with a removal in progress),
1666			 * discard any state that we have already processed.
1667			 */
1668			range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
1669		}
1670		mutex_exit(&msp->ms_lock);
1671		mutex_exit(&msp->ms_sync_lock);
1672
1673		vca.vca_msp = msp;
1674		zfs_dbgmsg("copying %llu segments for metaslab %llu",
1675		    (u_longlong_t)zfs_btree_numnodes(
1676		    &svr->svr_allocd_segs->rt_root),
1677		    (u_longlong_t)msp->ms_id);
1678
1679		while (!svr->svr_thread_exit &&
1680		    !range_tree_is_empty(svr->svr_allocd_segs)) {
1681
1682			mutex_exit(&svr->svr_lock);
1683
1684			/*
1685			 * We need to periodically drop the config lock so that
1686			 * writers can get in.  Additionally, we can't wait
1687			 * for a txg to sync while holding a config lock
1688			 * (since a waiting writer could cause a 3-way deadlock
1689			 * with the sync thread, which also gets a config
1690			 * lock for reader).  So we can't hold the config lock
1691			 * while calling dmu_tx_assign().
1692			 */
1693			spa_config_exit(spa, SCL_CONFIG, FTAG);
1694
1695			/*
1696			 * This delay will pause the removal around the point
1697			 * specified by zfs_removal_suspend_progress. We do this
1698			 * solely from the test suite or during debugging.
1699			 */
1700			while (zfs_removal_suspend_progress &&
1701			    !svr->svr_thread_exit)
1702				delay(hz);
1703
1704			mutex_enter(&vca.vca_lock);
1705			while (vca.vca_outstanding_bytes >
1706			    zfs_remove_max_copy_bytes) {
1707				cv_wait(&vca.vca_cv, &vca.vca_lock);
1708			}
1709			mutex_exit(&vca.vca_lock);
1710
1711			dmu_tx_t *tx =
1712			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
1713
1714			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
1715			uint64_t txg = dmu_tx_get_txg(tx);
1716
1717			/*
1718			 * Reacquire the vdev_config lock.  The vdev_t
1719			 * that we're removing may have changed, e.g. due
1720			 * to a vdev_attach or vdev_detach.
1721			 */
1722			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1723			vd = vdev_lookup_top(spa, svr->svr_vdev_id);
1724
1725			if (txg != last_txg)
1726				max_alloc = spa_remove_max_segment(spa);
1727			last_txg = txg;
1728
1729			spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
1730
1731			dmu_tx_commit(tx);
1732			mutex_enter(&svr->svr_lock);
1733		}
1734
1735		mutex_enter(&vca.vca_lock);
1736		if (zfs_removal_ignore_errors == 0 &&
1737		    (vca.vca_read_error_bytes > 0 ||
1738		    vca.vca_write_error_bytes > 0)) {
1739			svr->svr_thread_exit = B_TRUE;
1740		}
1741		mutex_exit(&vca.vca_lock);
1742	}
1743
1744	mutex_exit(&svr->svr_lock);
1745
1746	spa_config_exit(spa, SCL_CONFIG, FTAG);
1747
1748	/*
1749	 * Wait for all copies to finish before cleaning up the vca.
1750	 */
1751	txg_wait_synced(spa->spa_dsl_pool, 0);
1752	ASSERT0(vca.vca_outstanding_bytes);
1753
1754	mutex_destroy(&vca.vca_lock);
1755	cv_destroy(&vca.vca_cv);
1756
1757	if (svr->svr_thread_exit) {
1758		mutex_enter(&svr->svr_lock);
1759		range_tree_vacate(svr->svr_allocd_segs, NULL, NULL);
1760		svr->svr_thread = NULL;
1761		cv_broadcast(&svr->svr_cv);
1762		mutex_exit(&svr->svr_lock);
1763
1764		/*
1765		 * During the removal process an unrecoverable read or write
1766		 * error was encountered.  The removal process must be
1767		 * cancelled or this damage may become permanent.
1768		 */
1769		if (zfs_removal_ignore_errors == 0 &&
1770		    (vca.vca_read_error_bytes > 0 ||
1771		    vca.vca_write_error_bytes > 0)) {
1772			zfs_dbgmsg("canceling removal due to IO errors: "
1773			    "[read_error_bytes=%llu] [write_error_bytes=%llu]",
1774			    (u_longlong_t)vca.vca_read_error_bytes,
1775			    (u_longlong_t)vca.vca_write_error_bytes);
1776			spa_vdev_remove_cancel_impl(spa);
1777		}
1778	} else {
1779		ASSERT0(range_tree_space(svr->svr_allocd_segs));
1780		vdev_remove_complete(spa);
1781	}
1782
1783	thread_exit();
1784}
1785
1786void
1787spa_vdev_remove_suspend(spa_t *spa)
1788{
1789	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
1790
1791	if (svr == NULL)
1792		return;
1793
1794	mutex_enter(&svr->svr_lock);
1795	svr->svr_thread_exit = B_TRUE;
1796	while (svr->svr_thread != NULL)
1797		cv_wait(&svr->svr_cv, &svr->svr_lock);
1798	svr->svr_thread_exit = B_FALSE;
1799	mutex_exit(&svr->svr_lock);
1800}
1801
1802/*
1803 * Return true if the "allocating" property has been set to "off"
1804 */
1805static boolean_t
1806vdev_prop_allocating_off(vdev_t *vd)
1807{
1808	uint64_t objid = vd->vdev_top_zap;
1809	uint64_t allocating = 1;
1810
1811	/* no vdev property object => no props */
1812	if (objid != 0) {
1813		spa_t *spa = vd->vdev_spa;
1814		objset_t *mos = spa->spa_meta_objset;
1815
1816		mutex_enter(&spa->spa_props_lock);
1817		(void) zap_lookup(mos, objid, "allocating", sizeof (uint64_t),
1818		    1, &allocating);
1819		mutex_exit(&spa->spa_props_lock);
1820	}
1821	return (allocating == 0);
1822}
1823
1824static int
1825spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
1826{
1827	(void) arg;
1828	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1829
1830	if (spa->spa_vdev_removal == NULL)
1831		return (ENOTACTIVE);
1832	return (0);
1833}
1834
1835/*
1836 * Cancel a removal by freeing all entries from the partial mapping
1837 * and marking the vdev as no longer being removing.
1838 */
1839static void
1840spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
1841{
1842	(void) arg;
1843	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1844	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
1845	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
1846	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
1847	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
1848	objset_t *mos = spa->spa_meta_objset;
1849
1850	ASSERT3P(svr->svr_thread, ==, NULL);
1851
1852	spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
1853
1854	boolean_t are_precise;
1855	VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
1856	if (are_precise) {
1857		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
1858		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
1859		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
1860	}
1861
1862	uint64_t obsolete_sm_object;
1863	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
1864	if (obsolete_sm_object != 0) {
1865		ASSERT(vd->vdev_obsolete_sm != NULL);
1866		ASSERT3U(obsolete_sm_object, ==,
1867		    space_map_object(vd->vdev_obsolete_sm));
1868
1869		space_map_free(vd->vdev_obsolete_sm, tx);
1870		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
1871		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
1872		space_map_close(vd->vdev_obsolete_sm);
1873		vd->vdev_obsolete_sm = NULL;
1874		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
1875	}
1876	for (int i = 0; i < TXG_SIZE; i++) {
1877		ASSERT(list_is_empty(&svr->svr_new_segments[i]));
1878		ASSERT3U(svr->svr_max_offset_to_sync[i], <=,
1879		    vdev_indirect_mapping_max_offset(vim));
1880	}
1881
1882	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
1883		metaslab_t *msp = vd->vdev_ms[msi];
1884
1885		if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
1886			break;
1887
1888		ASSERT0(range_tree_space(svr->svr_allocd_segs));
1889
1890		mutex_enter(&msp->ms_lock);
1891
1892		/*
1893		 * Assert nothing in flight -- ms_*tree is empty.
1894		 */
1895		for (int i = 0; i < TXG_SIZE; i++)
1896			ASSERT0(range_tree_space(msp->ms_allocating[i]));
1897		for (int i = 0; i < TXG_DEFER_SIZE; i++)
1898			ASSERT0(range_tree_space(msp->ms_defer[i]));
1899		ASSERT0(range_tree_space(msp->ms_freed));
1900
1901		if (msp->ms_sm != NULL) {
1902			mutex_enter(&svr->svr_lock);
1903			VERIFY0(space_map_load(msp->ms_sm,
1904			    svr->svr_allocd_segs, SM_ALLOC));
1905
1906			range_tree_walk(msp->ms_unflushed_allocs,
1907			    range_tree_add, svr->svr_allocd_segs);
1908			range_tree_walk(msp->ms_unflushed_frees,
1909			    range_tree_remove, svr->svr_allocd_segs);
1910			range_tree_walk(msp->ms_freeing,
1911			    range_tree_remove, svr->svr_allocd_segs);
1912
1913			/*
1914			 * Clear everything past what has been synced,
1915			 * because we have not allocated mappings for it yet.
1916			 */
1917			uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
1918			uint64_t sm_end = msp->ms_sm->sm_start +
1919			    msp->ms_sm->sm_size;
1920			if (sm_end > syncd)
1921				range_tree_clear(svr->svr_allocd_segs,
1922				    syncd, sm_end - syncd);
1923
1924			mutex_exit(&svr->svr_lock);
1925		}
1926		mutex_exit(&msp->ms_lock);
1927
1928		mutex_enter(&svr->svr_lock);
1929		range_tree_vacate(svr->svr_allocd_segs,
1930		    free_mapped_segment_cb, vd);
1931		mutex_exit(&svr->svr_lock);
1932	}
1933
1934	/*
1935	 * Note: this must happen after we invoke free_mapped_segment_cb,
1936	 * because it adds to the obsolete_segments.
1937	 */
1938	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
1939
1940	ASSERT3U(vic->vic_mapping_object, ==,
1941	    vdev_indirect_mapping_object(vd->vdev_indirect_mapping));
1942	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
1943	vd->vdev_indirect_mapping = NULL;
1944	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
1945	vic->vic_mapping_object = 0;
1946
1947	ASSERT3U(vic->vic_births_object, ==,
1948	    vdev_indirect_births_object(vd->vdev_indirect_births));
1949	vdev_indirect_births_close(vd->vdev_indirect_births);
1950	vd->vdev_indirect_births = NULL;
1951	vdev_indirect_births_free(mos, vic->vic_births_object, tx);
1952	vic->vic_births_object = 0;
1953
1954	/*
1955	 * We may have processed some frees from the removing vdev in this
1956	 * txg, thus increasing svr_bytes_done; discard that here to
1957	 * satisfy the assertions in spa_vdev_removal_destroy().
1958	 * Note that future txg's can not have any bytes_done, because
1959	 * future TXG's are only modified from open context, and we have
1960	 * already shut down the copying thread.
1961	 */
1962	svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0;
1963	spa_finish_removal(spa, DSS_CANCELED, tx);
1964
1965	vd->vdev_removing = B_FALSE;
1966
1967	if (!vdev_prop_allocating_off(vd)) {
1968		spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
1969		vdev_activate(vd);
1970		spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
1971	}
1972
1973	vdev_config_dirty(vd);
1974
1975	zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
1976	    (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx));
1977	spa_history_log_internal(spa, "vdev remove canceled", tx,
1978	    "%s vdev %llu %s", spa_name(spa),
1979	    (u_longlong_t)vd->vdev_id,
1980	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
1981}
1982
1983static int
1984spa_vdev_remove_cancel_impl(spa_t *spa)
1985{
1986	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
1987	    spa_vdev_remove_cancel_sync, NULL, 0,
1988	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
1989	return (error);
1990}
1991
1992int
1993spa_vdev_remove_cancel(spa_t *spa)
1994{
1995	spa_vdev_remove_suspend(spa);
1996
1997	if (spa->spa_vdev_removal == NULL)
1998		return (ENOTACTIVE);
1999
2000	return (spa_vdev_remove_cancel_impl(spa));
2001}
2002
2003void
2004svr_sync(spa_t *spa, dmu_tx_t *tx)
2005{
2006	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
2007	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
2008
2009	if (svr == NULL)
2010		return;
2011
2012	/*
2013	 * This check is necessary so that we do not dirty the
2014	 * DIRECTORY_OBJECT via spa_sync_removing_state() when there
2015	 * is nothing to do.  Dirtying it every time would prevent us
2016	 * from syncing-to-convergence.
2017	 */
2018	if (svr->svr_bytes_done[txgoff] == 0)
2019		return;
2020
2021	/*
2022	 * Update progress accounting.
2023	 */
2024	spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
2025	svr->svr_bytes_done[txgoff] = 0;
2026
2027	spa_sync_removing_state(spa, tx);
2028}
2029
2030static void
2031vdev_remove_make_hole_and_free(vdev_t *vd)
2032{
2033	uint64_t id = vd->vdev_id;
2034	spa_t *spa = vd->vdev_spa;
2035	vdev_t *rvd = spa->spa_root_vdev;
2036
2037	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2038	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2039
2040	vdev_free(vd);
2041
2042	vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
2043	vdev_add_child(rvd, vd);
2044	vdev_config_dirty(rvd);
2045
2046	/*
2047	 * Reassess the health of our root vdev.
2048	 */
2049	vdev_reopen(rvd);
2050}
2051
2052/*
2053 * Remove a log device.  The config lock is held for the specified TXG.
2054 */
2055static int
2056spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
2057{
2058	metaslab_group_t *mg = vd->vdev_mg;
2059	spa_t *spa = vd->vdev_spa;
2060	int error = 0;
2061
2062	ASSERT(vd->vdev_islog);
2063	ASSERT(vd == vd->vdev_top);
2064	ASSERT3P(vd->vdev_log_mg, ==, NULL);
2065	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2066
2067	/*
2068	 * Stop allocating from this vdev.
2069	 */
2070	metaslab_group_passivate(mg);
2071
2072	/*
2073	 * Wait for the youngest allocations and frees to sync,
2074	 * and then wait for the deferral of those frees to finish.
2075	 */
2076	spa_vdev_config_exit(spa, NULL,
2077	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
2078
2079	/*
2080	 * Cancel any initialize or TRIM which was in progress.
2081	 */
2082	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
2083	vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED);
2084	vdev_autotrim_stop_wait(vd);
2085
2086	/*
2087	 * Evacuate the device.  We don't hold the config lock as
2088	 * writer since we need to do I/O but we do keep the
2089	 * spa_namespace_lock held.  Once this completes the device
2090	 * should no longer have any blocks allocated on it.
2091	 */
2092	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2093	if (vd->vdev_stat.vs_alloc != 0)
2094		error = spa_reset_logs(spa);
2095
2096	*txg = spa_vdev_config_enter(spa);
2097
2098	if (error != 0) {
2099		metaslab_group_activate(mg);
2100		ASSERT3P(vd->vdev_log_mg, ==, NULL);
2101		return (error);
2102	}
2103	ASSERT0(vd->vdev_stat.vs_alloc);
2104
2105	/*
2106	 * The evacuation succeeded.  Remove any remaining MOS metadata
2107	 * associated with this vdev, and wait for these changes to sync.
2108	 */
2109	vd->vdev_removing = B_TRUE;
2110
2111	vdev_dirty_leaves(vd, VDD_DTL, *txg);
2112	vdev_config_dirty(vd);
2113
2114	/*
2115	 * When the log space map feature is enabled we look at
2116	 * the vdev's top_zap to find the on-disk flush data of
2117	 * the metaslab we just flushed. Thus, while removing a
2118	 * log vdev we make sure to call vdev_metaslab_fini()
2119	 * first, which removes all metaslabs of this vdev from
2120	 * spa_metaslabs_by_flushed before vdev_remove_empty()
2121	 * destroys the top_zap of this log vdev.
2122	 *
2123	 * This avoids the scenario where we flush a metaslab
2124	 * from the log vdev being removed that doesn't have a
2125	 * top_zap and end up failing to lookup its on-disk flush
2126	 * data.
2127	 *
2128	 * We don't call metaslab_group_destroy() right away
2129	 * though (it will be called in vdev_free() later) as
2130	 * during metaslab_sync() of metaslabs from other vdevs
2131	 * we may touch the metaslab group of this vdev through
2132	 * metaslab_class_histogram_verify()
2133	 */
2134	vdev_metaslab_fini(vd);
2135
2136	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
2137	*txg = spa_vdev_config_enter(spa);
2138
2139	sysevent_t *ev = spa_event_create(spa, vd, NULL,
2140	    ESC_ZFS_VDEV_REMOVE_DEV);
2141	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2142	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2143
2144	/* The top ZAP should have been destroyed by vdev_remove_empty. */
2145	ASSERT0(vd->vdev_top_zap);
2146	/* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
2147	ASSERT0(vd->vdev_leaf_zap);
2148
2149	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
2150
2151	if (list_link_active(&vd->vdev_state_dirty_node))
2152		vdev_state_clean(vd);
2153	if (list_link_active(&vd->vdev_config_dirty_node))
2154		vdev_config_clean(vd);
2155
2156	ASSERT0(vd->vdev_stat.vs_alloc);
2157
2158	/*
2159	 * Clean up the vdev namespace.
2160	 */
2161	vdev_remove_make_hole_and_free(vd);
2162
2163	if (ev != NULL)
2164		spa_event_post(ev);
2165
2166	return (0);
2167}
2168
2169static int
2170spa_vdev_remove_top_check(vdev_t *vd)
2171{
2172	spa_t *spa = vd->vdev_spa;
2173
2174	if (vd != vd->vdev_top)
2175		return (SET_ERROR(ENOTSUP));
2176
2177	if (!vdev_is_concrete(vd))
2178		return (SET_ERROR(ENOTSUP));
2179
2180	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
2181		return (SET_ERROR(ENOTSUP));
2182
2183	/*
2184	 * This device is already being removed
2185	 */
2186	if (vd->vdev_removing)
2187		return (SET_ERROR(EALREADY));
2188
2189	metaslab_class_t *mc = vd->vdev_mg->mg_class;
2190	metaslab_class_t *normal = spa_normal_class(spa);
2191	if (mc != normal) {
2192		/*
2193		 * Space allocated from the special (or dedup) class is
2194		 * included in the DMU's space usage, but it's not included
2195		 * in spa_dspace (or dsl_pool_adjustedsize()).  Therefore
2196		 * there is always at least as much free space in the normal
2197		 * class, as is allocated from the special (and dedup) class.
2198		 * As a backup check, we will return ENOSPC if this is
2199		 * violated. See also spa_update_dspace().
2200		 */
2201		uint64_t available = metaslab_class_get_space(normal) -
2202		    metaslab_class_get_alloc(normal);
2203		ASSERT3U(available, >=, vd->vdev_stat.vs_alloc);
2204		if (available < vd->vdev_stat.vs_alloc)
2205			return (SET_ERROR(ENOSPC));
2206	} else if (!vd->vdev_noalloc) {
2207		/* available space in the pool's normal class */
2208		uint64_t available = dsl_dir_space_available(
2209		    spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
2210		if (available < vd->vdev_stat.vs_dspace)
2211			return (SET_ERROR(ENOSPC));
2212	}
2213
2214	/*
2215	 * There can not be a removal in progress.
2216	 */
2217	if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
2218		return (SET_ERROR(EBUSY));
2219
2220	/*
2221	 * The device must have all its data.
2222	 */
2223	if (!vdev_dtl_empty(vd, DTL_MISSING) ||
2224	    !vdev_dtl_empty(vd, DTL_OUTAGE))
2225		return (SET_ERROR(EBUSY));
2226
2227	/*
2228	 * The device must be healthy.
2229	 */
2230	if (!vdev_readable(vd))
2231		return (SET_ERROR(EIO));
2232
2233	/*
2234	 * All vdevs in normal class must have the same ashift.
2235	 */
2236	if (spa->spa_max_ashift != spa->spa_min_ashift) {
2237		return (SET_ERROR(EINVAL));
2238	}
2239
2240	/*
2241	 * A removed special/dedup vdev must have same ashift as normal class.
2242	 */
2243	ASSERT(!vd->vdev_islog);
2244	if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
2245	    vd->vdev_ashift != spa->spa_max_ashift) {
2246		return (SET_ERROR(EINVAL));
2247	}
2248
2249	/*
2250	 * All vdevs in normal class must have the same ashift
2251	 * and not be raidz or draid.
2252	 */
2253	vdev_t *rvd = spa->spa_root_vdev;
2254	for (uint64_t id = 0; id < rvd->vdev_children; id++) {
2255		vdev_t *cvd = rvd->vdev_child[id];
2256
2257		/*
2258		 * A removed special/dedup vdev must have the same ashift
2259		 * across all vdevs in its class.
2260		 */
2261		if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
2262		    cvd->vdev_alloc_bias == vd->vdev_alloc_bias &&
2263		    cvd->vdev_ashift != vd->vdev_ashift) {
2264			return (SET_ERROR(EINVAL));
2265		}
2266		if (cvd->vdev_ashift != 0 &&
2267		    cvd->vdev_alloc_bias == VDEV_BIAS_NONE)
2268			ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
2269		if (!vdev_is_concrete(cvd))
2270			continue;
2271		if (vdev_get_nparity(cvd) != 0)
2272			return (SET_ERROR(EINVAL));
2273		/*
2274		 * Need the mirror to be mirror of leaf vdevs only
2275		 */
2276		if (cvd->vdev_ops == &vdev_mirror_ops) {
2277			for (uint64_t cid = 0;
2278			    cid < cvd->vdev_children; cid++) {
2279				if (!cvd->vdev_child[cid]->vdev_ops->
2280				    vdev_op_leaf)
2281					return (SET_ERROR(EINVAL));
2282			}
2283		}
2284	}
2285
2286	return (0);
2287}
2288
2289/*
2290 * Initiate removal of a top-level vdev, reducing the total space in the pool.
2291 * The config lock is held for the specified TXG.  Once initiated,
2292 * evacuation of all allocated space (copying it to other vdevs) happens
2293 * in the background (see spa_vdev_remove_thread()), and can be canceled
2294 * (see spa_vdev_remove_cancel()).  If successful, the vdev will
2295 * be transformed to an indirect vdev (see spa_vdev_remove_complete()).
2296 */
2297static int
2298spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
2299{
2300	spa_t *spa = vd->vdev_spa;
2301	boolean_t set_noalloc = B_FALSE;
2302	int error;
2303
2304	/*
2305	 * Check for errors up-front, so that we don't waste time
2306	 * passivating the metaslab group and clearing the ZIL if there
2307	 * are errors.
2308	 */
2309	error = spa_vdev_remove_top_check(vd);
2310
2311	/*
2312	 * Stop allocating from this vdev.  Note that we must check
2313	 * that this is not the only device in the pool before
2314	 * passivating, otherwise we will not be able to make
2315	 * progress because we can't allocate from any vdevs.
2316	 * The above check for sufficient free space serves this
2317	 * purpose.
2318	 */
2319	if (error == 0 && !vd->vdev_noalloc) {
2320		set_noalloc = B_TRUE;
2321		error = vdev_passivate(vd, txg);
2322	}
2323
2324	if (error != 0)
2325		return (error);
2326
2327	/*
2328	 * We stop any initializing and TRIM that is currently in progress
2329	 * but leave the state as "active". This will allow the process to
2330	 * resume if the removal is canceled sometime later.
2331	 */
2332
2333	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
2334
2335	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
2336	vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
2337	vdev_autotrim_stop_wait(vd);
2338
2339	*txg = spa_vdev_config_enter(spa);
2340
2341	/*
2342	 * Things might have changed while the config lock was dropped
2343	 * (e.g. space usage).  Check for errors again.
2344	 */
2345	error = spa_vdev_remove_top_check(vd);
2346
2347	if (error != 0) {
2348		if (set_noalloc)
2349			vdev_activate(vd);
2350		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
2351		spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
2352		spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
2353		return (error);
2354	}
2355
2356	vd->vdev_removing = B_TRUE;
2357
2358	vdev_dirty_leaves(vd, VDD_DTL, *txg);
2359	vdev_config_dirty(vd);
2360	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
2361	dsl_sync_task_nowait(spa->spa_dsl_pool,
2362	    vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx);
2363	dmu_tx_commit(tx);
2364
2365	return (0);
2366}
2367
2368/*
2369 * Remove a device from the pool.
2370 *
2371 * Removing a device from the vdev namespace requires several steps
2372 * and can take a significant amount of time.  As a result we use
2373 * the spa_vdev_config_[enter/exit] functions which allow us to
2374 * grab and release the spa_config_lock while still holding the namespace
2375 * lock.  During each step the configuration is synced out.
2376 */
2377int
2378spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
2379{
2380	vdev_t *vd;
2381	nvlist_t **spares, **l2cache, *nv;
2382	uint64_t txg = 0;
2383	uint_t nspares, nl2cache;
2384	int error = 0, error_log;
2385	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
2386	sysevent_t *ev = NULL;
2387	const char *vd_type = NULL;
2388	char *vd_path = NULL;
2389
2390	ASSERT(spa_writeable(spa));
2391
2392	if (!locked)
2393		txg = spa_vdev_enter(spa);
2394
2395	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2396	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
2397		error = (spa_has_checkpoint(spa)) ?
2398		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
2399
2400		if (!locked)
2401			return (spa_vdev_exit(spa, NULL, txg, error));
2402
2403		return (error);
2404	}
2405
2406	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
2407
2408	if (spa->spa_spares.sav_vdevs != NULL &&
2409	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2410	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
2411	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
2412		/*
2413		 * Only remove the hot spare if it's not currently in use
2414		 * in this pool.
2415		 */
2416		if (vd == NULL || unspare) {
2417			const char *type;
2418			boolean_t draid_spare = B_FALSE;
2419
2420			if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
2421			    == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
2422				draid_spare = B_TRUE;
2423
2424			if (vd == NULL && draid_spare) {
2425				error = SET_ERROR(ENOTSUP);
2426			} else {
2427				if (vd == NULL)
2428					vd = spa_lookup_by_guid(spa,
2429					    guid, B_TRUE);
2430				ev = spa_event_create(spa, vd, NULL,
2431				    ESC_ZFS_VDEV_REMOVE_AUX);
2432
2433				vd_type = VDEV_TYPE_SPARE;
2434				vd_path = spa_strdup(fnvlist_lookup_string(
2435				    nv, ZPOOL_CONFIG_PATH));
2436				spa_vdev_remove_aux(spa->spa_spares.sav_config,
2437				    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
2438				spa_load_spares(spa);
2439				spa->spa_spares.sav_sync = B_TRUE;
2440			}
2441		} else {
2442			error = SET_ERROR(EBUSY);
2443		}
2444	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
2445	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
2446	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
2447	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
2448		vd_type = VDEV_TYPE_L2CACHE;
2449		vd_path = spa_strdup(fnvlist_lookup_string(
2450		    nv, ZPOOL_CONFIG_PATH));
2451		/*
2452		 * Cache devices can always be removed.
2453		 */
2454		vd = spa_lookup_by_guid(spa, guid, B_TRUE);
2455
2456		/*
2457		 * Stop trimming the cache device. We need to release the
2458		 * config lock to allow the syncing of TRIM transactions
2459		 * without releasing the spa_namespace_lock. The same
2460		 * strategy is employed in spa_vdev_remove_top().
2461		 */
2462		spa_vdev_config_exit(spa, NULL,
2463		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
2464		mutex_enter(&vd->vdev_trim_lock);
2465		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
2466		mutex_exit(&vd->vdev_trim_lock);
2467		txg = spa_vdev_config_enter(spa);
2468
2469		ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
2470		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
2471		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
2472		spa_load_l2cache(spa);
2473		spa->spa_l2cache.sav_sync = B_TRUE;
2474	} else if (vd != NULL && vd->vdev_islog) {
2475		ASSERT(!locked);
2476		vd_type = VDEV_TYPE_LOG;
2477		vd_path = spa_strdup((vd->vdev_path != NULL) ?
2478		    vd->vdev_path : "-");
2479		error = spa_vdev_remove_log(vd, &txg);
2480	} else if (vd != NULL) {
2481		ASSERT(!locked);
2482		error = spa_vdev_remove_top(vd, &txg);
2483	} else {
2484		/*
2485		 * There is no vdev of any kind with the specified guid.
2486		 */
2487		error = SET_ERROR(ENOENT);
2488	}
2489
2490	error_log = error;
2491
2492	if (!locked)
2493		error = spa_vdev_exit(spa, NULL, txg, error);
2494
2495	/*
2496	 * Logging must be done outside the spa config lock. Otherwise,
2497	 * this code path could end up holding the spa config lock while
2498	 * waiting for a txg_sync so it can write to the internal log.
2499	 * Doing that would prevent the txg sync from actually happening,
2500	 * causing a deadlock.
2501	 */
2502	if (error_log == 0 && vd_type != NULL && vd_path != NULL) {
2503		spa_history_log_internal(spa, "vdev remove", NULL,
2504		    "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path);
2505	}
2506	if (vd_path != NULL)
2507		spa_strfree(vd_path);
2508
2509	if (ev != NULL)
2510		spa_event_post(ev);
2511
2512	return (error);
2513}
2514
2515int
2516spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
2517{
2518	prs->prs_state = spa->spa_removing_phys.sr_state;
2519
2520	if (prs->prs_state == DSS_NONE)
2521		return (SET_ERROR(ENOENT));
2522
2523	prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev;
2524	prs->prs_start_time = spa->spa_removing_phys.sr_start_time;
2525	prs->prs_end_time = spa->spa_removing_phys.sr_end_time;
2526	prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
2527	prs->prs_copied = spa->spa_removing_phys.sr_copied;
2528
2529	prs->prs_mapping_memory = 0;
2530	uint64_t indirect_vdev_id =
2531	    spa->spa_removing_phys.sr_prev_indirect_vdev;
2532	while (indirect_vdev_id != -1) {
2533		vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id];
2534		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
2535		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
2536
2537		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
2538		prs->prs_mapping_memory += vdev_indirect_mapping_size(vim);
2539		indirect_vdev_id = vic->vic_prev_indirect_vdev;
2540	}
2541
2542	return (0);
2543}
2544
2545ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW,
2546	"Ignore hard IO errors when removing device");
2547
2548ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, UINT, ZMOD_RW,
2549	"Largest contiguous segment to allocate when removing device");
2550
2551ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, UINT, ZMOD_RW,
2552	"Largest span of free chunks a remap segment can span");
2553
2554/* BEGIN CSTYLED */
2555ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, UINT, ZMOD_RW,
2556	"Pause device removal after this many bytes are copied "
2557	"(debug use only - causes removal to hang)");
2558/* END CSTYLED */
2559
2560EXPORT_SYMBOL(free_from_removing_vdev);
2561EXPORT_SYMBOL(spa_removal_get_stats);
2562EXPORT_SYMBOL(spa_remove_init);
2563EXPORT_SYMBOL(spa_restart_removal);
2564EXPORT_SYMBOL(spa_vdev_removal_destroy);
2565EXPORT_SYMBOL(spa_vdev_remove);
2566EXPORT_SYMBOL(spa_vdev_remove_cancel);
2567EXPORT_SYMBOL(spa_vdev_remove_suspend);
2568EXPORT_SYMBOL(svr_sync);
2569