1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2017, Intel Corporation.
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/dmu.h>
31#include <sys/dmu_tx.h>
32#include <sys/space_map.h>
33#include <sys/metaslab_impl.h>
34#include <sys/vdev_impl.h>
35#include <sys/vdev_draid.h>
36#include <sys/zio.h>
37#include <sys/spa_impl.h>
38#include <sys/zfeature.h>
39#include <sys/vdev_indirect_mapping.h>
40#include <sys/zap.h>
41#include <sys/btree.h>
42
43#define	GANG_ALLOCATION(flags) \
44	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
45
46/*
47 * Metaslab granularity, in bytes. This is roughly similar to what would be
48 * referred to as the "stripe size" in traditional RAID arrays. In normal
49 * operation, we will try to write this amount of data to each disk before
50 * moving on to the next top-level vdev.
51 */
52static uint64_t metaslab_aliquot = 1024 * 1024;
53
54/*
55 * For testing, make some blocks above a certain size be gang blocks.
56 */
57uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
58
59/*
60 * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
61 */
62uint_t metaslab_force_ganging_pct = 3;
63
64/*
65 * In pools where the log space map feature is not enabled we touch
66 * multiple metaslabs (and their respective space maps) with each
67 * transaction group. Thus, we benefit from having a small space map
68 * block size since it allows us to issue more I/O operations scattered
69 * around the disk. So a sane default for the space map block size
70 * is 8~16K.
71 */
72int zfs_metaslab_sm_blksz_no_log = (1 << 14);
73
74/*
75 * When the log space map feature is enabled, we accumulate a lot of
76 * changes per metaslab that are flushed once in a while so we benefit
77 * from a bigger block size like 128K for the metaslab space maps.
78 */
79int zfs_metaslab_sm_blksz_with_log = (1 << 17);
80
81/*
82 * The in-core space map representation is more compact than its on-disk form.
83 * The zfs_condense_pct determines how much more compact the in-core
84 * space map representation must be before we compact it on-disk.
85 * Values should be greater than or equal to 100.
86 */
87uint_t zfs_condense_pct = 200;
88
89/*
90 * Condensing a metaslab is not guaranteed to actually reduce the amount of
91 * space used on disk. In particular, a space map uses data in increments of
92 * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
93 * same number of blocks after condensing. Since the goal of condensing is to
94 * reduce the number of IOPs required to read the space map, we only want to
95 * condense when we can be sure we will reduce the number of blocks used by the
96 * space map. Unfortunately, we cannot precisely compute whether or not this is
97 * the case in metaslab_should_condense since we are holding ms_lock. Instead,
98 * we apply the following heuristic: do not condense a spacemap unless the
99 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
100 * blocks.
101 */
102static const int zfs_metaslab_condense_block_threshold = 4;
103
104/*
105 * The zfs_mg_noalloc_threshold defines which metaslab groups should
106 * be eligible for allocation. The value is defined as a percentage of
107 * free space. Metaslab groups that have more free space than
108 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
109 * a metaslab group's free space is less than or equal to the
110 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
111 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
112 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
113 * groups are allowed to accept allocations. Gang blocks are always
114 * eligible to allocate on any metaslab group. The default value of 0 means
115 * no metaslab group will be excluded based on this criterion.
116 */
117static uint_t zfs_mg_noalloc_threshold = 0;
118
119/*
120 * Metaslab groups are considered eligible for allocations if their
121 * fragmentation metric (measured as a percentage) is less than or
122 * equal to zfs_mg_fragmentation_threshold. If a metaslab group
123 * exceeds this threshold then it will be skipped unless all metaslab
124 * groups within the metaslab class have also crossed this threshold.
125 *
126 * This tunable was introduced to avoid edge cases where we continue
127 * allocating from very fragmented disks in our pool while other, less
128 * fragmented disks, exists. On the other hand, if all disks in the
129 * pool are uniformly approaching the threshold, the threshold can
130 * be a speed bump in performance, where we keep switching the disks
131 * that we allocate from (e.g. we allocate some segments from disk A
132 * making it bypassing the threshold while freeing segments from disk
133 * B getting its fragmentation below the threshold).
134 *
135 * Empirically, we've seen that our vdev selection for allocations is
136 * good enough that fragmentation increases uniformly across all vdevs
137 * the majority of the time. Thus we set the threshold percentage high
138 * enough to avoid hitting the speed bump on pools that are being pushed
139 * to the edge.
140 */
141static uint_t zfs_mg_fragmentation_threshold = 95;
142
143/*
144 * Allow metaslabs to keep their active state as long as their fragmentation
145 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
146 * active metaslab that exceeds this threshold will no longer keep its active
147 * status allowing better metaslabs to be selected.
148 */
149static uint_t zfs_metaslab_fragmentation_threshold = 70;
150
151/*
152 * When set will load all metaslabs when pool is first opened.
153 */
154int metaslab_debug_load = B_FALSE;
155
156/*
157 * When set will prevent metaslabs from being unloaded.
158 */
159static int metaslab_debug_unload = B_FALSE;
160
161/*
162 * Minimum size which forces the dynamic allocator to change
163 * it's allocation strategy.  Once the space map cannot satisfy
164 * an allocation of this size then it switches to using more
165 * aggressive strategy (i.e search by size rather than offset).
166 */
167uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
168
169/*
170 * The minimum free space, in percent, which must be available
171 * in a space map to continue allocations in a first-fit fashion.
172 * Once the space map's free space drops below this level we dynamically
173 * switch to using best-fit allocations.
174 */
175uint_t metaslab_df_free_pct = 4;
176
177/*
178 * Maximum distance to search forward from the last offset. Without this
179 * limit, fragmented pools can see >100,000 iterations and
180 * metaslab_block_picker() becomes the performance limiting factor on
181 * high-performance storage.
182 *
183 * With the default setting of 16MB, we typically see less than 500
184 * iterations, even with very fragmented, ashift=9 pools. The maximum number
185 * of iterations possible is:
186 *     metaslab_df_max_search / (2 * (1<<ashift))
187 * With the default setting of 16MB this is 16*1024 (with ashift=9) or
188 * 2048 (with ashift=12).
189 */
190static uint_t metaslab_df_max_search = 16 * 1024 * 1024;
191
192/*
193 * Forces the metaslab_block_picker function to search for at least this many
194 * segments forwards until giving up on finding a segment that the allocation
195 * will fit into.
196 */
197static const uint32_t metaslab_min_search_count = 100;
198
199/*
200 * If we are not searching forward (due to metaslab_df_max_search,
201 * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
202 * controls what segment is used.  If it is set, we will use the largest free
203 * segment.  If it is not set, we will use a segment of exactly the requested
204 * size (or larger).
205 */
206static int metaslab_df_use_largest_segment = B_FALSE;
207
208/*
209 * These tunables control how long a metaslab will remain loaded after the
210 * last allocation from it.  A metaslab can't be unloaded until at least
211 * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
212 * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
213 * unloaded sooner.  These settings are intended to be generous -- to keep
214 * metaslabs loaded for a long time, reducing the rate of metaslab loading.
215 */
216static uint_t metaslab_unload_delay = 32;
217static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
218
219/*
220 * Max number of metaslabs per group to preload.
221 */
222uint_t metaslab_preload_limit = 10;
223
224/*
225 * Enable/disable preloading of metaslab.
226 */
227static int metaslab_preload_enabled = B_TRUE;
228
229/*
230 * Enable/disable fragmentation weighting on metaslabs.
231 */
232static int metaslab_fragmentation_factor_enabled = B_TRUE;
233
234/*
235 * Enable/disable lba weighting (i.e. outer tracks are given preference).
236 */
237static int metaslab_lba_weighting_enabled = B_TRUE;
238
239/*
240 * Enable/disable metaslab group biasing.
241 */
242static int metaslab_bias_enabled = B_TRUE;
243
244/*
245 * Enable/disable remapping of indirect DVAs to their concrete vdevs.
246 */
247static const boolean_t zfs_remap_blkptr_enable = B_TRUE;
248
249/*
250 * Enable/disable segment-based metaslab selection.
251 */
252static int zfs_metaslab_segment_weight_enabled = B_TRUE;
253
254/*
255 * When using segment-based metaslab selection, we will continue
256 * allocating from the active metaslab until we have exhausted
257 * zfs_metaslab_switch_threshold of its buckets.
258 */
259static int zfs_metaslab_switch_threshold = 2;
260
261/*
262 * Internal switch to enable/disable the metaslab allocation tracing
263 * facility.
264 */
265static const boolean_t metaslab_trace_enabled = B_FALSE;
266
267/*
268 * Maximum entries that the metaslab allocation tracing facility will keep
269 * in a given list when running in non-debug mode. We limit the number
270 * of entries in non-debug mode to prevent us from using up too much memory.
271 * The limit should be sufficiently large that we don't expect any allocation
272 * to every exceed this value. In debug mode, the system will panic if this
273 * limit is ever reached allowing for further investigation.
274 */
275static const uint64_t metaslab_trace_max_entries = 5000;
276
277/*
278 * Maximum number of metaslabs per group that can be disabled
279 * simultaneously.
280 */
281static const int max_disabled_ms = 3;
282
283/*
284 * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
285 * To avoid 64-bit overflow, don't set above UINT32_MAX.
286 */
287static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */
288
289/*
290 * Maximum percentage of memory to use on storing loaded metaslabs. If loading
291 * a metaslab would take it over this percentage, the oldest selected metaslab
292 * is automatically unloaded.
293 */
294static uint_t zfs_metaslab_mem_limit = 25;
295
296/*
297 * Force the per-metaslab range trees to use 64-bit integers to store
298 * segments. Used for debugging purposes.
299 */
300static const boolean_t zfs_metaslab_force_large_segs = B_FALSE;
301
302/*
303 * By default we only store segments over a certain size in the size-sorted
304 * metaslab trees (ms_allocatable_by_size and
305 * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
306 * improves load and unload times at the cost of causing us to use slightly
307 * larger segments than we would otherwise in some cases.
308 */
309static const uint32_t metaslab_by_size_min_shift = 14;
310
311/*
312 * If not set, we will first try normal allocation.  If that fails then
313 * we will do a gang allocation.  If that fails then we will do a "try hard"
314 * gang allocation.  If that fails then we will have a multi-layer gang
315 * block.
316 *
317 * If set, we will first try normal allocation.  If that fails then
318 * we will do a "try hard" allocation.  If that fails we will do a gang
319 * allocation.  If that fails we will do a "try hard" gang allocation.  If
320 * that fails then we will have a multi-layer gang block.
321 */
322static int zfs_metaslab_try_hard_before_gang = B_FALSE;
323
324/*
325 * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
326 * metaslabs.  This improves performance, especially when there are many
327 * metaslabs per vdev and the allocation can't actually be satisfied (so we
328 * would otherwise iterate all the metaslabs).  If there is a metaslab with a
329 * worse weight but it can actually satisfy the allocation, we won't find it
330 * until trying hard.  This may happen if the worse metaslab is not loaded
331 * (and the true weight is better than we have calculated), or due to weight
332 * bucketization.  E.g. we are looking for a 60K segment, and the best
333 * metaslabs all have free segments in the 32-63K bucket, but the best
334 * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
335 * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
336 * bucket, and therefore a lower weight).
337 */
338static uint_t zfs_metaslab_find_max_tries = 100;
339
340static uint64_t metaslab_weight(metaslab_t *, boolean_t);
341static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
342static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
343static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
344
345static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
346static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
347static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
348static unsigned int metaslab_idx_func(multilist_t *, void *);
349static void metaslab_evict(metaslab_t *, uint64_t);
350static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
351kmem_cache_t *metaslab_alloc_trace_cache;
352
353typedef struct metaslab_stats {
354	kstat_named_t metaslabstat_trace_over_limit;
355	kstat_named_t metaslabstat_reload_tree;
356	kstat_named_t metaslabstat_too_many_tries;
357	kstat_named_t metaslabstat_try_hard;
358} metaslab_stats_t;
359
360static metaslab_stats_t metaslab_stats = {
361	{ "trace_over_limit",		KSTAT_DATA_UINT64 },
362	{ "reload_tree",		KSTAT_DATA_UINT64 },
363	{ "too_many_tries",		KSTAT_DATA_UINT64 },
364	{ "try_hard",			KSTAT_DATA_UINT64 },
365};
366
367#define	METASLABSTAT_BUMP(stat) \
368	atomic_inc_64(&metaslab_stats.stat.value.ui64);
369
370
371static kstat_t *metaslab_ksp;
372
373void
374metaslab_stat_init(void)
375{
376	ASSERT(metaslab_alloc_trace_cache == NULL);
377	metaslab_alloc_trace_cache = kmem_cache_create(
378	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
379	    0, NULL, NULL, NULL, NULL, NULL, 0);
380	metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
381	    "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
382	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
383	if (metaslab_ksp != NULL) {
384		metaslab_ksp->ks_data = &metaslab_stats;
385		kstat_install(metaslab_ksp);
386	}
387}
388
389void
390metaslab_stat_fini(void)
391{
392	if (metaslab_ksp != NULL) {
393		kstat_delete(metaslab_ksp);
394		metaslab_ksp = NULL;
395	}
396
397	kmem_cache_destroy(metaslab_alloc_trace_cache);
398	metaslab_alloc_trace_cache = NULL;
399}
400
401/*
402 * ==========================================================================
403 * Metaslab classes
404 * ==========================================================================
405 */
406metaslab_class_t *
407metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops)
408{
409	metaslab_class_t *mc;
410
411	mc = kmem_zalloc(offsetof(metaslab_class_t,
412	    mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
413
414	mc->mc_spa = spa;
415	mc->mc_ops = ops;
416	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
417	multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t),
418	    offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
419	for (int i = 0; i < spa->spa_alloc_count; i++) {
420		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
421		mca->mca_rotor = NULL;
422		zfs_refcount_create_tracked(&mca->mca_alloc_slots);
423	}
424
425	return (mc);
426}
427
428void
429metaslab_class_destroy(metaslab_class_t *mc)
430{
431	spa_t *spa = mc->mc_spa;
432
433	ASSERT(mc->mc_alloc == 0);
434	ASSERT(mc->mc_deferred == 0);
435	ASSERT(mc->mc_space == 0);
436	ASSERT(mc->mc_dspace == 0);
437
438	for (int i = 0; i < spa->spa_alloc_count; i++) {
439		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
440		ASSERT(mca->mca_rotor == NULL);
441		zfs_refcount_destroy(&mca->mca_alloc_slots);
442	}
443	mutex_destroy(&mc->mc_lock);
444	multilist_destroy(&mc->mc_metaslab_txg_list);
445	kmem_free(mc, offsetof(metaslab_class_t,
446	    mc_allocator[spa->spa_alloc_count]));
447}
448
449int
450metaslab_class_validate(metaslab_class_t *mc)
451{
452	metaslab_group_t *mg;
453	vdev_t *vd;
454
455	/*
456	 * Must hold one of the spa_config locks.
457	 */
458	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
459	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
460
461	if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
462		return (0);
463
464	do {
465		vd = mg->mg_vd;
466		ASSERT(vd->vdev_mg != NULL);
467		ASSERT3P(vd->vdev_top, ==, vd);
468		ASSERT3P(mg->mg_class, ==, mc);
469		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
470	} while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
471
472	return (0);
473}
474
475static void
476metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
477    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
478{
479	atomic_add_64(&mc->mc_alloc, alloc_delta);
480	atomic_add_64(&mc->mc_deferred, defer_delta);
481	atomic_add_64(&mc->mc_space, space_delta);
482	atomic_add_64(&mc->mc_dspace, dspace_delta);
483}
484
485uint64_t
486metaslab_class_get_alloc(metaslab_class_t *mc)
487{
488	return (mc->mc_alloc);
489}
490
491uint64_t
492metaslab_class_get_deferred(metaslab_class_t *mc)
493{
494	return (mc->mc_deferred);
495}
496
497uint64_t
498metaslab_class_get_space(metaslab_class_t *mc)
499{
500	return (mc->mc_space);
501}
502
503uint64_t
504metaslab_class_get_dspace(metaslab_class_t *mc)
505{
506	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
507}
508
509void
510metaslab_class_histogram_verify(metaslab_class_t *mc)
511{
512	spa_t *spa = mc->mc_spa;
513	vdev_t *rvd = spa->spa_root_vdev;
514	uint64_t *mc_hist;
515	int i;
516
517	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
518		return;
519
520	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
521	    KM_SLEEP);
522
523	mutex_enter(&mc->mc_lock);
524	for (int c = 0; c < rvd->vdev_children; c++) {
525		vdev_t *tvd = rvd->vdev_child[c];
526		metaslab_group_t *mg = vdev_get_mg(tvd, mc);
527
528		/*
529		 * Skip any holes, uninitialized top-levels, or
530		 * vdevs that are not in this metalab class.
531		 */
532		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
533		    mg->mg_class != mc) {
534			continue;
535		}
536
537		IMPLY(mg == mg->mg_vd->vdev_log_mg,
538		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
539
540		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
541			mc_hist[i] += mg->mg_histogram[i];
542	}
543
544	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
545		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
546	}
547
548	mutex_exit(&mc->mc_lock);
549	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
550}
551
552/*
553 * Calculate the metaslab class's fragmentation metric. The metric
554 * is weighted based on the space contribution of each metaslab group.
555 * The return value will be a number between 0 and 100 (inclusive), or
556 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
557 * zfs_frag_table for more information about the metric.
558 */
559uint64_t
560metaslab_class_fragmentation(metaslab_class_t *mc)
561{
562	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
563	uint64_t fragmentation = 0;
564
565	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
566
567	for (int c = 0; c < rvd->vdev_children; c++) {
568		vdev_t *tvd = rvd->vdev_child[c];
569		metaslab_group_t *mg = tvd->vdev_mg;
570
571		/*
572		 * Skip any holes, uninitialized top-levels,
573		 * or vdevs that are not in this metalab class.
574		 */
575		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
576		    mg->mg_class != mc) {
577			continue;
578		}
579
580		/*
581		 * If a metaslab group does not contain a fragmentation
582		 * metric then just bail out.
583		 */
584		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
585			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
586			return (ZFS_FRAG_INVALID);
587		}
588
589		/*
590		 * Determine how much this metaslab_group is contributing
591		 * to the overall pool fragmentation metric.
592		 */
593		fragmentation += mg->mg_fragmentation *
594		    metaslab_group_get_space(mg);
595	}
596	fragmentation /= metaslab_class_get_space(mc);
597
598	ASSERT3U(fragmentation, <=, 100);
599	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
600	return (fragmentation);
601}
602
603/*
604 * Calculate the amount of expandable space that is available in
605 * this metaslab class. If a device is expanded then its expandable
606 * space will be the amount of allocatable space that is currently not
607 * part of this metaslab class.
608 */
609uint64_t
610metaslab_class_expandable_space(metaslab_class_t *mc)
611{
612	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
613	uint64_t space = 0;
614
615	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
616	for (int c = 0; c < rvd->vdev_children; c++) {
617		vdev_t *tvd = rvd->vdev_child[c];
618		metaslab_group_t *mg = tvd->vdev_mg;
619
620		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
621		    mg->mg_class != mc) {
622			continue;
623		}
624
625		/*
626		 * Calculate if we have enough space to add additional
627		 * metaslabs. We report the expandable space in terms
628		 * of the metaslab size since that's the unit of expansion.
629		 */
630		space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize,
631		    1ULL << tvd->vdev_ms_shift, uint64_t);
632	}
633	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
634	return (space);
635}
636
637void
638metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
639{
640	multilist_t *ml = &mc->mc_metaslab_txg_list;
641	hrtime_t now = gethrtime();
642	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
643		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
644		metaslab_t *msp = multilist_sublist_head(mls);
645		multilist_sublist_unlock(mls);
646		while (msp != NULL) {
647			mutex_enter(&msp->ms_lock);
648
649			/*
650			 * If the metaslab has been removed from the list
651			 * (which could happen if we were at the memory limit
652			 * and it was evicted during this loop), then we can't
653			 * proceed and we should restart the sublist.
654			 */
655			if (!multilist_link_active(&msp->ms_class_txg_node)) {
656				mutex_exit(&msp->ms_lock);
657				i--;
658				break;
659			}
660			mls = multilist_sublist_lock_idx(ml, i);
661			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
662			multilist_sublist_unlock(mls);
663			if (txg >
664			    msp->ms_selected_txg + metaslab_unload_delay &&
665			    now > msp->ms_selected_time +
666			    MSEC2NSEC(metaslab_unload_delay_ms) &&
667			    (msp->ms_allocator == -1 ||
668			    !metaslab_preload_enabled)) {
669				metaslab_evict(msp, txg);
670			} else {
671				/*
672				 * Once we've hit a metaslab selected too
673				 * recently to evict, we're done evicting for
674				 * now.
675				 */
676				mutex_exit(&msp->ms_lock);
677				break;
678			}
679			mutex_exit(&msp->ms_lock);
680			msp = next_msp;
681		}
682	}
683}
684
685static int
686metaslab_compare(const void *x1, const void *x2)
687{
688	const metaslab_t *m1 = (const metaslab_t *)x1;
689	const metaslab_t *m2 = (const metaslab_t *)x2;
690
691	int sort1 = 0;
692	int sort2 = 0;
693	if (m1->ms_allocator != -1 && m1->ms_primary)
694		sort1 = 1;
695	else if (m1->ms_allocator != -1 && !m1->ms_primary)
696		sort1 = 2;
697	if (m2->ms_allocator != -1 && m2->ms_primary)
698		sort2 = 1;
699	else if (m2->ms_allocator != -1 && !m2->ms_primary)
700		sort2 = 2;
701
702	/*
703	 * Sort inactive metaslabs first, then primaries, then secondaries. When
704	 * selecting a metaslab to allocate from, an allocator first tries its
705	 * primary, then secondary active metaslab. If it doesn't have active
706	 * metaslabs, or can't allocate from them, it searches for an inactive
707	 * metaslab to activate. If it can't find a suitable one, it will steal
708	 * a primary or secondary metaslab from another allocator.
709	 */
710	if (sort1 < sort2)
711		return (-1);
712	if (sort1 > sort2)
713		return (1);
714
715	int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
716	if (likely(cmp))
717		return (cmp);
718
719	IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
720
721	return (TREE_CMP(m1->ms_start, m2->ms_start));
722}
723
724/*
725 * ==========================================================================
726 * Metaslab groups
727 * ==========================================================================
728 */
729/*
730 * Update the allocatable flag and the metaslab group's capacity.
731 * The allocatable flag is set to true if the capacity is below
732 * the zfs_mg_noalloc_threshold or has a fragmentation value that is
733 * greater than zfs_mg_fragmentation_threshold. If a metaslab group
734 * transitions from allocatable to non-allocatable or vice versa then the
735 * metaslab group's class is updated to reflect the transition.
736 */
737static void
738metaslab_group_alloc_update(metaslab_group_t *mg)
739{
740	vdev_t *vd = mg->mg_vd;
741	metaslab_class_t *mc = mg->mg_class;
742	vdev_stat_t *vs = &vd->vdev_stat;
743	boolean_t was_allocatable;
744	boolean_t was_initialized;
745
746	ASSERT(vd == vd->vdev_top);
747	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
748	    SCL_ALLOC);
749
750	mutex_enter(&mg->mg_lock);
751	was_allocatable = mg->mg_allocatable;
752	was_initialized = mg->mg_initialized;
753
754	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
755	    (vs->vs_space + 1);
756
757	mutex_enter(&mc->mc_lock);
758
759	/*
760	 * If the metaslab group was just added then it won't
761	 * have any space until we finish syncing out this txg.
762	 * At that point we will consider it initialized and available
763	 * for allocations.  We also don't consider non-activated
764	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
765	 * to be initialized, because they can't be used for allocation.
766	 */
767	mg->mg_initialized = metaslab_group_initialized(mg);
768	if (!was_initialized && mg->mg_initialized) {
769		mc->mc_groups++;
770	} else if (was_initialized && !mg->mg_initialized) {
771		ASSERT3U(mc->mc_groups, >, 0);
772		mc->mc_groups--;
773	}
774	if (mg->mg_initialized)
775		mg->mg_no_free_space = B_FALSE;
776
777	/*
778	 * A metaslab group is considered allocatable if it has plenty
779	 * of free space or is not heavily fragmented. We only take
780	 * fragmentation into account if the metaslab group has a valid
781	 * fragmentation metric (i.e. a value between 0 and 100).
782	 */
783	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
784	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
785	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
786	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
787
788	/*
789	 * The mc_alloc_groups maintains a count of the number of
790	 * groups in this metaslab class that are still above the
791	 * zfs_mg_noalloc_threshold. This is used by the allocating
792	 * threads to determine if they should avoid allocations to
793	 * a given group. The allocator will avoid allocations to a group
794	 * if that group has reached or is below the zfs_mg_noalloc_threshold
795	 * and there are still other groups that are above the threshold.
796	 * When a group transitions from allocatable to non-allocatable or
797	 * vice versa we update the metaslab class to reflect that change.
798	 * When the mc_alloc_groups value drops to 0 that means that all
799	 * groups have reached the zfs_mg_noalloc_threshold making all groups
800	 * eligible for allocations. This effectively means that all devices
801	 * are balanced again.
802	 */
803	if (was_allocatable && !mg->mg_allocatable)
804		mc->mc_alloc_groups--;
805	else if (!was_allocatable && mg->mg_allocatable)
806		mc->mc_alloc_groups++;
807	mutex_exit(&mc->mc_lock);
808
809	mutex_exit(&mg->mg_lock);
810}
811
812int
813metaslab_sort_by_flushed(const void *va, const void *vb)
814{
815	const metaslab_t *a = va;
816	const metaslab_t *b = vb;
817
818	int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
819	if (likely(cmp))
820		return (cmp);
821
822	uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
823	uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
824	cmp = TREE_CMP(a_vdev_id, b_vdev_id);
825	if (cmp)
826		return (cmp);
827
828	return (TREE_CMP(a->ms_id, b->ms_id));
829}
830
831metaslab_group_t *
832metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
833{
834	metaslab_group_t *mg;
835
836	mg = kmem_zalloc(offsetof(metaslab_group_t,
837	    mg_allocator[allocators]), KM_SLEEP);
838	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
839	mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
840	cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
841	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
842	    sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
843	mg->mg_vd = vd;
844	mg->mg_class = mc;
845	mg->mg_activation_count = 0;
846	mg->mg_initialized = B_FALSE;
847	mg->mg_no_free_space = B_TRUE;
848	mg->mg_allocators = allocators;
849
850	for (int i = 0; i < allocators; i++) {
851		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
852		zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
853	}
854
855	return (mg);
856}
857
858void
859metaslab_group_destroy(metaslab_group_t *mg)
860{
861	ASSERT(mg->mg_prev == NULL);
862	ASSERT(mg->mg_next == NULL);
863	/*
864	 * We may have gone below zero with the activation count
865	 * either because we never activated in the first place or
866	 * because we're done, and possibly removing the vdev.
867	 */
868	ASSERT(mg->mg_activation_count <= 0);
869
870	avl_destroy(&mg->mg_metaslab_tree);
871	mutex_destroy(&mg->mg_lock);
872	mutex_destroy(&mg->mg_ms_disabled_lock);
873	cv_destroy(&mg->mg_ms_disabled_cv);
874
875	for (int i = 0; i < mg->mg_allocators; i++) {
876		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
877		zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
878	}
879	kmem_free(mg, offsetof(metaslab_group_t,
880	    mg_allocator[mg->mg_allocators]));
881}
882
883void
884metaslab_group_activate(metaslab_group_t *mg)
885{
886	metaslab_class_t *mc = mg->mg_class;
887	spa_t *spa = mc->mc_spa;
888	metaslab_group_t *mgprev, *mgnext;
889
890	ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
891
892	ASSERT(mg->mg_prev == NULL);
893	ASSERT(mg->mg_next == NULL);
894	ASSERT(mg->mg_activation_count <= 0);
895
896	if (++mg->mg_activation_count <= 0)
897		return;
898
899	mg->mg_aliquot = metaslab_aliquot * MAX(1,
900	    vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd));
901	metaslab_group_alloc_update(mg);
902
903	if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
904		mg->mg_prev = mg;
905		mg->mg_next = mg;
906	} else {
907		mgnext = mgprev->mg_next;
908		mg->mg_prev = mgprev;
909		mg->mg_next = mgnext;
910		mgprev->mg_next = mg;
911		mgnext->mg_prev = mg;
912	}
913	for (int i = 0; i < spa->spa_alloc_count; i++) {
914		mc->mc_allocator[i].mca_rotor = mg;
915		mg = mg->mg_next;
916	}
917}
918
919/*
920 * Passivate a metaslab group and remove it from the allocation rotor.
921 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
922 * a metaslab group. This function will momentarily drop spa_config_locks
923 * that are lower than the SCL_ALLOC lock (see comment below).
924 */
925void
926metaslab_group_passivate(metaslab_group_t *mg)
927{
928	metaslab_class_t *mc = mg->mg_class;
929	spa_t *spa = mc->mc_spa;
930	metaslab_group_t *mgprev, *mgnext;
931	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
932
933	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
934	    (SCL_ALLOC | SCL_ZIO));
935
936	if (--mg->mg_activation_count != 0) {
937		for (int i = 0; i < spa->spa_alloc_count; i++)
938			ASSERT(mc->mc_allocator[i].mca_rotor != mg);
939		ASSERT(mg->mg_prev == NULL);
940		ASSERT(mg->mg_next == NULL);
941		ASSERT(mg->mg_activation_count < 0);
942		return;
943	}
944
945	/*
946	 * The spa_config_lock is an array of rwlocks, ordered as
947	 * follows (from highest to lowest):
948	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
949	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
950	 * (For more information about the spa_config_lock see spa_misc.c)
951	 * The higher the lock, the broader its coverage. When we passivate
952	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
953	 * config locks. However, the metaslab group's taskq might be trying
954	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
955	 * lower locks to allow the I/O to complete. At a minimum,
956	 * we continue to hold the SCL_ALLOC lock, which prevents any future
957	 * allocations from taking place and any changes to the vdev tree.
958	 */
959	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
960	taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
961	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
962	metaslab_group_alloc_update(mg);
963	for (int i = 0; i < mg->mg_allocators; i++) {
964		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
965		metaslab_t *msp = mga->mga_primary;
966		if (msp != NULL) {
967			mutex_enter(&msp->ms_lock);
968			metaslab_passivate(msp,
969			    metaslab_weight_from_range_tree(msp));
970			mutex_exit(&msp->ms_lock);
971		}
972		msp = mga->mga_secondary;
973		if (msp != NULL) {
974			mutex_enter(&msp->ms_lock);
975			metaslab_passivate(msp,
976			    metaslab_weight_from_range_tree(msp));
977			mutex_exit(&msp->ms_lock);
978		}
979	}
980
981	mgprev = mg->mg_prev;
982	mgnext = mg->mg_next;
983
984	if (mg == mgnext) {
985		mgnext = NULL;
986	} else {
987		mgprev->mg_next = mgnext;
988		mgnext->mg_prev = mgprev;
989	}
990	for (int i = 0; i < spa->spa_alloc_count; i++) {
991		if (mc->mc_allocator[i].mca_rotor == mg)
992			mc->mc_allocator[i].mca_rotor = mgnext;
993	}
994
995	mg->mg_prev = NULL;
996	mg->mg_next = NULL;
997}
998
999boolean_t
1000metaslab_group_initialized(metaslab_group_t *mg)
1001{
1002	vdev_t *vd = mg->mg_vd;
1003	vdev_stat_t *vs = &vd->vdev_stat;
1004
1005	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
1006}
1007
1008uint64_t
1009metaslab_group_get_space(metaslab_group_t *mg)
1010{
1011	/*
1012	 * Note that the number of nodes in mg_metaslab_tree may be one less
1013	 * than vdev_ms_count, due to the embedded log metaslab.
1014	 */
1015	mutex_enter(&mg->mg_lock);
1016	uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
1017	mutex_exit(&mg->mg_lock);
1018	return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
1019}
1020
1021void
1022metaslab_group_histogram_verify(metaslab_group_t *mg)
1023{
1024	uint64_t *mg_hist;
1025	avl_tree_t *t = &mg->mg_metaslab_tree;
1026	uint64_t ashift = mg->mg_vd->vdev_ashift;
1027
1028	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
1029		return;
1030
1031	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
1032	    KM_SLEEP);
1033
1034	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
1035	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
1036
1037	mutex_enter(&mg->mg_lock);
1038	for (metaslab_t *msp = avl_first(t);
1039	    msp != NULL; msp = AVL_NEXT(t, msp)) {
1040		VERIFY3P(msp->ms_group, ==, mg);
1041		/* skip if not active */
1042		if (msp->ms_sm == NULL)
1043			continue;
1044
1045		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1046			mg_hist[i + ashift] +=
1047			    msp->ms_sm->sm_phys->smp_histogram[i];
1048		}
1049	}
1050
1051	for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
1052		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
1053
1054	mutex_exit(&mg->mg_lock);
1055
1056	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
1057}
1058
1059static void
1060metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
1061{
1062	metaslab_class_t *mc = mg->mg_class;
1063	uint64_t ashift = mg->mg_vd->vdev_ashift;
1064
1065	ASSERT(MUTEX_HELD(&msp->ms_lock));
1066	if (msp->ms_sm == NULL)
1067		return;
1068
1069	mutex_enter(&mg->mg_lock);
1070	mutex_enter(&mc->mc_lock);
1071	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1072		IMPLY(mg == mg->mg_vd->vdev_log_mg,
1073		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
1074		mg->mg_histogram[i + ashift] +=
1075		    msp->ms_sm->sm_phys->smp_histogram[i];
1076		mc->mc_histogram[i + ashift] +=
1077		    msp->ms_sm->sm_phys->smp_histogram[i];
1078	}
1079	mutex_exit(&mc->mc_lock);
1080	mutex_exit(&mg->mg_lock);
1081}
1082
1083void
1084metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
1085{
1086	metaslab_class_t *mc = mg->mg_class;
1087	uint64_t ashift = mg->mg_vd->vdev_ashift;
1088
1089	ASSERT(MUTEX_HELD(&msp->ms_lock));
1090	if (msp->ms_sm == NULL)
1091		return;
1092
1093	mutex_enter(&mg->mg_lock);
1094	mutex_enter(&mc->mc_lock);
1095	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1096		ASSERT3U(mg->mg_histogram[i + ashift], >=,
1097		    msp->ms_sm->sm_phys->smp_histogram[i]);
1098		ASSERT3U(mc->mc_histogram[i + ashift], >=,
1099		    msp->ms_sm->sm_phys->smp_histogram[i]);
1100		IMPLY(mg == mg->mg_vd->vdev_log_mg,
1101		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
1102
1103		mg->mg_histogram[i + ashift] -=
1104		    msp->ms_sm->sm_phys->smp_histogram[i];
1105		mc->mc_histogram[i + ashift] -=
1106		    msp->ms_sm->sm_phys->smp_histogram[i];
1107	}
1108	mutex_exit(&mc->mc_lock);
1109	mutex_exit(&mg->mg_lock);
1110}
1111
1112static void
1113metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
1114{
1115	ASSERT(msp->ms_group == NULL);
1116	mutex_enter(&mg->mg_lock);
1117	msp->ms_group = mg;
1118	msp->ms_weight = 0;
1119	avl_add(&mg->mg_metaslab_tree, msp);
1120	mutex_exit(&mg->mg_lock);
1121
1122	mutex_enter(&msp->ms_lock);
1123	metaslab_group_histogram_add(mg, msp);
1124	mutex_exit(&msp->ms_lock);
1125}
1126
1127static void
1128metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
1129{
1130	mutex_enter(&msp->ms_lock);
1131	metaslab_group_histogram_remove(mg, msp);
1132	mutex_exit(&msp->ms_lock);
1133
1134	mutex_enter(&mg->mg_lock);
1135	ASSERT(msp->ms_group == mg);
1136	avl_remove(&mg->mg_metaslab_tree, msp);
1137
1138	metaslab_class_t *mc = msp->ms_group->mg_class;
1139	multilist_sublist_t *mls =
1140	    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
1141	if (multilist_link_active(&msp->ms_class_txg_node))
1142		multilist_sublist_remove(mls, msp);
1143	multilist_sublist_unlock(mls);
1144
1145	msp->ms_group = NULL;
1146	mutex_exit(&mg->mg_lock);
1147}
1148
1149static void
1150metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1151{
1152	ASSERT(MUTEX_HELD(&msp->ms_lock));
1153	ASSERT(MUTEX_HELD(&mg->mg_lock));
1154	ASSERT(msp->ms_group == mg);
1155
1156	avl_remove(&mg->mg_metaslab_tree, msp);
1157	msp->ms_weight = weight;
1158	avl_add(&mg->mg_metaslab_tree, msp);
1159
1160}
1161
1162static void
1163metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1164{
1165	/*
1166	 * Although in principle the weight can be any value, in
1167	 * practice we do not use values in the range [1, 511].
1168	 */
1169	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
1170	ASSERT(MUTEX_HELD(&msp->ms_lock));
1171
1172	mutex_enter(&mg->mg_lock);
1173	metaslab_group_sort_impl(mg, msp, weight);
1174	mutex_exit(&mg->mg_lock);
1175}
1176
1177/*
1178 * Calculate the fragmentation for a given metaslab group. We can use
1179 * a simple average here since all metaslabs within the group must have
1180 * the same size. The return value will be a value between 0 and 100
1181 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
1182 * group have a fragmentation metric.
1183 */
1184uint64_t
1185metaslab_group_fragmentation(metaslab_group_t *mg)
1186{
1187	vdev_t *vd = mg->mg_vd;
1188	uint64_t fragmentation = 0;
1189	uint64_t valid_ms = 0;
1190
1191	for (int m = 0; m < vd->vdev_ms_count; m++) {
1192		metaslab_t *msp = vd->vdev_ms[m];
1193
1194		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
1195			continue;
1196		if (msp->ms_group != mg)
1197			continue;
1198
1199		valid_ms++;
1200		fragmentation += msp->ms_fragmentation;
1201	}
1202
1203	if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
1204		return (ZFS_FRAG_INVALID);
1205
1206	fragmentation /= valid_ms;
1207	ASSERT3U(fragmentation, <=, 100);
1208	return (fragmentation);
1209}
1210
1211/*
1212 * Determine if a given metaslab group should skip allocations. A metaslab
1213 * group should avoid allocations if its free capacity is less than the
1214 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
1215 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
1216 * that can still handle allocations. If the allocation throttle is enabled
1217 * then we skip allocations to devices that have reached their maximum
1218 * allocation queue depth unless the selected metaslab group is the only
1219 * eligible group remaining.
1220 */
1221static boolean_t
1222metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1223    int flags, uint64_t psize, int allocator, int d)
1224{
1225	spa_t *spa = mg->mg_vd->vdev_spa;
1226	metaslab_class_t *mc = mg->mg_class;
1227
1228	/*
1229	 * We can only consider skipping this metaslab group if it's
1230	 * in the normal metaslab class and there are other metaslab
1231	 * groups to select from. Otherwise, we always consider it eligible
1232	 * for allocations.
1233	 */
1234	if ((mc != spa_normal_class(spa) &&
1235	    mc != spa_special_class(spa) &&
1236	    mc != spa_dedup_class(spa)) ||
1237	    mc->mc_groups <= 1)
1238		return (B_TRUE);
1239
1240	/*
1241	 * If the metaslab group's mg_allocatable flag is set (see comments
1242	 * in metaslab_group_alloc_update() for more information) and
1243	 * the allocation throttle is disabled then allow allocations to this
1244	 * device. However, if the allocation throttle is enabled then
1245	 * check if we have reached our allocation limit (mga_alloc_queue_depth)
1246	 * to determine if we should allow allocations to this metaslab group.
1247	 * If all metaslab groups are no longer considered allocatable
1248	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1249	 * gang block size then we allow allocations on this metaslab group
1250	 * regardless of the mg_allocatable or throttle settings.
1251	 */
1252	if (mg->mg_allocatable) {
1253		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
1254		int64_t qdepth;
1255		uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
1256
1257		if (!mc->mc_alloc_throttle_enabled)
1258			return (B_TRUE);
1259
1260		/*
1261		 * If this metaslab group does not have any free space, then
1262		 * there is no point in looking further.
1263		 */
1264		if (mg->mg_no_free_space)
1265			return (B_FALSE);
1266
1267		/*
1268		 * Some allocations (e.g., those coming from device removal
1269		 * where the * allocations are not even counted in the
1270		 * metaslab * allocation queues) are allowed to bypass
1271		 * the throttle.
1272		 */
1273		if (flags & METASLAB_DONT_THROTTLE)
1274			return (B_TRUE);
1275
1276		/*
1277		 * Relax allocation throttling for ditto blocks.  Due to
1278		 * random imbalances in allocation it tends to push copies
1279		 * to one vdev, that looks a bit better at the moment.
1280		 */
1281		qmax = qmax * (4 + d) / 4;
1282
1283		qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
1284
1285		/*
1286		 * If this metaslab group is below its qmax or it's
1287		 * the only allocatable metaslab group, then attempt
1288		 * to allocate from it.
1289		 */
1290		if (qdepth < qmax || mc->mc_alloc_groups == 1)
1291			return (B_TRUE);
1292		ASSERT3U(mc->mc_alloc_groups, >, 1);
1293
1294		/*
1295		 * Since this metaslab group is at or over its qmax, we
1296		 * need to determine if there are metaslab groups after this
1297		 * one that might be able to handle this allocation. This is
1298		 * racy since we can't hold the locks for all metaslab
1299		 * groups at the same time when we make this check.
1300		 */
1301		for (metaslab_group_t *mgp = mg->mg_next;
1302		    mgp != rotor; mgp = mgp->mg_next) {
1303			metaslab_group_allocator_t *mgap =
1304			    &mgp->mg_allocator[allocator];
1305			qmax = mgap->mga_cur_max_alloc_queue_depth;
1306			qmax = qmax * (4 + d) / 4;
1307			qdepth =
1308			    zfs_refcount_count(&mgap->mga_alloc_queue_depth);
1309
1310			/*
1311			 * If there is another metaslab group that
1312			 * might be able to handle the allocation, then
1313			 * we return false so that we skip this group.
1314			 */
1315			if (qdepth < qmax && !mgp->mg_no_free_space)
1316				return (B_FALSE);
1317		}
1318
1319		/*
1320		 * We didn't find another group to handle the allocation
1321		 * so we can't skip this metaslab group even though
1322		 * we are at or over our qmax.
1323		 */
1324		return (B_TRUE);
1325
1326	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1327		return (B_TRUE);
1328	}
1329	return (B_FALSE);
1330}
1331
1332/*
1333 * ==========================================================================
1334 * Range tree callbacks
1335 * ==========================================================================
1336 */
1337
1338/*
1339 * Comparison function for the private size-ordered tree using 32-bit
1340 * ranges. Tree is sorted by size, larger sizes at the end of the tree.
1341 */
1342__attribute__((always_inline)) inline
1343static int
1344metaslab_rangesize32_compare(const void *x1, const void *x2)
1345{
1346	const range_seg32_t *r1 = x1;
1347	const range_seg32_t *r2 = x2;
1348
1349	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1350	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1351
1352	int cmp = TREE_CMP(rs_size1, rs_size2);
1353
1354	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
1355}
1356
1357/*
1358 * Comparison function for the private size-ordered tree using 64-bit
1359 * ranges. Tree is sorted by size, larger sizes at the end of the tree.
1360 */
1361__attribute__((always_inline)) inline
1362static int
1363metaslab_rangesize64_compare(const void *x1, const void *x2)
1364{
1365	const range_seg64_t *r1 = x1;
1366	const range_seg64_t *r2 = x2;
1367
1368	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1369	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1370
1371	int cmp = TREE_CMP(rs_size1, rs_size2);
1372
1373	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
1374}
1375
1376typedef struct metaslab_rt_arg {
1377	zfs_btree_t *mra_bt;
1378	uint32_t mra_floor_shift;
1379} metaslab_rt_arg_t;
1380
1381struct mssa_arg {
1382	range_tree_t *rt;
1383	metaslab_rt_arg_t *mra;
1384};
1385
1386static void
1387metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
1388{
1389	struct mssa_arg *mssap = arg;
1390	range_tree_t *rt = mssap->rt;
1391	metaslab_rt_arg_t *mrap = mssap->mra;
1392	range_seg_max_t seg = {0};
1393	rs_set_start(&seg, rt, start);
1394	rs_set_end(&seg, rt, start + size);
1395	metaslab_rt_add(rt, &seg, mrap);
1396}
1397
1398static void
1399metaslab_size_tree_full_load(range_tree_t *rt)
1400{
1401	metaslab_rt_arg_t *mrap = rt->rt_arg;
1402	METASLABSTAT_BUMP(metaslabstat_reload_tree);
1403	ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
1404	mrap->mra_floor_shift = 0;
1405	struct mssa_arg arg = {0};
1406	arg.rt = rt;
1407	arg.mra = mrap;
1408	range_tree_walk(rt, metaslab_size_sorted_add, &arg);
1409}
1410
1411
1412ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
1413    range_seg32_t, metaslab_rangesize32_compare)
1414
1415ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
1416    range_seg64_t, metaslab_rangesize64_compare)
1417
1418/*
1419 * Create any block allocator specific components. The current allocators
1420 * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1421 */
1422static void
1423metaslab_rt_create(range_tree_t *rt, void *arg)
1424{
1425	metaslab_rt_arg_t *mrap = arg;
1426	zfs_btree_t *size_tree = mrap->mra_bt;
1427
1428	size_t size;
1429	int (*compare) (const void *, const void *);
1430	bt_find_in_buf_f bt_find;
1431	switch (rt->rt_type) {
1432	case RANGE_SEG32:
1433		size = sizeof (range_seg32_t);
1434		compare = metaslab_rangesize32_compare;
1435		bt_find = metaslab_rt_find_rangesize32_in_buf;
1436		break;
1437	case RANGE_SEG64:
1438		size = sizeof (range_seg64_t);
1439		compare = metaslab_rangesize64_compare;
1440		bt_find = metaslab_rt_find_rangesize64_in_buf;
1441		break;
1442	default:
1443		panic("Invalid range seg type %d", rt->rt_type);
1444	}
1445	zfs_btree_create(size_tree, compare, bt_find, size);
1446	mrap->mra_floor_shift = metaslab_by_size_min_shift;
1447}
1448
1449static void
1450metaslab_rt_destroy(range_tree_t *rt, void *arg)
1451{
1452	(void) rt;
1453	metaslab_rt_arg_t *mrap = arg;
1454	zfs_btree_t *size_tree = mrap->mra_bt;
1455
1456	zfs_btree_destroy(size_tree);
1457	kmem_free(mrap, sizeof (*mrap));
1458}
1459
1460static void
1461metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1462{
1463	metaslab_rt_arg_t *mrap = arg;
1464	zfs_btree_t *size_tree = mrap->mra_bt;
1465
1466	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
1467	    (1ULL << mrap->mra_floor_shift))
1468		return;
1469
1470	zfs_btree_add(size_tree, rs);
1471}
1472
1473static void
1474metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1475{
1476	metaslab_rt_arg_t *mrap = arg;
1477	zfs_btree_t *size_tree = mrap->mra_bt;
1478
1479	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL <<
1480	    mrap->mra_floor_shift))
1481		return;
1482
1483	zfs_btree_remove(size_tree, rs);
1484}
1485
1486static void
1487metaslab_rt_vacate(range_tree_t *rt, void *arg)
1488{
1489	metaslab_rt_arg_t *mrap = arg;
1490	zfs_btree_t *size_tree = mrap->mra_bt;
1491	zfs_btree_clear(size_tree);
1492	zfs_btree_destroy(size_tree);
1493
1494	metaslab_rt_create(rt, arg);
1495}
1496
1497static const range_tree_ops_t metaslab_rt_ops = {
1498	.rtop_create = metaslab_rt_create,
1499	.rtop_destroy = metaslab_rt_destroy,
1500	.rtop_add = metaslab_rt_add,
1501	.rtop_remove = metaslab_rt_remove,
1502	.rtop_vacate = metaslab_rt_vacate
1503};
1504
1505/*
1506 * ==========================================================================
1507 * Common allocator routines
1508 * ==========================================================================
1509 */
1510
1511/*
1512 * Return the maximum contiguous segment within the metaslab.
1513 */
1514uint64_t
1515metaslab_largest_allocatable(metaslab_t *msp)
1516{
1517	zfs_btree_t *t = &msp->ms_allocatable_by_size;
1518	range_seg_t *rs;
1519
1520	if (t == NULL)
1521		return (0);
1522	if (zfs_btree_numnodes(t) == 0)
1523		metaslab_size_tree_full_load(msp->ms_allocatable);
1524
1525	rs = zfs_btree_last(t, NULL);
1526	if (rs == NULL)
1527		return (0);
1528
1529	return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
1530	    msp->ms_allocatable));
1531}
1532
1533/*
1534 * Return the maximum contiguous segment within the unflushed frees of this
1535 * metaslab.
1536 */
1537static uint64_t
1538metaslab_largest_unflushed_free(metaslab_t *msp)
1539{
1540	ASSERT(MUTEX_HELD(&msp->ms_lock));
1541
1542	if (msp->ms_unflushed_frees == NULL)
1543		return (0);
1544
1545	if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
1546		metaslab_size_tree_full_load(msp->ms_unflushed_frees);
1547	range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
1548	    NULL);
1549	if (rs == NULL)
1550		return (0);
1551
1552	/*
1553	 * When a range is freed from the metaslab, that range is added to
1554	 * both the unflushed frees and the deferred frees. While the block
1555	 * will eventually be usable, if the metaslab were loaded the range
1556	 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
1557	 * txgs had passed.  As a result, when attempting to estimate an upper
1558	 * bound for the largest currently-usable free segment in the
1559	 * metaslab, we need to not consider any ranges currently in the defer
1560	 * trees. This algorithm approximates the largest available chunk in
1561	 * the largest range in the unflushed_frees tree by taking the first
1562	 * chunk.  While this may be a poor estimate, it should only remain so
1563	 * briefly and should eventually self-correct as frees are no longer
1564	 * deferred. Similar logic applies to the ms_freed tree. See
1565	 * metaslab_load() for more details.
1566	 *
1567	 * There are two primary sources of inaccuracy in this estimate. Both
1568	 * are tolerated for performance reasons. The first source is that we
1569	 * only check the largest segment for overlaps. Smaller segments may
1570	 * have more favorable overlaps with the other trees, resulting in
1571	 * larger usable chunks.  Second, we only look at the first chunk in
1572	 * the largest segment; there may be other usable chunks in the
1573	 * largest segment, but we ignore them.
1574	 */
1575	uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
1576	uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
1577	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1578		uint64_t start = 0;
1579		uint64_t size = 0;
1580		boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
1581		    rsize, &start, &size);
1582		if (found) {
1583			if (rstart == start)
1584				return (0);
1585			rsize = start - rstart;
1586		}
1587	}
1588
1589	uint64_t start = 0;
1590	uint64_t size = 0;
1591	boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
1592	    rsize, &start, &size);
1593	if (found)
1594		rsize = start - rstart;
1595
1596	return (rsize);
1597}
1598
1599static range_seg_t *
1600metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
1601    uint64_t size, zfs_btree_index_t *where)
1602{
1603	range_seg_t *rs;
1604	range_seg_max_t rsearch;
1605
1606	rs_set_start(&rsearch, rt, start);
1607	rs_set_end(&rsearch, rt, start + size);
1608
1609	rs = zfs_btree_find(t, &rsearch, where);
1610	if (rs == NULL) {
1611		rs = zfs_btree_next(t, where, where);
1612	}
1613
1614	return (rs);
1615}
1616
1617/*
1618 * This is a helper function that can be used by the allocator to find a
1619 * suitable block to allocate. This will search the specified B-tree looking
1620 * for a block that matches the specified criteria.
1621 */
1622static uint64_t
1623metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
1624    uint64_t max_search)
1625{
1626	if (*cursor == 0)
1627		*cursor = rt->rt_start;
1628	zfs_btree_t *bt = &rt->rt_root;
1629	zfs_btree_index_t where;
1630	range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
1631	uint64_t first_found;
1632	int count_searched = 0;
1633
1634	if (rs != NULL)
1635		first_found = rs_get_start(rs, rt);
1636
1637	while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
1638	    max_search || count_searched < metaslab_min_search_count)) {
1639		uint64_t offset = rs_get_start(rs, rt);
1640		if (offset + size <= rs_get_end(rs, rt)) {
1641			*cursor = offset + size;
1642			return (offset);
1643		}
1644		rs = zfs_btree_next(bt, &where, &where);
1645		count_searched++;
1646	}
1647
1648	*cursor = 0;
1649	return (-1ULL);
1650}
1651
1652static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size);
1653static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size);
1654static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size);
1655metaslab_ops_t *metaslab_allocator(spa_t *spa);
1656
1657static metaslab_ops_t metaslab_allocators[] = {
1658	{ "dynamic", metaslab_df_alloc },
1659	{ "cursor", metaslab_cf_alloc },
1660	{ "new-dynamic", metaslab_ndf_alloc },
1661};
1662
1663static int
1664spa_find_allocator_byname(const char *val)
1665{
1666	int a = ARRAY_SIZE(metaslab_allocators) - 1;
1667	if (strcmp("new-dynamic", val) == 0)
1668		return (-1); /* remove when ndf is working */
1669	for (; a >= 0; a--) {
1670		if (strcmp(val, metaslab_allocators[a].msop_name) == 0)
1671			return (a);
1672	}
1673	return (-1);
1674}
1675
1676void
1677spa_set_allocator(spa_t *spa, const char *allocator)
1678{
1679	int a = spa_find_allocator_byname(allocator);
1680	if (a < 0) a = 0;
1681	spa->spa_active_allocator = a;
1682	zfs_dbgmsg("spa allocator: %s\n", metaslab_allocators[a].msop_name);
1683}
1684
1685int
1686spa_get_allocator(spa_t *spa)
1687{
1688	return (spa->spa_active_allocator);
1689}
1690
1691#if defined(_KERNEL)
1692int
1693param_set_active_allocator_common(const char *val)
1694{
1695	char *p;
1696
1697	if (val == NULL)
1698		return (SET_ERROR(EINVAL));
1699
1700	if ((p = strchr(val, '\n')) != NULL)
1701		*p = '\0';
1702
1703	int a = spa_find_allocator_byname(val);
1704	if (a < 0)
1705		return (SET_ERROR(EINVAL));
1706
1707	zfs_active_allocator = metaslab_allocators[a].msop_name;
1708	return (0);
1709}
1710#endif
1711
1712metaslab_ops_t *
1713metaslab_allocator(spa_t *spa)
1714{
1715	int allocator = spa_get_allocator(spa);
1716	return (&metaslab_allocators[allocator]);
1717}
1718
1719/*
1720 * ==========================================================================
1721 * Dynamic Fit (df) block allocator
1722 *
1723 * Search for a free chunk of at least this size, starting from the last
1724 * offset (for this alignment of block) looking for up to
1725 * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
1726 * found within 16MB, then return a free chunk of exactly the requested size (or
1727 * larger).
1728 *
1729 * If it seems like searching from the last offset will be unproductive, skip
1730 * that and just return a free chunk of exactly the requested size (or larger).
1731 * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
1732 * mechanism is probably not very useful and may be removed in the future.
1733 *
1734 * The behavior when not searching can be changed to return the largest free
1735 * chunk, instead of a free chunk of exactly the requested size, by setting
1736 * metaslab_df_use_largest_segment.
1737 * ==========================================================================
1738 */
1739static uint64_t
1740metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1741{
1742	/*
1743	 * Find the largest power of 2 block size that evenly divides the
1744	 * requested size. This is used to try to allocate blocks with similar
1745	 * alignment from the same area of the metaslab (i.e. same cursor
1746	 * bucket) but it does not guarantee that other allocations sizes
1747	 * may exist in the same region.
1748	 */
1749	uint64_t align = size & -size;
1750	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1751	range_tree_t *rt = msp->ms_allocatable;
1752	uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1753	uint64_t offset;
1754
1755	ASSERT(MUTEX_HELD(&msp->ms_lock));
1756
1757	/*
1758	 * If we're running low on space, find a segment based on size,
1759	 * rather than iterating based on offset.
1760	 */
1761	if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
1762	    free_pct < metaslab_df_free_pct) {
1763		offset = -1;
1764	} else {
1765		offset = metaslab_block_picker(rt,
1766		    cursor, size, metaslab_df_max_search);
1767	}
1768
1769	if (offset == -1) {
1770		range_seg_t *rs;
1771		if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
1772			metaslab_size_tree_full_load(msp->ms_allocatable);
1773
1774		if (metaslab_df_use_largest_segment) {
1775			/* use largest free segment */
1776			rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
1777		} else {
1778			zfs_btree_index_t where;
1779			/* use segment of this size, or next largest */
1780			rs = metaslab_block_find(&msp->ms_allocatable_by_size,
1781			    rt, msp->ms_start, size, &where);
1782		}
1783		if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
1784		    rt)) {
1785			offset = rs_get_start(rs, rt);
1786			*cursor = offset + size;
1787		}
1788	}
1789
1790	return (offset);
1791}
1792
1793/*
1794 * ==========================================================================
1795 * Cursor fit block allocator -
1796 * Select the largest region in the metaslab, set the cursor to the beginning
1797 * of the range and the cursor_end to the end of the range. As allocations
1798 * are made advance the cursor. Continue allocating from the cursor until
1799 * the range is exhausted and then find a new range.
1800 * ==========================================================================
1801 */
1802static uint64_t
1803metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1804{
1805	range_tree_t *rt = msp->ms_allocatable;
1806	zfs_btree_t *t = &msp->ms_allocatable_by_size;
1807	uint64_t *cursor = &msp->ms_lbas[0];
1808	uint64_t *cursor_end = &msp->ms_lbas[1];
1809	uint64_t offset = 0;
1810
1811	ASSERT(MUTEX_HELD(&msp->ms_lock));
1812
1813	ASSERT3U(*cursor_end, >=, *cursor);
1814
1815	if ((*cursor + size) > *cursor_end) {
1816		range_seg_t *rs;
1817
1818		if (zfs_btree_numnodes(t) == 0)
1819			metaslab_size_tree_full_load(msp->ms_allocatable);
1820		rs = zfs_btree_last(t, NULL);
1821		if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
1822		    size)
1823			return (-1ULL);
1824
1825		*cursor = rs_get_start(rs, rt);
1826		*cursor_end = rs_get_end(rs, rt);
1827	}
1828
1829	offset = *cursor;
1830	*cursor += size;
1831
1832	return (offset);
1833}
1834
1835/*
1836 * ==========================================================================
1837 * New dynamic fit allocator -
1838 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1839 * contiguous blocks. If no region is found then just use the largest segment
1840 * that remains.
1841 * ==========================================================================
1842 */
1843
1844/*
1845 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1846 * to request from the allocator.
1847 */
1848uint64_t metaslab_ndf_clump_shift = 4;
1849
1850static uint64_t
1851metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1852{
1853	zfs_btree_t *t = &msp->ms_allocatable->rt_root;
1854	range_tree_t *rt = msp->ms_allocatable;
1855	zfs_btree_index_t where;
1856	range_seg_t *rs;
1857	range_seg_max_t rsearch;
1858	uint64_t hbit = highbit64(size);
1859	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1860	uint64_t max_size = metaslab_largest_allocatable(msp);
1861
1862	ASSERT(MUTEX_HELD(&msp->ms_lock));
1863
1864	if (max_size < size)
1865		return (-1ULL);
1866
1867	rs_set_start(&rsearch, rt, *cursor);
1868	rs_set_end(&rsearch, rt, *cursor + size);
1869
1870	rs = zfs_btree_find(t, &rsearch, &where);
1871	if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
1872		t = &msp->ms_allocatable_by_size;
1873
1874		rs_set_start(&rsearch, rt, 0);
1875		rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
1876		    metaslab_ndf_clump_shift)));
1877
1878		rs = zfs_btree_find(t, &rsearch, &where);
1879		if (rs == NULL)
1880			rs = zfs_btree_next(t, &where, &where);
1881		ASSERT(rs != NULL);
1882	}
1883
1884	if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
1885		*cursor = rs_get_start(rs, rt) + size;
1886		return (rs_get_start(rs, rt));
1887	}
1888	return (-1ULL);
1889}
1890
1891/*
1892 * ==========================================================================
1893 * Metaslabs
1894 * ==========================================================================
1895 */
1896
1897/*
1898 * Wait for any in-progress metaslab loads to complete.
1899 */
1900static void
1901metaslab_load_wait(metaslab_t *msp)
1902{
1903	ASSERT(MUTEX_HELD(&msp->ms_lock));
1904
1905	while (msp->ms_loading) {
1906		ASSERT(!msp->ms_loaded);
1907		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1908	}
1909}
1910
1911/*
1912 * Wait for any in-progress flushing to complete.
1913 */
1914static void
1915metaslab_flush_wait(metaslab_t *msp)
1916{
1917	ASSERT(MUTEX_HELD(&msp->ms_lock));
1918
1919	while (msp->ms_flushing)
1920		cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
1921}
1922
1923static unsigned int
1924metaslab_idx_func(multilist_t *ml, void *arg)
1925{
1926	metaslab_t *msp = arg;
1927
1928	/*
1929	 * ms_id values are allocated sequentially, so full 64bit
1930	 * division would be a waste of time, so limit it to 32 bits.
1931	 */
1932	return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml));
1933}
1934
1935uint64_t
1936metaslab_allocated_space(metaslab_t *msp)
1937{
1938	return (msp->ms_allocated_space);
1939}
1940
1941/*
1942 * Verify that the space accounting on disk matches the in-core range_trees.
1943 */
1944static void
1945metaslab_verify_space(metaslab_t *msp, uint64_t txg)
1946{
1947	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1948	uint64_t allocating = 0;
1949	uint64_t sm_free_space, msp_free_space;
1950
1951	ASSERT(MUTEX_HELD(&msp->ms_lock));
1952	ASSERT(!msp->ms_condensing);
1953
1954	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1955		return;
1956
1957	/*
1958	 * We can only verify the metaslab space when we're called
1959	 * from syncing context with a loaded metaslab that has an
1960	 * allocated space map. Calling this in non-syncing context
1961	 * does not provide a consistent view of the metaslab since
1962	 * we're performing allocations in the future.
1963	 */
1964	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
1965	    !msp->ms_loaded)
1966		return;
1967
1968	/*
1969	 * Even though the smp_alloc field can get negative,
1970	 * when it comes to a metaslab's space map, that should
1971	 * never be the case.
1972	 */
1973	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
1974
1975	ASSERT3U(space_map_allocated(msp->ms_sm), >=,
1976	    range_tree_space(msp->ms_unflushed_frees));
1977
1978	ASSERT3U(metaslab_allocated_space(msp), ==,
1979	    space_map_allocated(msp->ms_sm) +
1980	    range_tree_space(msp->ms_unflushed_allocs) -
1981	    range_tree_space(msp->ms_unflushed_frees));
1982
1983	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
1984
1985	/*
1986	 * Account for future allocations since we would have
1987	 * already deducted that space from the ms_allocatable.
1988	 */
1989	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
1990		allocating +=
1991		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
1992	}
1993	ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
1994	    msp->ms_allocating_total);
1995
1996	ASSERT3U(msp->ms_deferspace, ==,
1997	    range_tree_space(msp->ms_defer[0]) +
1998	    range_tree_space(msp->ms_defer[1]));
1999
2000	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
2001	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
2002
2003	VERIFY3U(sm_free_space, ==, msp_free_space);
2004}
2005
2006static void
2007metaslab_aux_histograms_clear(metaslab_t *msp)
2008{
2009	/*
2010	 * Auxiliary histograms are only cleared when resetting them,
2011	 * which can only happen while the metaslab is loaded.
2012	 */
2013	ASSERT(msp->ms_loaded);
2014
2015	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
2016	for (int t = 0; t < TXG_DEFER_SIZE; t++)
2017		memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t]));
2018}
2019
2020static void
2021metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
2022    range_tree_t *rt)
2023{
2024	/*
2025	 * This is modeled after space_map_histogram_add(), so refer to that
2026	 * function for implementation details. We want this to work like
2027	 * the space map histogram, and not the range tree histogram, as we
2028	 * are essentially constructing a delta that will be later subtracted
2029	 * from the space map histogram.
2030	 */
2031	int idx = 0;
2032	for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
2033		ASSERT3U(i, >=, idx + shift);
2034		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
2035
2036		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
2037			ASSERT3U(idx + shift, ==, i);
2038			idx++;
2039			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
2040		}
2041	}
2042}
2043
2044/*
2045 * Called at every sync pass that the metaslab gets synced.
2046 *
2047 * The reason is that we want our auxiliary histograms to be updated
2048 * wherever the metaslab's space map histogram is updated. This way
2049 * we stay consistent on which parts of the metaslab space map's
2050 * histogram are currently not available for allocations (e.g because
2051 * they are in the defer, freed, and freeing trees).
2052 */
2053static void
2054metaslab_aux_histograms_update(metaslab_t *msp)
2055{
2056	space_map_t *sm = msp->ms_sm;
2057	ASSERT(sm != NULL);
2058
2059	/*
2060	 * This is similar to the metaslab's space map histogram updates
2061	 * that take place in metaslab_sync(). The only difference is that
2062	 * we only care about segments that haven't made it into the
2063	 * ms_allocatable tree yet.
2064	 */
2065	if (msp->ms_loaded) {
2066		metaslab_aux_histograms_clear(msp);
2067
2068		metaslab_aux_histogram_add(msp->ms_synchist,
2069		    sm->sm_shift, msp->ms_freed);
2070
2071		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2072			metaslab_aux_histogram_add(msp->ms_deferhist[t],
2073			    sm->sm_shift, msp->ms_defer[t]);
2074		}
2075	}
2076
2077	metaslab_aux_histogram_add(msp->ms_synchist,
2078	    sm->sm_shift, msp->ms_freeing);
2079}
2080
2081/*
2082 * Called every time we are done syncing (writing to) the metaslab,
2083 * i.e. at the end of each sync pass.
2084 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
2085 */
2086static void
2087metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
2088{
2089	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2090	space_map_t *sm = msp->ms_sm;
2091
2092	if (sm == NULL) {
2093		/*
2094		 * We came here from metaslab_init() when creating/opening a
2095		 * pool, looking at a metaslab that hasn't had any allocations
2096		 * yet.
2097		 */
2098		return;
2099	}
2100
2101	/*
2102	 * This is similar to the actions that we take for the ms_freed
2103	 * and ms_defer trees in metaslab_sync_done().
2104	 */
2105	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
2106	if (defer_allowed) {
2107		memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist,
2108		    sizeof (msp->ms_synchist));
2109	} else {
2110		memset(msp->ms_deferhist[hist_index], 0,
2111		    sizeof (msp->ms_deferhist[hist_index]));
2112	}
2113	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
2114}
2115
2116/*
2117 * Ensure that the metaslab's weight and fragmentation are consistent
2118 * with the contents of the histogram (either the range tree's histogram
2119 * or the space map's depending whether the metaslab is loaded).
2120 */
2121static void
2122metaslab_verify_weight_and_frag(metaslab_t *msp)
2123{
2124	ASSERT(MUTEX_HELD(&msp->ms_lock));
2125
2126	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
2127		return;
2128
2129	/*
2130	 * We can end up here from vdev_remove_complete(), in which case we
2131	 * cannot do these assertions because we hold spa config locks and
2132	 * thus we are not allowed to read from the DMU.
2133	 *
2134	 * We check if the metaslab group has been removed and if that's
2135	 * the case we return immediately as that would mean that we are
2136	 * here from the aforementioned code path.
2137	 */
2138	if (msp->ms_group == NULL)
2139		return;
2140
2141	/*
2142	 * Devices being removed always return a weight of 0 and leave
2143	 * fragmentation and ms_max_size as is - there is nothing for
2144	 * us to verify here.
2145	 */
2146	vdev_t *vd = msp->ms_group->mg_vd;
2147	if (vd->vdev_removing)
2148		return;
2149
2150	/*
2151	 * If the metaslab is dirty it probably means that we've done
2152	 * some allocations or frees that have changed our histograms
2153	 * and thus the weight.
2154	 */
2155	for (int t = 0; t < TXG_SIZE; t++) {
2156		if (txg_list_member(&vd->vdev_ms_list, msp, t))
2157			return;
2158	}
2159
2160	/*
2161	 * This verification checks that our in-memory state is consistent
2162	 * with what's on disk. If the pool is read-only then there aren't
2163	 * any changes and we just have the initially-loaded state.
2164	 */
2165	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
2166		return;
2167
2168	/* some extra verification for in-core tree if you can */
2169	if (msp->ms_loaded) {
2170		range_tree_stat_verify(msp->ms_allocatable);
2171		VERIFY(space_map_histogram_verify(msp->ms_sm,
2172		    msp->ms_allocatable));
2173	}
2174
2175	uint64_t weight = msp->ms_weight;
2176	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2177	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
2178	uint64_t frag = msp->ms_fragmentation;
2179	uint64_t max_segsize = msp->ms_max_size;
2180
2181	msp->ms_weight = 0;
2182	msp->ms_fragmentation = 0;
2183
2184	/*
2185	 * This function is used for verification purposes and thus should
2186	 * not introduce any side-effects/mutations on the system's state.
2187	 *
2188	 * Regardless of whether metaslab_weight() thinks this metaslab
2189	 * should be active or not, we want to ensure that the actual weight
2190	 * (and therefore the value of ms_weight) would be the same if it
2191	 * was to be recalculated at this point.
2192	 *
2193	 * In addition we set the nodirty flag so metaslab_weight() does
2194	 * not dirty the metaslab for future TXGs (e.g. when trying to
2195	 * force condensing to upgrade the metaslab spacemaps).
2196	 */
2197	msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
2198
2199	VERIFY3U(max_segsize, ==, msp->ms_max_size);
2200
2201	/*
2202	 * If the weight type changed then there is no point in doing
2203	 * verification. Revert fields to their original values.
2204	 */
2205	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
2206	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
2207		msp->ms_fragmentation = frag;
2208		msp->ms_weight = weight;
2209		return;
2210	}
2211
2212	VERIFY3U(msp->ms_fragmentation, ==, frag);
2213	VERIFY3U(msp->ms_weight, ==, weight);
2214}
2215
2216/*
2217 * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
2218 * this class that was used longest ago, and attempt to unload it.  We don't
2219 * want to spend too much time in this loop to prevent performance
2220 * degradation, and we expect that most of the time this operation will
2221 * succeed. Between that and the normal unloading processing during txg sync,
2222 * we expect this to keep the metaslab memory usage under control.
2223 */
2224static void
2225metaslab_potentially_evict(metaslab_class_t *mc)
2226{
2227#ifdef _KERNEL
2228	uint64_t allmem = arc_all_memory();
2229	uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
2230	uint64_t size =	spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
2231	uint_t tries = 0;
2232	for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
2233	    tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2;
2234	    tries++) {
2235		unsigned int idx = multilist_get_random_index(
2236		    &mc->mc_metaslab_txg_list);
2237		multilist_sublist_t *mls =
2238		    multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
2239		metaslab_t *msp = multilist_sublist_head(mls);
2240		multilist_sublist_unlock(mls);
2241		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
2242		    inuse * size) {
2243			VERIFY3P(mls, ==, multilist_sublist_lock_idx(
2244			    &mc->mc_metaslab_txg_list, idx));
2245			ASSERT3U(idx, ==,
2246			    metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
2247
2248			if (!multilist_link_active(&msp->ms_class_txg_node)) {
2249				multilist_sublist_unlock(mls);
2250				break;
2251			}
2252			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
2253			multilist_sublist_unlock(mls);
2254			/*
2255			 * If the metaslab is currently loading there are two
2256			 * cases. If it's the metaslab we're evicting, we
2257			 * can't continue on or we'll panic when we attempt to
2258			 * recursively lock the mutex. If it's another
2259			 * metaslab that's loading, it can be safely skipped,
2260			 * since we know it's very new and therefore not a
2261			 * good eviction candidate. We check later once the
2262			 * lock is held that the metaslab is fully loaded
2263			 * before actually unloading it.
2264			 */
2265			if (msp->ms_loading) {
2266				msp = next_msp;
2267				inuse =
2268				    spl_kmem_cache_inuse(zfs_btree_leaf_cache);
2269				continue;
2270			}
2271			/*
2272			 * We can't unload metaslabs with no spacemap because
2273			 * they're not ready to be unloaded yet. We can't
2274			 * unload metaslabs with outstanding allocations
2275			 * because doing so could cause the metaslab's weight
2276			 * to decrease while it's unloaded, which violates an
2277			 * invariant that we use to prevent unnecessary
2278			 * loading. We also don't unload metaslabs that are
2279			 * currently active because they are high-weight
2280			 * metaslabs that are likely to be used in the near
2281			 * future.
2282			 */
2283			mutex_enter(&msp->ms_lock);
2284			if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
2285			    msp->ms_allocating_total == 0) {
2286				metaslab_unload(msp);
2287			}
2288			mutex_exit(&msp->ms_lock);
2289			msp = next_msp;
2290			inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
2291		}
2292	}
2293#else
2294	(void) mc, (void) zfs_metaslab_mem_limit;
2295#endif
2296}
2297
2298static int
2299metaslab_load_impl(metaslab_t *msp)
2300{
2301	int error = 0;
2302
2303	ASSERT(MUTEX_HELD(&msp->ms_lock));
2304	ASSERT(msp->ms_loading);
2305	ASSERT(!msp->ms_condensing);
2306
2307	/*
2308	 * We temporarily drop the lock to unblock other operations while we
2309	 * are reading the space map. Therefore, metaslab_sync() and
2310	 * metaslab_sync_done() can run at the same time as we do.
2311	 *
2312	 * If we are using the log space maps, metaslab_sync() can't write to
2313	 * the metaslab's space map while we are loading as we only write to
2314	 * it when we are flushing the metaslab, and that can't happen while
2315	 * we are loading it.
2316	 *
2317	 * If we are not using log space maps though, metaslab_sync() can
2318	 * append to the space map while we are loading. Therefore we load
2319	 * only entries that existed when we started the load. Additionally,
2320	 * metaslab_sync_done() has to wait for the load to complete because
2321	 * there are potential races like metaslab_load() loading parts of the
2322	 * space map that are currently being appended by metaslab_sync(). If
2323	 * we didn't, the ms_allocatable would have entries that
2324	 * metaslab_sync_done() would try to re-add later.
2325	 *
2326	 * That's why before dropping the lock we remember the synced length
2327	 * of the metaslab and read up to that point of the space map,
2328	 * ignoring entries appended by metaslab_sync() that happen after we
2329	 * drop the lock.
2330	 */
2331	uint64_t length = msp->ms_synced_length;
2332	mutex_exit(&msp->ms_lock);
2333
2334	hrtime_t load_start = gethrtime();
2335	metaslab_rt_arg_t *mrap;
2336	if (msp->ms_allocatable->rt_arg == NULL) {
2337		mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
2338	} else {
2339		mrap = msp->ms_allocatable->rt_arg;
2340		msp->ms_allocatable->rt_ops = NULL;
2341		msp->ms_allocatable->rt_arg = NULL;
2342	}
2343	mrap->mra_bt = &msp->ms_allocatable_by_size;
2344	mrap->mra_floor_shift = metaslab_by_size_min_shift;
2345
2346	if (msp->ms_sm != NULL) {
2347		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
2348		    SM_FREE, length);
2349
2350		/* Now, populate the size-sorted tree. */
2351		metaslab_rt_create(msp->ms_allocatable, mrap);
2352		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
2353		msp->ms_allocatable->rt_arg = mrap;
2354
2355		struct mssa_arg arg = {0};
2356		arg.rt = msp->ms_allocatable;
2357		arg.mra = mrap;
2358		range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
2359		    &arg);
2360	} else {
2361		/*
2362		 * Add the size-sorted tree first, since we don't need to load
2363		 * the metaslab from the spacemap.
2364		 */
2365		metaslab_rt_create(msp->ms_allocatable, mrap);
2366		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
2367		msp->ms_allocatable->rt_arg = mrap;
2368		/*
2369		 * The space map has not been allocated yet, so treat
2370		 * all the space in the metaslab as free and add it to the
2371		 * ms_allocatable tree.
2372		 */
2373		range_tree_add(msp->ms_allocatable,
2374		    msp->ms_start, msp->ms_size);
2375
2376		if (msp->ms_new) {
2377			/*
2378			 * If the ms_sm doesn't exist, this means that this
2379			 * metaslab hasn't gone through metaslab_sync() and
2380			 * thus has never been dirtied. So we shouldn't
2381			 * expect any unflushed allocs or frees from previous
2382			 * TXGs.
2383			 */
2384			ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
2385			ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
2386		}
2387	}
2388
2389	/*
2390	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
2391	 * changing the ms_sm (or log_sm) and the metaslab's range trees
2392	 * while we are about to use them and populate the ms_allocatable.
2393	 * The ms_lock is insufficient for this because metaslab_sync() doesn't
2394	 * hold the ms_lock while writing the ms_checkpointing tree to disk.
2395	 */
2396	mutex_enter(&msp->ms_sync_lock);
2397	mutex_enter(&msp->ms_lock);
2398
2399	ASSERT(!msp->ms_condensing);
2400	ASSERT(!msp->ms_flushing);
2401
2402	if (error != 0) {
2403		mutex_exit(&msp->ms_sync_lock);
2404		return (error);
2405	}
2406
2407	ASSERT3P(msp->ms_group, !=, NULL);
2408	msp->ms_loaded = B_TRUE;
2409
2410	/*
2411	 * Apply all the unflushed changes to ms_allocatable right
2412	 * away so any manipulations we do below have a clear view
2413	 * of what is allocated and what is free.
2414	 */
2415	range_tree_walk(msp->ms_unflushed_allocs,
2416	    range_tree_remove, msp->ms_allocatable);
2417	range_tree_walk(msp->ms_unflushed_frees,
2418	    range_tree_add, msp->ms_allocatable);
2419
2420	ASSERT3P(msp->ms_group, !=, NULL);
2421	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2422	if (spa_syncing_log_sm(spa) != NULL) {
2423		ASSERT(spa_feature_is_enabled(spa,
2424		    SPA_FEATURE_LOG_SPACEMAP));
2425
2426		/*
2427		 * If we use a log space map we add all the segments
2428		 * that are in ms_unflushed_frees so they are available
2429		 * for allocation.
2430		 *
2431		 * ms_allocatable needs to contain all free segments
2432		 * that are ready for allocations (thus not segments
2433		 * from ms_freeing, ms_freed, and the ms_defer trees).
2434		 * But if we grab the lock in this code path at a sync
2435		 * pass later that 1, then it also contains the
2436		 * segments of ms_freed (they were added to it earlier
2437		 * in this path through ms_unflushed_frees). So we
2438		 * need to remove all the segments that exist in
2439		 * ms_freed from ms_allocatable as they will be added
2440		 * later in metaslab_sync_done().
2441		 *
2442		 * When there's no log space map, the ms_allocatable
2443		 * correctly doesn't contain any segments that exist
2444		 * in ms_freed [see ms_synced_length].
2445		 */
2446		range_tree_walk(msp->ms_freed,
2447		    range_tree_remove, msp->ms_allocatable);
2448	}
2449
2450	/*
2451	 * If we are not using the log space map, ms_allocatable
2452	 * contains the segments that exist in the ms_defer trees
2453	 * [see ms_synced_length]. Thus we need to remove them
2454	 * from ms_allocatable as they will be added again in
2455	 * metaslab_sync_done().
2456	 *
2457	 * If we are using the log space map, ms_allocatable still
2458	 * contains the segments that exist in the ms_defer trees.
2459	 * Not because it read them through the ms_sm though. But
2460	 * because these segments are part of ms_unflushed_frees
2461	 * whose segments we add to ms_allocatable earlier in this
2462	 * code path.
2463	 */
2464	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2465		range_tree_walk(msp->ms_defer[t],
2466		    range_tree_remove, msp->ms_allocatable);
2467	}
2468
2469	/*
2470	 * Call metaslab_recalculate_weight_and_sort() now that the
2471	 * metaslab is loaded so we get the metaslab's real weight.
2472	 *
2473	 * Unless this metaslab was created with older software and
2474	 * has not yet been converted to use segment-based weight, we
2475	 * expect the new weight to be better or equal to the weight
2476	 * that the metaslab had while it was not loaded. This is
2477	 * because the old weight does not take into account the
2478	 * consolidation of adjacent segments between TXGs. [see
2479	 * comment for ms_synchist and ms_deferhist[] for more info]
2480	 */
2481	uint64_t weight = msp->ms_weight;
2482	uint64_t max_size = msp->ms_max_size;
2483	metaslab_recalculate_weight_and_sort(msp);
2484	if (!WEIGHT_IS_SPACEBASED(weight))
2485		ASSERT3U(weight, <=, msp->ms_weight);
2486	msp->ms_max_size = metaslab_largest_allocatable(msp);
2487	ASSERT3U(max_size, <=, msp->ms_max_size);
2488	hrtime_t load_end = gethrtime();
2489	msp->ms_load_time = load_end;
2490	zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, "
2491	    "ms_id %llu, smp_length %llu, "
2492	    "unflushed_allocs %llu, unflushed_frees %llu, "
2493	    "freed %llu, defer %llu + %llu, unloaded time %llu ms, "
2494	    "loading_time %lld ms, ms_max_size %llu, "
2495	    "max size error %lld, "
2496	    "old_weight %llx, new_weight %llx",
2497	    (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
2498	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
2499	    (u_longlong_t)msp->ms_id,
2500	    (u_longlong_t)space_map_length(msp->ms_sm),
2501	    (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
2502	    (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
2503	    (u_longlong_t)range_tree_space(msp->ms_freed),
2504	    (u_longlong_t)range_tree_space(msp->ms_defer[0]),
2505	    (u_longlong_t)range_tree_space(msp->ms_defer[1]),
2506	    (longlong_t)((load_start - msp->ms_unload_time) / 1000000),
2507	    (longlong_t)((load_end - load_start) / 1000000),
2508	    (u_longlong_t)msp->ms_max_size,
2509	    (u_longlong_t)msp->ms_max_size - max_size,
2510	    (u_longlong_t)weight, (u_longlong_t)msp->ms_weight);
2511
2512	metaslab_verify_space(msp, spa_syncing_txg(spa));
2513	mutex_exit(&msp->ms_sync_lock);
2514	return (0);
2515}
2516
2517int
2518metaslab_load(metaslab_t *msp)
2519{
2520	ASSERT(MUTEX_HELD(&msp->ms_lock));
2521
2522	/*
2523	 * There may be another thread loading the same metaslab, if that's
2524	 * the case just wait until the other thread is done and return.
2525	 */
2526	metaslab_load_wait(msp);
2527	if (msp->ms_loaded)
2528		return (0);
2529	VERIFY(!msp->ms_loading);
2530	ASSERT(!msp->ms_condensing);
2531
2532	/*
2533	 * We set the loading flag BEFORE potentially dropping the lock to
2534	 * wait for an ongoing flush (see ms_flushing below). This way other
2535	 * threads know that there is already a thread that is loading this
2536	 * metaslab.
2537	 */
2538	msp->ms_loading = B_TRUE;
2539
2540	/*
2541	 * Wait for any in-progress flushing to finish as we drop the ms_lock
2542	 * both here (during space_map_load()) and in metaslab_flush() (when
2543	 * we flush our changes to the ms_sm).
2544	 */
2545	if (msp->ms_flushing)
2546		metaslab_flush_wait(msp);
2547
2548	/*
2549	 * In the possibility that we were waiting for the metaslab to be
2550	 * flushed (where we temporarily dropped the ms_lock), ensure that
2551	 * no one else loaded the metaslab somehow.
2552	 */
2553	ASSERT(!msp->ms_loaded);
2554
2555	/*
2556	 * If we're loading a metaslab in the normal class, consider evicting
2557	 * another one to keep our memory usage under the limit defined by the
2558	 * zfs_metaslab_mem_limit tunable.
2559	 */
2560	if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
2561	    msp->ms_group->mg_class) {
2562		metaslab_potentially_evict(msp->ms_group->mg_class);
2563	}
2564
2565	int error = metaslab_load_impl(msp);
2566
2567	ASSERT(MUTEX_HELD(&msp->ms_lock));
2568	msp->ms_loading = B_FALSE;
2569	cv_broadcast(&msp->ms_load_cv);
2570
2571	return (error);
2572}
2573
2574void
2575metaslab_unload(metaslab_t *msp)
2576{
2577	ASSERT(MUTEX_HELD(&msp->ms_lock));
2578
2579	/*
2580	 * This can happen if a metaslab is selected for eviction (in
2581	 * metaslab_potentially_evict) and then unloaded during spa_sync (via
2582	 * metaslab_class_evict_old).
2583	 */
2584	if (!msp->ms_loaded)
2585		return;
2586
2587	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
2588	msp->ms_loaded = B_FALSE;
2589	msp->ms_unload_time = gethrtime();
2590
2591	msp->ms_activation_weight = 0;
2592	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
2593
2594	if (msp->ms_group != NULL) {
2595		metaslab_class_t *mc = msp->ms_group->mg_class;
2596		multilist_sublist_t *mls =
2597		    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
2598		if (multilist_link_active(&msp->ms_class_txg_node))
2599			multilist_sublist_remove(mls, msp);
2600		multilist_sublist_unlock(mls);
2601
2602		spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2603		zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, "
2604		    "ms_id %llu, weight %llx, "
2605		    "selected txg %llu (%llu ms ago), alloc_txg %llu, "
2606		    "loaded %llu ms ago, max_size %llu",
2607		    (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
2608		    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
2609		    (u_longlong_t)msp->ms_id,
2610		    (u_longlong_t)msp->ms_weight,
2611		    (u_longlong_t)msp->ms_selected_txg,
2612		    (u_longlong_t)(msp->ms_unload_time -
2613		    msp->ms_selected_time) / 1000 / 1000,
2614		    (u_longlong_t)msp->ms_alloc_txg,
2615		    (u_longlong_t)(msp->ms_unload_time -
2616		    msp->ms_load_time) / 1000 / 1000,
2617		    (u_longlong_t)msp->ms_max_size);
2618	}
2619
2620	/*
2621	 * We explicitly recalculate the metaslab's weight based on its space
2622	 * map (as it is now not loaded). We want unload metaslabs to always
2623	 * have their weights calculated from the space map histograms, while
2624	 * loaded ones have it calculated from their in-core range tree
2625	 * [see metaslab_load()]. This way, the weight reflects the information
2626	 * available in-core, whether it is loaded or not.
2627	 *
2628	 * If ms_group == NULL means that we came here from metaslab_fini(),
2629	 * at which point it doesn't make sense for us to do the recalculation
2630	 * and the sorting.
2631	 */
2632	if (msp->ms_group != NULL)
2633		metaslab_recalculate_weight_and_sort(msp);
2634}
2635
2636/*
2637 * We want to optimize the memory use of the per-metaslab range
2638 * trees. To do this, we store the segments in the range trees in
2639 * units of sectors, zero-indexing from the start of the metaslab. If
2640 * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
2641 * the ranges using two uint32_ts, rather than two uint64_ts.
2642 */
2643range_seg_type_t
2644metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
2645    uint64_t *start, uint64_t *shift)
2646{
2647	if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
2648	    !zfs_metaslab_force_large_segs) {
2649		*shift = vdev->vdev_ashift;
2650		*start = msp->ms_start;
2651		return (RANGE_SEG32);
2652	} else {
2653		*shift = 0;
2654		*start = 0;
2655		return (RANGE_SEG64);
2656	}
2657}
2658
2659void
2660metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
2661{
2662	ASSERT(MUTEX_HELD(&msp->ms_lock));
2663	metaslab_class_t *mc = msp->ms_group->mg_class;
2664	multilist_sublist_t *mls =
2665	    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
2666	if (multilist_link_active(&msp->ms_class_txg_node))
2667		multilist_sublist_remove(mls, msp);
2668	msp->ms_selected_txg = txg;
2669	msp->ms_selected_time = gethrtime();
2670	multilist_sublist_insert_tail(mls, msp);
2671	multilist_sublist_unlock(mls);
2672}
2673
2674void
2675metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
2676    int64_t defer_delta, int64_t space_delta)
2677{
2678	vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
2679
2680	ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
2681	ASSERT(vd->vdev_ms_count != 0);
2682
2683	metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
2684	    vdev_deflated_space(vd, space_delta));
2685}
2686
2687int
2688metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
2689    uint64_t txg, metaslab_t **msp)
2690{
2691	vdev_t *vd = mg->mg_vd;
2692	spa_t *spa = vd->vdev_spa;
2693	objset_t *mos = spa->spa_meta_objset;
2694	metaslab_t *ms;
2695	int error;
2696
2697	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
2698	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
2699	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
2700	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
2701	cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
2702	multilist_link_init(&ms->ms_class_txg_node);
2703
2704	ms->ms_id = id;
2705	ms->ms_start = id << vd->vdev_ms_shift;
2706	ms->ms_size = 1ULL << vd->vdev_ms_shift;
2707	ms->ms_allocator = -1;
2708	ms->ms_new = B_TRUE;
2709
2710	vdev_ops_t *ops = vd->vdev_ops;
2711	if (ops->vdev_op_metaslab_init != NULL)
2712		ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
2713
2714	/*
2715	 * We only open space map objects that already exist. All others
2716	 * will be opened when we finally allocate an object for it. For
2717	 * readonly pools there is no need to open the space map object.
2718	 *
2719	 * Note:
2720	 * When called from vdev_expand(), we can't call into the DMU as
2721	 * we are holding the spa_config_lock as a writer and we would
2722	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
2723	 * that case, the object parameter is zero though, so we won't
2724	 * call into the DMU.
2725	 */
2726	if (object != 0 && !(spa->spa_mode == SPA_MODE_READ &&
2727	    !spa->spa_read_spacemaps)) {
2728		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
2729		    ms->ms_size, vd->vdev_ashift);
2730
2731		if (error != 0) {
2732			kmem_free(ms, sizeof (metaslab_t));
2733			return (error);
2734		}
2735
2736		ASSERT(ms->ms_sm != NULL);
2737		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
2738	}
2739
2740	uint64_t shift, start;
2741	range_seg_type_t type =
2742	    metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
2743
2744	ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
2745	for (int t = 0; t < TXG_SIZE; t++) {
2746		ms->ms_allocating[t] = range_tree_create(NULL, type,
2747		    NULL, start, shift);
2748	}
2749	ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift);
2750	ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift);
2751	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2752		ms->ms_defer[t] = range_tree_create(NULL, type, NULL,
2753		    start, shift);
2754	}
2755	ms->ms_checkpointing =
2756	    range_tree_create(NULL, type, NULL, start, shift);
2757	ms->ms_unflushed_allocs =
2758	    range_tree_create(NULL, type, NULL, start, shift);
2759
2760	metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
2761	mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
2762	mrap->mra_floor_shift = metaslab_by_size_min_shift;
2763	ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
2764	    type, mrap, start, shift);
2765
2766	ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
2767
2768	metaslab_group_add(mg, ms);
2769	metaslab_set_fragmentation(ms, B_FALSE);
2770
2771	/*
2772	 * If we're opening an existing pool (txg == 0) or creating
2773	 * a new one (txg == TXG_INITIAL), all space is available now.
2774	 * If we're adding space to an existing pool, the new space
2775	 * does not become available until after this txg has synced.
2776	 * The metaslab's weight will also be initialized when we sync
2777	 * out this txg. This ensures that we don't attempt to allocate
2778	 * from it before we have initialized it completely.
2779	 */
2780	if (txg <= TXG_INITIAL) {
2781		metaslab_sync_done(ms, 0);
2782		metaslab_space_update(vd, mg->mg_class,
2783		    metaslab_allocated_space(ms), 0, 0);
2784	}
2785
2786	if (txg != 0) {
2787		vdev_dirty(vd, 0, NULL, txg);
2788		vdev_dirty(vd, VDD_METASLAB, ms, txg);
2789	}
2790
2791	*msp = ms;
2792
2793	return (0);
2794}
2795
2796static void
2797metaslab_fini_flush_data(metaslab_t *msp)
2798{
2799	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2800
2801	if (metaslab_unflushed_txg(msp) == 0) {
2802		ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
2803		    ==, NULL);
2804		return;
2805	}
2806	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
2807
2808	mutex_enter(&spa->spa_flushed_ms_lock);
2809	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
2810	mutex_exit(&spa->spa_flushed_ms_lock);
2811
2812	spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2813	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
2814	    metaslab_unflushed_dirty(msp));
2815}
2816
2817uint64_t
2818metaslab_unflushed_changes_memused(metaslab_t *ms)
2819{
2820	return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
2821	    range_tree_numsegs(ms->ms_unflushed_frees)) *
2822	    ms->ms_unflushed_allocs->rt_root.bt_elem_size);
2823}
2824
2825void
2826metaslab_fini(metaslab_t *msp)
2827{
2828	metaslab_group_t *mg = msp->ms_group;
2829	vdev_t *vd = mg->mg_vd;
2830	spa_t *spa = vd->vdev_spa;
2831
2832	metaslab_fini_flush_data(msp);
2833
2834	metaslab_group_remove(mg, msp);
2835
2836	mutex_enter(&msp->ms_lock);
2837	VERIFY(msp->ms_group == NULL);
2838
2839	/*
2840	 * If this metaslab hasn't been through metaslab_sync_done() yet its
2841	 * space hasn't been accounted for in its vdev and doesn't need to be
2842	 * subtracted.
2843	 */
2844	if (!msp->ms_new) {
2845		metaslab_space_update(vd, mg->mg_class,
2846		    -metaslab_allocated_space(msp), 0, -msp->ms_size);
2847
2848	}
2849	space_map_close(msp->ms_sm);
2850	msp->ms_sm = NULL;
2851
2852	metaslab_unload(msp);
2853
2854	range_tree_destroy(msp->ms_allocatable);
2855	range_tree_destroy(msp->ms_freeing);
2856	range_tree_destroy(msp->ms_freed);
2857
2858	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
2859	    metaslab_unflushed_changes_memused(msp));
2860	spa->spa_unflushed_stats.sus_memused -=
2861	    metaslab_unflushed_changes_memused(msp);
2862	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
2863	range_tree_destroy(msp->ms_unflushed_allocs);
2864	range_tree_destroy(msp->ms_checkpointing);
2865	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
2866	range_tree_destroy(msp->ms_unflushed_frees);
2867
2868	for (int t = 0; t < TXG_SIZE; t++) {
2869		range_tree_destroy(msp->ms_allocating[t]);
2870	}
2871	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2872		range_tree_destroy(msp->ms_defer[t]);
2873	}
2874	ASSERT0(msp->ms_deferspace);
2875
2876	for (int t = 0; t < TXG_SIZE; t++)
2877		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
2878
2879	range_tree_vacate(msp->ms_trim, NULL, NULL);
2880	range_tree_destroy(msp->ms_trim);
2881
2882	mutex_exit(&msp->ms_lock);
2883	cv_destroy(&msp->ms_load_cv);
2884	cv_destroy(&msp->ms_flush_cv);
2885	mutex_destroy(&msp->ms_lock);
2886	mutex_destroy(&msp->ms_sync_lock);
2887	ASSERT3U(msp->ms_allocator, ==, -1);
2888
2889	kmem_free(msp, sizeof (metaslab_t));
2890}
2891
2892#define	FRAGMENTATION_TABLE_SIZE	17
2893
2894/*
2895 * This table defines a segment size based fragmentation metric that will
2896 * allow each metaslab to derive its own fragmentation value. This is done
2897 * by calculating the space in each bucket of the spacemap histogram and
2898 * multiplying that by the fragmentation metric in this table. Doing
2899 * this for all buckets and dividing it by the total amount of free
2900 * space in this metaslab (i.e. the total free space in all buckets) gives
2901 * us the fragmentation metric. This means that a high fragmentation metric
2902 * equates to most of the free space being comprised of small segments.
2903 * Conversely, if the metric is low, then most of the free space is in
2904 * large segments. A 10% change in fragmentation equates to approximately
2905 * double the number of segments.
2906 *
2907 * This table defines 0% fragmented space using 16MB segments. Testing has
2908 * shown that segments that are greater than or equal to 16MB do not suffer
2909 * from drastic performance problems. Using this value, we derive the rest
2910 * of the table. Since the fragmentation value is never stored on disk, it
2911 * is possible to change these calculations in the future.
2912 */
2913static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
2914	100,	/* 512B	*/
2915	100,	/* 1K	*/
2916	98,	/* 2K	*/
2917	95,	/* 4K	*/
2918	90,	/* 8K	*/
2919	80,	/* 16K	*/
2920	70,	/* 32K	*/
2921	60,	/* 64K	*/
2922	50,	/* 128K	*/
2923	40,	/* 256K	*/
2924	30,	/* 512K	*/
2925	20,	/* 1M	*/
2926	15,	/* 2M	*/
2927	10,	/* 4M	*/
2928	5,	/* 8M	*/
2929	0	/* 16M	*/
2930};
2931
2932/*
2933 * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
2934 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
2935 * been upgraded and does not support this metric. Otherwise, the return
2936 * value should be in the range [0, 100].
2937 */
2938static void
2939metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty)
2940{
2941	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2942	uint64_t fragmentation = 0;
2943	uint64_t total = 0;
2944	boolean_t feature_enabled = spa_feature_is_enabled(spa,
2945	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
2946
2947	if (!feature_enabled) {
2948		msp->ms_fragmentation = ZFS_FRAG_INVALID;
2949		return;
2950	}
2951
2952	/*
2953	 * A null space map means that the entire metaslab is free
2954	 * and thus is not fragmented.
2955	 */
2956	if (msp->ms_sm == NULL) {
2957		msp->ms_fragmentation = 0;
2958		return;
2959	}
2960
2961	/*
2962	 * If this metaslab's space map has not been upgraded, flag it
2963	 * so that we upgrade next time we encounter it.
2964	 */
2965	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
2966		uint64_t txg = spa_syncing_txg(spa);
2967		vdev_t *vd = msp->ms_group->mg_vd;
2968
2969		/*
2970		 * If we've reached the final dirty txg, then we must
2971		 * be shutting down the pool. We don't want to dirty
2972		 * any data past this point so skip setting the condense
2973		 * flag. We can retry this action the next time the pool
2974		 * is imported. We also skip marking this metaslab for
2975		 * condensing if the caller has explicitly set nodirty.
2976		 */
2977		if (!nodirty &&
2978		    spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
2979			msp->ms_condense_wanted = B_TRUE;
2980			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2981			zfs_dbgmsg("txg %llu, requesting force condense: "
2982			    "ms_id %llu, vdev_id %llu", (u_longlong_t)txg,
2983			    (u_longlong_t)msp->ms_id,
2984			    (u_longlong_t)vd->vdev_id);
2985		}
2986		msp->ms_fragmentation = ZFS_FRAG_INVALID;
2987		return;
2988	}
2989
2990	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2991		uint64_t space = 0;
2992		uint8_t shift = msp->ms_sm->sm_shift;
2993
2994		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
2995		    FRAGMENTATION_TABLE_SIZE - 1);
2996
2997		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
2998			continue;
2999
3000		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
3001		total += space;
3002
3003		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
3004		fragmentation += space * zfs_frag_table[idx];
3005	}
3006
3007	if (total > 0)
3008		fragmentation /= total;
3009	ASSERT3U(fragmentation, <=, 100);
3010
3011	msp->ms_fragmentation = fragmentation;
3012}
3013
3014/*
3015 * Compute a weight -- a selection preference value -- for the given metaslab.
3016 * This is based on the amount of free space, the level of fragmentation,
3017 * the LBA range, and whether the metaslab is loaded.
3018 */
3019static uint64_t
3020metaslab_space_weight(metaslab_t *msp)
3021{
3022	metaslab_group_t *mg = msp->ms_group;
3023	vdev_t *vd = mg->mg_vd;
3024	uint64_t weight, space;
3025
3026	ASSERT(MUTEX_HELD(&msp->ms_lock));
3027
3028	/*
3029	 * The baseline weight is the metaslab's free space.
3030	 */
3031	space = msp->ms_size - metaslab_allocated_space(msp);
3032
3033	if (metaslab_fragmentation_factor_enabled &&
3034	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
3035		/*
3036		 * Use the fragmentation information to inversely scale
3037		 * down the baseline weight. We need to ensure that we
3038		 * don't exclude this metaslab completely when it's 100%
3039		 * fragmented. To avoid this we reduce the fragmented value
3040		 * by 1.
3041		 */
3042		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
3043
3044		/*
3045		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
3046		 * this metaslab again. The fragmentation metric may have
3047		 * decreased the space to something smaller than
3048		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
3049		 * so that we can consume any remaining space.
3050		 */
3051		if (space > 0 && space < SPA_MINBLOCKSIZE)
3052			space = SPA_MINBLOCKSIZE;
3053	}
3054	weight = space;
3055
3056	/*
3057	 * Modern disks have uniform bit density and constant angular velocity.
3058	 * Therefore, the outer recording zones are faster (higher bandwidth)
3059	 * than the inner zones by the ratio of outer to inner track diameter,
3060	 * which is typically around 2:1.  We account for this by assigning
3061	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
3062	 * In effect, this means that we'll select the metaslab with the most
3063	 * free bandwidth rather than simply the one with the most free space.
3064	 */
3065	if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
3066		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
3067		ASSERT(weight >= space && weight <= 2 * space);
3068	}
3069
3070	/*
3071	 * If this metaslab is one we're actively using, adjust its
3072	 * weight to make it preferable to any inactive metaslab so
3073	 * we'll polish it off. If the fragmentation on this metaslab
3074	 * has exceed our threshold, then don't mark it active.
3075	 */
3076	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
3077	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
3078		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
3079	}
3080
3081	WEIGHT_SET_SPACEBASED(weight);
3082	return (weight);
3083}
3084
3085/*
3086 * Return the weight of the specified metaslab, according to the segment-based
3087 * weighting algorithm. The metaslab must be loaded. This function can
3088 * be called within a sync pass since it relies only on the metaslab's
3089 * range tree which is always accurate when the metaslab is loaded.
3090 */
3091static uint64_t
3092metaslab_weight_from_range_tree(metaslab_t *msp)
3093{
3094	uint64_t weight = 0;
3095	uint32_t segments = 0;
3096
3097	ASSERT(msp->ms_loaded);
3098
3099	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
3100	    i--) {
3101		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
3102		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
3103
3104		segments <<= 1;
3105		segments += msp->ms_allocatable->rt_histogram[i];
3106
3107		/*
3108		 * The range tree provides more precision than the space map
3109		 * and must be downgraded so that all values fit within the
3110		 * space map's histogram. This allows us to compare loaded
3111		 * vs. unloaded metaslabs to determine which metaslab is
3112		 * considered "best".
3113		 */
3114		if (i > max_idx)
3115			continue;
3116
3117		if (segments != 0) {
3118			WEIGHT_SET_COUNT(weight, segments);
3119			WEIGHT_SET_INDEX(weight, i);
3120			WEIGHT_SET_ACTIVE(weight, 0);
3121			break;
3122		}
3123	}
3124	return (weight);
3125}
3126
3127/*
3128 * Calculate the weight based on the on-disk histogram. Should be applied
3129 * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
3130 * give results consistent with the on-disk state
3131 */
3132static uint64_t
3133metaslab_weight_from_spacemap(metaslab_t *msp)
3134{
3135	space_map_t *sm = msp->ms_sm;
3136	ASSERT(!msp->ms_loaded);
3137	ASSERT(sm != NULL);
3138	ASSERT3U(space_map_object(sm), !=, 0);
3139	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
3140
3141	/*
3142	 * Create a joint histogram from all the segments that have made
3143	 * it to the metaslab's space map histogram, that are not yet
3144	 * available for allocation because they are still in the freeing
3145	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
3146	 * these segments from the space map's histogram to get a more
3147	 * accurate weight.
3148	 */
3149	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
3150	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
3151		deferspace_histogram[i] += msp->ms_synchist[i];
3152	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3153		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
3154			deferspace_histogram[i] += msp->ms_deferhist[t][i];
3155		}
3156	}
3157
3158	uint64_t weight = 0;
3159	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
3160		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
3161		    deferspace_histogram[i]);
3162		uint64_t count =
3163		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
3164		if (count != 0) {
3165			WEIGHT_SET_COUNT(weight, count);
3166			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
3167			WEIGHT_SET_ACTIVE(weight, 0);
3168			break;
3169		}
3170	}
3171	return (weight);
3172}
3173
3174/*
3175 * Compute a segment-based weight for the specified metaslab. The weight
3176 * is determined by highest bucket in the histogram. The information
3177 * for the highest bucket is encoded into the weight value.
3178 */
3179static uint64_t
3180metaslab_segment_weight(metaslab_t *msp)
3181{
3182	metaslab_group_t *mg = msp->ms_group;
3183	uint64_t weight = 0;
3184	uint8_t shift = mg->mg_vd->vdev_ashift;
3185
3186	ASSERT(MUTEX_HELD(&msp->ms_lock));
3187
3188	/*
3189	 * The metaslab is completely free.
3190	 */
3191	if (metaslab_allocated_space(msp) == 0) {
3192		int idx = highbit64(msp->ms_size) - 1;
3193		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
3194
3195		if (idx < max_idx) {
3196			WEIGHT_SET_COUNT(weight, 1ULL);
3197			WEIGHT_SET_INDEX(weight, idx);
3198		} else {
3199			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
3200			WEIGHT_SET_INDEX(weight, max_idx);
3201		}
3202		WEIGHT_SET_ACTIVE(weight, 0);
3203		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
3204		return (weight);
3205	}
3206
3207	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
3208
3209	/*
3210	 * If the metaslab is fully allocated then just make the weight 0.
3211	 */
3212	if (metaslab_allocated_space(msp) == msp->ms_size)
3213		return (0);
3214	/*
3215	 * If the metaslab is already loaded, then use the range tree to
3216	 * determine the weight. Otherwise, we rely on the space map information
3217	 * to generate the weight.
3218	 */
3219	if (msp->ms_loaded) {
3220		weight = metaslab_weight_from_range_tree(msp);
3221	} else {
3222		weight = metaslab_weight_from_spacemap(msp);
3223	}
3224
3225	/*
3226	 * If the metaslab was active the last time we calculated its weight
3227	 * then keep it active. We want to consume the entire region that
3228	 * is associated with this weight.
3229	 */
3230	if (msp->ms_activation_weight != 0 && weight != 0)
3231		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
3232	return (weight);
3233}
3234
3235/*
3236 * Determine if we should attempt to allocate from this metaslab. If the
3237 * metaslab is loaded, then we can determine if the desired allocation
3238 * can be satisfied by looking at the size of the maximum free segment
3239 * on that metaslab. Otherwise, we make our decision based on the metaslab's
3240 * weight. For segment-based weighting we can determine the maximum
3241 * allocation based on the index encoded in its value. For space-based
3242 * weights we rely on the entire weight (excluding the weight-type bit).
3243 */
3244static boolean_t
3245metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
3246{
3247	/*
3248	 * This case will usually but not always get caught by the checks below;
3249	 * metaslabs can be loaded by various means, including the trim and
3250	 * initialize code. Once that happens, without this check they are
3251	 * allocatable even before they finish their first txg sync.
3252	 */
3253	if (unlikely(msp->ms_new))
3254		return (B_FALSE);
3255
3256	/*
3257	 * If the metaslab is loaded, ms_max_size is definitive and we can use
3258	 * the fast check. If it's not, the ms_max_size is a lower bound (once
3259	 * set), and we should use the fast check as long as we're not in
3260	 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
3261	 * seconds since the metaslab was unloaded.
3262	 */
3263	if (msp->ms_loaded ||
3264	    (msp->ms_max_size != 0 && !try_hard && gethrtime() <
3265	    msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
3266		return (msp->ms_max_size >= asize);
3267
3268	boolean_t should_allocate;
3269	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
3270		/*
3271		 * The metaslab segment weight indicates segments in the
3272		 * range [2^i, 2^(i+1)), where i is the index in the weight.
3273		 * Since the asize might be in the middle of the range, we
3274		 * should attempt the allocation if asize < 2^(i+1).
3275		 */
3276		should_allocate = (asize <
3277		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
3278	} else {
3279		should_allocate = (asize <=
3280		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
3281	}
3282
3283	return (should_allocate);
3284}
3285
3286static uint64_t
3287metaslab_weight(metaslab_t *msp, boolean_t nodirty)
3288{
3289	vdev_t *vd = msp->ms_group->mg_vd;
3290	spa_t *spa = vd->vdev_spa;
3291	uint64_t weight;
3292
3293	ASSERT(MUTEX_HELD(&msp->ms_lock));
3294
3295	metaslab_set_fragmentation(msp, nodirty);
3296
3297	/*
3298	 * Update the maximum size. If the metaslab is loaded, this will
3299	 * ensure that we get an accurate maximum size if newly freed space
3300	 * has been added back into the free tree. If the metaslab is
3301	 * unloaded, we check if there's a larger free segment in the
3302	 * unflushed frees. This is a lower bound on the largest allocatable
3303	 * segment size. Coalescing of adjacent entries may reveal larger
3304	 * allocatable segments, but we aren't aware of those until loading
3305	 * the space map into a range tree.
3306	 */
3307	if (msp->ms_loaded) {
3308		msp->ms_max_size = metaslab_largest_allocatable(msp);
3309	} else {
3310		msp->ms_max_size = MAX(msp->ms_max_size,
3311		    metaslab_largest_unflushed_free(msp));
3312	}
3313
3314	/*
3315	 * Segment-based weighting requires space map histogram support.
3316	 */
3317	if (zfs_metaslab_segment_weight_enabled &&
3318	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
3319	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
3320	    sizeof (space_map_phys_t))) {
3321		weight = metaslab_segment_weight(msp);
3322	} else {
3323		weight = metaslab_space_weight(msp);
3324	}
3325	return (weight);
3326}
3327
3328void
3329metaslab_recalculate_weight_and_sort(metaslab_t *msp)
3330{
3331	ASSERT(MUTEX_HELD(&msp->ms_lock));
3332
3333	/* note: we preserve the mask (e.g. indication of primary, etc..) */
3334	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
3335	metaslab_group_sort(msp->ms_group, msp,
3336	    metaslab_weight(msp, B_FALSE) | was_active);
3337}
3338
3339static int
3340metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3341    int allocator, uint64_t activation_weight)
3342{
3343	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
3344	ASSERT(MUTEX_HELD(&msp->ms_lock));
3345
3346	/*
3347	 * If we're activating for the claim code, we don't want to actually
3348	 * set the metaslab up for a specific allocator.
3349	 */
3350	if (activation_weight == METASLAB_WEIGHT_CLAIM) {
3351		ASSERT0(msp->ms_activation_weight);
3352		msp->ms_activation_weight = msp->ms_weight;
3353		metaslab_group_sort(mg, msp, msp->ms_weight |
3354		    activation_weight);
3355		return (0);
3356	}
3357
3358	metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
3359	    &mga->mga_primary : &mga->mga_secondary);
3360
3361	mutex_enter(&mg->mg_lock);
3362	if (*mspp != NULL) {
3363		mutex_exit(&mg->mg_lock);
3364		return (EEXIST);
3365	}
3366
3367	*mspp = msp;
3368	ASSERT3S(msp->ms_allocator, ==, -1);
3369	msp->ms_allocator = allocator;
3370	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
3371
3372	ASSERT0(msp->ms_activation_weight);
3373	msp->ms_activation_weight = msp->ms_weight;
3374	metaslab_group_sort_impl(mg, msp,
3375	    msp->ms_weight | activation_weight);
3376	mutex_exit(&mg->mg_lock);
3377
3378	return (0);
3379}
3380
3381static int
3382metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
3383{
3384	ASSERT(MUTEX_HELD(&msp->ms_lock));
3385
3386	/*
3387	 * The current metaslab is already activated for us so there
3388	 * is nothing to do. Already activated though, doesn't mean
3389	 * that this metaslab is activated for our allocator nor our
3390	 * requested activation weight. The metaslab could have started
3391	 * as an active one for our allocator but changed allocators
3392	 * while we were waiting to grab its ms_lock or we stole it
3393	 * [see find_valid_metaslab()]. This means that there is a
3394	 * possibility of passivating a metaslab of another allocator
3395	 * or from a different activation mask, from this thread.
3396	 */
3397	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3398		ASSERT(msp->ms_loaded);
3399		return (0);
3400	}
3401
3402	int error = metaslab_load(msp);
3403	if (error != 0) {
3404		metaslab_group_sort(msp->ms_group, msp, 0);
3405		return (error);
3406	}
3407
3408	/*
3409	 * When entering metaslab_load() we may have dropped the
3410	 * ms_lock because we were loading this metaslab, or we
3411	 * were waiting for another thread to load it for us. In
3412	 * that scenario, we recheck the weight of the metaslab
3413	 * to see if it was activated by another thread.
3414	 *
3415	 * If the metaslab was activated for another allocator or
3416	 * it was activated with a different activation weight (e.g.
3417	 * we wanted to make it a primary but it was activated as
3418	 * secondary) we return error (EBUSY).
3419	 *
3420	 * If the metaslab was activated for the same allocator
3421	 * and requested activation mask, skip activating it.
3422	 */
3423	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3424		if (msp->ms_allocator != allocator)
3425			return (EBUSY);
3426
3427		if ((msp->ms_weight & activation_weight) == 0)
3428			return (SET_ERROR(EBUSY));
3429
3430		EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
3431		    msp->ms_primary);
3432		return (0);
3433	}
3434
3435	/*
3436	 * If the metaslab has literally 0 space, it will have weight 0. In
3437	 * that case, don't bother activating it. This can happen if the
3438	 * metaslab had space during find_valid_metaslab, but another thread
3439	 * loaded it and used all that space while we were waiting to grab the
3440	 * lock.
3441	 */
3442	if (msp->ms_weight == 0) {
3443		ASSERT0(range_tree_space(msp->ms_allocatable));
3444		return (SET_ERROR(ENOSPC));
3445	}
3446
3447	if ((error = metaslab_activate_allocator(msp->ms_group, msp,
3448	    allocator, activation_weight)) != 0) {
3449		return (error);
3450	}
3451
3452	ASSERT(msp->ms_loaded);
3453	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
3454
3455	return (0);
3456}
3457
3458static void
3459metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3460    uint64_t weight)
3461{
3462	ASSERT(MUTEX_HELD(&msp->ms_lock));
3463	ASSERT(msp->ms_loaded);
3464
3465	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
3466		metaslab_group_sort(mg, msp, weight);
3467		return;
3468	}
3469
3470	mutex_enter(&mg->mg_lock);
3471	ASSERT3P(msp->ms_group, ==, mg);
3472	ASSERT3S(0, <=, msp->ms_allocator);
3473	ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
3474
3475	metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
3476	if (msp->ms_primary) {
3477		ASSERT3P(mga->mga_primary, ==, msp);
3478		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
3479		mga->mga_primary = NULL;
3480	} else {
3481		ASSERT3P(mga->mga_secondary, ==, msp);
3482		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
3483		mga->mga_secondary = NULL;
3484	}
3485	msp->ms_allocator = -1;
3486	metaslab_group_sort_impl(mg, msp, weight);
3487	mutex_exit(&mg->mg_lock);
3488}
3489
3490static void
3491metaslab_passivate(metaslab_t *msp, uint64_t weight)
3492{
3493	uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE;
3494
3495	/*
3496	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
3497	 * this metaslab again.  In that case, it had better be empty,
3498	 * or we would be leaving space on the table.
3499	 */
3500	ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
3501	    size >= SPA_MINBLOCKSIZE ||
3502	    range_tree_space(msp->ms_allocatable) == 0);
3503	ASSERT0(weight & METASLAB_ACTIVE_MASK);
3504
3505	ASSERT(msp->ms_activation_weight != 0);
3506	msp->ms_activation_weight = 0;
3507	metaslab_passivate_allocator(msp->ms_group, msp, weight);
3508	ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
3509}
3510
3511/*
3512 * Segment-based metaslabs are activated once and remain active until
3513 * we either fail an allocation attempt (similar to space-based metaslabs)
3514 * or have exhausted the free space in zfs_metaslab_switch_threshold
3515 * buckets since the metaslab was activated. This function checks to see
3516 * if we've exhausted the zfs_metaslab_switch_threshold buckets in the
3517 * metaslab and passivates it proactively. This will allow us to select a
3518 * metaslab with a larger contiguous region, if any, remaining within this
3519 * metaslab group. If we're in sync pass > 1, then we continue using this
3520 * metaslab so that we don't dirty more block and cause more sync passes.
3521 */
3522static void
3523metaslab_segment_may_passivate(metaslab_t *msp)
3524{
3525	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3526
3527	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
3528		return;
3529
3530	/*
3531	 * Since we are in the middle of a sync pass, the most accurate
3532	 * information that is accessible to us is the in-core range tree
3533	 * histogram; calculate the new weight based on that information.
3534	 */
3535	uint64_t weight = metaslab_weight_from_range_tree(msp);
3536	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
3537	int current_idx = WEIGHT_GET_INDEX(weight);
3538
3539	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
3540		metaslab_passivate(msp, weight);
3541}
3542
3543static void
3544metaslab_preload(void *arg)
3545{
3546	metaslab_t *msp = arg;
3547	metaslab_class_t *mc = msp->ms_group->mg_class;
3548	spa_t *spa = mc->mc_spa;
3549	fstrans_cookie_t cookie = spl_fstrans_mark();
3550
3551	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
3552
3553	mutex_enter(&msp->ms_lock);
3554	(void) metaslab_load(msp);
3555	metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
3556	mutex_exit(&msp->ms_lock);
3557	spl_fstrans_unmark(cookie);
3558}
3559
3560static void
3561metaslab_group_preload(metaslab_group_t *mg)
3562{
3563	spa_t *spa = mg->mg_vd->vdev_spa;
3564	metaslab_t *msp;
3565	avl_tree_t *t = &mg->mg_metaslab_tree;
3566	int m = 0;
3567
3568	if (spa_shutting_down(spa) || !metaslab_preload_enabled)
3569		return;
3570
3571	mutex_enter(&mg->mg_lock);
3572
3573	/*
3574	 * Load the next potential metaslabs
3575	 */
3576	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
3577		ASSERT3P(msp->ms_group, ==, mg);
3578
3579		/*
3580		 * We preload only the maximum number of metaslabs specified
3581		 * by metaslab_preload_limit. If a metaslab is being forced
3582		 * to condense then we preload it too. This will ensure
3583		 * that force condensing happens in the next txg.
3584		 */
3585		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
3586			continue;
3587		}
3588
3589		VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
3590		    msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0))
3591		    != TASKQID_INVALID);
3592	}
3593	mutex_exit(&mg->mg_lock);
3594}
3595
3596/*
3597 * Determine if the space map's on-disk footprint is past our tolerance for
3598 * inefficiency. We would like to use the following criteria to make our
3599 * decision:
3600 *
3601 * 1. Do not condense if the size of the space map object would dramatically
3602 *    increase as a result of writing out the free space range tree.
3603 *
3604 * 2. Condense if the on on-disk space map representation is at least
3605 *    zfs_condense_pct/100 times the size of the optimal representation
3606 *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
3607 *
3608 * 3. Do not condense if the on-disk size of the space map does not actually
3609 *    decrease.
3610 *
3611 * Unfortunately, we cannot compute the on-disk size of the space map in this
3612 * context because we cannot accurately compute the effects of compression, etc.
3613 * Instead, we apply the heuristic described in the block comment for
3614 * zfs_metaslab_condense_block_threshold - we only condense if the space used
3615 * is greater than a threshold number of blocks.
3616 */
3617static boolean_t
3618metaslab_should_condense(metaslab_t *msp)
3619{
3620	space_map_t *sm = msp->ms_sm;
3621	vdev_t *vd = msp->ms_group->mg_vd;
3622	uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift;
3623
3624	ASSERT(MUTEX_HELD(&msp->ms_lock));
3625	ASSERT(msp->ms_loaded);
3626	ASSERT(sm != NULL);
3627	ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
3628
3629	/*
3630	 * We always condense metaslabs that are empty and metaslabs for
3631	 * which a condense request has been made.
3632	 */
3633	if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
3634	    msp->ms_condense_wanted)
3635		return (B_TRUE);
3636
3637	uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
3638	uint64_t object_size = space_map_length(sm);
3639	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
3640	    msp->ms_allocatable, SM_NO_VDEVID);
3641
3642	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
3643	    object_size > zfs_metaslab_condense_block_threshold * record_size);
3644}
3645
3646/*
3647 * Condense the on-disk space map representation to its minimized form.
3648 * The minimized form consists of a small number of allocations followed
3649 * by the entries of the free range tree (ms_allocatable). The condensed
3650 * spacemap contains all the entries of previous TXGs (including those in
3651 * the pool-wide log spacemaps; thus this is effectively a superset of
3652 * metaslab_flush()), but this TXG's entries still need to be written.
3653 */
3654static void
3655metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
3656{
3657	range_tree_t *condense_tree;
3658	space_map_t *sm = msp->ms_sm;
3659	uint64_t txg = dmu_tx_get_txg(tx);
3660	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3661
3662	ASSERT(MUTEX_HELD(&msp->ms_lock));
3663	ASSERT(msp->ms_loaded);
3664	ASSERT(msp->ms_sm != NULL);
3665
3666	/*
3667	 * In order to condense the space map, we need to change it so it
3668	 * only describes which segments are currently allocated and free.
3669	 *
3670	 * All the current free space resides in the ms_allocatable, all
3671	 * the ms_defer trees, and all the ms_allocating trees. We ignore
3672	 * ms_freed because it is empty because we're in sync pass 1. We
3673	 * ignore ms_freeing because these changes are not yet reflected
3674	 * in the spacemap (they will be written later this txg).
3675	 *
3676	 * So to truncate the space map to represent all the entries of
3677	 * previous TXGs we do the following:
3678	 *
3679	 * 1] We create a range tree (condense tree) that is 100% empty.
3680	 * 2] We add to it all segments found in the ms_defer trees
3681	 *    as those segments are marked as free in the original space
3682	 *    map. We do the same with the ms_allocating trees for the same
3683	 *    reason. Adding these segments should be a relatively
3684	 *    inexpensive operation since we expect these trees to have a
3685	 *    small number of nodes.
3686	 * 3] We vacate any unflushed allocs, since they are not frees we
3687	 *    need to add to the condense tree. Then we vacate any
3688	 *    unflushed frees as they should already be part of ms_allocatable.
3689	 * 4] At this point, we would ideally like to add all segments
3690	 *    in the ms_allocatable tree from the condense tree. This way
3691	 *    we would write all the entries of the condense tree as the
3692	 *    condensed space map, which would only contain freed
3693	 *    segments with everything else assumed to be allocated.
3694	 *
3695	 *    Doing so can be prohibitively expensive as ms_allocatable can
3696	 *    be large, and therefore computationally expensive to add to
3697	 *    the condense_tree. Instead we first sync out an entry marking
3698	 *    everything as allocated, then the condense_tree and then the
3699	 *    ms_allocatable, in the condensed space map. While this is not
3700	 *    optimal, it is typically close to optimal and more importantly
3701	 *    much cheaper to compute.
3702	 *
3703	 * 5] Finally, as both of the unflushed trees were written to our
3704	 *    new and condensed metaslab space map, we basically flushed
3705	 *    all the unflushed changes to disk, thus we call
3706	 *    metaslab_flush_update().
3707	 */
3708	ASSERT3U(spa_sync_pass(spa), ==, 1);
3709	ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
3710
3711	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
3712	    "spa %s, smp size %llu, segments %llu, forcing condense=%s",
3713	    (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp,
3714	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
3715	    spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm),
3716	    (u_longlong_t)range_tree_numsegs(msp->ms_allocatable),
3717	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
3718
3719	msp->ms_condense_wanted = B_FALSE;
3720
3721	range_seg_type_t type;
3722	uint64_t shift, start;
3723	type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
3724	    &start, &shift);
3725
3726	condense_tree = range_tree_create(NULL, type, NULL, start, shift);
3727
3728	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3729		range_tree_walk(msp->ms_defer[t],
3730		    range_tree_add, condense_tree);
3731	}
3732
3733	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
3734		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
3735		    range_tree_add, condense_tree);
3736	}
3737
3738	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3739	    metaslab_unflushed_changes_memused(msp));
3740	spa->spa_unflushed_stats.sus_memused -=
3741	    metaslab_unflushed_changes_memused(msp);
3742	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3743	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3744
3745	/*
3746	 * We're about to drop the metaslab's lock thus allowing other
3747	 * consumers to change it's content. Set the metaslab's ms_condensing
3748	 * flag to ensure that allocations on this metaslab do not occur
3749	 * while we're in the middle of committing it to disk. This is only
3750	 * critical for ms_allocatable as all other range trees use per TXG
3751	 * views of their content.
3752	 */
3753	msp->ms_condensing = B_TRUE;
3754
3755	mutex_exit(&msp->ms_lock);
3756	uint64_t object = space_map_object(msp->ms_sm);
3757	space_map_truncate(sm,
3758	    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3759	    zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
3760
3761	/*
3762	 * space_map_truncate() may have reallocated the spacemap object.
3763	 * If so, update the vdev_ms_array.
3764	 */
3765	if (space_map_object(msp->ms_sm) != object) {
3766		object = space_map_object(msp->ms_sm);
3767		dmu_write(spa->spa_meta_objset,
3768		    msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
3769		    msp->ms_id, sizeof (uint64_t), &object, tx);
3770	}
3771
3772	/*
3773	 * Note:
3774	 * When the log space map feature is enabled, each space map will
3775	 * always have ALLOCS followed by FREES for each sync pass. This is
3776	 * typically true even when the log space map feature is disabled,
3777	 * except from the case where a metaslab goes through metaslab_sync()
3778	 * and gets condensed. In that case the metaslab's space map will have
3779	 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
3780	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
3781	 * sync pass 1.
3782	 */
3783	range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
3784	    shift);
3785	range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
3786	space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
3787	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
3788	space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
3789
3790	range_tree_vacate(condense_tree, NULL, NULL);
3791	range_tree_destroy(condense_tree);
3792	range_tree_vacate(tmp_tree, NULL, NULL);
3793	range_tree_destroy(tmp_tree);
3794	mutex_enter(&msp->ms_lock);
3795
3796	msp->ms_condensing = B_FALSE;
3797	metaslab_flush_update(msp, tx);
3798}
3799
3800static void
3801metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
3802{
3803	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3804	ASSERT(spa_syncing_log_sm(spa) != NULL);
3805	ASSERT(msp->ms_sm != NULL);
3806	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3807	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3808
3809	mutex_enter(&spa->spa_flushed_ms_lock);
3810	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3811	metaslab_set_unflushed_dirty(msp, B_TRUE);
3812	avl_add(&spa->spa_metaslabs_by_flushed, msp);
3813	mutex_exit(&spa->spa_flushed_ms_lock);
3814
3815	spa_log_sm_increment_current_mscount(spa);
3816	spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
3817}
3818
3819void
3820metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
3821{
3822	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3823	ASSERT(spa_syncing_log_sm(spa) != NULL);
3824	ASSERT(msp->ms_sm != NULL);
3825	ASSERT(metaslab_unflushed_txg(msp) != 0);
3826	ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
3827	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3828	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3829
3830	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
3831
3832	/* update metaslab's position in our flushing tree */
3833	uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
3834	boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
3835	mutex_enter(&spa->spa_flushed_ms_lock);
3836	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
3837	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3838	metaslab_set_unflushed_dirty(msp, dirty);
3839	avl_add(&spa->spa_metaslabs_by_flushed, msp);
3840	mutex_exit(&spa->spa_flushed_ms_lock);
3841
3842	/* update metaslab counts of spa_log_sm_t nodes */
3843	spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
3844	spa_log_sm_increment_current_mscount(spa);
3845
3846	/* update log space map summary */
3847	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
3848	    ms_prev_flushed_dirty);
3849	spa_log_summary_add_flushed_metaslab(spa, dirty);
3850
3851	/* cleanup obsolete logs if any */
3852	spa_cleanup_old_sm_logs(spa, tx);
3853}
3854
3855/*
3856 * Called when the metaslab has been flushed (its own spacemap now reflects
3857 * all the contents of the pool-wide spacemap log). Updates the metaslab's
3858 * metadata and any pool-wide related log space map data (e.g. summary,
3859 * obsolete logs, etc..) to reflect that.
3860 */
3861static void
3862metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
3863{
3864	metaslab_group_t *mg = msp->ms_group;
3865	spa_t *spa = mg->mg_vd->vdev_spa;
3866
3867	ASSERT(MUTEX_HELD(&msp->ms_lock));
3868
3869	ASSERT3U(spa_sync_pass(spa), ==, 1);
3870
3871	/*
3872	 * Just because a metaslab got flushed, that doesn't mean that
3873	 * it will pass through metaslab_sync_done(). Thus, make sure to
3874	 * update ms_synced_length here in case it doesn't.
3875	 */
3876	msp->ms_synced_length = space_map_length(msp->ms_sm);
3877
3878	/*
3879	 * We may end up here from metaslab_condense() without the
3880	 * feature being active. In that case this is a no-op.
3881	 */
3882	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
3883	    metaslab_unflushed_txg(msp) == 0)
3884		return;
3885
3886	metaslab_unflushed_bump(msp, tx, B_FALSE);
3887}
3888
3889boolean_t
3890metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
3891{
3892	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3893
3894	ASSERT(MUTEX_HELD(&msp->ms_lock));
3895	ASSERT3U(spa_sync_pass(spa), ==, 1);
3896	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
3897
3898	ASSERT(msp->ms_sm != NULL);
3899	ASSERT(metaslab_unflushed_txg(msp) != 0);
3900	ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
3901
3902	/*
3903	 * There is nothing wrong with flushing the same metaslab twice, as
3904	 * this codepath should work on that case. However, the current
3905	 * flushing scheme makes sure to avoid this situation as we would be
3906	 * making all these calls without having anything meaningful to write
3907	 * to disk. We assert this behavior here.
3908	 */
3909	ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
3910
3911	/*
3912	 * We can not flush while loading, because then we would
3913	 * not load the ms_unflushed_{allocs,frees}.
3914	 */
3915	if (msp->ms_loading)
3916		return (B_FALSE);
3917
3918	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3919	metaslab_verify_weight_and_frag(msp);
3920
3921	/*
3922	 * Metaslab condensing is effectively flushing. Therefore if the
3923	 * metaslab can be condensed we can just condense it instead of
3924	 * flushing it.
3925	 *
3926	 * Note that metaslab_condense() does call metaslab_flush_update()
3927	 * so we can just return immediately after condensing. We also
3928	 * don't need to care about setting ms_flushing or broadcasting
3929	 * ms_flush_cv, even if we temporarily drop the ms_lock in
3930	 * metaslab_condense(), as the metaslab is already loaded.
3931	 */
3932	if (msp->ms_loaded && metaslab_should_condense(msp)) {
3933		metaslab_group_t *mg = msp->ms_group;
3934
3935		/*
3936		 * For all histogram operations below refer to the
3937		 * comments of metaslab_sync() where we follow a
3938		 * similar procedure.
3939		 */
3940		metaslab_group_histogram_verify(mg);
3941		metaslab_class_histogram_verify(mg->mg_class);
3942		metaslab_group_histogram_remove(mg, msp);
3943
3944		metaslab_condense(msp, tx);
3945
3946		space_map_histogram_clear(msp->ms_sm);
3947		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
3948		ASSERT(range_tree_is_empty(msp->ms_freed));
3949		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3950			space_map_histogram_add(msp->ms_sm,
3951			    msp->ms_defer[t], tx);
3952		}
3953		metaslab_aux_histograms_update(msp);
3954
3955		metaslab_group_histogram_add(mg, msp);
3956		metaslab_group_histogram_verify(mg);
3957		metaslab_class_histogram_verify(mg->mg_class);
3958
3959		metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3960
3961		/*
3962		 * Since we recreated the histogram (and potentially
3963		 * the ms_sm too while condensing) ensure that the
3964		 * weight is updated too because we are not guaranteed
3965		 * that this metaslab is dirty and will go through
3966		 * metaslab_sync_done().
3967		 */
3968		metaslab_recalculate_weight_and_sort(msp);
3969		return (B_TRUE);
3970	}
3971
3972	msp->ms_flushing = B_TRUE;
3973	uint64_t sm_len_before = space_map_length(msp->ms_sm);
3974
3975	mutex_exit(&msp->ms_lock);
3976	space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
3977	    SM_NO_VDEVID, tx);
3978	space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
3979	    SM_NO_VDEVID, tx);
3980	mutex_enter(&msp->ms_lock);
3981
3982	uint64_t sm_len_after = space_map_length(msp->ms_sm);
3983	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
3984		zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
3985		    "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
3986		    "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx),
3987		    spa_name(spa),
3988		    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
3989		    (u_longlong_t)msp->ms_id,
3990		    (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
3991		    (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
3992		    (u_longlong_t)(sm_len_after - sm_len_before));
3993	}
3994
3995	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3996	    metaslab_unflushed_changes_memused(msp));
3997	spa->spa_unflushed_stats.sus_memused -=
3998	    metaslab_unflushed_changes_memused(msp);
3999	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
4000	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
4001
4002	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
4003	metaslab_verify_weight_and_frag(msp);
4004
4005	metaslab_flush_update(msp, tx);
4006
4007	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
4008	metaslab_verify_weight_and_frag(msp);
4009
4010	msp->ms_flushing = B_FALSE;
4011	cv_broadcast(&msp->ms_flush_cv);
4012	return (B_TRUE);
4013}
4014
4015/*
4016 * Write a metaslab to disk in the context of the specified transaction group.
4017 */
4018void
4019metaslab_sync(metaslab_t *msp, uint64_t txg)
4020{
4021	metaslab_group_t *mg = msp->ms_group;
4022	vdev_t *vd = mg->mg_vd;
4023	spa_t *spa = vd->vdev_spa;
4024	objset_t *mos = spa_meta_objset(spa);
4025	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
4026	dmu_tx_t *tx;
4027
4028	ASSERT(!vd->vdev_ishole);
4029
4030	/*
4031	 * This metaslab has just been added so there's no work to do now.
4032	 */
4033	if (msp->ms_new) {
4034		ASSERT0(range_tree_space(alloctree));
4035		ASSERT0(range_tree_space(msp->ms_freeing));
4036		ASSERT0(range_tree_space(msp->ms_freed));
4037		ASSERT0(range_tree_space(msp->ms_checkpointing));
4038		ASSERT0(range_tree_space(msp->ms_trim));
4039		return;
4040	}
4041
4042	/*
4043	 * Normally, we don't want to process a metaslab if there are no
4044	 * allocations or frees to perform. However, if the metaslab is being
4045	 * forced to condense, it's loaded and we're not beyond the final
4046	 * dirty txg, we need to let it through. Not condensing beyond the
4047	 * final dirty txg prevents an issue where metaslabs that need to be
4048	 * condensed but were loaded for other reasons could cause a panic
4049	 * here. By only checking the txg in that branch of the conditional,
4050	 * we preserve the utility of the VERIFY statements in all other
4051	 * cases.
4052	 */
4053	if (range_tree_is_empty(alloctree) &&
4054	    range_tree_is_empty(msp->ms_freeing) &&
4055	    range_tree_is_empty(msp->ms_checkpointing) &&
4056	    !(msp->ms_loaded && msp->ms_condense_wanted &&
4057	    txg <= spa_final_dirty_txg(spa)))
4058		return;
4059
4060
4061	VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
4062
4063	/*
4064	 * The only state that can actually be changing concurrently
4065	 * with metaslab_sync() is the metaslab's ms_allocatable. No
4066	 * other thread can be modifying this txg's alloc, freeing,
4067	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
4068	 * could call into the DMU, because the DMU can call down to
4069	 * us (e.g. via zio_free()) at any time.
4070	 *
4071	 * The spa_vdev_remove_thread() can be reading metaslab state
4072	 * concurrently, and it is locked out by the ms_sync_lock.
4073	 * Note that the ms_lock is insufficient for this, because it
4074	 * is dropped by space_map_write().
4075	 */
4076	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
4077
4078	/*
4079	 * Generate a log space map if one doesn't exist already.
4080	 */
4081	spa_generate_syncing_log_sm(spa, tx);
4082
4083	if (msp->ms_sm == NULL) {
4084		uint64_t new_object = space_map_alloc(mos,
4085		    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
4086		    zfs_metaslab_sm_blksz_with_log :
4087		    zfs_metaslab_sm_blksz_no_log, tx);
4088		VERIFY3U(new_object, !=, 0);
4089
4090		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
4091		    msp->ms_id, sizeof (uint64_t), &new_object, tx);
4092
4093		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
4094		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
4095		ASSERT(msp->ms_sm != NULL);
4096
4097		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
4098		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
4099		ASSERT0(metaslab_allocated_space(msp));
4100	}
4101
4102	if (!range_tree_is_empty(msp->ms_checkpointing) &&
4103	    vd->vdev_checkpoint_sm == NULL) {
4104		ASSERT(spa_has_checkpoint(spa));
4105
4106		uint64_t new_object = space_map_alloc(mos,
4107		    zfs_vdev_standard_sm_blksz, tx);
4108		VERIFY3U(new_object, !=, 0);
4109
4110		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
4111		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
4112		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
4113
4114		/*
4115		 * We save the space map object as an entry in vdev_top_zap
4116		 * so it can be retrieved when the pool is reopened after an
4117		 * export or through zdb.
4118		 */
4119		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
4120		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4121		    sizeof (new_object), 1, &new_object, tx));
4122	}
4123
4124	mutex_enter(&msp->ms_sync_lock);
4125	mutex_enter(&msp->ms_lock);
4126
4127	/*
4128	 * Note: metaslab_condense() clears the space map's histogram.
4129	 * Therefore we must verify and remove this histogram before
4130	 * condensing.
4131	 */
4132	metaslab_group_histogram_verify(mg);
4133	metaslab_class_histogram_verify(mg->mg_class);
4134	metaslab_group_histogram_remove(mg, msp);
4135
4136	if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
4137	    metaslab_should_condense(msp))
4138		metaslab_condense(msp, tx);
4139
4140	/*
4141	 * We'll be going to disk to sync our space accounting, thus we
4142	 * drop the ms_lock during that time so allocations coming from
4143	 * open-context (ZIL) for future TXGs do not block.
4144	 */
4145	mutex_exit(&msp->ms_lock);
4146	space_map_t *log_sm = spa_syncing_log_sm(spa);
4147	if (log_sm != NULL) {
4148		ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
4149		if (metaslab_unflushed_txg(msp) == 0)
4150			metaslab_unflushed_add(msp, tx);
4151		else if (!metaslab_unflushed_dirty(msp))
4152			metaslab_unflushed_bump(msp, tx, B_TRUE);
4153
4154		space_map_write(log_sm, alloctree, SM_ALLOC,
4155		    vd->vdev_id, tx);
4156		space_map_write(log_sm, msp->ms_freeing, SM_FREE,
4157		    vd->vdev_id, tx);
4158		mutex_enter(&msp->ms_lock);
4159
4160		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
4161		    metaslab_unflushed_changes_memused(msp));
4162		spa->spa_unflushed_stats.sus_memused -=
4163		    metaslab_unflushed_changes_memused(msp);
4164		range_tree_remove_xor_add(alloctree,
4165		    msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
4166		range_tree_remove_xor_add(msp->ms_freeing,
4167		    msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
4168		spa->spa_unflushed_stats.sus_memused +=
4169		    metaslab_unflushed_changes_memused(msp);
4170	} else {
4171		ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
4172
4173		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
4174		    SM_NO_VDEVID, tx);
4175		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
4176		    SM_NO_VDEVID, tx);
4177		mutex_enter(&msp->ms_lock);
4178	}
4179
4180	msp->ms_allocated_space += range_tree_space(alloctree);
4181	ASSERT3U(msp->ms_allocated_space, >=,
4182	    range_tree_space(msp->ms_freeing));
4183	msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
4184
4185	if (!range_tree_is_empty(msp->ms_checkpointing)) {
4186		ASSERT(spa_has_checkpoint(spa));
4187		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
4188
4189		/*
4190		 * Since we are doing writes to disk and the ms_checkpointing
4191		 * tree won't be changing during that time, we drop the
4192		 * ms_lock while writing to the checkpoint space map, for the
4193		 * same reason mentioned above.
4194		 */
4195		mutex_exit(&msp->ms_lock);
4196		space_map_write(vd->vdev_checkpoint_sm,
4197		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
4198		mutex_enter(&msp->ms_lock);
4199
4200		spa->spa_checkpoint_info.sci_dspace +=
4201		    range_tree_space(msp->ms_checkpointing);
4202		vd->vdev_stat.vs_checkpoint_space +=
4203		    range_tree_space(msp->ms_checkpointing);
4204		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
4205		    -space_map_allocated(vd->vdev_checkpoint_sm));
4206
4207		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
4208	}
4209
4210	if (msp->ms_loaded) {
4211		/*
4212		 * When the space map is loaded, we have an accurate
4213		 * histogram in the range tree. This gives us an opportunity
4214		 * to bring the space map's histogram up-to-date so we clear
4215		 * it first before updating it.
4216		 */
4217		space_map_histogram_clear(msp->ms_sm);
4218		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
4219
4220		/*
4221		 * Since we've cleared the histogram we need to add back
4222		 * any free space that has already been processed, plus
4223		 * any deferred space. This allows the on-disk histogram
4224		 * to accurately reflect all free space even if some space
4225		 * is not yet available for allocation (i.e. deferred).
4226		 */
4227		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
4228
4229		/*
4230		 * Add back any deferred free space that has not been
4231		 * added back into the in-core free tree yet. This will
4232		 * ensure that we don't end up with a space map histogram
4233		 * that is completely empty unless the metaslab is fully
4234		 * allocated.
4235		 */
4236		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
4237			space_map_histogram_add(msp->ms_sm,
4238			    msp->ms_defer[t], tx);
4239		}
4240	}
4241
4242	/*
4243	 * Always add the free space from this sync pass to the space
4244	 * map histogram. We want to make sure that the on-disk histogram
4245	 * accounts for all free space. If the space map is not loaded,
4246	 * then we will lose some accuracy but will correct it the next
4247	 * time we load the space map.
4248	 */
4249	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
4250	metaslab_aux_histograms_update(msp);
4251
4252	metaslab_group_histogram_add(mg, msp);
4253	metaslab_group_histogram_verify(mg);
4254	metaslab_class_histogram_verify(mg->mg_class);
4255
4256	/*
4257	 * For sync pass 1, we avoid traversing this txg's free range tree
4258	 * and instead will just swap the pointers for freeing and freed.
4259	 * We can safely do this since the freed_tree is guaranteed to be
4260	 * empty on the initial pass.
4261	 *
4262	 * Keep in mind that even if we are currently using a log spacemap
4263	 * we want current frees to end up in the ms_allocatable (but not
4264	 * get appended to the ms_sm) so their ranges can be reused as usual.
4265	 */
4266	if (spa_sync_pass(spa) == 1) {
4267		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
4268		ASSERT0(msp->ms_allocated_this_txg);
4269	} else {
4270		range_tree_vacate(msp->ms_freeing,
4271		    range_tree_add, msp->ms_freed);
4272	}
4273	msp->ms_allocated_this_txg += range_tree_space(alloctree);
4274	range_tree_vacate(alloctree, NULL, NULL);
4275
4276	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
4277	ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
4278	    & TXG_MASK]));
4279	ASSERT0(range_tree_space(msp->ms_freeing));
4280	ASSERT0(range_tree_space(msp->ms_checkpointing));
4281
4282	mutex_exit(&msp->ms_lock);
4283
4284	/*
4285	 * Verify that the space map object ID has been recorded in the
4286	 * vdev_ms_array.
4287	 */
4288	uint64_t object;
4289	VERIFY0(dmu_read(mos, vd->vdev_ms_array,
4290	    msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
4291	VERIFY3U(object, ==, space_map_object(msp->ms_sm));
4292
4293	mutex_exit(&msp->ms_sync_lock);
4294	dmu_tx_commit(tx);
4295}
4296
4297static void
4298metaslab_evict(metaslab_t *msp, uint64_t txg)
4299{
4300	if (!msp->ms_loaded || msp->ms_disabled != 0)
4301		return;
4302
4303	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
4304		VERIFY0(range_tree_space(
4305		    msp->ms_allocating[(txg + t) & TXG_MASK]));
4306	}
4307	if (msp->ms_allocator != -1)
4308		metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
4309
4310	if (!metaslab_debug_unload)
4311		metaslab_unload(msp);
4312}
4313
4314/*
4315 * Called after a transaction group has completely synced to mark
4316 * all of the metaslab's free space as usable.
4317 */
4318void
4319metaslab_sync_done(metaslab_t *msp, uint64_t txg)
4320{
4321	metaslab_group_t *mg = msp->ms_group;
4322	vdev_t *vd = mg->mg_vd;
4323	spa_t *spa = vd->vdev_spa;
4324	range_tree_t **defer_tree;
4325	int64_t alloc_delta, defer_delta;
4326	boolean_t defer_allowed = B_TRUE;
4327
4328	ASSERT(!vd->vdev_ishole);
4329
4330	mutex_enter(&msp->ms_lock);
4331
4332	if (msp->ms_new) {
4333		/* this is a new metaslab, add its capacity to the vdev */
4334		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
4335
4336		/* there should be no allocations nor frees at this point */
4337		VERIFY0(msp->ms_allocated_this_txg);
4338		VERIFY0(range_tree_space(msp->ms_freed));
4339	}
4340
4341	ASSERT0(range_tree_space(msp->ms_freeing));
4342	ASSERT0(range_tree_space(msp->ms_checkpointing));
4343
4344	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
4345
4346	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
4347	    metaslab_class_get_alloc(spa_normal_class(spa));
4348	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing ||
4349	    vd->vdev_rz_expanding) {
4350		defer_allowed = B_FALSE;
4351	}
4352
4353	defer_delta = 0;
4354	alloc_delta = msp->ms_allocated_this_txg -
4355	    range_tree_space(msp->ms_freed);
4356
4357	if (defer_allowed) {
4358		defer_delta = range_tree_space(msp->ms_freed) -
4359		    range_tree_space(*defer_tree);
4360	} else {
4361		defer_delta -= range_tree_space(*defer_tree);
4362	}
4363	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
4364	    defer_delta, 0);
4365
4366	if (spa_syncing_log_sm(spa) == NULL) {
4367		/*
4368		 * If there's a metaslab_load() in progress and we don't have
4369		 * a log space map, it means that we probably wrote to the
4370		 * metaslab's space map. If this is the case, we need to
4371		 * make sure that we wait for the load to complete so that we
4372		 * have a consistent view at the in-core side of the metaslab.
4373		 */
4374		metaslab_load_wait(msp);
4375	} else {
4376		ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
4377	}
4378
4379	/*
4380	 * When auto-trimming is enabled, free ranges which are added to
4381	 * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
4382	 * periodically consumed by the vdev_autotrim_thread() which issues
4383	 * trims for all ranges and then vacates the tree.  The ms_trim tree
4384	 * can be discarded at any time with the sole consequence of recent
4385	 * frees not being trimmed.
4386	 */
4387	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
4388		range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
4389		if (!defer_allowed) {
4390			range_tree_walk(msp->ms_freed, range_tree_add,
4391			    msp->ms_trim);
4392		}
4393	} else {
4394		range_tree_vacate(msp->ms_trim, NULL, NULL);
4395	}
4396
4397	/*
4398	 * Move the frees from the defer_tree back to the free
4399	 * range tree (if it's loaded). Swap the freed_tree and
4400	 * the defer_tree -- this is safe to do because we've
4401	 * just emptied out the defer_tree.
4402	 */
4403	range_tree_vacate(*defer_tree,
4404	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
4405	if (defer_allowed) {
4406		range_tree_swap(&msp->ms_freed, defer_tree);
4407	} else {
4408		range_tree_vacate(msp->ms_freed,
4409		    msp->ms_loaded ? range_tree_add : NULL,
4410		    msp->ms_allocatable);
4411	}
4412
4413	msp->ms_synced_length = space_map_length(msp->ms_sm);
4414
4415	msp->ms_deferspace += defer_delta;
4416	ASSERT3S(msp->ms_deferspace, >=, 0);
4417	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
4418	if (msp->ms_deferspace != 0) {
4419		/*
4420		 * Keep syncing this metaslab until all deferred frees
4421		 * are back in circulation.
4422		 */
4423		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
4424	}
4425	metaslab_aux_histograms_update_done(msp, defer_allowed);
4426
4427	if (msp->ms_new) {
4428		msp->ms_new = B_FALSE;
4429		mutex_enter(&mg->mg_lock);
4430		mg->mg_ms_ready++;
4431		mutex_exit(&mg->mg_lock);
4432	}
4433
4434	/*
4435	 * Re-sort metaslab within its group now that we've adjusted
4436	 * its allocatable space.
4437	 */
4438	metaslab_recalculate_weight_and_sort(msp);
4439
4440	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
4441	ASSERT0(range_tree_space(msp->ms_freeing));
4442	ASSERT0(range_tree_space(msp->ms_freed));
4443	ASSERT0(range_tree_space(msp->ms_checkpointing));
4444	msp->ms_allocating_total -= msp->ms_allocated_this_txg;
4445	msp->ms_allocated_this_txg = 0;
4446	mutex_exit(&msp->ms_lock);
4447}
4448
4449void
4450metaslab_sync_reassess(metaslab_group_t *mg)
4451{
4452	spa_t *spa = mg->mg_class->mc_spa;
4453
4454	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4455	metaslab_group_alloc_update(mg);
4456	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
4457
4458	/*
4459	 * Preload the next potential metaslabs but only on active
4460	 * metaslab groups. We can get into a state where the metaslab
4461	 * is no longer active since we dirty metaslabs as we remove a
4462	 * a device, thus potentially making the metaslab group eligible
4463	 * for preloading.
4464	 */
4465	if (mg->mg_activation_count > 0) {
4466		metaslab_group_preload(mg);
4467	}
4468	spa_config_exit(spa, SCL_ALLOC, FTAG);
4469}
4470
4471/*
4472 * When writing a ditto block (i.e. more than one DVA for a given BP) on
4473 * the same vdev as an existing DVA of this BP, then try to allocate it
4474 * on a different metaslab than existing DVAs (i.e. a unique metaslab).
4475 */
4476static boolean_t
4477metaslab_is_unique(metaslab_t *msp, dva_t *dva)
4478{
4479	uint64_t dva_ms_id;
4480
4481	if (DVA_GET_ASIZE(dva) == 0)
4482		return (B_TRUE);
4483
4484	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
4485		return (B_TRUE);
4486
4487	dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
4488
4489	return (msp->ms_id != dva_ms_id);
4490}
4491
4492/*
4493 * ==========================================================================
4494 * Metaslab allocation tracing facility
4495 * ==========================================================================
4496 */
4497
4498/*
4499 * Add an allocation trace element to the allocation tracing list.
4500 */
4501static void
4502metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
4503    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
4504    int allocator)
4505{
4506	metaslab_alloc_trace_t *mat;
4507
4508	if (!metaslab_trace_enabled)
4509		return;
4510
4511	/*
4512	 * When the tracing list reaches its maximum we remove
4513	 * the second element in the list before adding a new one.
4514	 * By removing the second element we preserve the original
4515	 * entry as a clue to what allocations steps have already been
4516	 * performed.
4517	 */
4518	if (zal->zal_size == metaslab_trace_max_entries) {
4519		metaslab_alloc_trace_t *mat_next;
4520#ifdef ZFS_DEBUG
4521		panic("too many entries in allocation list");
4522#endif
4523		METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
4524		zal->zal_size--;
4525		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
4526		list_remove(&zal->zal_list, mat_next);
4527		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
4528	}
4529
4530	mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
4531	list_link_init(&mat->mat_list_node);
4532	mat->mat_mg = mg;
4533	mat->mat_msp = msp;
4534	mat->mat_size = psize;
4535	mat->mat_dva_id = dva_id;
4536	mat->mat_offset = offset;
4537	mat->mat_weight = 0;
4538	mat->mat_allocator = allocator;
4539
4540	if (msp != NULL)
4541		mat->mat_weight = msp->ms_weight;
4542
4543	/*
4544	 * The list is part of the zio so locking is not required. Only
4545	 * a single thread will perform allocations for a given zio.
4546	 */
4547	list_insert_tail(&zal->zal_list, mat);
4548	zal->zal_size++;
4549
4550	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
4551}
4552
4553void
4554metaslab_trace_init(zio_alloc_list_t *zal)
4555{
4556	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
4557	    offsetof(metaslab_alloc_trace_t, mat_list_node));
4558	zal->zal_size = 0;
4559}
4560
4561void
4562metaslab_trace_fini(zio_alloc_list_t *zal)
4563{
4564	metaslab_alloc_trace_t *mat;
4565
4566	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
4567		kmem_cache_free(metaslab_alloc_trace_cache, mat);
4568	list_destroy(&zal->zal_list);
4569	zal->zal_size = 0;
4570}
4571
4572/*
4573 * ==========================================================================
4574 * Metaslab block operations
4575 * ==========================================================================
4576 */
4577
4578static void
4579metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag,
4580    int flags, int allocator)
4581{
4582	if (!(flags & METASLAB_ASYNC_ALLOC) ||
4583	    (flags & METASLAB_DONT_THROTTLE))
4584		return;
4585
4586	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4587	if (!mg->mg_class->mc_alloc_throttle_enabled)
4588		return;
4589
4590	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4591	(void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
4592}
4593
4594static void
4595metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
4596{
4597	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4598	metaslab_class_allocator_t *mca =
4599	    &mg->mg_class->mc_allocator[allocator];
4600	uint64_t max = mg->mg_max_alloc_queue_depth;
4601	uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
4602	while (cur < max) {
4603		if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
4604		    cur, cur + 1) == cur) {
4605			atomic_inc_64(&mca->mca_alloc_max_slots);
4606			return;
4607		}
4608		cur = mga->mga_cur_max_alloc_queue_depth;
4609	}
4610}
4611
4612void
4613metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag,
4614    int flags, int allocator, boolean_t io_complete)
4615{
4616	if (!(flags & METASLAB_ASYNC_ALLOC) ||
4617	    (flags & METASLAB_DONT_THROTTLE))
4618		return;
4619
4620	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4621	if (!mg->mg_class->mc_alloc_throttle_enabled)
4622		return;
4623
4624	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4625	(void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
4626	if (io_complete)
4627		metaslab_group_increment_qdepth(mg, allocator);
4628}
4629
4630void
4631metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag,
4632    int allocator)
4633{
4634#ifdef ZFS_DEBUG
4635	const dva_t *dva = bp->blk_dva;
4636	int ndvas = BP_GET_NDVAS(bp);
4637
4638	for (int d = 0; d < ndvas; d++) {
4639		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
4640		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4641		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4642		VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
4643	}
4644#endif
4645}
4646
4647static uint64_t
4648metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
4649{
4650	uint64_t start;
4651	range_tree_t *rt = msp->ms_allocatable;
4652	metaslab_class_t *mc = msp->ms_group->mg_class;
4653
4654	ASSERT(MUTEX_HELD(&msp->ms_lock));
4655	VERIFY(!msp->ms_condensing);
4656	VERIFY0(msp->ms_disabled);
4657	VERIFY0(msp->ms_new);
4658
4659	start = mc->mc_ops->msop_alloc(msp, size);
4660	if (start != -1ULL) {
4661		metaslab_group_t *mg = msp->ms_group;
4662		vdev_t *vd = mg->mg_vd;
4663
4664		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
4665		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
4666		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
4667		range_tree_remove(rt, start, size);
4668		range_tree_clear(msp->ms_trim, start, size);
4669
4670		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
4671			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
4672
4673		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
4674		msp->ms_allocating_total += size;
4675
4676		/* Track the last successful allocation */
4677		msp->ms_alloc_txg = txg;
4678		metaslab_verify_space(msp, txg);
4679	}
4680
4681	/*
4682	 * Now that we've attempted the allocation we need to update the
4683	 * metaslab's maximum block size since it may have changed.
4684	 */
4685	msp->ms_max_size = metaslab_largest_allocatable(msp);
4686	return (start);
4687}
4688
4689/*
4690 * Find the metaslab with the highest weight that is less than what we've
4691 * already tried.  In the common case, this means that we will examine each
4692 * metaslab at most once. Note that concurrent callers could reorder metaslabs
4693 * by activation/passivation once we have dropped the mg_lock. If a metaslab is
4694 * activated by another thread, and we fail to allocate from the metaslab we
4695 * have selected, we may not try the newly-activated metaslab, and instead
4696 * activate another metaslab.  This is not optimal, but generally does not cause
4697 * any problems (a possible exception being if every metaslab is completely full
4698 * except for the newly-activated metaslab which we fail to examine).
4699 */
4700static metaslab_t *
4701find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
4702    dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
4703    boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
4704    boolean_t *was_active)
4705{
4706	avl_index_t idx;
4707	avl_tree_t *t = &mg->mg_metaslab_tree;
4708	metaslab_t *msp = avl_find(t, search, &idx);
4709	if (msp == NULL)
4710		msp = avl_nearest(t, idx, AVL_AFTER);
4711
4712	uint_t tries = 0;
4713	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
4714		int i;
4715
4716		if (!try_hard && tries > zfs_metaslab_find_max_tries) {
4717			METASLABSTAT_BUMP(metaslabstat_too_many_tries);
4718			return (NULL);
4719		}
4720		tries++;
4721
4722		if (!metaslab_should_allocate(msp, asize, try_hard)) {
4723			metaslab_trace_add(zal, mg, msp, asize, d,
4724			    TRACE_TOO_SMALL, allocator);
4725			continue;
4726		}
4727
4728		/*
4729		 * If the selected metaslab is condensing or disabled, or
4730		 * hasn't gone through a metaslab_sync_done(), then skip it.
4731		 */
4732		if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new)
4733			continue;
4734
4735		*was_active = msp->ms_allocator != -1;
4736		/*
4737		 * If we're activating as primary, this is our first allocation
4738		 * from this disk, so we don't need to check how close we are.
4739		 * If the metaslab under consideration was already active,
4740		 * we're getting desperate enough to steal another allocator's
4741		 * metaslab, so we still don't care about distances.
4742		 */
4743		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
4744			break;
4745
4746		for (i = 0; i < d; i++) {
4747			if (want_unique &&
4748			    !metaslab_is_unique(msp, &dva[i]))
4749				break;  /* try another metaslab */
4750		}
4751		if (i == d)
4752			break;
4753	}
4754
4755	if (msp != NULL) {
4756		search->ms_weight = msp->ms_weight;
4757		search->ms_start = msp->ms_start + 1;
4758		search->ms_allocator = msp->ms_allocator;
4759		search->ms_primary = msp->ms_primary;
4760	}
4761	return (msp);
4762}
4763
4764static void
4765metaslab_active_mask_verify(metaslab_t *msp)
4766{
4767	ASSERT(MUTEX_HELD(&msp->ms_lock));
4768
4769	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
4770		return;
4771
4772	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
4773		return;
4774
4775	if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
4776		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4777		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4778		VERIFY3S(msp->ms_allocator, !=, -1);
4779		VERIFY(msp->ms_primary);
4780		return;
4781	}
4782
4783	if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
4784		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4785		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4786		VERIFY3S(msp->ms_allocator, !=, -1);
4787		VERIFY(!msp->ms_primary);
4788		return;
4789	}
4790
4791	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
4792		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4793		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4794		VERIFY3S(msp->ms_allocator, ==, -1);
4795		return;
4796	}
4797}
4798
4799static uint64_t
4800metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
4801    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4802    int allocator, boolean_t try_hard)
4803{
4804	metaslab_t *msp = NULL;
4805	uint64_t offset = -1ULL;
4806
4807	uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
4808	for (int i = 0; i < d; i++) {
4809		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4810		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4811			activation_weight = METASLAB_WEIGHT_SECONDARY;
4812		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4813		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4814			activation_weight = METASLAB_WEIGHT_CLAIM;
4815			break;
4816		}
4817	}
4818
4819	/*
4820	 * If we don't have enough metaslabs active to fill the entire array, we
4821	 * just use the 0th slot.
4822	 */
4823	if (mg->mg_ms_ready < mg->mg_allocators * 3)
4824		allocator = 0;
4825	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
4826
4827	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
4828
4829	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
4830	search->ms_weight = UINT64_MAX;
4831	search->ms_start = 0;
4832	/*
4833	 * At the end of the metaslab tree are the already-active metaslabs,
4834	 * first the primaries, then the secondaries. When we resume searching
4835	 * through the tree, we need to consider ms_allocator and ms_primary so
4836	 * we start in the location right after where we left off, and don't
4837	 * accidentally loop forever considering the same metaslabs.
4838	 */
4839	search->ms_allocator = -1;
4840	search->ms_primary = B_TRUE;
4841	for (;;) {
4842		boolean_t was_active = B_FALSE;
4843
4844		mutex_enter(&mg->mg_lock);
4845
4846		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4847		    mga->mga_primary != NULL) {
4848			msp = mga->mga_primary;
4849
4850			/*
4851			 * Even though we don't hold the ms_lock for the
4852			 * primary metaslab, those fields should not
4853			 * change while we hold the mg_lock. Thus it is
4854			 * safe to make assertions on them.
4855			 */
4856			ASSERT(msp->ms_primary);
4857			ASSERT3S(msp->ms_allocator, ==, allocator);
4858			ASSERT(msp->ms_loaded);
4859
4860			was_active = B_TRUE;
4861			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4862		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4863		    mga->mga_secondary != NULL) {
4864			msp = mga->mga_secondary;
4865
4866			/*
4867			 * See comment above about the similar assertions
4868			 * for the primary metaslab.
4869			 */
4870			ASSERT(!msp->ms_primary);
4871			ASSERT3S(msp->ms_allocator, ==, allocator);
4872			ASSERT(msp->ms_loaded);
4873
4874			was_active = B_TRUE;
4875			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4876		} else {
4877			msp = find_valid_metaslab(mg, activation_weight, dva, d,
4878			    want_unique, asize, allocator, try_hard, zal,
4879			    search, &was_active);
4880		}
4881
4882		mutex_exit(&mg->mg_lock);
4883		if (msp == NULL) {
4884			kmem_free(search, sizeof (*search));
4885			return (-1ULL);
4886		}
4887		mutex_enter(&msp->ms_lock);
4888
4889		metaslab_active_mask_verify(msp);
4890
4891		/*
4892		 * This code is disabled out because of issues with
4893		 * tracepoints in non-gpl kernel modules.
4894		 */
4895#if 0
4896		DTRACE_PROBE3(ms__activation__attempt,
4897		    metaslab_t *, msp, uint64_t, activation_weight,
4898		    boolean_t, was_active);
4899#endif
4900
4901		/*
4902		 * Ensure that the metaslab we have selected is still
4903		 * capable of handling our request. It's possible that
4904		 * another thread may have changed the weight while we
4905		 * were blocked on the metaslab lock. We check the
4906		 * active status first to see if we need to set_selected_txg
4907		 * a new metaslab.
4908		 */
4909		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
4910			ASSERT3S(msp->ms_allocator, ==, -1);
4911			mutex_exit(&msp->ms_lock);
4912			continue;
4913		}
4914
4915		/*
4916		 * If the metaslab was activated for another allocator
4917		 * while we were waiting in the ms_lock above, or it's
4918		 * a primary and we're seeking a secondary (or vice versa),
4919		 * we go back and select a new metaslab.
4920		 */
4921		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
4922		    (msp->ms_allocator != -1) &&
4923		    (msp->ms_allocator != allocator || ((activation_weight ==
4924		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
4925			ASSERT(msp->ms_loaded);
4926			ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
4927			    msp->ms_allocator != -1);
4928			mutex_exit(&msp->ms_lock);
4929			continue;
4930		}
4931
4932		/*
4933		 * This metaslab was used for claiming regions allocated
4934		 * by the ZIL during pool import. Once these regions are
4935		 * claimed we don't need to keep the CLAIM bit set
4936		 * anymore. Passivate this metaslab to zero its activation
4937		 * mask.
4938		 */
4939		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
4940		    activation_weight != METASLAB_WEIGHT_CLAIM) {
4941			ASSERT(msp->ms_loaded);
4942			ASSERT3S(msp->ms_allocator, ==, -1);
4943			metaslab_passivate(msp, msp->ms_weight &
4944			    ~METASLAB_WEIGHT_CLAIM);
4945			mutex_exit(&msp->ms_lock);
4946			continue;
4947		}
4948
4949		metaslab_set_selected_txg(msp, txg);
4950
4951		int activation_error =
4952		    metaslab_activate(msp, allocator, activation_weight);
4953		metaslab_active_mask_verify(msp);
4954
4955		/*
4956		 * If the metaslab was activated by another thread for
4957		 * another allocator or activation_weight (EBUSY), or it
4958		 * failed because another metaslab was assigned as primary
4959		 * for this allocator (EEXIST) we continue using this
4960		 * metaslab for our allocation, rather than going on to a
4961		 * worse metaslab (we waited for that metaslab to be loaded
4962		 * after all).
4963		 *
4964		 * If the activation failed due to an I/O error or ENOSPC we
4965		 * skip to the next metaslab.
4966		 */
4967		boolean_t activated;
4968		if (activation_error == 0) {
4969			activated = B_TRUE;
4970		} else if (activation_error == EBUSY ||
4971		    activation_error == EEXIST) {
4972			activated = B_FALSE;
4973		} else {
4974			mutex_exit(&msp->ms_lock);
4975			continue;
4976		}
4977		ASSERT(msp->ms_loaded);
4978
4979		/*
4980		 * Now that we have the lock, recheck to see if we should
4981		 * continue to use this metaslab for this allocation. The
4982		 * the metaslab is now loaded so metaslab_should_allocate()
4983		 * can accurately determine if the allocation attempt should
4984		 * proceed.
4985		 */
4986		if (!metaslab_should_allocate(msp, asize, try_hard)) {
4987			/* Passivate this metaslab and select a new one. */
4988			metaslab_trace_add(zal, mg, msp, asize, d,
4989			    TRACE_TOO_SMALL, allocator);
4990			goto next;
4991		}
4992
4993		/*
4994		 * If this metaslab is currently condensing then pick again
4995		 * as we can't manipulate this metaslab until it's committed
4996		 * to disk. If this metaslab is being initialized, we shouldn't
4997		 * allocate from it since the allocated region might be
4998		 * overwritten after allocation.
4999		 */
5000		if (msp->ms_condensing) {
5001			metaslab_trace_add(zal, mg, msp, asize, d,
5002			    TRACE_CONDENSING, allocator);
5003			if (activated) {
5004				metaslab_passivate(msp, msp->ms_weight &
5005				    ~METASLAB_ACTIVE_MASK);
5006			}
5007			mutex_exit(&msp->ms_lock);
5008			continue;
5009		} else if (msp->ms_disabled > 0) {
5010			metaslab_trace_add(zal, mg, msp, asize, d,
5011			    TRACE_DISABLED, allocator);
5012			if (activated) {
5013				metaslab_passivate(msp, msp->ms_weight &
5014				    ~METASLAB_ACTIVE_MASK);
5015			}
5016			mutex_exit(&msp->ms_lock);
5017			continue;
5018		}
5019
5020		offset = metaslab_block_alloc(msp, asize, txg);
5021		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
5022
5023		if (offset != -1ULL) {
5024			/* Proactively passivate the metaslab, if needed */
5025			if (activated)
5026				metaslab_segment_may_passivate(msp);
5027			break;
5028		}
5029next:
5030		ASSERT(msp->ms_loaded);
5031
5032		/*
5033		 * This code is disabled out because of issues with
5034		 * tracepoints in non-gpl kernel modules.
5035		 */
5036#if 0
5037		DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
5038		    uint64_t, asize);
5039#endif
5040
5041		/*
5042		 * We were unable to allocate from this metaslab so determine
5043		 * a new weight for this metaslab. Now that we have loaded
5044		 * the metaslab we can provide a better hint to the metaslab
5045		 * selector.
5046		 *
5047		 * For space-based metaslabs, we use the maximum block size.
5048		 * This information is only available when the metaslab
5049		 * is loaded and is more accurate than the generic free
5050		 * space weight that was calculated by metaslab_weight().
5051		 * This information allows us to quickly compare the maximum
5052		 * available allocation in the metaslab to the allocation
5053		 * size being requested.
5054		 *
5055		 * For segment-based metaslabs, determine the new weight
5056		 * based on the highest bucket in the range tree. We
5057		 * explicitly use the loaded segment weight (i.e. the range
5058		 * tree histogram) since it contains the space that is
5059		 * currently available for allocation and is accurate
5060		 * even within a sync pass.
5061		 */
5062		uint64_t weight;
5063		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
5064			weight = metaslab_largest_allocatable(msp);
5065			WEIGHT_SET_SPACEBASED(weight);
5066		} else {
5067			weight = metaslab_weight_from_range_tree(msp);
5068		}
5069
5070		if (activated) {
5071			metaslab_passivate(msp, weight);
5072		} else {
5073			/*
5074			 * For the case where we use the metaslab that is
5075			 * active for another allocator we want to make
5076			 * sure that we retain the activation mask.
5077			 *
5078			 * Note that we could attempt to use something like
5079			 * metaslab_recalculate_weight_and_sort() that
5080			 * retains the activation mask here. That function
5081			 * uses metaslab_weight() to set the weight though
5082			 * which is not as accurate as the calculations
5083			 * above.
5084			 */
5085			weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
5086			metaslab_group_sort(mg, msp, weight);
5087		}
5088		metaslab_active_mask_verify(msp);
5089
5090		/*
5091		 * We have just failed an allocation attempt, check
5092		 * that metaslab_should_allocate() agrees. Otherwise,
5093		 * we may end up in an infinite loop retrying the same
5094		 * metaslab.
5095		 */
5096		ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
5097
5098		mutex_exit(&msp->ms_lock);
5099	}
5100	mutex_exit(&msp->ms_lock);
5101	kmem_free(search, sizeof (*search));
5102	return (offset);
5103}
5104
5105static uint64_t
5106metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
5107    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
5108    int allocator, boolean_t try_hard)
5109{
5110	uint64_t offset;
5111
5112	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
5113	    dva, d, allocator, try_hard);
5114
5115	mutex_enter(&mg->mg_lock);
5116	if (offset == -1ULL) {
5117		mg->mg_failed_allocations++;
5118		metaslab_trace_add(zal, mg, NULL, asize, d,
5119		    TRACE_GROUP_FAILURE, allocator);
5120		if (asize == SPA_GANGBLOCKSIZE) {
5121			/*
5122			 * This metaslab group was unable to allocate
5123			 * the minimum gang block size so it must be out of
5124			 * space. We must notify the allocation throttle
5125			 * to start skipping allocation attempts to this
5126			 * metaslab group until more space becomes available.
5127			 * Note: this failure cannot be caused by the
5128			 * allocation throttle since the allocation throttle
5129			 * is only responsible for skipping devices and
5130			 * not failing block allocations.
5131			 */
5132			mg->mg_no_free_space = B_TRUE;
5133		}
5134	}
5135	mg->mg_allocations++;
5136	mutex_exit(&mg->mg_lock);
5137	return (offset);
5138}
5139
5140/*
5141 * Allocate a block for the specified i/o.
5142 */
5143int
5144metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
5145    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
5146    zio_alloc_list_t *zal, int allocator)
5147{
5148	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5149	metaslab_group_t *mg, *rotor;
5150	vdev_t *vd;
5151	boolean_t try_hard = B_FALSE;
5152
5153	ASSERT(!DVA_IS_VALID(&dva[d]));
5154
5155	/*
5156	 * For testing, make some blocks above a certain size be gang blocks.
5157	 * This will result in more split blocks when using device removal,
5158	 * and a large number of split blocks coupled with ztest-induced
5159	 * damage can result in extremely long reconstruction times.  This
5160	 * will also test spilling from special to normal.
5161	 */
5162	if (psize >= metaslab_force_ganging &&
5163	    metaslab_force_ganging_pct > 0 &&
5164	    (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
5165		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
5166		    allocator);
5167		return (SET_ERROR(ENOSPC));
5168	}
5169
5170	/*
5171	 * Start at the rotor and loop through all mgs until we find something.
5172	 * Note that there's no locking on mca_rotor or mca_aliquot because
5173	 * nothing actually breaks if we miss a few updates -- we just won't
5174	 * allocate quite as evenly.  It all balances out over time.
5175	 *
5176	 * If we are doing ditto or log blocks, try to spread them across
5177	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
5178	 * allocated all of our ditto blocks, then try and spread them out on
5179	 * that vdev as much as possible.  If it turns out to not be possible,
5180	 * gradually lower our standards until anything becomes acceptable.
5181	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
5182	 * gives us hope of containing our fault domains to something we're
5183	 * able to reason about.  Otherwise, any two top-level vdev failures
5184	 * will guarantee the loss of data.  With consecutive allocation,
5185	 * only two adjacent top-level vdev failures will result in data loss.
5186	 *
5187	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
5188	 * ourselves on the same vdev as our gang block header.  That
5189	 * way, we can hope for locality in vdev_cache, plus it makes our
5190	 * fault domains something tractable.
5191	 */
5192	if (hintdva) {
5193		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
5194
5195		/*
5196		 * It's possible the vdev we're using as the hint no
5197		 * longer exists or its mg has been closed (e.g. by
5198		 * device removal).  Consult the rotor when
5199		 * all else fails.
5200		 */
5201		if (vd != NULL && vd->vdev_mg != NULL) {
5202			mg = vdev_get_mg(vd, mc);
5203
5204			if (flags & METASLAB_HINTBP_AVOID)
5205				mg = mg->mg_next;
5206		} else {
5207			mg = mca->mca_rotor;
5208		}
5209	} else if (d != 0) {
5210		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
5211		mg = vd->vdev_mg->mg_next;
5212	} else {
5213		ASSERT(mca->mca_rotor != NULL);
5214		mg = mca->mca_rotor;
5215	}
5216
5217	/*
5218	 * If the hint put us into the wrong metaslab class, or into a
5219	 * metaslab group that has been passivated, just follow the rotor.
5220	 */
5221	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
5222		mg = mca->mca_rotor;
5223
5224	rotor = mg;
5225top:
5226	do {
5227		boolean_t allocatable;
5228
5229		ASSERT(mg->mg_activation_count == 1);
5230		vd = mg->mg_vd;
5231
5232		/*
5233		 * Don't allocate from faulted devices.
5234		 */
5235		if (try_hard) {
5236			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
5237			allocatable = vdev_allocatable(vd);
5238			spa_config_exit(spa, SCL_ZIO, FTAG);
5239		} else {
5240			allocatable = vdev_allocatable(vd);
5241		}
5242
5243		/*
5244		 * Determine if the selected metaslab group is eligible
5245		 * for allocations. If we're ganging then don't allow
5246		 * this metaslab group to skip allocations since that would
5247		 * inadvertently return ENOSPC and suspend the pool
5248		 * even though space is still available.
5249		 */
5250		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
5251			allocatable = metaslab_group_allocatable(mg, rotor,
5252			    flags, psize, allocator, d);
5253		}
5254
5255		if (!allocatable) {
5256			metaslab_trace_add(zal, mg, NULL, psize, d,
5257			    TRACE_NOT_ALLOCATABLE, allocator);
5258			goto next;
5259		}
5260
5261		/*
5262		 * Avoid writing single-copy data to an unhealthy,
5263		 * non-redundant vdev, unless we've already tried all
5264		 * other vdevs.
5265		 */
5266		if (vd->vdev_state < VDEV_STATE_HEALTHY &&
5267		    d == 0 && !try_hard && vd->vdev_children == 0) {
5268			metaslab_trace_add(zal, mg, NULL, psize, d,
5269			    TRACE_VDEV_ERROR, allocator);
5270			goto next;
5271		}
5272
5273		ASSERT(mg->mg_class == mc);
5274
5275		uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
5276		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
5277
5278		/*
5279		 * If we don't need to try hard, then require that the
5280		 * block be on a different metaslab from any other DVAs
5281		 * in this BP (unique=true).  If we are trying hard, then
5282		 * allow any metaslab to be used (unique=false).
5283		 */
5284		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
5285		    !try_hard, dva, d, allocator, try_hard);
5286
5287		if (offset != -1ULL) {
5288			/*
5289			 * If we've just selected this metaslab group,
5290			 * figure out whether the corresponding vdev is
5291			 * over- or under-used relative to the pool,
5292			 * and set an allocation bias to even it out.
5293			 *
5294			 * Bias is also used to compensate for unequally
5295			 * sized vdevs so that space is allocated fairly.
5296			 */
5297			if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
5298				vdev_stat_t *vs = &vd->vdev_stat;
5299				int64_t vs_free = vs->vs_space - vs->vs_alloc;
5300				int64_t mc_free = mc->mc_space - mc->mc_alloc;
5301				int64_t ratio;
5302
5303				/*
5304				 * Calculate how much more or less we should
5305				 * try to allocate from this device during
5306				 * this iteration around the rotor.
5307				 *
5308				 * This basically introduces a zero-centered
5309				 * bias towards the devices with the most
5310				 * free space, while compensating for vdev
5311				 * size differences.
5312				 *
5313				 * Examples:
5314				 *  vdev V1 = 16M/128M
5315				 *  vdev V2 = 16M/128M
5316				 *  ratio(V1) = 100% ratio(V2) = 100%
5317				 *
5318				 *  vdev V1 = 16M/128M
5319				 *  vdev V2 = 64M/128M
5320				 *  ratio(V1) = 127% ratio(V2) =  72%
5321				 *
5322				 *  vdev V1 = 16M/128M
5323				 *  vdev V2 = 64M/512M
5324				 *  ratio(V1) =  40% ratio(V2) = 160%
5325				 */
5326				ratio = (vs_free * mc->mc_alloc_groups * 100) /
5327				    (mc_free + 1);
5328				mg->mg_bias = ((ratio - 100) *
5329				    (int64_t)mg->mg_aliquot) / 100;
5330			} else if (!metaslab_bias_enabled) {
5331				mg->mg_bias = 0;
5332			}
5333
5334			if ((flags & METASLAB_ZIL) ||
5335			    atomic_add_64_nv(&mca->mca_aliquot, asize) >=
5336			    mg->mg_aliquot + mg->mg_bias) {
5337				mca->mca_rotor = mg->mg_next;
5338				mca->mca_aliquot = 0;
5339			}
5340
5341			DVA_SET_VDEV(&dva[d], vd->vdev_id);
5342			DVA_SET_OFFSET(&dva[d], offset);
5343			DVA_SET_GANG(&dva[d],
5344			    ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
5345			DVA_SET_ASIZE(&dva[d], asize);
5346
5347			return (0);
5348		}
5349next:
5350		mca->mca_rotor = mg->mg_next;
5351		mca->mca_aliquot = 0;
5352	} while ((mg = mg->mg_next) != rotor);
5353
5354	/*
5355	 * If we haven't tried hard, perhaps do so now.
5356	 */
5357	if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
5358	    GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
5359	    psize <= 1 << spa->spa_min_ashift)) {
5360		METASLABSTAT_BUMP(metaslabstat_try_hard);
5361		try_hard = B_TRUE;
5362		goto top;
5363	}
5364
5365	memset(&dva[d], 0, sizeof (dva_t));
5366
5367	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
5368	return (SET_ERROR(ENOSPC));
5369}
5370
5371void
5372metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
5373    boolean_t checkpoint)
5374{
5375	metaslab_t *msp;
5376	spa_t *spa = vd->vdev_spa;
5377
5378	ASSERT(vdev_is_concrete(vd));
5379	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5380	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
5381
5382	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5383
5384	VERIFY(!msp->ms_condensing);
5385	VERIFY3U(offset, >=, msp->ms_start);
5386	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
5387	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5388	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
5389
5390	metaslab_check_free_impl(vd, offset, asize);
5391
5392	mutex_enter(&msp->ms_lock);
5393	if (range_tree_is_empty(msp->ms_freeing) &&
5394	    range_tree_is_empty(msp->ms_checkpointing)) {
5395		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
5396	}
5397
5398	if (checkpoint) {
5399		ASSERT(spa_has_checkpoint(spa));
5400		range_tree_add(msp->ms_checkpointing, offset, asize);
5401	} else {
5402		range_tree_add(msp->ms_freeing, offset, asize);
5403	}
5404	mutex_exit(&msp->ms_lock);
5405}
5406
5407void
5408metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5409    uint64_t size, void *arg)
5410{
5411	(void) inner_offset;
5412	boolean_t *checkpoint = arg;
5413
5414	ASSERT3P(checkpoint, !=, NULL);
5415
5416	if (vd->vdev_ops->vdev_op_remap != NULL)
5417		vdev_indirect_mark_obsolete(vd, offset, size);
5418	else
5419		metaslab_free_impl(vd, offset, size, *checkpoint);
5420}
5421
5422static void
5423metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
5424    boolean_t checkpoint)
5425{
5426	spa_t *spa = vd->vdev_spa;
5427
5428	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5429
5430	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
5431		return;
5432
5433	if (spa->spa_vdev_removal != NULL &&
5434	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
5435	    vdev_is_concrete(vd)) {
5436		/*
5437		 * Note: we check if the vdev is concrete because when
5438		 * we complete the removal, we first change the vdev to be
5439		 * an indirect vdev (in open context), and then (in syncing
5440		 * context) clear spa_vdev_removal.
5441		 */
5442		free_from_removing_vdev(vd, offset, size);
5443	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
5444		vdev_indirect_mark_obsolete(vd, offset, size);
5445		vd->vdev_ops->vdev_op_remap(vd, offset, size,
5446		    metaslab_free_impl_cb, &checkpoint);
5447	} else {
5448		metaslab_free_concrete(vd, offset, size, checkpoint);
5449	}
5450}
5451
5452typedef struct remap_blkptr_cb_arg {
5453	blkptr_t *rbca_bp;
5454	spa_remap_cb_t rbca_cb;
5455	vdev_t *rbca_remap_vd;
5456	uint64_t rbca_remap_offset;
5457	void *rbca_cb_arg;
5458} remap_blkptr_cb_arg_t;
5459
5460static void
5461remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5462    uint64_t size, void *arg)
5463{
5464	remap_blkptr_cb_arg_t *rbca = arg;
5465	blkptr_t *bp = rbca->rbca_bp;
5466
5467	/* We can not remap split blocks. */
5468	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
5469		return;
5470	ASSERT0(inner_offset);
5471
5472	if (rbca->rbca_cb != NULL) {
5473		/*
5474		 * At this point we know that we are not handling split
5475		 * blocks and we invoke the callback on the previous
5476		 * vdev which must be indirect.
5477		 */
5478		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
5479
5480		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
5481		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
5482
5483		/* set up remap_blkptr_cb_arg for the next call */
5484		rbca->rbca_remap_vd = vd;
5485		rbca->rbca_remap_offset = offset;
5486	}
5487
5488	/*
5489	 * The phys birth time is that of dva[0].  This ensures that we know
5490	 * when each dva was written, so that resilver can determine which
5491	 * blocks need to be scrubbed (i.e. those written during the time
5492	 * the vdev was offline).  It also ensures that the key used in
5493	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
5494	 * we didn't change the phys_birth, a lookup in the ARC for a
5495	 * remapped BP could find the data that was previously stored at
5496	 * this vdev + offset.
5497	 */
5498	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
5499	    DVA_GET_VDEV(&bp->blk_dva[0]));
5500	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
5501	uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
5502	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
5503	BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
5504
5505	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
5506	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
5507}
5508
5509/*
5510 * If the block pointer contains any indirect DVAs, modify them to refer to
5511 * concrete DVAs.  Note that this will sometimes not be possible, leaving
5512 * the indirect DVA in place.  This happens if the indirect DVA spans multiple
5513 * segments in the mapping (i.e. it is a "split block").
5514 *
5515 * If the BP was remapped, calls the callback on the original dva (note the
5516 * callback can be called multiple times if the original indirect DVA refers
5517 * to another indirect DVA, etc).
5518 *
5519 * Returns TRUE if the BP was remapped.
5520 */
5521boolean_t
5522spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
5523{
5524	remap_blkptr_cb_arg_t rbca;
5525
5526	if (!zfs_remap_blkptr_enable)
5527		return (B_FALSE);
5528
5529	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
5530		return (B_FALSE);
5531
5532	/*
5533	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
5534	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
5535	 */
5536	if (BP_GET_DEDUP(bp))
5537		return (B_FALSE);
5538
5539	/*
5540	 * Gang blocks can not be remapped, because
5541	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
5542	 * the BP used to read the gang block header (GBH) being the same
5543	 * as the DVA[0] that we allocated for the GBH.
5544	 */
5545	if (BP_IS_GANG(bp))
5546		return (B_FALSE);
5547
5548	/*
5549	 * Embedded BP's have no DVA to remap.
5550	 */
5551	if (BP_GET_NDVAS(bp) < 1)
5552		return (B_FALSE);
5553
5554	/*
5555	 * Note: we only remap dva[0].  If we remapped other dvas, we
5556	 * would no longer know what their phys birth txg is.
5557	 */
5558	dva_t *dva = &bp->blk_dva[0];
5559
5560	uint64_t offset = DVA_GET_OFFSET(dva);
5561	uint64_t size = DVA_GET_ASIZE(dva);
5562	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
5563
5564	if (vd->vdev_ops->vdev_op_remap == NULL)
5565		return (B_FALSE);
5566
5567	rbca.rbca_bp = bp;
5568	rbca.rbca_cb = callback;
5569	rbca.rbca_remap_vd = vd;
5570	rbca.rbca_remap_offset = offset;
5571	rbca.rbca_cb_arg = arg;
5572
5573	/*
5574	 * remap_blkptr_cb() will be called in order for each level of
5575	 * indirection, until a concrete vdev is reached or a split block is
5576	 * encountered. old_vd and old_offset are updated within the callback
5577	 * as we go from the one indirect vdev to the next one (either concrete
5578	 * or indirect again) in that order.
5579	 */
5580	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
5581
5582	/* Check if the DVA wasn't remapped because it is a split block */
5583	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
5584		return (B_FALSE);
5585
5586	return (B_TRUE);
5587}
5588
5589/*
5590 * Undo the allocation of a DVA which happened in the given transaction group.
5591 */
5592void
5593metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5594{
5595	metaslab_t *msp;
5596	vdev_t *vd;
5597	uint64_t vdev = DVA_GET_VDEV(dva);
5598	uint64_t offset = DVA_GET_OFFSET(dva);
5599	uint64_t size = DVA_GET_ASIZE(dva);
5600
5601	ASSERT(DVA_IS_VALID(dva));
5602	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5603
5604	if (txg > spa_freeze_txg(spa))
5605		return;
5606
5607	if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
5608	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
5609		zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
5610		    (u_longlong_t)vdev, (u_longlong_t)offset,
5611		    (u_longlong_t)size);
5612		return;
5613	}
5614
5615	ASSERT(!vd->vdev_removing);
5616	ASSERT(vdev_is_concrete(vd));
5617	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
5618	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
5619
5620	if (DVA_GET_GANG(dva))
5621		size = vdev_gang_header_asize(vd);
5622
5623	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5624
5625	mutex_enter(&msp->ms_lock);
5626	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
5627	    offset, size);
5628	msp->ms_allocating_total -= size;
5629
5630	VERIFY(!msp->ms_condensing);
5631	VERIFY3U(offset, >=, msp->ms_start);
5632	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
5633	VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
5634	    msp->ms_size);
5635	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5636	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5637	range_tree_add(msp->ms_allocatable, offset, size);
5638	mutex_exit(&msp->ms_lock);
5639}
5640
5641/*
5642 * Free the block represented by the given DVA.
5643 */
5644void
5645metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
5646{
5647	uint64_t vdev = DVA_GET_VDEV(dva);
5648	uint64_t offset = DVA_GET_OFFSET(dva);
5649	uint64_t size = DVA_GET_ASIZE(dva);
5650	vdev_t *vd = vdev_lookup_top(spa, vdev);
5651
5652	ASSERT(DVA_IS_VALID(dva));
5653	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5654
5655	if (DVA_GET_GANG(dva)) {
5656		size = vdev_gang_header_asize(vd);
5657	}
5658
5659	metaslab_free_impl(vd, offset, size, checkpoint);
5660}
5661
5662/*
5663 * Reserve some allocation slots. The reservation system must be called
5664 * before we call into the allocator. If there aren't any available slots
5665 * then the I/O will be throttled until an I/O completes and its slots are
5666 * freed up. The function returns true if it was successful in placing
5667 * the reservation.
5668 */
5669boolean_t
5670metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
5671    zio_t *zio, int flags)
5672{
5673	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5674	uint64_t max = mca->mca_alloc_max_slots;
5675
5676	ASSERT(mc->mc_alloc_throttle_enabled);
5677	if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
5678	    zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
5679		/*
5680		 * The potential race between _count() and _add() is covered
5681		 * by the allocator lock in most cases, or irrelevant due to
5682		 * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others.
5683		 * But even if we assume some other non-existing scenario, the
5684		 * worst that can happen is few more I/Os get to allocation
5685		 * earlier, that is not a problem.
5686		 *
5687		 * We reserve the slots individually so that we can unreserve
5688		 * them individually when an I/O completes.
5689		 */
5690		zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio);
5691		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
5692		return (B_TRUE);
5693	}
5694	return (B_FALSE);
5695}
5696
5697void
5698metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
5699    int allocator, zio_t *zio)
5700{
5701	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5702
5703	ASSERT(mc->mc_alloc_throttle_enabled);
5704	zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio);
5705}
5706
5707static int
5708metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
5709    uint64_t txg)
5710{
5711	metaslab_t *msp;
5712	spa_t *spa = vd->vdev_spa;
5713	int error = 0;
5714
5715	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
5716		return (SET_ERROR(ENXIO));
5717
5718	ASSERT3P(vd->vdev_ms, !=, NULL);
5719	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5720
5721	mutex_enter(&msp->ms_lock);
5722
5723	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
5724		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
5725		if (error == EBUSY) {
5726			ASSERT(msp->ms_loaded);
5727			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
5728			error = 0;
5729		}
5730	}
5731
5732	if (error == 0 &&
5733	    !range_tree_contains(msp->ms_allocatable, offset, size))
5734		error = SET_ERROR(ENOENT);
5735
5736	if (error || txg == 0) {	/* txg == 0 indicates dry run */
5737		mutex_exit(&msp->ms_lock);
5738		return (error);
5739	}
5740
5741	VERIFY(!msp->ms_condensing);
5742	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5743	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5744	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
5745	    msp->ms_size);
5746	range_tree_remove(msp->ms_allocatable, offset, size);
5747	range_tree_clear(msp->ms_trim, offset, size);
5748
5749	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(8) */
5750		metaslab_class_t *mc = msp->ms_group->mg_class;
5751		multilist_sublist_t *mls =
5752		    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
5753		if (!multilist_link_active(&msp->ms_class_txg_node)) {
5754			msp->ms_selected_txg = txg;
5755			multilist_sublist_insert_head(mls, msp);
5756		}
5757		multilist_sublist_unlock(mls);
5758
5759		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
5760			vdev_dirty(vd, VDD_METASLAB, msp, txg);
5761		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
5762		    offset, size);
5763		msp->ms_allocating_total += size;
5764	}
5765
5766	mutex_exit(&msp->ms_lock);
5767
5768	return (0);
5769}
5770
5771typedef struct metaslab_claim_cb_arg_t {
5772	uint64_t	mcca_txg;
5773	int		mcca_error;
5774} metaslab_claim_cb_arg_t;
5775
5776static void
5777metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5778    uint64_t size, void *arg)
5779{
5780	(void) inner_offset;
5781	metaslab_claim_cb_arg_t *mcca_arg = arg;
5782
5783	if (mcca_arg->mcca_error == 0) {
5784		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
5785		    size, mcca_arg->mcca_txg);
5786	}
5787}
5788
5789int
5790metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
5791{
5792	if (vd->vdev_ops->vdev_op_remap != NULL) {
5793		metaslab_claim_cb_arg_t arg;
5794
5795		/*
5796		 * Only zdb(8) can claim on indirect vdevs.  This is used
5797		 * to detect leaks of mapped space (that are not accounted
5798		 * for in the obsolete counts, spacemap, or bpobj).
5799		 */
5800		ASSERT(!spa_writeable(vd->vdev_spa));
5801		arg.mcca_error = 0;
5802		arg.mcca_txg = txg;
5803
5804		vd->vdev_ops->vdev_op_remap(vd, offset, size,
5805		    metaslab_claim_impl_cb, &arg);
5806
5807		if (arg.mcca_error == 0) {
5808			arg.mcca_error = metaslab_claim_concrete(vd,
5809			    offset, size, txg);
5810		}
5811		return (arg.mcca_error);
5812	} else {
5813		return (metaslab_claim_concrete(vd, offset, size, txg));
5814	}
5815}
5816
5817/*
5818 * Intent log support: upon opening the pool after a crash, notify the SPA
5819 * of blocks that the intent log has allocated for immediate write, but
5820 * which are still considered free by the SPA because the last transaction
5821 * group didn't commit yet.
5822 */
5823static int
5824metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5825{
5826	uint64_t vdev = DVA_GET_VDEV(dva);
5827	uint64_t offset = DVA_GET_OFFSET(dva);
5828	uint64_t size = DVA_GET_ASIZE(dva);
5829	vdev_t *vd;
5830
5831	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
5832		return (SET_ERROR(ENXIO));
5833	}
5834
5835	ASSERT(DVA_IS_VALID(dva));
5836
5837	if (DVA_GET_GANG(dva))
5838		size = vdev_gang_header_asize(vd);
5839
5840	return (metaslab_claim_impl(vd, offset, size, txg));
5841}
5842
5843int
5844metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
5845    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
5846    zio_alloc_list_t *zal, zio_t *zio, int allocator)
5847{
5848	dva_t *dva = bp->blk_dva;
5849	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
5850	int error = 0;
5851
5852	ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
5853	ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
5854
5855	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5856
5857	if (mc->mc_allocator[allocator].mca_rotor == NULL) {
5858		/* no vdevs in this class */
5859		spa_config_exit(spa, SCL_ALLOC, FTAG);
5860		return (SET_ERROR(ENOSPC));
5861	}
5862
5863	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
5864	ASSERT(BP_GET_NDVAS(bp) == 0);
5865	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
5866	ASSERT3P(zal, !=, NULL);
5867
5868	for (int d = 0; d < ndvas; d++) {
5869		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
5870		    txg, flags, zal, allocator);
5871		if (error != 0) {
5872			for (d--; d >= 0; d--) {
5873				metaslab_unalloc_dva(spa, &dva[d], txg);
5874				metaslab_group_alloc_decrement(spa,
5875				    DVA_GET_VDEV(&dva[d]), zio, flags,
5876				    allocator, B_FALSE);
5877				memset(&dva[d], 0, sizeof (dva_t));
5878			}
5879			spa_config_exit(spa, SCL_ALLOC, FTAG);
5880			return (error);
5881		} else {
5882			/*
5883			 * Update the metaslab group's queue depth
5884			 * based on the newly allocated dva.
5885			 */
5886			metaslab_group_alloc_increment(spa,
5887			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
5888		}
5889	}
5890	ASSERT(error == 0);
5891	ASSERT(BP_GET_NDVAS(bp) == ndvas);
5892
5893	spa_config_exit(spa, SCL_ALLOC, FTAG);
5894
5895	BP_SET_BIRTH(bp, txg, 0);
5896
5897	return (0);
5898}
5899
5900void
5901metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
5902{
5903	const dva_t *dva = bp->blk_dva;
5904	int ndvas = BP_GET_NDVAS(bp);
5905
5906	ASSERT(!BP_IS_HOLE(bp));
5907	ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
5908
5909	/*
5910	 * If we have a checkpoint for the pool we need to make sure that
5911	 * the blocks that we free that are part of the checkpoint won't be
5912	 * reused until the checkpoint is discarded or we revert to it.
5913	 *
5914	 * The checkpoint flag is passed down the metaslab_free code path
5915	 * and is set whenever we want to add a block to the checkpoint's
5916	 * accounting. That is, we "checkpoint" blocks that existed at the
5917	 * time the checkpoint was created and are therefore referenced by
5918	 * the checkpointed uberblock.
5919	 *
5920	 * Note that, we don't checkpoint any blocks if the current
5921	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
5922	 * normally as they will be referenced by the checkpointed uberblock.
5923	 */
5924	boolean_t checkpoint = B_FALSE;
5925	if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
5926	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
5927		/*
5928		 * At this point, if the block is part of the checkpoint
5929		 * there is no way it was created in the current txg.
5930		 */
5931		ASSERT(!now);
5932		ASSERT3U(spa_syncing_txg(spa), ==, txg);
5933		checkpoint = B_TRUE;
5934	}
5935
5936	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
5937
5938	for (int d = 0; d < ndvas; d++) {
5939		if (now) {
5940			metaslab_unalloc_dva(spa, &dva[d], txg);
5941		} else {
5942			ASSERT3U(txg, ==, spa_syncing_txg(spa));
5943			metaslab_free_dva(spa, &dva[d], checkpoint);
5944		}
5945	}
5946
5947	spa_config_exit(spa, SCL_FREE, FTAG);
5948}
5949
5950int
5951metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
5952{
5953	const dva_t *dva = bp->blk_dva;
5954	int ndvas = BP_GET_NDVAS(bp);
5955	int error = 0;
5956
5957	ASSERT(!BP_IS_HOLE(bp));
5958
5959	if (txg != 0) {
5960		/*
5961		 * First do a dry run to make sure all DVAs are claimable,
5962		 * so we don't have to unwind from partial failures below.
5963		 */
5964		if ((error = metaslab_claim(spa, bp, 0)) != 0)
5965			return (error);
5966	}
5967
5968	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5969
5970	for (int d = 0; d < ndvas; d++) {
5971		error = metaslab_claim_dva(spa, &dva[d], txg);
5972		if (error != 0)
5973			break;
5974	}
5975
5976	spa_config_exit(spa, SCL_ALLOC, FTAG);
5977
5978	ASSERT(error == 0 || txg == 0);
5979
5980	return (error);
5981}
5982
5983static void
5984metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
5985    uint64_t size, void *arg)
5986{
5987	(void) inner, (void) arg;
5988
5989	if (vd->vdev_ops == &vdev_indirect_ops)
5990		return;
5991
5992	metaslab_check_free_impl(vd, offset, size);
5993}
5994
5995static void
5996metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
5997{
5998	metaslab_t *msp;
5999	spa_t *spa __maybe_unused = vd->vdev_spa;
6000
6001	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
6002		return;
6003
6004	if (vd->vdev_ops->vdev_op_remap != NULL) {
6005		vd->vdev_ops->vdev_op_remap(vd, offset, size,
6006		    metaslab_check_free_impl_cb, NULL);
6007		return;
6008	}
6009
6010	ASSERT(vdev_is_concrete(vd));
6011	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
6012	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
6013
6014	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6015
6016	mutex_enter(&msp->ms_lock);
6017	if (msp->ms_loaded) {
6018		range_tree_verify_not_present(msp->ms_allocatable,
6019		    offset, size);
6020	}
6021
6022	/*
6023	 * Check all segments that currently exist in the freeing pipeline.
6024	 *
6025	 * It would intuitively make sense to also check the current allocating
6026	 * tree since metaslab_unalloc_dva() exists for extents that are
6027	 * allocated and freed in the same sync pass within the same txg.
6028	 * Unfortunately there are places (e.g. the ZIL) where we allocate a
6029	 * segment but then we free part of it within the same txg
6030	 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
6031	 * current allocating tree.
6032	 */
6033	range_tree_verify_not_present(msp->ms_freeing, offset, size);
6034	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
6035	range_tree_verify_not_present(msp->ms_freed, offset, size);
6036	for (int j = 0; j < TXG_DEFER_SIZE; j++)
6037		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
6038	range_tree_verify_not_present(msp->ms_trim, offset, size);
6039	mutex_exit(&msp->ms_lock);
6040}
6041
6042void
6043metaslab_check_free(spa_t *spa, const blkptr_t *bp)
6044{
6045	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
6046		return;
6047
6048	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
6049	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
6050		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
6051		vdev_t *vd = vdev_lookup_top(spa, vdev);
6052		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
6053		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
6054
6055		if (DVA_GET_GANG(&bp->blk_dva[i]))
6056			size = vdev_gang_header_asize(vd);
6057
6058		ASSERT3P(vd, !=, NULL);
6059
6060		metaslab_check_free_impl(vd, offset, size);
6061	}
6062	spa_config_exit(spa, SCL_VDEV, FTAG);
6063}
6064
6065static void
6066metaslab_group_disable_wait(metaslab_group_t *mg)
6067{
6068	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
6069	while (mg->mg_disabled_updating) {
6070		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
6071	}
6072}
6073
6074static void
6075metaslab_group_disabled_increment(metaslab_group_t *mg)
6076{
6077	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
6078	ASSERT(mg->mg_disabled_updating);
6079
6080	while (mg->mg_ms_disabled >= max_disabled_ms) {
6081		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
6082	}
6083	mg->mg_ms_disabled++;
6084	ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
6085}
6086
6087/*
6088 * Mark the metaslab as disabled to prevent any allocations on this metaslab.
6089 * We must also track how many metaslabs are currently disabled within a
6090 * metaslab group and limit them to prevent allocation failures from
6091 * occurring because all metaslabs are disabled.
6092 */
6093void
6094metaslab_disable(metaslab_t *msp)
6095{
6096	ASSERT(!MUTEX_HELD(&msp->ms_lock));
6097	metaslab_group_t *mg = msp->ms_group;
6098
6099	mutex_enter(&mg->mg_ms_disabled_lock);
6100
6101	/*
6102	 * To keep an accurate count of how many threads have disabled
6103	 * a specific metaslab group, we only allow one thread to mark
6104	 * the metaslab group at a time. This ensures that the value of
6105	 * ms_disabled will be accurate when we decide to mark a metaslab
6106	 * group as disabled. To do this we force all other threads
6107	 * to wait till the metaslab's mg_disabled_updating flag is no
6108	 * longer set.
6109	 */
6110	metaslab_group_disable_wait(mg);
6111	mg->mg_disabled_updating = B_TRUE;
6112	if (msp->ms_disabled == 0) {
6113		metaslab_group_disabled_increment(mg);
6114	}
6115	mutex_enter(&msp->ms_lock);
6116	msp->ms_disabled++;
6117	mutex_exit(&msp->ms_lock);
6118
6119	mg->mg_disabled_updating = B_FALSE;
6120	cv_broadcast(&mg->mg_ms_disabled_cv);
6121	mutex_exit(&mg->mg_ms_disabled_lock);
6122}
6123
6124void
6125metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
6126{
6127	metaslab_group_t *mg = msp->ms_group;
6128	spa_t *spa = mg->mg_vd->vdev_spa;
6129
6130	/*
6131	 * Wait for the outstanding IO to be synced to prevent newly
6132	 * allocated blocks from being overwritten.  This used by
6133	 * initialize and TRIM which are modifying unallocated space.
6134	 */
6135	if (sync)
6136		txg_wait_synced(spa_get_dsl(spa), 0);
6137
6138	mutex_enter(&mg->mg_ms_disabled_lock);
6139	mutex_enter(&msp->ms_lock);
6140	if (--msp->ms_disabled == 0) {
6141		mg->mg_ms_disabled--;
6142		cv_broadcast(&mg->mg_ms_disabled_cv);
6143		if (unload)
6144			metaslab_unload(msp);
6145	}
6146	mutex_exit(&msp->ms_lock);
6147	mutex_exit(&mg->mg_ms_disabled_lock);
6148}
6149
6150void
6151metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
6152{
6153	ms->ms_unflushed_dirty = dirty;
6154}
6155
6156static void
6157metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
6158{
6159	vdev_t *vd = ms->ms_group->mg_vd;
6160	spa_t *spa = vd->vdev_spa;
6161	objset_t *mos = spa_meta_objset(spa);
6162
6163	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
6164
6165	metaslab_unflushed_phys_t entry = {
6166		.msp_unflushed_txg = metaslab_unflushed_txg(ms),
6167	};
6168	uint64_t entry_size = sizeof (entry);
6169	uint64_t entry_offset = ms->ms_id * entry_size;
6170
6171	uint64_t object = 0;
6172	int err = zap_lookup(mos, vd->vdev_top_zap,
6173	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
6174	    &object);
6175	if (err == ENOENT) {
6176		object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
6177		    SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
6178		VERIFY0(zap_add(mos, vd->vdev_top_zap,
6179		    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
6180		    &object, tx));
6181	} else {
6182		VERIFY0(err);
6183	}
6184
6185	dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
6186	    &entry, tx);
6187}
6188
6189void
6190metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
6191{
6192	ms->ms_unflushed_txg = txg;
6193	metaslab_update_ondisk_flush_data(ms, tx);
6194}
6195
6196boolean_t
6197metaslab_unflushed_dirty(metaslab_t *ms)
6198{
6199	return (ms->ms_unflushed_dirty);
6200}
6201
6202uint64_t
6203metaslab_unflushed_txg(metaslab_t *ms)
6204{
6205	return (ms->ms_unflushed_txg);
6206}
6207
6208ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW,
6209	"Allocation granularity (a.k.a. stripe size)");
6210
6211ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
6212	"Load all metaslabs when pool is first opened");
6213
6214ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
6215	"Prevent metaslabs from being unloaded");
6216
6217ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
6218	"Preload potential metaslabs during reassessment");
6219
6220ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW,
6221	"Max number of metaslabs per group to preload");
6222
6223ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
6224	"Delay in txgs after metaslab was last used before unloading");
6225
6226ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW,
6227	"Delay in milliseconds after metaslab was last used before unloading");
6228
6229/* BEGIN CSTYLED */
6230ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW,
6231	"Percentage of metaslab group size that should be free to make it "
6232	"eligible for allocation");
6233
6234ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW,
6235	"Percentage of metaslab group size that should be considered eligible "
6236	"for allocations unless all metaslab groups within the metaslab class "
6237	"have also crossed this threshold");
6238
6239ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT,
6240	ZMOD_RW,
6241	"Use the fragmentation metric to prefer less fragmented metaslabs");
6242/* END CSTYLED */
6243
6244ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT,
6245	ZMOD_RW, "Fragmentation for metaslab to allow allocation");
6246
6247ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
6248	"Prefer metaslabs with lower LBAs");
6249
6250ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
6251	"Enable metaslab group biasing");
6252
6253ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
6254	ZMOD_RW, "Enable segment-based metaslab selection");
6255
6256ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
6257	"Segment-based metaslab selection maximum buckets before switching");
6258
6259ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
6260	"Blocks larger than this size are sometimes forced to be gang blocks");
6261
6262ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
6263	"Percentage of large blocks that will be forced to be gang blocks");
6264
6265ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
6266	"Max distance (bytes) to search forward before using size tree");
6267
6268ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
6269	"When looking in size tree, use largest segment instead of exact fit");
6270
6271ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
6272	ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
6273
6274ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW,
6275	"Percentage of memory that can be used to store metaslab range trees");
6276
6277ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
6278	ZMOD_RW, "Try hard to allocate before ganging");
6279
6280ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
6281	"Normally only consider this many of the best metaslabs in each vdev");
6282
6283/* BEGIN CSTYLED */
6284ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator,
6285	param_set_active_allocator, param_get_charp, ZMOD_RW,
6286	"SPA active allocator");
6287/* END CSTYLED */
6288