1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22211931Smm * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23290753Smav * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
24246675Smm * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25297112Smav * Copyright (c) 2014 Integros [integros.com]
26168404Spjd */
27168404Spjd
28168404Spjd#include <sys/zfs_context.h>
29168404Spjd#include <sys/dmu.h>
30168404Spjd#include <sys/dmu_tx.h>
31168404Spjd#include <sys/space_map.h>
32168404Spjd#include <sys/metaslab_impl.h>
33168404Spjd#include <sys/vdev_impl.h>
34168404Spjd#include <sys/zio.h>
35262093Savg#include <sys/spa_impl.h>
36269773Sdelphij#include <sys/zfeature.h>
37168404Spjd
38255226SpjdSYSCTL_DECL(_vfs_zfs);
39255226SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
40255226Spjd
41307279Smav#define	GANG_ALLOCATION(flags) \
42307279Smav	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
43224177Smm
44262093Savg#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
45262093Savg#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
46262093Savg#define	METASLAB_ACTIVE_MASK		\
47262093Savg	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
48262093Savg
49168404Spjduint64_t metaslab_aliquot = 512ULL << 10;
50185029Spjduint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
51255226SpjdTUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang);
52255226SpjdSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
53255226Spjd    &metaslab_gang_bang, 0,
54255226Spjd    "Force gang block allocation for blocks larger than or equal to this value");
55168404Spjd
56168404Spjd/*
57247398Smm * The in-core space map representation is more compact than its on-disk form.
58247398Smm * The zfs_condense_pct determines how much more compact the in-core
59247398Smm * space_map representation must be before we compact it on-disk.
60247398Smm * Values should be greater than or equal to 100.
61247398Smm */
62247398Smmint zfs_condense_pct = 200;
63262093SavgTUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct);
64262093SavgSYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
65262093Savg    &zfs_condense_pct, 0,
66262093Savg    "Condense on-disk spacemap when it is more than this many percents"
67262093Savg    " of in-memory counterpart");
68247398Smm
69247398Smm/*
70269416Sdelphij * Condensing a metaslab is not guaranteed to actually reduce the amount of
71269416Sdelphij * space used on disk. In particular, a space map uses data in increments of
72273341Sdelphij * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
73269416Sdelphij * same number of blocks after condensing. Since the goal of condensing is to
74269416Sdelphij * reduce the number of IOPs required to read the space map, we only want to
75269416Sdelphij * condense when we can be sure we will reduce the number of blocks used by the
76269416Sdelphij * space map. Unfortunately, we cannot precisely compute whether or not this is
77269416Sdelphij * the case in metaslab_should_condense since we are holding ms_lock. Instead,
78269416Sdelphij * we apply the following heuristic: do not condense a spacemap unless the
79269416Sdelphij * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
80269416Sdelphij * blocks.
81269416Sdelphij */
82269416Sdelphijint zfs_metaslab_condense_block_threshold = 4;
83269416Sdelphij
84269416Sdelphij/*
85260768Savg * The zfs_mg_noalloc_threshold defines which metaslab groups should
86260768Savg * be eligible for allocation. The value is defined as a percentage of
87269773Sdelphij * free space. Metaslab groups that have more free space than
88260768Savg * zfs_mg_noalloc_threshold are always eligible for allocations. Once
89260768Savg * a metaslab group's free space is less than or equal to the
90260768Savg * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
91260768Savg * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
92260768Savg * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
93260768Savg * groups are allowed to accept allocations. Gang blocks are always
94260768Savg * eligible to allocate on any metaslab group. The default value of 0 means
95260768Savg * no metaslab group will be excluded based on this criterion.
96260768Savg */
97260768Savgint zfs_mg_noalloc_threshold = 0;
98262093SavgTUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold);
99262093SavgSYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
100262093Savg    &zfs_mg_noalloc_threshold, 0,
101262093Savg    "Percentage of metaslab group size that should be free"
102262093Savg    " to make it eligible for allocation");
103260768Savg
104260768Savg/*
105269773Sdelphij * Metaslab groups are considered eligible for allocations if their
106269773Sdelphij * fragmenation metric (measured as a percentage) is less than or equal to
107269773Sdelphij * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
108269773Sdelphij * then it will be skipped unless all metaslab groups within the metaslab
109269773Sdelphij * class have also crossed this threshold.
110269773Sdelphij */
111269773Sdelphijint zfs_mg_fragmentation_threshold = 85;
112269774SdelphijTUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold);
113269774SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
114269774Sdelphij    &zfs_mg_fragmentation_threshold, 0,
115269774Sdelphij    "Percentage of metaslab group size that should be considered "
116269774Sdelphij    "eligible for allocations unless all metaslab groups within the metaslab class "
117269774Sdelphij    "have also crossed this threshold");
118269773Sdelphij
119269773Sdelphij/*
120269773Sdelphij * Allow metaslabs to keep their active state as long as their fragmentation
121269773Sdelphij * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
122269773Sdelphij * active metaslab that exceeds this threshold will no longer keep its active
123269773Sdelphij * status allowing better metaslabs to be selected.
124269773Sdelphij */
125269773Sdelphijint zfs_metaslab_fragmentation_threshold = 70;
126269774SdelphijTUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold",
127269774Sdelphij    &zfs_metaslab_fragmentation_threshold);
128269774SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
129269774Sdelphij    &zfs_metaslab_fragmentation_threshold, 0,
130269774Sdelphij    "Maximum percentage of metaslab fragmentation level to keep their active state");
131269773Sdelphij
132269773Sdelphij/*
133262093Savg * When set will load all metaslabs when pool is first opened.
134219089Spjd */
135262093Savgint metaslab_debug_load = 0;
136262093SavgTUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load);
137262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
138262093Savg    &metaslab_debug_load, 0,
139262093Savg    "Load all metaslabs when pool is first opened");
140219089Spjd
141219089Spjd/*
142262093Savg * When set will prevent metaslabs from being unloaded.
143262093Savg */
144262093Savgint metaslab_debug_unload = 0;
145262093SavgTUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload);
146262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
147262093Savg    &metaslab_debug_unload, 0,
148262093Savg    "Prevent metaslabs from being unloaded");
149262093Savg
150262093Savg/*
151209962Smm * Minimum size which forces the dynamic allocator to change
152211931Smm * it's allocation strategy.  Once the space map cannot satisfy
153209962Smm * an allocation of this size then it switches to using more
154209962Smm * aggressive strategy (i.e search by size rather than offset).
155209962Smm */
156276081Sdelphijuint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
157255226SpjdTUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold",
158255226Spjd    &metaslab_df_alloc_threshold);
159255226SpjdSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
160255226Spjd    &metaslab_df_alloc_threshold, 0,
161255226Spjd    "Minimum size which forces the dynamic allocator to change it's allocation strategy");
162209962Smm
163209962Smm/*
164209962Smm * The minimum free space, in percent, which must be available
165209962Smm * in a space map to continue allocations in a first-fit fashion.
166209962Smm * Once the space_map's free space drops below this level we dynamically
167209962Smm * switch to using best-fit allocations.
168209962Smm */
169211931Smmint metaslab_df_free_pct = 4;
170255226SpjdTUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct);
171255226SpjdSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
172255226Spjd    &metaslab_df_free_pct, 0,
173255226Spjd    "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion");
174209962Smm
175209962Smm/*
176211931Smm * A metaslab is considered "free" if it contains a contiguous
177211931Smm * segment which is greater than metaslab_min_alloc_size.
178211931Smm */
179211931Smmuint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
180255226SpjdTUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size",
181255226Spjd    &metaslab_min_alloc_size);
182255226SpjdSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
183255226Spjd    &metaslab_min_alloc_size, 0,
184255226Spjd    "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size");
185211931Smm
186211931Smm/*
187262093Savg * Percentage of all cpus that can be used by the metaslab taskq.
188211931Smm */
189262093Savgint metaslab_load_pct = 50;
190262093SavgTUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct);
191262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
192262093Savg    &metaslab_load_pct, 0,
193262093Savg    "Percentage of cpus that can be used by the metaslab taskq");
194211931Smm
195211931Smm/*
196262093Savg * Determines how many txgs a metaslab may remain loaded without having any
197262093Savg * allocations from it. As long as a metaslab continues to be used we will
198262093Savg * keep it loaded.
199211931Smm */
200262093Savgint metaslab_unload_delay = TXG_SIZE * 2;
201262093SavgTUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay);
202262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
203262093Savg    &metaslab_unload_delay, 0,
204262093Savg    "Number of TXGs that an unused metaslab can be kept in memory");
205211931Smm
206211931Smm/*
207262093Savg * Max number of metaslabs per group to preload.
208262093Savg */
209262093Savgint metaslab_preload_limit = SPA_DVAS_PER_BP;
210262093SavgTUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit);
211262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
212262093Savg    &metaslab_preload_limit, 0,
213262093Savg    "Max number of metaslabs per group to preload");
214262093Savg
215262093Savg/*
216262093Savg * Enable/disable preloading of metaslab.
217262093Savg */
218262093Savgboolean_t metaslab_preload_enabled = B_TRUE;
219262093SavgTUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled);
220262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
221262093Savg    &metaslab_preload_enabled, 0,
222262093Savg    "Max number of metaslabs per group to preload");
223262093Savg
224262093Savg/*
225269773Sdelphij * Enable/disable fragmentation weighting on metaslabs.
226262093Savg */
227269773Sdelphijboolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
228269773SdelphijTUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled",
229269773Sdelphij    &metaslab_fragmentation_factor_enabled);
230269773SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
231269773Sdelphij    &metaslab_fragmentation_factor_enabled, 0,
232269773Sdelphij    "Enable fragmentation weighting on metaslabs");
233262093Savg
234269773Sdelphij/*
235269773Sdelphij * Enable/disable lba weighting (i.e. outer tracks are given preference).
236269773Sdelphij */
237269773Sdelphijboolean_t metaslab_lba_weighting_enabled = B_TRUE;
238269773SdelphijTUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled",
239269773Sdelphij    &metaslab_lba_weighting_enabled);
240269773SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
241269773Sdelphij    &metaslab_lba_weighting_enabled, 0,
242269773Sdelphij    "Enable LBA weighting (i.e. outer tracks are given preference)");
243262093Savg
244262093Savg/*
245269773Sdelphij * Enable/disable metaslab group biasing.
246269773Sdelphij */
247269773Sdelphijboolean_t metaslab_bias_enabled = B_TRUE;
248269773SdelphijTUNABLE_INT("vfs.zfs.metaslab.bias_enabled",
249269773Sdelphij    &metaslab_bias_enabled);
250269773SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
251269773Sdelphij    &metaslab_bias_enabled, 0,
252269773Sdelphij    "Enable metaslab group biasing");
253269773Sdelphij
254269773Sdelphijstatic uint64_t metaslab_fragmentation(metaslab_t *);
255269773Sdelphij
256269773Sdelphij/*
257168404Spjd * ==========================================================================
258168404Spjd * Metaslab classes
259168404Spjd * ==========================================================================
260168404Spjd */
261168404Spjdmetaslab_class_t *
262262093Savgmetaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
263168404Spjd{
264168404Spjd	metaslab_class_t *mc;
265168404Spjd
266168404Spjd	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
267168404Spjd
268219089Spjd	mc->mc_spa = spa;
269168404Spjd	mc->mc_rotor = NULL;
270209962Smm	mc->mc_ops = ops;
271307279Smav	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
272307279Smav	refcount_create_tracked(&mc->mc_alloc_slots);
273168404Spjd
274168404Spjd	return (mc);
275168404Spjd}
276168404Spjd
277168404Spjdvoid
278168404Spjdmetaslab_class_destroy(metaslab_class_t *mc)
279168404Spjd{
280219089Spjd	ASSERT(mc->mc_rotor == NULL);
281219089Spjd	ASSERT(mc->mc_alloc == 0);
282219089Spjd	ASSERT(mc->mc_deferred == 0);
283219089Spjd	ASSERT(mc->mc_space == 0);
284219089Spjd	ASSERT(mc->mc_dspace == 0);
285168404Spjd
286307279Smav	refcount_destroy(&mc->mc_alloc_slots);
287307279Smav	mutex_destroy(&mc->mc_lock);
288168404Spjd	kmem_free(mc, sizeof (metaslab_class_t));
289168404Spjd}
290168404Spjd
291219089Spjdint
292219089Spjdmetaslab_class_validate(metaslab_class_t *mc)
293168404Spjd{
294219089Spjd	metaslab_group_t *mg;
295219089Spjd	vdev_t *vd;
296168404Spjd
297219089Spjd	/*
298219089Spjd	 * Must hold one of the spa_config locks.
299219089Spjd	 */
300219089Spjd	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
301219089Spjd	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
302168404Spjd
303219089Spjd	if ((mg = mc->mc_rotor) == NULL)
304219089Spjd		return (0);
305219089Spjd
306219089Spjd	do {
307219089Spjd		vd = mg->mg_vd;
308219089Spjd		ASSERT(vd->vdev_mg != NULL);
309219089Spjd		ASSERT3P(vd->vdev_top, ==, vd);
310219089Spjd		ASSERT3P(mg->mg_class, ==, mc);
311219089Spjd		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
312219089Spjd	} while ((mg = mg->mg_next) != mc->mc_rotor);
313219089Spjd
314219089Spjd	return (0);
315168404Spjd}
316168404Spjd
317168404Spjdvoid
318219089Spjdmetaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
319219089Spjd    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
320168404Spjd{
321219089Spjd	atomic_add_64(&mc->mc_alloc, alloc_delta);
322219089Spjd	atomic_add_64(&mc->mc_deferred, defer_delta);
323219089Spjd	atomic_add_64(&mc->mc_space, space_delta);
324219089Spjd	atomic_add_64(&mc->mc_dspace, dspace_delta);
325219089Spjd}
326168404Spjd
327254591Sgibbsvoid
328254591Sgibbsmetaslab_class_minblocksize_update(metaslab_class_t *mc)
329254591Sgibbs{
330254591Sgibbs	metaslab_group_t *mg;
331254591Sgibbs	vdev_t *vd;
332254591Sgibbs	uint64_t minashift = UINT64_MAX;
333254591Sgibbs
334254591Sgibbs	if ((mg = mc->mc_rotor) == NULL) {
335254591Sgibbs		mc->mc_minblocksize = SPA_MINBLOCKSIZE;
336254591Sgibbs		return;
337254591Sgibbs	}
338254591Sgibbs
339254591Sgibbs	do {
340254591Sgibbs		vd = mg->mg_vd;
341254591Sgibbs		if (vd->vdev_ashift < minashift)
342254591Sgibbs			minashift = vd->vdev_ashift;
343254591Sgibbs	} while ((mg = mg->mg_next) != mc->mc_rotor);
344254591Sgibbs
345254591Sgibbs	mc->mc_minblocksize = 1ULL << minashift;
346254591Sgibbs}
347254591Sgibbs
348219089Spjduint64_t
349219089Spjdmetaslab_class_get_alloc(metaslab_class_t *mc)
350219089Spjd{
351219089Spjd	return (mc->mc_alloc);
352219089Spjd}
353168404Spjd
354219089Spjduint64_t
355219089Spjdmetaslab_class_get_deferred(metaslab_class_t *mc)
356219089Spjd{
357219089Spjd	return (mc->mc_deferred);
358219089Spjd}
359168404Spjd
360219089Spjduint64_t
361219089Spjdmetaslab_class_get_space(metaslab_class_t *mc)
362219089Spjd{
363219089Spjd	return (mc->mc_space);
364219089Spjd}
365168404Spjd
366219089Spjduint64_t
367219089Spjdmetaslab_class_get_dspace(metaslab_class_t *mc)
368219089Spjd{
369219089Spjd	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
370168404Spjd}
371168404Spjd
372254591Sgibbsuint64_t
373254591Sgibbsmetaslab_class_get_minblocksize(metaslab_class_t *mc)
374254591Sgibbs{
375254591Sgibbs	return (mc->mc_minblocksize);
376254591Sgibbs}
377254591Sgibbs
378269773Sdelphijvoid
379269773Sdelphijmetaslab_class_histogram_verify(metaslab_class_t *mc)
380269773Sdelphij{
381269773Sdelphij	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
382269773Sdelphij	uint64_t *mc_hist;
383269773Sdelphij	int i;
384269773Sdelphij
385269773Sdelphij	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
386269773Sdelphij		return;
387269773Sdelphij
388269773Sdelphij	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
389269773Sdelphij	    KM_SLEEP);
390269773Sdelphij
391269773Sdelphij	for (int c = 0; c < rvd->vdev_children; c++) {
392269773Sdelphij		vdev_t *tvd = rvd->vdev_child[c];
393269773Sdelphij		metaslab_group_t *mg = tvd->vdev_mg;
394269773Sdelphij
395269773Sdelphij		/*
396269773Sdelphij		 * Skip any holes, uninitialized top-levels, or
397269773Sdelphij		 * vdevs that are not in this metalab class.
398269773Sdelphij		 */
399269773Sdelphij		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
400269773Sdelphij		    mg->mg_class != mc) {
401269773Sdelphij			continue;
402269773Sdelphij		}
403269773Sdelphij
404269773Sdelphij		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
405269773Sdelphij			mc_hist[i] += mg->mg_histogram[i];
406269773Sdelphij	}
407269773Sdelphij
408269773Sdelphij	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
409269773Sdelphij		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
410269773Sdelphij
411269773Sdelphij	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
412269773Sdelphij}
413269773Sdelphij
414168404Spjd/*
415269773Sdelphij * Calculate the metaslab class's fragmentation metric. The metric
416269773Sdelphij * is weighted based on the space contribution of each metaslab group.
417269773Sdelphij * The return value will be a number between 0 and 100 (inclusive), or
418269773Sdelphij * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
419269773Sdelphij * zfs_frag_table for more information about the metric.
420269773Sdelphij */
421269773Sdelphijuint64_t
422269773Sdelphijmetaslab_class_fragmentation(metaslab_class_t *mc)
423269773Sdelphij{
424269773Sdelphij	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
425269773Sdelphij	uint64_t fragmentation = 0;
426269773Sdelphij
427269773Sdelphij	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
428269773Sdelphij
429269773Sdelphij	for (int c = 0; c < rvd->vdev_children; c++) {
430269773Sdelphij		vdev_t *tvd = rvd->vdev_child[c];
431269773Sdelphij		metaslab_group_t *mg = tvd->vdev_mg;
432269773Sdelphij
433269773Sdelphij		/*
434269773Sdelphij		 * Skip any holes, uninitialized top-levels, or
435269773Sdelphij		 * vdevs that are not in this metalab class.
436269773Sdelphij		 */
437269773Sdelphij		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
438269773Sdelphij		    mg->mg_class != mc) {
439269773Sdelphij			continue;
440269773Sdelphij		}
441269773Sdelphij
442269773Sdelphij		/*
443269773Sdelphij		 * If a metaslab group does not contain a fragmentation
444269773Sdelphij		 * metric then just bail out.
445269773Sdelphij		 */
446269773Sdelphij		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
447269773Sdelphij			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
448269773Sdelphij			return (ZFS_FRAG_INVALID);
449269773Sdelphij		}
450269773Sdelphij
451269773Sdelphij		/*
452269773Sdelphij		 * Determine how much this metaslab_group is contributing
453269773Sdelphij		 * to the overall pool fragmentation metric.
454269773Sdelphij		 */
455269773Sdelphij		fragmentation += mg->mg_fragmentation *
456269773Sdelphij		    metaslab_group_get_space(mg);
457269773Sdelphij	}
458269773Sdelphij	fragmentation /= metaslab_class_get_space(mc);
459269773Sdelphij
460269773Sdelphij	ASSERT3U(fragmentation, <=, 100);
461269773Sdelphij	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
462269773Sdelphij	return (fragmentation);
463269773Sdelphij}
464269773Sdelphij
465269773Sdelphij/*
466269773Sdelphij * Calculate the amount of expandable space that is available in
467269773Sdelphij * this metaslab class. If a device is expanded then its expandable
468269773Sdelphij * space will be the amount of allocatable space that is currently not
469269773Sdelphij * part of this metaslab class.
470269773Sdelphij */
471269773Sdelphijuint64_t
472269773Sdelphijmetaslab_class_expandable_space(metaslab_class_t *mc)
473269773Sdelphij{
474269773Sdelphij	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
475269773Sdelphij	uint64_t space = 0;
476269773Sdelphij
477269773Sdelphij	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
478269773Sdelphij	for (int c = 0; c < rvd->vdev_children; c++) {
479269773Sdelphij		vdev_t *tvd = rvd->vdev_child[c];
480269773Sdelphij		metaslab_group_t *mg = tvd->vdev_mg;
481269773Sdelphij
482269773Sdelphij		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
483269773Sdelphij		    mg->mg_class != mc) {
484269773Sdelphij			continue;
485269773Sdelphij		}
486269773Sdelphij
487307268Smav		/*
488307268Smav		 * Calculate if we have enough space to add additional
489307268Smav		 * metaslabs. We report the expandable space in terms
490307268Smav		 * of the metaslab size since that's the unit of expansion.
491307268Smav		 */
492307268Smav		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
493307268Smav		    1ULL << tvd->vdev_ms_shift);
494269773Sdelphij	}
495269773Sdelphij	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
496269773Sdelphij	return (space);
497269773Sdelphij}
498269773Sdelphij
499269773Sdelphij/*
500168404Spjd * ==========================================================================
501168404Spjd * Metaslab groups
502168404Spjd * ==========================================================================
503168404Spjd */
504168404Spjdstatic int
505168404Spjdmetaslab_compare(const void *x1, const void *x2)
506168404Spjd{
507168404Spjd	const metaslab_t *m1 = x1;
508168404Spjd	const metaslab_t *m2 = x2;
509168404Spjd
510168404Spjd	if (m1->ms_weight < m2->ms_weight)
511168404Spjd		return (1);
512168404Spjd	if (m1->ms_weight > m2->ms_weight)
513168404Spjd		return (-1);
514168404Spjd
515168404Spjd	/*
516168404Spjd	 * If the weights are identical, use the offset to force uniqueness.
517168404Spjd	 */
518262093Savg	if (m1->ms_start < m2->ms_start)
519168404Spjd		return (-1);
520262093Savg	if (m1->ms_start > m2->ms_start)
521168404Spjd		return (1);
522168404Spjd
523168404Spjd	ASSERT3P(m1, ==, m2);
524168404Spjd
525168404Spjd	return (0);
526168404Spjd}
527168404Spjd
528260768Savg/*
529260768Savg * Update the allocatable flag and the metaslab group's capacity.
530260768Savg * The allocatable flag is set to true if the capacity is below
531307279Smav * the zfs_mg_noalloc_threshold or has a fragmentation value that is
532307279Smav * greater than zfs_mg_fragmentation_threshold. If a metaslab group
533307279Smav * transitions from allocatable to non-allocatable or vice versa then the
534307279Smav * metaslab group's class is updated to reflect the transition.
535260768Savg */
536260768Savgstatic void
537260768Savgmetaslab_group_alloc_update(metaslab_group_t *mg)
538260768Savg{
539260768Savg	vdev_t *vd = mg->mg_vd;
540260768Savg	metaslab_class_t *mc = mg->mg_class;
541260768Savg	vdev_stat_t *vs = &vd->vdev_stat;
542260768Savg	boolean_t was_allocatable;
543307279Smav	boolean_t was_initialized;
544260768Savg
545260768Savg	ASSERT(vd == vd->vdev_top);
546260768Savg
547260768Savg	mutex_enter(&mg->mg_lock);
548260768Savg	was_allocatable = mg->mg_allocatable;
549307279Smav	was_initialized = mg->mg_initialized;
550260768Savg
551260768Savg	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
552260768Savg	    (vs->vs_space + 1);
553260768Savg
554307279Smav	mutex_enter(&mc->mc_lock);
555307279Smav
556269773Sdelphij	/*
557307279Smav	 * If the metaslab group was just added then it won't
558307279Smav	 * have any space until we finish syncing out this txg.
559307279Smav	 * At that point we will consider it initialized and available
560307279Smav	 * for allocations.  We also don't consider non-activated
561307279Smav	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
562307279Smav	 * to be initialized, because they can't be used for allocation.
563307279Smav	 */
564307279Smav	mg->mg_initialized = metaslab_group_initialized(mg);
565307279Smav	if (!was_initialized && mg->mg_initialized) {
566307279Smav		mc->mc_groups++;
567307279Smav	} else if (was_initialized && !mg->mg_initialized) {
568307279Smav		ASSERT3U(mc->mc_groups, >, 0);
569307279Smav		mc->mc_groups--;
570307279Smav	}
571307279Smav	if (mg->mg_initialized)
572307279Smav		mg->mg_no_free_space = B_FALSE;
573307279Smav
574307279Smav	/*
575269773Sdelphij	 * A metaslab group is considered allocatable if it has plenty
576269773Sdelphij	 * of free space or is not heavily fragmented. We only take
577269773Sdelphij	 * fragmentation into account if the metaslab group has a valid
578269773Sdelphij	 * fragmentation metric (i.e. a value between 0 and 100).
579269773Sdelphij	 */
580307279Smav	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
581307279Smav	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
582269773Sdelphij	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
583269773Sdelphij	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
584260768Savg
585260768Savg	/*
586260768Savg	 * The mc_alloc_groups maintains a count of the number of
587260768Savg	 * groups in this metaslab class that are still above the
588260768Savg	 * zfs_mg_noalloc_threshold. This is used by the allocating
589260768Savg	 * threads to determine if they should avoid allocations to
590260768Savg	 * a given group. The allocator will avoid allocations to a group
591260768Savg	 * if that group has reached or is below the zfs_mg_noalloc_threshold
592260768Savg	 * and there are still other groups that are above the threshold.
593260768Savg	 * When a group transitions from allocatable to non-allocatable or
594260768Savg	 * vice versa we update the metaslab class to reflect that change.
595260768Savg	 * When the mc_alloc_groups value drops to 0 that means that all
596260768Savg	 * groups have reached the zfs_mg_noalloc_threshold making all groups
597260768Savg	 * eligible for allocations. This effectively means that all devices
598260768Savg	 * are balanced again.
599260768Savg	 */
600260768Savg	if (was_allocatable && !mg->mg_allocatable)
601260768Savg		mc->mc_alloc_groups--;
602260768Savg	else if (!was_allocatable && mg->mg_allocatable)
603260768Savg		mc->mc_alloc_groups++;
604307279Smav	mutex_exit(&mc->mc_lock);
605269773Sdelphij
606260768Savg	mutex_exit(&mg->mg_lock);
607260768Savg}
608260768Savg
609168404Spjdmetaslab_group_t *
610168404Spjdmetaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
611168404Spjd{
612168404Spjd	metaslab_group_t *mg;
613168404Spjd
614168404Spjd	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
615168404Spjd	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
616168404Spjd	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
617168404Spjd	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
618168404Spjd	mg->mg_vd = vd;
619219089Spjd	mg->mg_class = mc;
620219089Spjd	mg->mg_activation_count = 0;
621307279Smav	mg->mg_initialized = B_FALSE;
622307279Smav	mg->mg_no_free_space = B_TRUE;
623307279Smav	refcount_create_tracked(&mg->mg_alloc_queue_depth);
624168404Spjd
625265746Sdelphij	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
626262093Savg	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
627262093Savg
628168404Spjd	return (mg);
629168404Spjd}
630168404Spjd
631168404Spjdvoid
632168404Spjdmetaslab_group_destroy(metaslab_group_t *mg)
633168404Spjd{
634219089Spjd	ASSERT(mg->mg_prev == NULL);
635219089Spjd	ASSERT(mg->mg_next == NULL);
636219089Spjd	/*
637219089Spjd	 * We may have gone below zero with the activation count
638219089Spjd	 * either because we never activated in the first place or
639219089Spjd	 * because we're done, and possibly removing the vdev.
640219089Spjd	 */
641219089Spjd	ASSERT(mg->mg_activation_count <= 0);
642219089Spjd
643265746Sdelphij	taskq_destroy(mg->mg_taskq);
644168404Spjd	avl_destroy(&mg->mg_metaslab_tree);
645168404Spjd	mutex_destroy(&mg->mg_lock);
646307279Smav	refcount_destroy(&mg->mg_alloc_queue_depth);
647168404Spjd	kmem_free(mg, sizeof (metaslab_group_t));
648168404Spjd}
649168404Spjd
650219089Spjdvoid
651219089Spjdmetaslab_group_activate(metaslab_group_t *mg)
652219089Spjd{
653219089Spjd	metaslab_class_t *mc = mg->mg_class;
654219089Spjd	metaslab_group_t *mgprev, *mgnext;
655219089Spjd
656219089Spjd	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
657219089Spjd
658219089Spjd	ASSERT(mc->mc_rotor != mg);
659219089Spjd	ASSERT(mg->mg_prev == NULL);
660219089Spjd	ASSERT(mg->mg_next == NULL);
661219089Spjd	ASSERT(mg->mg_activation_count <= 0);
662219089Spjd
663219089Spjd	if (++mg->mg_activation_count <= 0)
664219089Spjd		return;
665219089Spjd
666219089Spjd	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
667260768Savg	metaslab_group_alloc_update(mg);
668219089Spjd
669219089Spjd	if ((mgprev = mc->mc_rotor) == NULL) {
670219089Spjd		mg->mg_prev = mg;
671219089Spjd		mg->mg_next = mg;
672219089Spjd	} else {
673219089Spjd		mgnext = mgprev->mg_next;
674219089Spjd		mg->mg_prev = mgprev;
675219089Spjd		mg->mg_next = mgnext;
676219089Spjd		mgprev->mg_next = mg;
677219089Spjd		mgnext->mg_prev = mg;
678219089Spjd	}
679219089Spjd	mc->mc_rotor = mg;
680254591Sgibbs	metaslab_class_minblocksize_update(mc);
681219089Spjd}
682219089Spjd
683219089Spjdvoid
684219089Spjdmetaslab_group_passivate(metaslab_group_t *mg)
685219089Spjd{
686219089Spjd	metaslab_class_t *mc = mg->mg_class;
687219089Spjd	metaslab_group_t *mgprev, *mgnext;
688219089Spjd
689219089Spjd	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
690219089Spjd
691219089Spjd	if (--mg->mg_activation_count != 0) {
692219089Spjd		ASSERT(mc->mc_rotor != mg);
693219089Spjd		ASSERT(mg->mg_prev == NULL);
694219089Spjd		ASSERT(mg->mg_next == NULL);
695219089Spjd		ASSERT(mg->mg_activation_count < 0);
696219089Spjd		return;
697219089Spjd	}
698219089Spjd
699262093Savg	taskq_wait(mg->mg_taskq);
700269773Sdelphij	metaslab_group_alloc_update(mg);
701262093Savg
702219089Spjd	mgprev = mg->mg_prev;
703219089Spjd	mgnext = mg->mg_next;
704219089Spjd
705219089Spjd	if (mg == mgnext) {
706219089Spjd		mc->mc_rotor = NULL;
707219089Spjd	} else {
708219089Spjd		mc->mc_rotor = mgnext;
709219089Spjd		mgprev->mg_next = mgnext;
710219089Spjd		mgnext->mg_prev = mgprev;
711219089Spjd	}
712219089Spjd
713219089Spjd	mg->mg_prev = NULL;
714219089Spjd	mg->mg_next = NULL;
715254591Sgibbs	metaslab_class_minblocksize_update(mc);
716219089Spjd}
717219089Spjd
718307279Smavboolean_t
719307279Smavmetaslab_group_initialized(metaslab_group_t *mg)
720307279Smav{
721307279Smav	vdev_t *vd = mg->mg_vd;
722307279Smav	vdev_stat_t *vs = &vd->vdev_stat;
723307279Smav
724307279Smav	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
725307279Smav}
726307279Smav
727269773Sdelphijuint64_t
728269773Sdelphijmetaslab_group_get_space(metaslab_group_t *mg)
729269773Sdelphij{
730269773Sdelphij	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
731269773Sdelphij}
732269773Sdelphij
733269773Sdelphijvoid
734269773Sdelphijmetaslab_group_histogram_verify(metaslab_group_t *mg)
735269773Sdelphij{
736269773Sdelphij	uint64_t *mg_hist;
737269773Sdelphij	vdev_t *vd = mg->mg_vd;
738269773Sdelphij	uint64_t ashift = vd->vdev_ashift;
739269773Sdelphij	int i;
740269773Sdelphij
741269773Sdelphij	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
742269773Sdelphij		return;
743269773Sdelphij
744269773Sdelphij	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
745269773Sdelphij	    KM_SLEEP);
746269773Sdelphij
747269773Sdelphij	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
748269773Sdelphij	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
749269773Sdelphij
750269773Sdelphij	for (int m = 0; m < vd->vdev_ms_count; m++) {
751269773Sdelphij		metaslab_t *msp = vd->vdev_ms[m];
752269773Sdelphij
753269773Sdelphij		if (msp->ms_sm == NULL)
754269773Sdelphij			continue;
755269773Sdelphij
756269773Sdelphij		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
757269773Sdelphij			mg_hist[i + ashift] +=
758269773Sdelphij			    msp->ms_sm->sm_phys->smp_histogram[i];
759269773Sdelphij	}
760269773Sdelphij
761269773Sdelphij	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
762269773Sdelphij		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
763269773Sdelphij
764269773Sdelphij	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
765269773Sdelphij}
766269773Sdelphij
767168404Spjdstatic void
768269773Sdelphijmetaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
769269773Sdelphij{
770269773Sdelphij	metaslab_class_t *mc = mg->mg_class;
771269773Sdelphij	uint64_t ashift = mg->mg_vd->vdev_ashift;
772269773Sdelphij
773269773Sdelphij	ASSERT(MUTEX_HELD(&msp->ms_lock));
774269773Sdelphij	if (msp->ms_sm == NULL)
775269773Sdelphij		return;
776269773Sdelphij
777269773Sdelphij	mutex_enter(&mg->mg_lock);
778269773Sdelphij	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
779269773Sdelphij		mg->mg_histogram[i + ashift] +=
780269773Sdelphij		    msp->ms_sm->sm_phys->smp_histogram[i];
781269773Sdelphij		mc->mc_histogram[i + ashift] +=
782269773Sdelphij		    msp->ms_sm->sm_phys->smp_histogram[i];
783269773Sdelphij	}
784269773Sdelphij	mutex_exit(&mg->mg_lock);
785269773Sdelphij}
786269773Sdelphij
787269773Sdelphijvoid
788269773Sdelphijmetaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
789269773Sdelphij{
790269773Sdelphij	metaslab_class_t *mc = mg->mg_class;
791269773Sdelphij	uint64_t ashift = mg->mg_vd->vdev_ashift;
792269773Sdelphij
793269773Sdelphij	ASSERT(MUTEX_HELD(&msp->ms_lock));
794269773Sdelphij	if (msp->ms_sm == NULL)
795269773Sdelphij		return;
796269773Sdelphij
797269773Sdelphij	mutex_enter(&mg->mg_lock);
798269773Sdelphij	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
799269773Sdelphij		ASSERT3U(mg->mg_histogram[i + ashift], >=,
800269773Sdelphij		    msp->ms_sm->sm_phys->smp_histogram[i]);
801269773Sdelphij		ASSERT3U(mc->mc_histogram[i + ashift], >=,
802269773Sdelphij		    msp->ms_sm->sm_phys->smp_histogram[i]);
803269773Sdelphij
804269773Sdelphij		mg->mg_histogram[i + ashift] -=
805269773Sdelphij		    msp->ms_sm->sm_phys->smp_histogram[i];
806269773Sdelphij		mc->mc_histogram[i + ashift] -=
807269773Sdelphij		    msp->ms_sm->sm_phys->smp_histogram[i];
808269773Sdelphij	}
809269773Sdelphij	mutex_exit(&mg->mg_lock);
810269773Sdelphij}
811269773Sdelphij
812269773Sdelphijstatic void
813168404Spjdmetaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
814168404Spjd{
815269773Sdelphij	ASSERT(msp->ms_group == NULL);
816168404Spjd	mutex_enter(&mg->mg_lock);
817168404Spjd	msp->ms_group = mg;
818168404Spjd	msp->ms_weight = 0;
819168404Spjd	avl_add(&mg->mg_metaslab_tree, msp);
820168404Spjd	mutex_exit(&mg->mg_lock);
821269773Sdelphij
822269773Sdelphij	mutex_enter(&msp->ms_lock);
823269773Sdelphij	metaslab_group_histogram_add(mg, msp);
824269773Sdelphij	mutex_exit(&msp->ms_lock);
825168404Spjd}
826168404Spjd
827168404Spjdstatic void
828168404Spjdmetaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
829168404Spjd{
830269773Sdelphij	mutex_enter(&msp->ms_lock);
831269773Sdelphij	metaslab_group_histogram_remove(mg, msp);
832269773Sdelphij	mutex_exit(&msp->ms_lock);
833269773Sdelphij
834168404Spjd	mutex_enter(&mg->mg_lock);
835168404Spjd	ASSERT(msp->ms_group == mg);
836168404Spjd	avl_remove(&mg->mg_metaslab_tree, msp);
837168404Spjd	msp->ms_group = NULL;
838168404Spjd	mutex_exit(&mg->mg_lock);
839168404Spjd}
840168404Spjd
841168404Spjdstatic void
842168404Spjdmetaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
843168404Spjd{
844168404Spjd	/*
845168404Spjd	 * Although in principle the weight can be any value, in
846269773Sdelphij	 * practice we do not use values in the range [1, 511].
847168404Spjd	 */
848269773Sdelphij	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
849168404Spjd	ASSERT(MUTEX_HELD(&msp->ms_lock));
850168404Spjd
851168404Spjd	mutex_enter(&mg->mg_lock);
852168404Spjd	ASSERT(msp->ms_group == mg);
853168404Spjd	avl_remove(&mg->mg_metaslab_tree, msp);
854168404Spjd	msp->ms_weight = weight;
855168404Spjd	avl_add(&mg->mg_metaslab_tree, msp);
856168404Spjd	mutex_exit(&mg->mg_lock);
857168404Spjd}
858168404Spjd
859168404Spjd/*
860269773Sdelphij * Calculate the fragmentation for a given metaslab group. We can use
861269773Sdelphij * a simple average here since all metaslabs within the group must have
862269773Sdelphij * the same size. The return value will be a value between 0 and 100
863269773Sdelphij * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
864269773Sdelphij * group have a fragmentation metric.
865269773Sdelphij */
866269773Sdelphijuint64_t
867269773Sdelphijmetaslab_group_fragmentation(metaslab_group_t *mg)
868269773Sdelphij{
869269773Sdelphij	vdev_t *vd = mg->mg_vd;
870269773Sdelphij	uint64_t fragmentation = 0;
871269773Sdelphij	uint64_t valid_ms = 0;
872269773Sdelphij
873269773Sdelphij	for (int m = 0; m < vd->vdev_ms_count; m++) {
874269773Sdelphij		metaslab_t *msp = vd->vdev_ms[m];
875269773Sdelphij
876269773Sdelphij		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
877269773Sdelphij			continue;
878269773Sdelphij
879269773Sdelphij		valid_ms++;
880269773Sdelphij		fragmentation += msp->ms_fragmentation;
881269773Sdelphij	}
882269773Sdelphij
883269773Sdelphij	if (valid_ms <= vd->vdev_ms_count / 2)
884269773Sdelphij		return (ZFS_FRAG_INVALID);
885269773Sdelphij
886269773Sdelphij	fragmentation /= valid_ms;
887269773Sdelphij	ASSERT3U(fragmentation, <=, 100);
888269773Sdelphij	return (fragmentation);
889269773Sdelphij}
890269773Sdelphij
891269773Sdelphij/*
892260768Savg * Determine if a given metaslab group should skip allocations. A metaslab
893269773Sdelphij * group should avoid allocations if its free capacity is less than the
894269773Sdelphij * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
895269773Sdelphij * zfs_mg_fragmentation_threshold and there is at least one metaslab group
896307279Smav * that can still handle allocations. If the allocation throttle is enabled
897307279Smav * then we skip allocations to devices that have reached their maximum
898307279Smav * allocation queue depth unless the selected metaslab group is the only
899307279Smav * eligible group remaining.
900260768Savg */
901260768Savgstatic boolean_t
902307279Smavmetaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
903307279Smav    uint64_t psize)
904260768Savg{
905307279Smav	spa_t *spa = mg->mg_vd->vdev_spa;
906260768Savg	metaslab_class_t *mc = mg->mg_class;
907260768Savg
908260768Savg	/*
909307279Smav	 * We can only consider skipping this metaslab group if it's
910307279Smav	 * in the normal metaslab class and there are other metaslab
911307279Smav	 * groups to select from. Otherwise, we always consider it eligible
912269773Sdelphij	 * for allocations.
913260768Savg	 */
914307279Smav	if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
915307279Smav		return (B_TRUE);
916307279Smav
917307279Smav	/*
918307279Smav	 * If the metaslab group's mg_allocatable flag is set (see comments
919307279Smav	 * in metaslab_group_alloc_update() for more information) and
920307279Smav	 * the allocation throttle is disabled then allow allocations to this
921307279Smav	 * device. However, if the allocation throttle is enabled then
922307279Smav	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
923307279Smav	 * to determine if we should allow allocations to this metaslab group.
924307279Smav	 * If all metaslab groups are no longer considered allocatable
925307279Smav	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
926307279Smav	 * gang block size then we allow allocations on this metaslab group
927307279Smav	 * regardless of the mg_allocatable or throttle settings.
928307279Smav	 */
929307279Smav	if (mg->mg_allocatable) {
930307279Smav		metaslab_group_t *mgp;
931307279Smav		int64_t qdepth;
932307279Smav		uint64_t qmax = mg->mg_max_alloc_queue_depth;
933307279Smav
934307279Smav		if (!mc->mc_alloc_throttle_enabled)
935307279Smav			return (B_TRUE);
936307279Smav
937307279Smav		/*
938307279Smav		 * If this metaslab group does not have any free space, then
939307279Smav		 * there is no point in looking further.
940307279Smav		 */
941307279Smav		if (mg->mg_no_free_space)
942307279Smav			return (B_FALSE);
943307279Smav
944307279Smav		qdepth = refcount_count(&mg->mg_alloc_queue_depth);
945307279Smav
946307279Smav		/*
947307279Smav		 * If this metaslab group is below its qmax or it's
948307279Smav		 * the only allocatable metasable group, then attempt
949307279Smav		 * to allocate from it.
950307279Smav		 */
951307279Smav		if (qdepth < qmax || mc->mc_alloc_groups == 1)
952307279Smav			return (B_TRUE);
953307279Smav		ASSERT3U(mc->mc_alloc_groups, >, 1);
954307279Smav
955307279Smav		/*
956307279Smav		 * Since this metaslab group is at or over its qmax, we
957307279Smav		 * need to determine if there are metaslab groups after this
958307279Smav		 * one that might be able to handle this allocation. This is
959307279Smav		 * racy since we can't hold the locks for all metaslab
960307279Smav		 * groups at the same time when we make this check.
961307279Smav		 */
962307279Smav		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
963307279Smav			qmax = mgp->mg_max_alloc_queue_depth;
964307279Smav
965307279Smav			qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
966307279Smav
967307279Smav			/*
968307279Smav			 * If there is another metaslab group that
969307279Smav			 * might be able to handle the allocation, then
970307279Smav			 * we return false so that we skip this group.
971307279Smav			 */
972307279Smav			if (qdepth < qmax && !mgp->mg_no_free_space)
973307279Smav				return (B_FALSE);
974307279Smav		}
975307279Smav
976307279Smav		/*
977307279Smav		 * We didn't find another group to handle the allocation
978307279Smav		 * so we can't skip this metaslab group even though
979307279Smav		 * we are at or over our qmax.
980307279Smav		 */
981307279Smav		return (B_TRUE);
982307279Smav
983307279Smav	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
984307279Smav		return (B_TRUE);
985307279Smav	}
986307279Smav	return (B_FALSE);
987260768Savg}
988260768Savg
989260768Savg/*
990211931Smm * ==========================================================================
991262093Savg * Range tree callbacks
992211931Smm * ==========================================================================
993211931Smm */
994262093Savg
995262093Savg/*
996262093Savg * Comparison function for the private size-ordered tree. Tree is sorted
997262093Savg * by size, larger sizes at the end of the tree.
998262093Savg */
999211931Smmstatic int
1000262093Savgmetaslab_rangesize_compare(const void *x1, const void *x2)
1001211931Smm{
1002262093Savg	const range_seg_t *r1 = x1;
1003262093Savg	const range_seg_t *r2 = x2;
1004262093Savg	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1005262093Savg	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1006211931Smm
1007262093Savg	if (rs_size1 < rs_size2)
1008211931Smm		return (-1);
1009262093Savg	if (rs_size1 > rs_size2)
1010211931Smm		return (1);
1011211931Smm
1012262093Savg	if (r1->rs_start < r2->rs_start)
1013211931Smm		return (-1);
1014262093Savg
1015262093Savg	if (r1->rs_start > r2->rs_start)
1016211931Smm		return (1);
1017211931Smm
1018211931Smm	return (0);
1019211931Smm}
1020211931Smm
1021211931Smm/*
1022262093Savg * Create any block allocator specific components. The current allocators
1023262093Savg * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1024168404Spjd */
1025262093Savgstatic void
1026262093Savgmetaslab_rt_create(range_tree_t *rt, void *arg)
1027168404Spjd{
1028262093Savg	metaslab_t *msp = arg;
1029168404Spjd
1030262093Savg	ASSERT3P(rt->rt_arg, ==, msp);
1031262093Savg	ASSERT(msp->ms_tree == NULL);
1032168404Spjd
1033262093Savg	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1034262093Savg	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1035168404Spjd}
1036168404Spjd
1037262093Savg/*
1038262093Savg * Destroy the block allocator specific components.
1039262093Savg */
1040209962Smmstatic void
1041262093Savgmetaslab_rt_destroy(range_tree_t *rt, void *arg)
1042209962Smm{
1043262093Savg	metaslab_t *msp = arg;
1044211931Smm
1045262093Savg	ASSERT3P(rt->rt_arg, ==, msp);
1046262093Savg	ASSERT3P(msp->ms_tree, ==, rt);
1047262093Savg	ASSERT0(avl_numnodes(&msp->ms_size_tree));
1048211931Smm
1049262093Savg	avl_destroy(&msp->ms_size_tree);
1050209962Smm}
1051209962Smm
1052209962Smmstatic void
1053262093Savgmetaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1054209962Smm{
1055262093Savg	metaslab_t *msp = arg;
1056211931Smm
1057262093Savg	ASSERT3P(rt->rt_arg, ==, msp);
1058262093Savg	ASSERT3P(msp->ms_tree, ==, rt);
1059262093Savg	VERIFY(!msp->ms_condensing);
1060262093Savg	avl_add(&msp->ms_size_tree, rs);
1061209962Smm}
1062209962Smm
1063168404Spjdstatic void
1064262093Savgmetaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1065168404Spjd{
1066262093Savg	metaslab_t *msp = arg;
1067262093Savg
1068262093Savg	ASSERT3P(rt->rt_arg, ==, msp);
1069262093Savg	ASSERT3P(msp->ms_tree, ==, rt);
1070262093Savg	VERIFY(!msp->ms_condensing);
1071262093Savg	avl_remove(&msp->ms_size_tree, rs);
1072168404Spjd}
1073168404Spjd
1074168404Spjdstatic void
1075262093Savgmetaslab_rt_vacate(range_tree_t *rt, void *arg)
1076168404Spjd{
1077262093Savg	metaslab_t *msp = arg;
1078262093Savg
1079262093Savg	ASSERT3P(rt->rt_arg, ==, msp);
1080262093Savg	ASSERT3P(msp->ms_tree, ==, rt);
1081262093Savg
1082262093Savg	/*
1083262093Savg	 * Normally one would walk the tree freeing nodes along the way.
1084262093Savg	 * Since the nodes are shared with the range trees we can avoid
1085262093Savg	 * walking all nodes and just reinitialize the avl tree. The nodes
1086262093Savg	 * will be freed by the range tree, so we don't want to free them here.
1087262093Savg	 */
1088262093Savg	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1089262093Savg	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1090168404Spjd}
1091168404Spjd
1092262093Savgstatic range_tree_ops_t metaslab_rt_ops = {
1093262093Savg	metaslab_rt_create,
1094262093Savg	metaslab_rt_destroy,
1095262093Savg	metaslab_rt_add,
1096262093Savg	metaslab_rt_remove,
1097262093Savg	metaslab_rt_vacate
1098262093Savg};
1099262093Savg
1100168404Spjd/*
1101262093Savg * ==========================================================================
1102262093Savg * Metaslab block operations
1103262093Savg * ==========================================================================
1104262093Savg */
1105262093Savg
1106262093Savg/*
1107211931Smm * Return the maximum contiguous segment within the metaslab.
1108209962Smm */
1109209962Smmuint64_t
1110262093Savgmetaslab_block_maxsize(metaslab_t *msp)
1111209962Smm{
1112262093Savg	avl_tree_t *t = &msp->ms_size_tree;
1113262093Savg	range_seg_t *rs;
1114209962Smm
1115262093Savg	if (t == NULL || (rs = avl_last(t)) == NULL)
1116209962Smm		return (0ULL);
1117209962Smm
1118262093Savg	return (rs->rs_end - rs->rs_start);
1119209962Smm}
1120209962Smm
1121262093Savguint64_t
1122262093Savgmetaslab_block_alloc(metaslab_t *msp, uint64_t size)
1123262093Savg{
1124262093Savg	uint64_t start;
1125262093Savg	range_tree_t *rt = msp->ms_tree;
1126262093Savg
1127262093Savg	VERIFY(!msp->ms_condensing);
1128262093Savg
1129262093Savg	start = msp->ms_ops->msop_alloc(msp, size);
1130262093Savg	if (start != -1ULL) {
1131262093Savg		vdev_t *vd = msp->ms_group->mg_vd;
1132262093Savg
1133262093Savg		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
1134262093Savg		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
1135262093Savg		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
1136262093Savg		range_tree_remove(rt, start, size);
1137262093Savg	}
1138262093Savg	return (start);
1139262093Savg}
1140262093Savg
1141211931Smm/*
1142211931Smm * ==========================================================================
1143262093Savg * Common allocator routines
1144262093Savg * ==========================================================================
1145262093Savg */
1146262093Savg
1147262093Savg/*
1148262093Savg * This is a helper function that can be used by the allocator to find
1149262093Savg * a suitable block to allocate. This will search the specified AVL
1150262093Savg * tree looking for a block that matches the specified criteria.
1151262093Savg */
1152262093Savgstatic uint64_t
1153262093Savgmetaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1154262093Savg    uint64_t align)
1155262093Savg{
1156262093Savg	range_seg_t *rs, rsearch;
1157262093Savg	avl_index_t where;
1158262093Savg
1159262093Savg	rsearch.rs_start = *cursor;
1160262093Savg	rsearch.rs_end = *cursor + size;
1161262093Savg
1162262093Savg	rs = avl_find(t, &rsearch, &where);
1163262093Savg	if (rs == NULL)
1164262093Savg		rs = avl_nearest(t, where, AVL_AFTER);
1165262093Savg
1166262093Savg	while (rs != NULL) {
1167262093Savg		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1168262093Savg
1169262093Savg		if (offset + size <= rs->rs_end) {
1170262093Savg			*cursor = offset + size;
1171262093Savg			return (offset);
1172262093Savg		}
1173262093Savg		rs = AVL_NEXT(t, rs);
1174262093Savg	}
1175262093Savg
1176262093Savg	/*
1177262093Savg	 * If we know we've searched the whole map (*cursor == 0), give up.
1178262093Savg	 * Otherwise, reset the cursor to the beginning and try again.
1179262093Savg	 */
1180262093Savg	if (*cursor == 0)
1181262093Savg		return (-1ULL);
1182262093Savg
1183262093Savg	*cursor = 0;
1184262093Savg	return (metaslab_block_picker(t, cursor, size, align));
1185262093Savg}
1186262093Savg
1187262093Savg/*
1188262093Savg * ==========================================================================
1189211931Smm * The first-fit block allocator
1190211931Smm * ==========================================================================
1191211931Smm */
1192211931Smmstatic uint64_t
1193262093Savgmetaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1194209962Smm{
1195262093Savg	/*
1196262093Savg	 * Find the largest power of 2 block size that evenly divides the
1197262093Savg	 * requested size. This is used to try to allocate blocks with similar
1198262093Savg	 * alignment from the same area of the metaslab (i.e. same cursor
1199262093Savg	 * bucket) but it does not guarantee that other allocations sizes
1200262093Savg	 * may exist in the same region.
1201262093Savg	 */
1202211931Smm	uint64_t align = size & -size;
1203265740Sdelphij	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1204262093Savg	avl_tree_t *t = &msp->ms_tree->rt_root;
1205209962Smm
1206211931Smm	return (metaslab_block_picker(t, cursor, size, align));
1207209962Smm}
1208209962Smm
1209262093Savgstatic metaslab_ops_t metaslab_ff_ops = {
1210269773Sdelphij	metaslab_ff_alloc
1211211931Smm};
1212209962Smm
1213211931Smm/*
1214211931Smm * ==========================================================================
1215211931Smm * Dynamic block allocator -
1216211931Smm * Uses the first fit allocation scheme until space get low and then
1217211931Smm * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1218211931Smm * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1219211931Smm * ==========================================================================
1220211931Smm */
1221209962Smmstatic uint64_t
1222262093Savgmetaslab_df_alloc(metaslab_t *msp, uint64_t size)
1223209962Smm{
1224262093Savg	/*
1225262093Savg	 * Find the largest power of 2 block size that evenly divides the
1226262093Savg	 * requested size. This is used to try to allocate blocks with similar
1227262093Savg	 * alignment from the same area of the metaslab (i.e. same cursor
1228262093Savg	 * bucket) but it does not guarantee that other allocations sizes
1229262093Savg	 * may exist in the same region.
1230262093Savg	 */
1231209962Smm	uint64_t align = size & -size;
1232265740Sdelphij	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1233262093Savg	range_tree_t *rt = msp->ms_tree;
1234262093Savg	avl_tree_t *t = &rt->rt_root;
1235262093Savg	uint64_t max_size = metaslab_block_maxsize(msp);
1236262093Savg	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1237209962Smm
1238262093Savg	ASSERT(MUTEX_HELD(&msp->ms_lock));
1239262093Savg	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1240209962Smm
1241209962Smm	if (max_size < size)
1242209962Smm		return (-1ULL);
1243209962Smm
1244209962Smm	/*
1245209962Smm	 * If we're running low on space switch to using the size
1246209962Smm	 * sorted AVL tree (best-fit).
1247209962Smm	 */
1248209962Smm	if (max_size < metaslab_df_alloc_threshold ||
1249209962Smm	    free_pct < metaslab_df_free_pct) {
1250262093Savg		t = &msp->ms_size_tree;
1251209962Smm		*cursor = 0;
1252209962Smm	}
1253209962Smm
1254209962Smm	return (metaslab_block_picker(t, cursor, size, 1ULL));
1255209962Smm}
1256209962Smm
1257262093Savgstatic metaslab_ops_t metaslab_df_ops = {
1258269773Sdelphij	metaslab_df_alloc
1259209962Smm};
1260209962Smm
1261211931Smm/*
1262211931Smm * ==========================================================================
1263262093Savg * Cursor fit block allocator -
1264262093Savg * Select the largest region in the metaslab, set the cursor to the beginning
1265262093Savg * of the range and the cursor_end to the end of the range. As allocations
1266262093Savg * are made advance the cursor. Continue allocating from the cursor until
1267262093Savg * the range is exhausted and then find a new range.
1268211931Smm * ==========================================================================
1269211931Smm */
1270211931Smmstatic uint64_t
1271262093Savgmetaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1272211931Smm{
1273262093Savg	range_tree_t *rt = msp->ms_tree;
1274262093Savg	avl_tree_t *t = &msp->ms_size_tree;
1275262093Savg	uint64_t *cursor = &msp->ms_lbas[0];
1276262093Savg	uint64_t *cursor_end = &msp->ms_lbas[1];
1277211931Smm	uint64_t offset = 0;
1278209962Smm
1279262093Savg	ASSERT(MUTEX_HELD(&msp->ms_lock));
1280262093Savg	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1281211931Smm
1282262093Savg	ASSERT3U(*cursor_end, >=, *cursor);
1283211931Smm
1284262093Savg	if ((*cursor + size) > *cursor_end) {
1285262093Savg		range_seg_t *rs;
1286211931Smm
1287262093Savg		rs = avl_last(&msp->ms_size_tree);
1288262093Savg		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1289262093Savg			return (-1ULL);
1290211931Smm
1291262093Savg		*cursor = rs->rs_start;
1292262093Savg		*cursor_end = rs->rs_end;
1293262093Savg	}
1294211931Smm
1295262093Savg	offset = *cursor;
1296262093Savg	*cursor += size;
1297262093Savg
1298211931Smm	return (offset);
1299211931Smm}
1300211931Smm
1301262093Savgstatic metaslab_ops_t metaslab_cf_ops = {
1302269773Sdelphij	metaslab_cf_alloc
1303211931Smm};
1304211931Smm
1305262093Savg/*
1306262093Savg * ==========================================================================
1307262093Savg * New dynamic fit allocator -
1308262093Savg * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1309262093Savg * contiguous blocks. If no region is found then just use the largest segment
1310262093Savg * that remains.
1311262093Savg * ==========================================================================
1312262093Savg */
1313262093Savg
1314262093Savg/*
1315262093Savg * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1316262093Savg * to request from the allocator.
1317262093Savg */
1318211931Smmuint64_t metaslab_ndf_clump_shift = 4;
1319211931Smm
1320211931Smmstatic uint64_t
1321262093Savgmetaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1322211931Smm{
1323262093Savg	avl_tree_t *t = &msp->ms_tree->rt_root;
1324211931Smm	avl_index_t where;
1325262093Savg	range_seg_t *rs, rsearch;
1326265740Sdelphij	uint64_t hbit = highbit64(size);
1327262093Savg	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1328262093Savg	uint64_t max_size = metaslab_block_maxsize(msp);
1329211931Smm
1330262093Savg	ASSERT(MUTEX_HELD(&msp->ms_lock));
1331262093Savg	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1332211931Smm
1333211931Smm	if (max_size < size)
1334211931Smm		return (-1ULL);
1335211931Smm
1336262093Savg	rsearch.rs_start = *cursor;
1337262093Savg	rsearch.rs_end = *cursor + size;
1338211931Smm
1339262093Savg	rs = avl_find(t, &rsearch, &where);
1340262093Savg	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1341262093Savg		t = &msp->ms_size_tree;
1342211931Smm
1343262093Savg		rsearch.rs_start = 0;
1344262093Savg		rsearch.rs_end = MIN(max_size,
1345211931Smm		    1ULL << (hbit + metaslab_ndf_clump_shift));
1346262093Savg		rs = avl_find(t, &rsearch, &where);
1347262093Savg		if (rs == NULL)
1348262093Savg			rs = avl_nearest(t, where, AVL_AFTER);
1349262093Savg		ASSERT(rs != NULL);
1350211931Smm	}
1351211931Smm
1352262093Savg	if ((rs->rs_end - rs->rs_start) >= size) {
1353262093Savg		*cursor = rs->rs_start + size;
1354262093Savg		return (rs->rs_start);
1355211931Smm	}
1356211931Smm	return (-1ULL);
1357211931Smm}
1358211931Smm
1359262093Savgstatic metaslab_ops_t metaslab_ndf_ops = {
1360269773Sdelphij	metaslab_ndf_alloc
1361211931Smm};
1362211931Smm
1363262093Savgmetaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1364211931Smm
1365209962Smm/*
1366168404Spjd * ==========================================================================
1367168404Spjd * Metaslabs
1368168404Spjd * ==========================================================================
1369168404Spjd */
1370262093Savg
1371262093Savg/*
1372262093Savg * Wait for any in-progress metaslab loads to complete.
1373262093Savg */
1374262093Savgvoid
1375262093Savgmetaslab_load_wait(metaslab_t *msp)
1376262093Savg{
1377262093Savg	ASSERT(MUTEX_HELD(&msp->ms_lock));
1378262093Savg
1379262093Savg	while (msp->ms_loading) {
1380262093Savg		ASSERT(!msp->ms_loaded);
1381262093Savg		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1382262093Savg	}
1383262093Savg}
1384262093Savg
1385262093Savgint
1386262093Savgmetaslab_load(metaslab_t *msp)
1387262093Savg{
1388262093Savg	int error = 0;
1389262093Savg
1390262093Savg	ASSERT(MUTEX_HELD(&msp->ms_lock));
1391262093Savg	ASSERT(!msp->ms_loaded);
1392262093Savg	ASSERT(!msp->ms_loading);
1393262093Savg
1394262093Savg	msp->ms_loading = B_TRUE;
1395262093Savg
1396262093Savg	/*
1397262093Savg	 * If the space map has not been allocated yet, then treat
1398262093Savg	 * all the space in the metaslab as free and add it to the
1399262093Savg	 * ms_tree.
1400262093Savg	 */
1401262093Savg	if (msp->ms_sm != NULL)
1402262093Savg		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1403262093Savg	else
1404262093Savg		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1405262093Savg
1406262093Savg	msp->ms_loaded = (error == 0);
1407262093Savg	msp->ms_loading = B_FALSE;
1408262093Savg
1409262093Savg	if (msp->ms_loaded) {
1410262093Savg		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1411262093Savg			range_tree_walk(msp->ms_defertree[t],
1412262093Savg			    range_tree_remove, msp->ms_tree);
1413262093Savg		}
1414262093Savg	}
1415262093Savg	cv_broadcast(&msp->ms_load_cv);
1416262093Savg	return (error);
1417262093Savg}
1418262093Savg
1419262093Savgvoid
1420262093Savgmetaslab_unload(metaslab_t *msp)
1421262093Savg{
1422262093Savg	ASSERT(MUTEX_HELD(&msp->ms_lock));
1423262093Savg	range_tree_vacate(msp->ms_tree, NULL, NULL);
1424262093Savg	msp->ms_loaded = B_FALSE;
1425262093Savg	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1426262093Savg}
1427262093Savg
1428277553Sdelphijint
1429277553Sdelphijmetaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1430277553Sdelphij    metaslab_t **msp)
1431168404Spjd{
1432168404Spjd	vdev_t *vd = mg->mg_vd;
1433262093Savg	objset_t *mos = vd->vdev_spa->spa_meta_objset;
1434277553Sdelphij	metaslab_t *ms;
1435277553Sdelphij	int error;
1436168404Spjd
1437277553Sdelphij	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1438277553Sdelphij	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1439277553Sdelphij	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1440277553Sdelphij	ms->ms_id = id;
1441277553Sdelphij	ms->ms_start = id << vd->vdev_ms_shift;
1442277553Sdelphij	ms->ms_size = 1ULL << vd->vdev_ms_shift;
1443168404Spjd
1444262093Savg	/*
1445262093Savg	 * We only open space map objects that already exist. All others
1446262093Savg	 * will be opened when we finally allocate an object for it.
1447262093Savg	 */
1448262093Savg	if (object != 0) {
1449277553Sdelphij		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1450277553Sdelphij		    ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
1451277553Sdelphij
1452277553Sdelphij		if (error != 0) {
1453277553Sdelphij			kmem_free(ms, sizeof (metaslab_t));
1454277553Sdelphij			return (error);
1455277553Sdelphij		}
1456277553Sdelphij
1457277553Sdelphij		ASSERT(ms->ms_sm != NULL);
1458262093Savg	}
1459168404Spjd
1460168404Spjd	/*
1461262093Savg	 * We create the main range tree here, but we don't create the
1462262093Savg	 * alloctree and freetree until metaslab_sync_done().  This serves
1463168404Spjd	 * two purposes: it allows metaslab_sync_done() to detect the
1464168404Spjd	 * addition of new space; and for debugging, it ensures that we'd
1465168404Spjd	 * data fault on any attempt to use this metaslab before it's ready.
1466168404Spjd	 */
1467277553Sdelphij	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
1468277553Sdelphij	metaslab_group_add(mg, ms);
1469168404Spjd
1470277553Sdelphij	ms->ms_fragmentation = metaslab_fragmentation(ms);
1471277553Sdelphij	ms->ms_ops = mg->mg_class->mc_ops;
1472219089Spjd
1473168404Spjd	/*
1474168404Spjd	 * If we're opening an existing pool (txg == 0) or creating
1475168404Spjd	 * a new one (txg == TXG_INITIAL), all space is available now.
1476168404Spjd	 * If we're adding space to an existing pool, the new space
1477168404Spjd	 * does not become available until after this txg has synced.
1478168404Spjd	 */
1479168404Spjd	if (txg <= TXG_INITIAL)
1480277553Sdelphij		metaslab_sync_done(ms, 0);
1481168404Spjd
1482262093Savg	/*
1483262093Savg	 * If metaslab_debug_load is set and we're initializing a metaslab
1484262093Savg	 * that has an allocated space_map object then load the its space
1485262093Savg	 * map so that can verify frees.
1486262093Savg	 */
1487277553Sdelphij	if (metaslab_debug_load && ms->ms_sm != NULL) {
1488277553Sdelphij		mutex_enter(&ms->ms_lock);
1489277553Sdelphij		VERIFY0(metaslab_load(ms));
1490277553Sdelphij		mutex_exit(&ms->ms_lock);
1491262093Savg	}
1492262093Savg
1493168404Spjd	if (txg != 0) {
1494168404Spjd		vdev_dirty(vd, 0, NULL, txg);
1495277553Sdelphij		vdev_dirty(vd, VDD_METASLAB, ms, txg);
1496168404Spjd	}
1497168404Spjd
1498277553Sdelphij	*msp = ms;
1499277553Sdelphij
1500277553Sdelphij	return (0);
1501168404Spjd}
1502168404Spjd
1503168404Spjdvoid
1504168404Spjdmetaslab_fini(metaslab_t *msp)
1505168404Spjd{
1506168404Spjd	metaslab_group_t *mg = msp->ms_group;
1507168404Spjd
1508168404Spjd	metaslab_group_remove(mg, msp);
1509168404Spjd
1510168404Spjd	mutex_enter(&msp->ms_lock);
1511168404Spjd
1512262093Savg	VERIFY(msp->ms_group == NULL);
1513262093Savg	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1514262093Savg	    0, -msp->ms_size);
1515262093Savg	space_map_close(msp->ms_sm);
1516168404Spjd
1517262093Savg	metaslab_unload(msp);
1518262093Savg	range_tree_destroy(msp->ms_tree);
1519262093Savg
1520219089Spjd	for (int t = 0; t < TXG_SIZE; t++) {
1521262093Savg		range_tree_destroy(msp->ms_alloctree[t]);
1522262093Savg		range_tree_destroy(msp->ms_freetree[t]);
1523168404Spjd	}
1524168404Spjd
1525247398Smm	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1526262093Savg		range_tree_destroy(msp->ms_defertree[t]);
1527247398Smm	}
1528219089Spjd
1529240415Smm	ASSERT0(msp->ms_deferspace);
1530219089Spjd
1531168404Spjd	mutex_exit(&msp->ms_lock);
1532262093Savg	cv_destroy(&msp->ms_load_cv);
1533168404Spjd	mutex_destroy(&msp->ms_lock);
1534168404Spjd
1535168404Spjd	kmem_free(msp, sizeof (metaslab_t));
1536168404Spjd}
1537168404Spjd
1538269773Sdelphij#define	FRAGMENTATION_TABLE_SIZE	17
1539269773Sdelphij
1540262093Savg/*
1541269773Sdelphij * This table defines a segment size based fragmentation metric that will
1542269773Sdelphij * allow each metaslab to derive its own fragmentation value. This is done
1543269773Sdelphij * by calculating the space in each bucket of the spacemap histogram and
1544269773Sdelphij * multiplying that by the fragmetation metric in this table. Doing
1545269773Sdelphij * this for all buckets and dividing it by the total amount of free
1546269773Sdelphij * space in this metaslab (i.e. the total free space in all buckets) gives
1547269773Sdelphij * us the fragmentation metric. This means that a high fragmentation metric
1548269773Sdelphij * equates to most of the free space being comprised of small segments.
1549269773Sdelphij * Conversely, if the metric is low, then most of the free space is in
1550269773Sdelphij * large segments. A 10% change in fragmentation equates to approximately
1551269773Sdelphij * double the number of segments.
1552262093Savg *
1553269773Sdelphij * This table defines 0% fragmented space using 16MB segments. Testing has
1554269773Sdelphij * shown that segments that are greater than or equal to 16MB do not suffer
1555269773Sdelphij * from drastic performance problems. Using this value, we derive the rest
1556269773Sdelphij * of the table. Since the fragmentation value is never stored on disk, it
1557269773Sdelphij * is possible to change these calculations in the future.
1558262093Savg */
1559269773Sdelphijint zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1560269773Sdelphij	100,	/* 512B	*/
1561269773Sdelphij	100,	/* 1K	*/
1562269773Sdelphij	98,	/* 2K	*/
1563269773Sdelphij	95,	/* 4K	*/
1564269773Sdelphij	90,	/* 8K	*/
1565269773Sdelphij	80,	/* 16K	*/
1566269773Sdelphij	70,	/* 32K	*/
1567269773Sdelphij	60,	/* 64K	*/
1568269773Sdelphij	50,	/* 128K	*/
1569269773Sdelphij	40,	/* 256K	*/
1570269773Sdelphij	30,	/* 512K	*/
1571269773Sdelphij	20,	/* 1M	*/
1572269773Sdelphij	15,	/* 2M	*/
1573269773Sdelphij	10,	/* 4M	*/
1574269773Sdelphij	5,	/* 8M	*/
1575269773Sdelphij	0	/* 16M	*/
1576269773Sdelphij};
1577269773Sdelphij
1578269773Sdelphij/*
1579269773Sdelphij * Calclate the metaslab's fragmentation metric. A return value
1580269773Sdelphij * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1581269773Sdelphij * not support this metric. Otherwise, the return value should be in the
1582269773Sdelphij * range [0, 100].
1583269773Sdelphij */
1584262093Savgstatic uint64_t
1585269773Sdelphijmetaslab_fragmentation(metaslab_t *msp)
1586262093Savg{
1587269773Sdelphij	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1588269773Sdelphij	uint64_t fragmentation = 0;
1589269773Sdelphij	uint64_t total = 0;
1590269773Sdelphij	boolean_t feature_enabled = spa_feature_is_enabled(spa,
1591269773Sdelphij	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
1592168404Spjd
1593269773Sdelphij	if (!feature_enabled)
1594269773Sdelphij		return (ZFS_FRAG_INVALID);
1595269773Sdelphij
1596262093Savg	/*
1597269773Sdelphij	 * A null space map means that the entire metaslab is free
1598269773Sdelphij	 * and thus is not fragmented.
1599262093Savg	 */
1600269773Sdelphij	if (msp->ms_sm == NULL)
1601269773Sdelphij		return (0);
1602269773Sdelphij
1603269773Sdelphij	/*
1604269773Sdelphij	 * If this metaslab's space_map has not been upgraded, flag it
1605269773Sdelphij	 * so that we upgrade next time we encounter it.
1606269773Sdelphij	 */
1607269773Sdelphij	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1608269773Sdelphij		uint64_t txg = spa_syncing_txg(spa);
1609262093Savg		vdev_t *vd = msp->ms_group->mg_vd;
1610262093Savg
1611273341Sdelphij		if (spa_writeable(spa)) {
1612273341Sdelphij			msp->ms_condense_wanted = B_TRUE;
1613273341Sdelphij			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1614273341Sdelphij			spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1615273341Sdelphij			    "msp %p, vd %p", txg, msp, vd);
1616273341Sdelphij		}
1617269773Sdelphij		return (ZFS_FRAG_INVALID);
1618262093Savg	}
1619262093Savg
1620269773Sdelphij	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1621269773Sdelphij		uint64_t space = 0;
1622269773Sdelphij		uint8_t shift = msp->ms_sm->sm_shift;
1623269773Sdelphij		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1624269773Sdelphij		    FRAGMENTATION_TABLE_SIZE - 1);
1625262093Savg
1626262093Savg		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1627262093Savg			continue;
1628262093Savg
1629269773Sdelphij		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1630269773Sdelphij		total += space;
1631269773Sdelphij
1632269773Sdelphij		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1633269773Sdelphij		fragmentation += space * zfs_frag_table[idx];
1634262093Savg	}
1635269773Sdelphij
1636269773Sdelphij	if (total > 0)
1637269773Sdelphij		fragmentation /= total;
1638269773Sdelphij	ASSERT3U(fragmentation, <=, 100);
1639269773Sdelphij	return (fragmentation);
1640262093Savg}
1641262093Savg
1642269773Sdelphij/*
1643269773Sdelphij * Compute a weight -- a selection preference value -- for the given metaslab.
1644269773Sdelphij * This is based on the amount of free space, the level of fragmentation,
1645269773Sdelphij * the LBA range, and whether the metaslab is loaded.
1646269773Sdelphij */
1647168404Spjdstatic uint64_t
1648168404Spjdmetaslab_weight(metaslab_t *msp)
1649168404Spjd{
1650168404Spjd	metaslab_group_t *mg = msp->ms_group;
1651168404Spjd	vdev_t *vd = mg->mg_vd;
1652168404Spjd	uint64_t weight, space;
1653168404Spjd
1654168404Spjd	ASSERT(MUTEX_HELD(&msp->ms_lock));
1655168404Spjd
1656168404Spjd	/*
1657247398Smm	 * This vdev is in the process of being removed so there is nothing
1658247398Smm	 * for us to do here.
1659247398Smm	 */
1660247398Smm	if (vd->vdev_removing) {
1661262093Savg		ASSERT0(space_map_allocated(msp->ms_sm));
1662247398Smm		ASSERT0(vd->vdev_ms_shift);
1663247398Smm		return (0);
1664247398Smm	}
1665247398Smm
1666247398Smm	/*
1667168404Spjd	 * The baseline weight is the metaslab's free space.
1668168404Spjd	 */
1669262093Savg	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1670269773Sdelphij
1671269773Sdelphij	msp->ms_fragmentation = metaslab_fragmentation(msp);
1672269773Sdelphij	if (metaslab_fragmentation_factor_enabled &&
1673269773Sdelphij	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1674269773Sdelphij		/*
1675269773Sdelphij		 * Use the fragmentation information to inversely scale
1676269773Sdelphij		 * down the baseline weight. We need to ensure that we
1677269773Sdelphij		 * don't exclude this metaslab completely when it's 100%
1678269773Sdelphij		 * fragmented. To avoid this we reduce the fragmented value
1679269773Sdelphij		 * by 1.
1680269773Sdelphij		 */
1681269773Sdelphij		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1682269773Sdelphij
1683269773Sdelphij		/*
1684269773Sdelphij		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1685269773Sdelphij		 * this metaslab again. The fragmentation metric may have
1686269773Sdelphij		 * decreased the space to something smaller than
1687269773Sdelphij		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1688269773Sdelphij		 * so that we can consume any remaining space.
1689269773Sdelphij		 */
1690269773Sdelphij		if (space > 0 && space < SPA_MINBLOCKSIZE)
1691269773Sdelphij			space = SPA_MINBLOCKSIZE;
1692269773Sdelphij	}
1693168404Spjd	weight = space;
1694168404Spjd
1695168404Spjd	/*
1696168404Spjd	 * Modern disks have uniform bit density and constant angular velocity.
1697168404Spjd	 * Therefore, the outer recording zones are faster (higher bandwidth)
1698168404Spjd	 * than the inner zones by the ratio of outer to inner track diameter,
1699168404Spjd	 * which is typically around 2:1.  We account for this by assigning
1700168404Spjd	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1701168404Spjd	 * In effect, this means that we'll select the metaslab with the most
1702168404Spjd	 * free bandwidth rather than simply the one with the most free space.
1703168404Spjd	 */
1704269773Sdelphij	if (metaslab_lba_weighting_enabled) {
1705269773Sdelphij		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1706269773Sdelphij		ASSERT(weight >= space && weight <= 2 * space);
1707269773Sdelphij	}
1708168404Spjd
1709269773Sdelphij	/*
1710269773Sdelphij	 * If this metaslab is one we're actively using, adjust its
1711269773Sdelphij	 * weight to make it preferable to any inactive metaslab so
1712269773Sdelphij	 * we'll polish it off. If the fragmentation on this metaslab
1713269773Sdelphij	 * has exceed our threshold, then don't mark it active.
1714269773Sdelphij	 */
1715269773Sdelphij	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1716269773Sdelphij	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1717211931Smm		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1718211931Smm	}
1719262093Savg
1720211931Smm	return (weight);
1721211931Smm}
1722211931Smm
1723168404Spjdstatic int
1724224177Smmmetaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1725168404Spjd{
1726168404Spjd	ASSERT(MUTEX_HELD(&msp->ms_lock));
1727168404Spjd
1728168404Spjd	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1729262093Savg		metaslab_load_wait(msp);
1730262093Savg		if (!msp->ms_loaded) {
1731262093Savg			int error = metaslab_load(msp);
1732262093Savg			if (error) {
1733219089Spjd				metaslab_group_sort(msp->ms_group, msp, 0);
1734219089Spjd				return (error);
1735219089Spjd			}
1736168404Spjd		}
1737209962Smm
1738168404Spjd		metaslab_group_sort(msp->ms_group, msp,
1739168404Spjd		    msp->ms_weight | activation_weight);
1740168404Spjd	}
1741262093Savg	ASSERT(msp->ms_loaded);
1742168404Spjd	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1743168404Spjd
1744168404Spjd	return (0);
1745168404Spjd}
1746168404Spjd
1747168404Spjdstatic void
1748168404Spjdmetaslab_passivate(metaslab_t *msp, uint64_t size)
1749168404Spjd{
1750168404Spjd	/*
1751168404Spjd	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1752168404Spjd	 * this metaslab again.  In that case, it had better be empty,
1753168404Spjd	 * or we would be leaving space on the table.
1754168404Spjd	 */
1755262093Savg	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1756168404Spjd	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1757168404Spjd	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1758168404Spjd}
1759168404Spjd
1760262093Savgstatic void
1761262093Savgmetaslab_preload(void *arg)
1762262093Savg{
1763262093Savg	metaslab_t *msp = arg;
1764262093Savg	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1765262093Savg
1766268656Sdelphij	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1767268656Sdelphij
1768262093Savg	mutex_enter(&msp->ms_lock);
1769262093Savg	metaslab_load_wait(msp);
1770262093Savg	if (!msp->ms_loaded)
1771262093Savg		(void) metaslab_load(msp);
1772262093Savg
1773262093Savg	/*
1774262093Savg	 * Set the ms_access_txg value so that we don't unload it right away.
1775262093Savg	 */
1776262093Savg	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1777262093Savg	mutex_exit(&msp->ms_lock);
1778262093Savg}
1779262093Savg
1780262093Savgstatic void
1781262093Savgmetaslab_group_preload(metaslab_group_t *mg)
1782262093Savg{
1783262093Savg	spa_t *spa = mg->mg_vd->vdev_spa;
1784262093Savg	metaslab_t *msp;
1785262093Savg	avl_tree_t *t = &mg->mg_metaslab_tree;
1786262093Savg	int m = 0;
1787262093Savg
1788262093Savg	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1789262093Savg		taskq_wait(mg->mg_taskq);
1790262093Savg		return;
1791262093Savg	}
1792268656Sdelphij
1793262093Savg	mutex_enter(&mg->mg_lock);
1794262093Savg	/*
1795268656Sdelphij	 * Load the next potential metaslabs
1796262093Savg	 */
1797268656Sdelphij	msp = avl_first(t);
1798268656Sdelphij	while (msp != NULL) {
1799268656Sdelphij		metaslab_t *msp_next = AVL_NEXT(t, msp);
1800262093Savg
1801269773Sdelphij		/*
1802269773Sdelphij		 * We preload only the maximum number of metaslabs specified
1803269773Sdelphij		 * by metaslab_preload_limit. If a metaslab is being forced
1804269773Sdelphij		 * to condense then we preload it too. This will ensure
1805269773Sdelphij		 * that force condensing happens in the next txg.
1806269773Sdelphij		 */
1807269773Sdelphij		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
1808269773Sdelphij			msp = msp_next;
1809269773Sdelphij			continue;
1810269773Sdelphij		}
1811262093Savg
1812268656Sdelphij		/*
1813268656Sdelphij		 * We must drop the metaslab group lock here to preserve
1814268656Sdelphij		 * lock ordering with the ms_lock (when grabbing both
1815268656Sdelphij		 * the mg_lock and the ms_lock, the ms_lock must be taken
1816268656Sdelphij		 * first).  As a result, it is possible that the ordering
1817268656Sdelphij		 * of the metaslabs within the avl tree may change before
1818268656Sdelphij		 * we reacquire the lock. The metaslab cannot be removed from
1819268656Sdelphij		 * the tree while we're in syncing context so it is safe to
1820268656Sdelphij		 * drop the mg_lock here. If the metaslabs are reordered
1821268656Sdelphij		 * nothing will break -- we just may end up loading a
1822268656Sdelphij		 * less than optimal one.
1823268656Sdelphij		 */
1824268656Sdelphij		mutex_exit(&mg->mg_lock);
1825262093Savg		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1826262093Savg		    msp, TQ_SLEEP) != 0);
1827268656Sdelphij		mutex_enter(&mg->mg_lock);
1828268656Sdelphij		msp = msp_next;
1829262093Savg	}
1830262093Savg	mutex_exit(&mg->mg_lock);
1831262093Savg}
1832262093Savg
1833168404Spjd/*
1834262093Savg * Determine if the space map's on-disk footprint is past our tolerance
1835262093Savg * for inefficiency. We would like to use the following criteria to make
1836262093Savg * our decision:
1837247398Smm *
1838247398Smm * 1. The size of the space map object should not dramatically increase as a
1839262093Savg * result of writing out the free space range tree.
1840247398Smm *
1841247398Smm * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1842262093Savg * times the size than the free space range tree representation
1843262093Savg * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1844247398Smm *
1845269416Sdelphij * 3. The on-disk size of the space map should actually decrease.
1846269416Sdelphij *
1847247398Smm * Checking the first condition is tricky since we don't want to walk
1848247398Smm * the entire AVL tree calculating the estimated on-disk size. Instead we
1849262093Savg * use the size-ordered range tree in the metaslab and calculate the
1850262093Savg * size required to write out the largest segment in our free tree. If the
1851247398Smm * size required to represent that segment on disk is larger than the space
1852247398Smm * map object then we avoid condensing this map.
1853247398Smm *
1854247398Smm * To determine the second criterion we use a best-case estimate and assume
1855247398Smm * each segment can be represented on-disk as a single 64-bit entry. We refer
1856247398Smm * to this best-case estimate as the space map's minimal form.
1857269416Sdelphij *
1858269416Sdelphij * Unfortunately, we cannot compute the on-disk size of the space map in this
1859269416Sdelphij * context because we cannot accurately compute the effects of compression, etc.
1860269416Sdelphij * Instead, we apply the heuristic described in the block comment for
1861269416Sdelphij * zfs_metaslab_condense_block_threshold - we only condense if the space used
1862269416Sdelphij * is greater than a threshold number of blocks.
1863247398Smm */
1864247398Smmstatic boolean_t
1865247398Smmmetaslab_should_condense(metaslab_t *msp)
1866247398Smm{
1867262093Savg	space_map_t *sm = msp->ms_sm;
1868262093Savg	range_seg_t *rs;
1869269416Sdelphij	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
1870269416Sdelphij	dmu_object_info_t doi;
1871269416Sdelphij	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
1872247398Smm
1873247398Smm	ASSERT(MUTEX_HELD(&msp->ms_lock));
1874262093Savg	ASSERT(msp->ms_loaded);
1875247398Smm
1876247398Smm	/*
1877262093Savg	 * Use the ms_size_tree range tree, which is ordered by size, to
1878269773Sdelphij	 * obtain the largest segment in the free tree. We always condense
1879269773Sdelphij	 * metaslabs that are empty and metaslabs for which a condense
1880269773Sdelphij	 * request has been made.
1881247398Smm	 */
1882262093Savg	rs = avl_last(&msp->ms_size_tree);
1883269773Sdelphij	if (rs == NULL || msp->ms_condense_wanted)
1884247398Smm		return (B_TRUE);
1885247398Smm
1886247398Smm	/*
1887247398Smm	 * Calculate the number of 64-bit entries this segment would
1888247398Smm	 * require when written to disk. If this single segment would be
1889247398Smm	 * larger on-disk than the entire current on-disk structure, then
1890247398Smm	 * clearly condensing will increase the on-disk structure size.
1891247398Smm	 */
1892262093Savg	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1893247398Smm	entries = size / (MIN(size, SM_RUN_MAX));
1894247398Smm	segsz = entries * sizeof (uint64_t);
1895247398Smm
1896269416Sdelphij	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
1897269416Sdelphij	object_size = space_map_length(msp->ms_sm);
1898269416Sdelphij
1899269416Sdelphij	dmu_object_info_from_db(sm->sm_dbuf, &doi);
1900269416Sdelphij	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
1901269416Sdelphij
1902269416Sdelphij	return (segsz <= object_size &&
1903269416Sdelphij	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
1904269416Sdelphij	    object_size > zfs_metaslab_condense_block_threshold * record_size);
1905247398Smm}
1906247398Smm
1907247398Smm/*
1908247398Smm * Condense the on-disk space map representation to its minimized form.
1909247398Smm * The minimized form consists of a small number of allocations followed by
1910262093Savg * the entries of the free range tree.
1911247398Smm */
1912247398Smmstatic void
1913247398Smmmetaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1914247398Smm{
1915247398Smm	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1916262093Savg	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1917262093Savg	range_tree_t *condense_tree;
1918262093Savg	space_map_t *sm = msp->ms_sm;
1919247398Smm
1920247398Smm	ASSERT(MUTEX_HELD(&msp->ms_lock));
1921247398Smm	ASSERT3U(spa_sync_pass(spa), ==, 1);
1922262093Savg	ASSERT(msp->ms_loaded);
1923247398Smm
1924269773Sdelphij
1925290753Smav	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
1926290753Smav	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
1927290753Smav	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
1928290753Smav	    msp->ms_group->mg_vd->vdev_spa->spa_name,
1929290753Smav	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
1930269773Sdelphij	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
1931247398Smm
1932269773Sdelphij	msp->ms_condense_wanted = B_FALSE;
1933269773Sdelphij
1934247398Smm	/*
1935262093Savg	 * Create an range tree that is 100% allocated. We remove segments
1936247398Smm	 * that have been freed in this txg, any deferred frees that exist,
1937247398Smm	 * and any allocation in the future. Removing segments should be
1938262093Savg	 * a relatively inexpensive operation since we expect these trees to
1939262093Savg	 * have a small number of nodes.
1940247398Smm	 */
1941262093Savg	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1942262093Savg	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1943247398Smm
1944247398Smm	/*
1945262093Savg	 * Remove what's been freed in this txg from the condense_tree.
1946247398Smm	 * Since we're in sync_pass 1, we know that all the frees from
1947262093Savg	 * this txg are in the freetree.
1948247398Smm	 */
1949262093Savg	range_tree_walk(freetree, range_tree_remove, condense_tree);
1950247398Smm
1951262093Savg	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1952262093Savg		range_tree_walk(msp->ms_defertree[t],
1953262093Savg		    range_tree_remove, condense_tree);
1954262093Savg	}
1955247398Smm
1956262093Savg	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1957262093Savg		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1958262093Savg		    range_tree_remove, condense_tree);
1959262093Savg	}
1960247398Smm
1961247398Smm	/*
1962247398Smm	 * We're about to drop the metaslab's lock thus allowing
1963247398Smm	 * other consumers to change it's content. Set the
1964262093Savg	 * metaslab's ms_condensing flag to ensure that
1965247398Smm	 * allocations on this metaslab do not occur while we're
1966247398Smm	 * in the middle of committing it to disk. This is only critical
1967262093Savg	 * for the ms_tree as all other range trees use per txg
1968247398Smm	 * views of their content.
1969247398Smm	 */
1970262093Savg	msp->ms_condensing = B_TRUE;
1971247398Smm
1972247398Smm	mutex_exit(&msp->ms_lock);
1973262093Savg	space_map_truncate(sm, tx);
1974247398Smm	mutex_enter(&msp->ms_lock);
1975247398Smm
1976247398Smm	/*
1977247398Smm	 * While we would ideally like to create a space_map representation
1978247398Smm	 * that consists only of allocation records, doing so can be
1979262093Savg	 * prohibitively expensive because the in-core free tree can be
1980247398Smm	 * large, and therefore computationally expensive to subtract
1981262093Savg	 * from the condense_tree. Instead we sync out two trees, a cheap
1982262093Savg	 * allocation only tree followed by the in-core free tree. While not
1983247398Smm	 * optimal, this is typically close to optimal, and much cheaper to
1984247398Smm	 * compute.
1985247398Smm	 */
1986262093Savg	space_map_write(sm, condense_tree, SM_ALLOC, tx);
1987262093Savg	range_tree_vacate(condense_tree, NULL, NULL);
1988262093Savg	range_tree_destroy(condense_tree);
1989247398Smm
1990262093Savg	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1991262093Savg	msp->ms_condensing = B_FALSE;
1992247398Smm}
1993247398Smm
1994247398Smm/*
1995168404Spjd * Write a metaslab to disk in the context of the specified transaction group.
1996168404Spjd */
1997168404Spjdvoid
1998168404Spjdmetaslab_sync(metaslab_t *msp, uint64_t txg)
1999168404Spjd{
2000262093Savg	metaslab_group_t *mg = msp->ms_group;
2001262093Savg	vdev_t *vd = mg->mg_vd;
2002168404Spjd	spa_t *spa = vd->vdev_spa;
2003219089Spjd	objset_t *mos = spa_meta_objset(spa);
2004262093Savg	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2005262093Savg	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
2006262093Savg	range_tree_t **freed_tree =
2007262093Savg	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
2008168404Spjd	dmu_tx_t *tx;
2009262093Savg	uint64_t object = space_map_object(msp->ms_sm);
2010168404Spjd
2011219089Spjd	ASSERT(!vd->vdev_ishole);
2012168404Spjd
2013247398Smm	/*
2014247398Smm	 * This metaslab has just been added so there's no work to do now.
2015247398Smm	 */
2016262093Savg	if (*freetree == NULL) {
2017262093Savg		ASSERT3P(alloctree, ==, NULL);
2018219089Spjd		return;
2019247398Smm	}
2020219089Spjd
2021262093Savg	ASSERT3P(alloctree, !=, NULL);
2022262093Savg	ASSERT3P(*freetree, !=, NULL);
2023262093Savg	ASSERT3P(*freed_tree, !=, NULL);
2024247398Smm
2025269773Sdelphij	/*
2026269773Sdelphij	 * Normally, we don't want to process a metaslab if there
2027269773Sdelphij	 * are no allocations or frees to perform. However, if the metaslab
2028269773Sdelphij	 * is being forced to condense we need to let it through.
2029269773Sdelphij	 */
2030262093Savg	if (range_tree_space(alloctree) == 0 &&
2031269773Sdelphij	    range_tree_space(*freetree) == 0 &&
2032269773Sdelphij	    !msp->ms_condense_wanted)
2033247398Smm		return;
2034247398Smm
2035168404Spjd	/*
2036168404Spjd	 * The only state that can actually be changing concurrently with
2037262093Savg	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
2038262093Savg	 * be modifying this txg's alloctree, freetree, freed_tree, or
2039262093Savg	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
2040262093Savg	 * space_map ASSERTs. We drop it whenever we call into the DMU,
2041262093Savg	 * because the DMU can call down to us (e.g. via zio_free()) at
2042262093Savg	 * any time.
2043168404Spjd	 */
2044168404Spjd
2045219089Spjd	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2046219089Spjd
2047262093Savg	if (msp->ms_sm == NULL) {
2048262093Savg		uint64_t new_object;
2049262093Savg
2050262093Savg		new_object = space_map_alloc(mos, tx);
2051262093Savg		VERIFY3U(new_object, !=, 0);
2052262093Savg
2053262093Savg		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2054262093Savg		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
2055262093Savg		    &msp->ms_lock));
2056262093Savg		ASSERT(msp->ms_sm != NULL);
2057168404Spjd	}
2058168404Spjd
2059219089Spjd	mutex_enter(&msp->ms_lock);
2060219089Spjd
2061273341Sdelphij	/*
2062273341Sdelphij	 * Note: metaslab_condense() clears the space_map's histogram.
2063273341Sdelphij	 * Therefore we must verify and remove this histogram before
2064273341Sdelphij	 * condensing.
2065273341Sdelphij	 */
2066273341Sdelphij	metaslab_group_histogram_verify(mg);
2067273341Sdelphij	metaslab_class_histogram_verify(mg->mg_class);
2068273341Sdelphij	metaslab_group_histogram_remove(mg, msp);
2069273341Sdelphij
2070262093Savg	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2071247398Smm	    metaslab_should_condense(msp)) {
2072247398Smm		metaslab_condense(msp, txg, tx);
2073247398Smm	} else {
2074262093Savg		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2075262093Savg		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
2076247398Smm	}
2077168404Spjd
2078262093Savg	if (msp->ms_loaded) {
2079262093Savg		/*
2080262093Savg		 * When the space map is loaded, we have an accruate
2081262093Savg		 * histogram in the range tree. This gives us an opportunity
2082262093Savg		 * to bring the space map's histogram up-to-date so we clear
2083262093Savg		 * it first before updating it.
2084262093Savg		 */
2085262093Savg		space_map_histogram_clear(msp->ms_sm);
2086262093Savg		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2087262093Savg	} else {
2088262093Savg		/*
2089262093Savg		 * Since the space map is not loaded we simply update the
2090262093Savg		 * exisiting histogram with what was freed in this txg. This
2091262093Savg		 * means that the on-disk histogram may not have an accurate
2092262093Savg		 * view of the free space but it's close enough to allow
2093262093Savg		 * us to make allocation decisions.
2094262093Savg		 */
2095262093Savg		space_map_histogram_add(msp->ms_sm, *freetree, tx);
2096262093Savg	}
2097269773Sdelphij	metaslab_group_histogram_add(mg, msp);
2098269773Sdelphij	metaslab_group_histogram_verify(mg);
2099269773Sdelphij	metaslab_class_histogram_verify(mg->mg_class);
2100262093Savg
2101247398Smm	/*
2102262093Savg	 * For sync pass 1, we avoid traversing this txg's free range tree
2103262093Savg	 * and instead will just swap the pointers for freetree and
2104262093Savg	 * freed_tree. We can safely do this since the freed_tree is
2105247398Smm	 * guaranteed to be empty on the initial pass.
2106247398Smm	 */
2107247398Smm	if (spa_sync_pass(spa) == 1) {
2108262093Savg		range_tree_swap(freetree, freed_tree);
2109247398Smm	} else {
2110262093Savg		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
2111168404Spjd	}
2112269773Sdelphij	range_tree_vacate(alloctree, NULL, NULL);
2113168404Spjd
2114262093Savg	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2115262093Savg	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2116168404Spjd
2117168404Spjd	mutex_exit(&msp->ms_lock);
2118168404Spjd
2119262093Savg	if (object != space_map_object(msp->ms_sm)) {
2120262093Savg		object = space_map_object(msp->ms_sm);
2121262093Savg		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2122262093Savg		    msp->ms_id, sizeof (uint64_t), &object, tx);
2123262093Savg	}
2124168404Spjd	dmu_tx_commit(tx);
2125168404Spjd}
2126168404Spjd
2127168404Spjd/*
2128168404Spjd * Called after a transaction group has completely synced to mark
2129168404Spjd * all of the metaslab's free space as usable.
2130168404Spjd */
2131168404Spjdvoid
2132168404Spjdmetaslab_sync_done(metaslab_t *msp, uint64_t txg)
2133168404Spjd{
2134168404Spjd	metaslab_group_t *mg = msp->ms_group;
2135168404Spjd	vdev_t *vd = mg->mg_vd;
2136262093Savg	range_tree_t **freed_tree;
2137262093Savg	range_tree_t **defer_tree;
2138219089Spjd	int64_t alloc_delta, defer_delta;
2139168404Spjd
2140219089Spjd	ASSERT(!vd->vdev_ishole);
2141219089Spjd
2142168404Spjd	mutex_enter(&msp->ms_lock);
2143168404Spjd
2144168404Spjd	/*
2145168404Spjd	 * If this metaslab is just becoming available, initialize its
2146262093Savg	 * alloctrees, freetrees, and defertree and add its capacity to
2147262093Savg	 * the vdev.
2148168404Spjd	 */
2149262093Savg	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
2150219089Spjd		for (int t = 0; t < TXG_SIZE; t++) {
2151262093Savg			ASSERT(msp->ms_alloctree[t] == NULL);
2152262093Savg			ASSERT(msp->ms_freetree[t] == NULL);
2153262093Savg
2154262093Savg			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
2155262093Savg			    &msp->ms_lock);
2156262093Savg			msp->ms_freetree[t] = range_tree_create(NULL, msp,
2157262093Savg			    &msp->ms_lock);
2158168404Spjd		}
2159219089Spjd
2160247398Smm		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2161262093Savg			ASSERT(msp->ms_defertree[t] == NULL);
2162262093Savg
2163262093Savg			msp->ms_defertree[t] = range_tree_create(NULL, msp,
2164262093Savg			    &msp->ms_lock);
2165247398Smm		}
2166219089Spjd
2167262093Savg		vdev_space_update(vd, 0, 0, msp->ms_size);
2168168404Spjd	}
2169168404Spjd
2170262093Savg	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
2171262093Savg	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2172168404Spjd
2173262093Savg	alloc_delta = space_map_alloc_delta(msp->ms_sm);
2174262093Savg	defer_delta = range_tree_space(*freed_tree) -
2175262093Savg	    range_tree_space(*defer_tree);
2176262093Savg
2177219089Spjd	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2178219089Spjd
2179262093Savg	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2180262093Savg	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2181168404Spjd
2182168404Spjd	/*
2183262093Savg	 * If there's a metaslab_load() in progress, wait for it to complete
2184168404Spjd	 * so that we have a consistent view of the in-core space map.
2185168404Spjd	 */
2186262093Savg	metaslab_load_wait(msp);
2187168404Spjd
2188247398Smm	/*
2189262093Savg	 * Move the frees from the defer_tree back to the free
2190262093Savg	 * range tree (if it's loaded). Swap the freed_tree and the
2191262093Savg	 * defer_tree -- this is safe to do because we've just emptied out
2192262093Savg	 * the defer_tree.
2193247398Smm	 */
2194262093Savg	range_tree_vacate(*defer_tree,
2195262093Savg	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2196262093Savg	range_tree_swap(freed_tree, defer_tree);
2197247398Smm
2198262093Savg	space_map_update(msp->ms_sm);
2199168404Spjd
2200219089Spjd	msp->ms_deferspace += defer_delta;
2201219089Spjd	ASSERT3S(msp->ms_deferspace, >=, 0);
2202262093Savg	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2203219089Spjd	if (msp->ms_deferspace != 0) {
2204219089Spjd		/*
2205219089Spjd		 * Keep syncing this metaslab until all deferred frees
2206219089Spjd		 * are back in circulation.
2207219089Spjd		 */
2208219089Spjd		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2209219089Spjd	}
2210219089Spjd
2211262093Savg	if (msp->ms_loaded && msp->ms_access_txg < txg) {
2212262093Savg		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2213262093Savg			VERIFY0(range_tree_space(
2214262093Savg			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
2215262093Savg		}
2216168404Spjd
2217262093Savg		if (!metaslab_debug_unload)
2218262093Savg			metaslab_unload(msp);
2219168404Spjd	}
2220168404Spjd
2221168404Spjd	metaslab_group_sort(mg, msp, metaslab_weight(msp));
2222262093Savg	mutex_exit(&msp->ms_lock);
2223168404Spjd}
2224168404Spjd
2225211931Smmvoid
2226211931Smmmetaslab_sync_reassess(metaslab_group_t *mg)
2227211931Smm{
2228260768Savg	metaslab_group_alloc_update(mg);
2229269773Sdelphij	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2230224177Smm
2231211931Smm	/*
2232262093Savg	 * Preload the next potential metaslabs
2233211931Smm	 */
2234262093Savg	metaslab_group_preload(mg);
2235211931Smm}
2236211931Smm
2237168404Spjdstatic uint64_t
2238168404Spjdmetaslab_distance(metaslab_t *msp, dva_t *dva)
2239168404Spjd{
2240168404Spjd	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2241168404Spjd	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2242262093Savg	uint64_t start = msp->ms_id;
2243168404Spjd
2244168404Spjd	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2245168404Spjd		return (1ULL << 63);
2246168404Spjd
2247168404Spjd	if (offset < start)
2248168404Spjd		return ((start - offset) << ms_shift);
2249168404Spjd	if (offset > start)
2250168404Spjd		return ((offset - start) << ms_shift);
2251168404Spjd	return (0);
2252168404Spjd}
2253168404Spjd
2254307279Smav/*
2255307279Smav * ==========================================================================
2256307279Smav * Metaslab block operations
2257307279Smav * ==========================================================================
2258307279Smav */
2259307279Smav
2260307279Smavstatic void
2261307279Smavmetaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2262307279Smav{
2263307279Smav	if (!(flags & METASLAB_ASYNC_ALLOC) ||
2264307279Smav	    flags & METASLAB_DONT_THROTTLE)
2265307279Smav		return;
2266307279Smav
2267307279Smav	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2268307279Smav	if (!mg->mg_class->mc_alloc_throttle_enabled)
2269307279Smav		return;
2270307279Smav
2271307279Smav	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2272307279Smav}
2273307279Smav
2274307279Smavvoid
2275307279Smavmetaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2276307279Smav{
2277307279Smav	if (!(flags & METASLAB_ASYNC_ALLOC) ||
2278307279Smav	    flags & METASLAB_DONT_THROTTLE)
2279307279Smav		return;
2280307279Smav
2281307279Smav	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2282307279Smav	if (!mg->mg_class->mc_alloc_throttle_enabled)
2283307279Smav		return;
2284307279Smav
2285307279Smav	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2286307279Smav}
2287307279Smav
2288307279Smavvoid
2289307279Smavmetaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2290307279Smav{
2291307279Smav#ifdef ZFS_DEBUG
2292307279Smav	const dva_t *dva = bp->blk_dva;
2293307279Smav	int ndvas = BP_GET_NDVAS(bp);
2294307279Smav
2295307279Smav	for (int d = 0; d < ndvas; d++) {
2296307279Smav		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2297307279Smav		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2298307279Smav		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2299307279Smav	}
2300307279Smav#endif
2301307279Smav}
2302307279Smav
2303168404Spjdstatic uint64_t
2304307279Smavmetaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
2305265741Sdelphij    uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2306168404Spjd{
2307224177Smm	spa_t *spa = mg->mg_vd->vdev_spa;
2308168404Spjd	metaslab_t *msp = NULL;
2309168404Spjd	uint64_t offset = -1ULL;
2310168404Spjd	avl_tree_t *t = &mg->mg_metaslab_tree;
2311168404Spjd	uint64_t activation_weight;
2312168404Spjd	uint64_t target_distance;
2313168404Spjd	int i;
2314168404Spjd
2315168404Spjd	activation_weight = METASLAB_WEIGHT_PRIMARY;
2316209962Smm	for (i = 0; i < d; i++) {
2317209962Smm		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2318168404Spjd			activation_weight = METASLAB_WEIGHT_SECONDARY;
2319209962Smm			break;
2320209962Smm		}
2321209962Smm	}
2322168404Spjd
2323168404Spjd	for (;;) {
2324209962Smm		boolean_t was_active;
2325209962Smm
2326168404Spjd		mutex_enter(&mg->mg_lock);
2327168404Spjd		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
2328224177Smm			if (msp->ms_weight < asize) {
2329224177Smm				spa_dbgmsg(spa, "%s: failed to meet weight "
2330224177Smm				    "requirement: vdev %llu, txg %llu, mg %p, "
2331307279Smav				    "msp %p, asize %llu, "
2332265741Sdelphij				    "weight %llu", spa_name(spa),
2333265741Sdelphij				    mg->mg_vd->vdev_id, txg,
2334307279Smav				    mg, msp, asize, msp->ms_weight);
2335168404Spjd				mutex_exit(&mg->mg_lock);
2336168404Spjd				return (-1ULL);
2337168404Spjd			}
2338247398Smm
2339247398Smm			/*
2340247398Smm			 * If the selected metaslab is condensing, skip it.
2341247398Smm			 */
2342262093Savg			if (msp->ms_condensing)
2343247398Smm				continue;
2344247398Smm
2345209962Smm			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2346168404Spjd			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2347168404Spjd				break;
2348168404Spjd
2349168404Spjd			target_distance = min_distance +
2350262093Savg			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2351262093Savg			    min_distance >> 1);
2352168404Spjd
2353168404Spjd			for (i = 0; i < d; i++)
2354168404Spjd				if (metaslab_distance(msp, &dva[i]) <
2355168404Spjd				    target_distance)
2356168404Spjd					break;
2357168404Spjd			if (i == d)
2358168404Spjd				break;
2359168404Spjd		}
2360168404Spjd		mutex_exit(&mg->mg_lock);
2361168404Spjd		if (msp == NULL)
2362168404Spjd			return (-1ULL);
2363168404Spjd
2364260768Savg		mutex_enter(&msp->ms_lock);
2365260768Savg
2366224177Smm		/*
2367168404Spjd		 * Ensure that the metaslab we have selected is still
2368168404Spjd		 * capable of handling our request. It's possible that
2369168404Spjd		 * another thread may have changed the weight while we
2370168404Spjd		 * were blocked on the metaslab lock.
2371168404Spjd		 */
2372224177Smm		if (msp->ms_weight < asize || (was_active &&
2373209962Smm		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
2374209962Smm		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
2375168404Spjd			mutex_exit(&msp->ms_lock);
2376168404Spjd			continue;
2377168404Spjd		}
2378168404Spjd
2379168404Spjd		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2380168404Spjd		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
2381168404Spjd			metaslab_passivate(msp,
2382168404Spjd			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2383168404Spjd			mutex_exit(&msp->ms_lock);
2384168404Spjd			continue;
2385168404Spjd		}
2386168404Spjd
2387224177Smm		if (metaslab_activate(msp, activation_weight) != 0) {
2388168404Spjd			mutex_exit(&msp->ms_lock);
2389168404Spjd			continue;
2390168404Spjd		}
2391168404Spjd
2392247398Smm		/*
2393247398Smm		 * If this metaslab is currently condensing then pick again as
2394247398Smm		 * we can't manipulate this metaslab until it's committed
2395247398Smm		 * to disk.
2396247398Smm		 */
2397262093Savg		if (msp->ms_condensing) {
2398247398Smm			mutex_exit(&msp->ms_lock);
2399247398Smm			continue;
2400247398Smm		}
2401247398Smm
2402262093Savg		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
2403168404Spjd			break;
2404168404Spjd
2405262093Savg		metaslab_passivate(msp, metaslab_block_maxsize(msp));
2406168404Spjd		mutex_exit(&msp->ms_lock);
2407168404Spjd	}
2408168404Spjd
2409262093Savg	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2410168404Spjd		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2411168404Spjd
2412262093Savg	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
2413262093Savg	msp->ms_access_txg = txg + metaslab_unload_delay;
2414168404Spjd
2415168404Spjd	mutex_exit(&msp->ms_lock);
2416168404Spjd	return (offset);
2417168404Spjd}
2418168404Spjd
2419168404Spjd/*
2420168404Spjd * Allocate a block for the specified i/o.
2421168404Spjd */
2422168404Spjdstatic int
2423185029Spjdmetaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2424185029Spjd    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
2425168404Spjd{
2426168404Spjd	metaslab_group_t *mg, *rotor;
2427168404Spjd	vdev_t *vd;
2428168404Spjd	int dshift = 3;
2429168404Spjd	int all_zero;
2430209962Smm	int zio_lock = B_FALSE;
2431209962Smm	boolean_t allocatable;
2432168404Spjd	uint64_t asize;
2433168404Spjd	uint64_t distance;
2434168404Spjd
2435168404Spjd	ASSERT(!DVA_IS_VALID(&dva[d]));
2436168404Spjd
2437185029Spjd	/*
2438185029Spjd	 * For testing, make some blocks above a certain size be gang blocks.
2439185029Spjd	 */
2440219089Spjd	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
2441249195Smm		return (SET_ERROR(ENOSPC));
2442168404Spjd
2443168404Spjd	/*
2444168404Spjd	 * Start at the rotor and loop through all mgs until we find something.
2445219089Spjd	 * Note that there's no locking on mc_rotor or mc_aliquot because
2446168404Spjd	 * nothing actually breaks if we miss a few updates -- we just won't
2447168404Spjd	 * allocate quite as evenly.  It all balances out over time.
2448168404Spjd	 *
2449168404Spjd	 * If we are doing ditto or log blocks, try to spread them across
2450168404Spjd	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
2451168404Spjd	 * allocated all of our ditto blocks, then try and spread them out on
2452168404Spjd	 * that vdev as much as possible.  If it turns out to not be possible,
2453168404Spjd	 * gradually lower our standards until anything becomes acceptable.
2454168404Spjd	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2455168404Spjd	 * gives us hope of containing our fault domains to something we're
2456168404Spjd	 * able to reason about.  Otherwise, any two top-level vdev failures
2457168404Spjd	 * will guarantee the loss of data.  With consecutive allocation,
2458168404Spjd	 * only two adjacent top-level vdev failures will result in data loss.
2459168404Spjd	 *
2460168404Spjd	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
2461168404Spjd	 * ourselves on the same vdev as our gang block header.  That
2462168404Spjd	 * way, we can hope for locality in vdev_cache, plus it makes our
2463168404Spjd	 * fault domains something tractable.
2464168404Spjd	 */
2465168404Spjd	if (hintdva) {
2466168404Spjd		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2467219089Spjd
2468219089Spjd		/*
2469219089Spjd		 * It's possible the vdev we're using as the hint no
2470219089Spjd		 * longer exists (i.e. removed). Consult the rotor when
2471219089Spjd		 * all else fails.
2472219089Spjd		 */
2473219089Spjd		if (vd != NULL) {
2474168404Spjd			mg = vd->vdev_mg;
2475219089Spjd
2476219089Spjd			if (flags & METASLAB_HINTBP_AVOID &&
2477219089Spjd			    mg->mg_next != NULL)
2478219089Spjd				mg = mg->mg_next;
2479219089Spjd		} else {
2480219089Spjd			mg = mc->mc_rotor;
2481219089Spjd		}
2482168404Spjd	} else if (d != 0) {
2483168404Spjd		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
2484168404Spjd		mg = vd->vdev_mg->mg_next;
2485168404Spjd	} else {
2486168404Spjd		mg = mc->mc_rotor;
2487168404Spjd	}
2488185029Spjd
2489185029Spjd	/*
2490219089Spjd	 * If the hint put us into the wrong metaslab class, or into a
2491219089Spjd	 * metaslab group that has been passivated, just follow the rotor.
2492185029Spjd	 */
2493219089Spjd	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
2494185029Spjd		mg = mc->mc_rotor;
2495185029Spjd
2496168404Spjd	rotor = mg;
2497168404Spjdtop:
2498168404Spjd	all_zero = B_TRUE;
2499168404Spjd	do {
2500219089Spjd		ASSERT(mg->mg_activation_count == 1);
2501168404Spjd		vd = mg->mg_vd;
2502209962Smm
2503185029Spjd		/*
2504185029Spjd		 * Don't allocate from faulted devices.
2505185029Spjd		 */
2506209962Smm		if (zio_lock) {
2507209962Smm			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2508209962Smm			allocatable = vdev_allocatable(vd);
2509209962Smm			spa_config_exit(spa, SCL_ZIO, FTAG);
2510209962Smm		} else {
2511209962Smm			allocatable = vdev_allocatable(vd);
2512209962Smm		}
2513260768Savg
2514260768Savg		/*
2515260768Savg		 * Determine if the selected metaslab group is eligible
2516307279Smav		 * for allocations. If we're ganging then don't allow
2517307279Smav		 * this metaslab group to skip allocations since that would
2518307279Smav		 * inadvertently return ENOSPC and suspend the pool
2519260768Savg		 * even though space is still available.
2520260768Savg		 */
2521307279Smav		if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) {
2522307279Smav			allocatable = metaslab_group_allocatable(mg, rotor,
2523307279Smav			    psize);
2524307279Smav		}
2525260768Savg
2526209962Smm		if (!allocatable)
2527185029Spjd			goto next;
2528209962Smm
2529307279Smav		ASSERT(mg->mg_initialized);
2530307279Smav
2531185029Spjd		/*
2532307279Smav		 * Avoid writing single-copy data to a failing vdev.
2533185029Spjd		 */
2534185029Spjd		if ((vd->vdev_stat.vs_write_errors > 0 ||
2535185029Spjd		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
2536269773Sdelphij		    d == 0 && dshift == 3 && vd->vdev_children == 0) {
2537185029Spjd			all_zero = B_FALSE;
2538185029Spjd			goto next;
2539185029Spjd		}
2540168404Spjd
2541185029Spjd		ASSERT(mg->mg_class == mc);
2542185029Spjd
2543168404Spjd		distance = vd->vdev_asize >> dshift;
2544168404Spjd		if (distance <= (1ULL << vd->vdev_ms_shift))
2545168404Spjd			distance = 0;
2546168404Spjd		else
2547168404Spjd			all_zero = B_FALSE;
2548168404Spjd
2549168404Spjd		asize = vdev_psize_to_asize(vd, psize);
2550168404Spjd		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
2551168404Spjd
2552307279Smav		uint64_t offset = metaslab_group_alloc(mg, asize, txg,
2553307279Smav		    distance, dva, d);
2554307279Smav
2555307279Smav		mutex_enter(&mg->mg_lock);
2556307279Smav		if (offset == -1ULL) {
2557307279Smav			mg->mg_failed_allocations++;
2558307279Smav			if (asize == SPA_GANGBLOCKSIZE) {
2559307279Smav				/*
2560307279Smav				 * This metaslab group was unable to allocate
2561307279Smav				 * the minimum gang block size so it must be
2562307279Smav				 * out of space. We must notify the allocation
2563307279Smav				 * throttle to start skipping allocation
2564307279Smav				 * attempts to this metaslab group until more
2565307279Smav				 * space becomes available.
2566307279Smav				 *
2567307279Smav				 * Note: this failure cannot be caused by the
2568307279Smav				 * allocation throttle since the allocation
2569307279Smav				 * throttle is only responsible for skipping
2570307279Smav				 * devices and not failing block allocations.
2571307279Smav				 */
2572307279Smav				mg->mg_no_free_space = B_TRUE;
2573307279Smav			}
2574307279Smav		}
2575307279Smav		mg->mg_allocations++;
2576307279Smav		mutex_exit(&mg->mg_lock);
2577307279Smav
2578168404Spjd		if (offset != -1ULL) {
2579168404Spjd			/*
2580168404Spjd			 * If we've just selected this metaslab group,
2581168404Spjd			 * figure out whether the corresponding vdev is
2582168404Spjd			 * over- or under-used relative to the pool,
2583168404Spjd			 * and set an allocation bias to even it out.
2584168404Spjd			 */
2585269773Sdelphij			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
2586168404Spjd				vdev_stat_t *vs = &vd->vdev_stat;
2587219089Spjd				int64_t vu, cu;
2588168404Spjd
2589224177Smm				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
2590224177Smm				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
2591168404Spjd
2592168404Spjd				/*
2593224177Smm				 * Calculate how much more or less we should
2594224177Smm				 * try to allocate from this device during
2595224177Smm				 * this iteration around the rotor.
2596224177Smm				 * For example, if a device is 80% full
2597224177Smm				 * and the pool is 20% full then we should
2598224177Smm				 * reduce allocations by 60% on this device.
2599224177Smm				 *
2600224177Smm				 * mg_bias = (20 - 80) * 512K / 100 = -307K
2601224177Smm				 *
2602224177Smm				 * This reduces allocations by 307K for this
2603224177Smm				 * iteration.
2604168404Spjd				 */
2605219089Spjd				mg->mg_bias = ((cu - vu) *
2606224177Smm				    (int64_t)mg->mg_aliquot) / 100;
2607269773Sdelphij			} else if (!metaslab_bias_enabled) {
2608269773Sdelphij				mg->mg_bias = 0;
2609168404Spjd			}
2610168404Spjd
2611219089Spjd			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
2612168404Spjd			    mg->mg_aliquot + mg->mg_bias) {
2613168404Spjd				mc->mc_rotor = mg->mg_next;
2614219089Spjd				mc->mc_aliquot = 0;
2615168404Spjd			}
2616168404Spjd
2617168404Spjd			DVA_SET_VDEV(&dva[d], vd->vdev_id);
2618168404Spjd			DVA_SET_OFFSET(&dva[d], offset);
2619185029Spjd			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
2620168404Spjd			DVA_SET_ASIZE(&dva[d], asize);
2621168404Spjd
2622168404Spjd			return (0);
2623168404Spjd		}
2624185029Spjdnext:
2625168404Spjd		mc->mc_rotor = mg->mg_next;
2626219089Spjd		mc->mc_aliquot = 0;
2627168404Spjd	} while ((mg = mg->mg_next) != rotor);
2628168404Spjd
2629168404Spjd	if (!all_zero) {
2630168404Spjd		dshift++;
2631168404Spjd		ASSERT(dshift < 64);
2632168404Spjd		goto top;
2633168404Spjd	}
2634168404Spjd
2635209962Smm	if (!allocatable && !zio_lock) {
2636209962Smm		dshift = 3;
2637209962Smm		zio_lock = B_TRUE;
2638209962Smm		goto top;
2639209962Smm	}
2640209962Smm
2641168404Spjd	bzero(&dva[d], sizeof (dva_t));
2642168404Spjd
2643249195Smm	return (SET_ERROR(ENOSPC));
2644168404Spjd}
2645168404Spjd
2646168404Spjd/*
2647168404Spjd * Free the block represented by DVA in the context of the specified
2648168404Spjd * transaction group.
2649168404Spjd */
2650168404Spjdstatic void
2651168404Spjdmetaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
2652168404Spjd{
2653168404Spjd	uint64_t vdev = DVA_GET_VDEV(dva);
2654168404Spjd	uint64_t offset = DVA_GET_OFFSET(dva);
2655168404Spjd	uint64_t size = DVA_GET_ASIZE(dva);
2656168404Spjd	vdev_t *vd;
2657168404Spjd	metaslab_t *msp;
2658168404Spjd
2659168404Spjd	ASSERT(DVA_IS_VALID(dva));
2660168404Spjd
2661168404Spjd	if (txg > spa_freeze_txg(spa))
2662168404Spjd		return;
2663168404Spjd
2664168404Spjd	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2665168404Spjd	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
2666168404Spjd		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
2667168404Spjd		    (u_longlong_t)vdev, (u_longlong_t)offset);
2668168404Spjd		ASSERT(0);
2669168404Spjd		return;
2670168404Spjd	}
2671168404Spjd
2672168404Spjd	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2673168404Spjd
2674168404Spjd	if (DVA_GET_GANG(dva))
2675168404Spjd		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2676168404Spjd
2677168404Spjd	mutex_enter(&msp->ms_lock);
2678168404Spjd
2679168404Spjd	if (now) {
2680262093Savg		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
2681168404Spjd		    offset, size);
2682262093Savg
2683262093Savg		VERIFY(!msp->ms_condensing);
2684262093Savg		VERIFY3U(offset, >=, msp->ms_start);
2685262093Savg		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2686262093Savg		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2687262093Savg		    msp->ms_size);
2688262093Savg		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2689262093Savg		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2690262093Savg		range_tree_add(msp->ms_tree, offset, size);
2691168404Spjd	} else {
2692262093Savg		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2693168404Spjd			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2694262093Savg		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2695262093Savg		    offset, size);
2696168404Spjd	}
2697168404Spjd
2698168404Spjd	mutex_exit(&msp->ms_lock);
2699168404Spjd}
2700168404Spjd
2701168404Spjd/*
2702168404Spjd * Intent log support: upon opening the pool after a crash, notify the SPA
2703168404Spjd * of blocks that the intent log has allocated for immediate write, but
2704168404Spjd * which are still considered free by the SPA because the last transaction
2705168404Spjd * group didn't commit yet.
2706168404Spjd */
2707168404Spjdstatic int
2708168404Spjdmetaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2709168404Spjd{
2710168404Spjd	uint64_t vdev = DVA_GET_VDEV(dva);
2711168404Spjd	uint64_t offset = DVA_GET_OFFSET(dva);
2712168404Spjd	uint64_t size = DVA_GET_ASIZE(dva);
2713168404Spjd	vdev_t *vd;
2714168404Spjd	metaslab_t *msp;
2715219089Spjd	int error = 0;
2716168404Spjd
2717168404Spjd	ASSERT(DVA_IS_VALID(dva));
2718168404Spjd
2719168404Spjd	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2720168404Spjd	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2721249195Smm		return (SET_ERROR(ENXIO));
2722168404Spjd
2723168404Spjd	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2724168404Spjd
2725168404Spjd	if (DVA_GET_GANG(dva))
2726168404Spjd		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2727168404Spjd
2728168404Spjd	mutex_enter(&msp->ms_lock);
2729168404Spjd
2730262093Savg	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2731224177Smm		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2732219089Spjd
2733262093Savg	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2734249195Smm		error = SET_ERROR(ENOENT);
2735219089Spjd
2736185029Spjd	if (error || txg == 0) {	/* txg == 0 indicates dry run */
2737168404Spjd		mutex_exit(&msp->ms_lock);
2738168404Spjd		return (error);
2739168404Spjd	}
2740168404Spjd
2741262093Savg	VERIFY(!msp->ms_condensing);
2742262093Savg	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2743262093Savg	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2744262093Savg	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2745262093Savg	range_tree_remove(msp->ms_tree, offset, size);
2746168404Spjd
2747209962Smm	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
2748262093Savg		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2749185029Spjd			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2750262093Savg		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2751185029Spjd	}
2752185029Spjd
2753168404Spjd	mutex_exit(&msp->ms_lock);
2754168404Spjd
2755168404Spjd	return (0);
2756168404Spjd}
2757168404Spjd
2758307279Smav/*
2759307279Smav * Reserve some allocation slots. The reservation system must be called
2760307279Smav * before we call into the allocator. If there aren't any available slots
2761307279Smav * then the I/O will be throttled until an I/O completes and its slots are
2762307279Smav * freed up. The function returns true if it was successful in placing
2763307279Smav * the reservation.
2764307279Smav */
2765307279Smavboolean_t
2766307279Smavmetaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
2767307279Smav    int flags)
2768307279Smav{
2769307279Smav	uint64_t available_slots = 0;
2770307279Smav	boolean_t slot_reserved = B_FALSE;
2771307279Smav
2772307279Smav	ASSERT(mc->mc_alloc_throttle_enabled);
2773307279Smav	mutex_enter(&mc->mc_lock);
2774307279Smav
2775307279Smav	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
2776307279Smav	if (reserved_slots < mc->mc_alloc_max_slots)
2777307279Smav		available_slots = mc->mc_alloc_max_slots - reserved_slots;
2778307279Smav
2779307279Smav	if (slots <= available_slots || GANG_ALLOCATION(flags)) {
2780307279Smav		/*
2781307279Smav		 * We reserve the slots individually so that we can unreserve
2782307279Smav		 * them individually when an I/O completes.
2783307279Smav		 */
2784307279Smav		for (int d = 0; d < slots; d++) {
2785307279Smav			reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
2786307279Smav		}
2787307279Smav		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
2788307279Smav		slot_reserved = B_TRUE;
2789307279Smav	}
2790307279Smav
2791307279Smav	mutex_exit(&mc->mc_lock);
2792307279Smav	return (slot_reserved);
2793307279Smav}
2794307279Smav
2795307279Smavvoid
2796307279Smavmetaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
2797307279Smav{
2798307279Smav	ASSERT(mc->mc_alloc_throttle_enabled);
2799307279Smav	mutex_enter(&mc->mc_lock);
2800307279Smav	for (int d = 0; d < slots; d++) {
2801307279Smav		(void) refcount_remove(&mc->mc_alloc_slots, zio);
2802307279Smav	}
2803307279Smav	mutex_exit(&mc->mc_lock);
2804307279Smav}
2805307279Smav
2806168404Spjdint
2807185029Spjdmetaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2808307279Smav    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio)
2809168404Spjd{
2810168404Spjd	dva_t *dva = bp->blk_dva;
2811168404Spjd	dva_t *hintdva = hintbp->blk_dva;
2812168404Spjd	int error = 0;
2813168404Spjd
2814185029Spjd	ASSERT(bp->blk_birth == 0);
2815219089Spjd	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2816185029Spjd
2817185029Spjd	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2818185029Spjd
2819185029Spjd	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
2820185029Spjd		spa_config_exit(spa, SCL_ALLOC, FTAG);
2821249195Smm		return (SET_ERROR(ENOSPC));
2822185029Spjd	}
2823185029Spjd
2824168404Spjd	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2825168404Spjd	ASSERT(BP_GET_NDVAS(bp) == 0);
2826168404Spjd	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2827168404Spjd
2828185029Spjd	for (int d = 0; d < ndvas; d++) {
2829185029Spjd		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2830185029Spjd		    txg, flags);
2831262093Savg		if (error != 0) {
2832168404Spjd			for (d--; d >= 0; d--) {
2833168404Spjd				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2834307279Smav				metaslab_group_alloc_decrement(spa,
2835307279Smav				    DVA_GET_VDEV(&dva[d]), zio, flags);
2836168404Spjd				bzero(&dva[d], sizeof (dva_t));
2837168404Spjd			}
2838185029Spjd			spa_config_exit(spa, SCL_ALLOC, FTAG);
2839168404Spjd			return (error);
2840307279Smav		} else {
2841307279Smav			/*
2842307279Smav			 * Update the metaslab group's queue depth
2843307279Smav			 * based on the newly allocated dva.
2844307279Smav			 */
2845307279Smav			metaslab_group_alloc_increment(spa,
2846307279Smav			    DVA_GET_VDEV(&dva[d]), zio, flags);
2847168404Spjd		}
2848307279Smav
2849168404Spjd	}
2850168404Spjd	ASSERT(error == 0);
2851168404Spjd	ASSERT(BP_GET_NDVAS(bp) == ndvas);
2852168404Spjd
2853185029Spjd	spa_config_exit(spa, SCL_ALLOC, FTAG);
2854185029Spjd
2855219089Spjd	BP_SET_BIRTH(bp, txg, txg);
2856185029Spjd
2857168404Spjd	return (0);
2858168404Spjd}
2859168404Spjd
2860168404Spjdvoid
2861168404Spjdmetaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2862168404Spjd{
2863168404Spjd	const dva_t *dva = bp->blk_dva;
2864168404Spjd	int ndvas = BP_GET_NDVAS(bp);
2865168404Spjd
2866168404Spjd	ASSERT(!BP_IS_HOLE(bp));
2867219089Spjd	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2868168404Spjd
2869185029Spjd	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2870185029Spjd
2871185029Spjd	for (int d = 0; d < ndvas; d++)
2872168404Spjd		metaslab_free_dva(spa, &dva[d], txg, now);
2873185029Spjd
2874185029Spjd	spa_config_exit(spa, SCL_FREE, FTAG);
2875168404Spjd}
2876168404Spjd
2877168404Spjdint
2878168404Spjdmetaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2879168404Spjd{
2880168404Spjd	const dva_t *dva = bp->blk_dva;
2881168404Spjd	int ndvas = BP_GET_NDVAS(bp);
2882185029Spjd	int error = 0;
2883168404Spjd
2884168404Spjd	ASSERT(!BP_IS_HOLE(bp));
2885168404Spjd
2886185029Spjd	if (txg != 0) {
2887185029Spjd		/*
2888185029Spjd		 * First do a dry run to make sure all DVAs are claimable,
2889185029Spjd		 * so we don't have to unwind from partial failures below.
2890185029Spjd		 */
2891185029Spjd		if ((error = metaslab_claim(spa, bp, 0)) != 0)
2892185029Spjd			return (error);
2893185029Spjd	}
2894185029Spjd
2895185029Spjd	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2896185029Spjd
2897185029Spjd	for (int d = 0; d < ndvas; d++)
2898168404Spjd		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2899185029Spjd			break;
2900168404Spjd
2901185029Spjd	spa_config_exit(spa, SCL_ALLOC, FTAG);
2902185029Spjd
2903185029Spjd	ASSERT(error == 0 || txg == 0);
2904185029Spjd
2905185029Spjd	return (error);
2906168404Spjd}
2907248571Smm
2908248571Smmvoid
2909248571Smmmetaslab_check_free(spa_t *spa, const blkptr_t *bp)
2910248571Smm{
2911248571Smm	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2912248571Smm		return;
2913248571Smm
2914248571Smm	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2915248571Smm	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2916262093Savg		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2917262093Savg		vdev_t *vd = vdev_lookup_top(spa, vdev);
2918262093Savg		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2919248571Smm		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2920262093Savg		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2921248571Smm
2922262093Savg		if (msp->ms_loaded)
2923262093Savg			range_tree_verify(msp->ms_tree, offset, size);
2924248571Smm
2925248571Smm		for (int j = 0; j < TXG_SIZE; j++)
2926262093Savg			range_tree_verify(msp->ms_freetree[j], offset, size);
2927248571Smm		for (int j = 0; j < TXG_DEFER_SIZE; j++)
2928262093Savg			range_tree_verify(msp->ms_defertree[j], offset, size);
2929248571Smm	}
2930248571Smm	spa_config_exit(spa, SCL_VDEV, FTAG);
2931248571Smm}
2932