Deleted Added
full compact
metaslab.c (268855) metaslab.c (269118)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 18 unchanged lines hidden (view full) ---

27#include <sys/zfs_context.h>
28#include <sys/dmu.h>
29#include <sys/dmu_tx.h>
30#include <sys/space_map.h>
31#include <sys/metaslab_impl.h>
32#include <sys/vdev_impl.h>
33#include <sys/zio.h>
34#include <sys/spa_impl.h>
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 18 unchanged lines hidden (view full) ---

27#include <sys/zfs_context.h>
28#include <sys/dmu.h>
29#include <sys/dmu_tx.h>
30#include <sys/space_map.h>
31#include <sys/metaslab_impl.h>
32#include <sys/vdev_impl.h>
33#include <sys/zio.h>
34#include <sys/spa_impl.h>
35#include <sys/zfeature.h>
35
36SYSCTL_DECL(_vfs_zfs);
37SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
38
39/*
40 * Allow allocations to switch to gang blocks quickly. We do this to
41 * avoid having to load lots of space_maps in a given txg. There are,
42 * however, some cases where we want to avoid "fast" ganging and instead

--- 41 unchanged lines hidden (view full) ---

84 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
85 * blocks.
86 */
87int zfs_metaslab_condense_block_threshold = 4;
88
89/*
90 * The zfs_mg_noalloc_threshold defines which metaslab groups should
91 * be eligible for allocation. The value is defined as a percentage of
36
37SYSCTL_DECL(_vfs_zfs);
38SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
39
40/*
41 * Allow allocations to switch to gang blocks quickly. We do this to
42 * avoid having to load lots of space_maps in a given txg. There are,
43 * however, some cases where we want to avoid "fast" ganging and instead

--- 41 unchanged lines hidden (view full) ---

85 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
86 * blocks.
87 */
88int zfs_metaslab_condense_block_threshold = 4;
89
90/*
91 * The zfs_mg_noalloc_threshold defines which metaslab groups should
92 * be eligible for allocation. The value is defined as a percentage of
92 * a free space. Metaslab groups that have more free space than
93 * free space. Metaslab groups that have more free space than
93 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
94 * a metaslab group's free space is less than or equal to the
95 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
96 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
97 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
98 * groups are allowed to accept allocations. Gang blocks are always
99 * eligible to allocate on any metaslab group. The default value of 0 means
100 * no metaslab group will be excluded based on this criterion.
101 */
102int zfs_mg_noalloc_threshold = 0;
103SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
104 &zfs_mg_noalloc_threshold, 0,
105 "Percentage of metaslab group size that should be free"
106 " to make it eligible for allocation");
107
108/*
94 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
95 * a metaslab group's free space is less than or equal to the
96 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
97 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
98 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
99 * groups are allowed to accept allocations. Gang blocks are always
100 * eligible to allocate on any metaslab group. The default value of 0 means
101 * no metaslab group will be excluded based on this criterion.
102 */
103int zfs_mg_noalloc_threshold = 0;
104SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
105 &zfs_mg_noalloc_threshold, 0,
106 "Percentage of metaslab group size that should be free"
107 " to make it eligible for allocation");
108
109/*
110 * Metaslab groups are considered eligible for allocations if their
111 * fragmenation metric (measured as a percentage) is less than or equal to
112 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
113 * then it will be skipped unless all metaslab groups within the metaslab
114 * class have also crossed this threshold.
115 */
116int zfs_mg_fragmentation_threshold = 85;
117
118/*
119 * Allow metaslabs to keep their active state as long as their fragmentation
120 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
121 * active metaslab that exceeds this threshold will no longer keep its active
122 * status allowing better metaslabs to be selected.
123 */
124int zfs_metaslab_fragmentation_threshold = 70;
125
126/*
109 * When set will load all metaslabs when pool is first opened.
110 */
111int metaslab_debug_load = 0;
112SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
113 &metaslab_debug_load, 0,
114 "Load all metaslabs when pool is first opened");
115
116/*

--- 51 unchanged lines hidden (view full) ---

168 * keep it loaded.
169 */
170int metaslab_unload_delay = TXG_SIZE * 2;
171SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
172 &metaslab_unload_delay, 0,
173 "Number of TXGs that an unused metaslab can be kept in memory");
174
175/*
127 * When set will load all metaslabs when pool is first opened.
128 */
129int metaslab_debug_load = 0;
130SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
131 &metaslab_debug_load, 0,
132 "Load all metaslabs when pool is first opened");
133
134/*

--- 51 unchanged lines hidden (view full) ---

186 * keep it loaded.
187 */
188int metaslab_unload_delay = TXG_SIZE * 2;
189SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
190 &metaslab_unload_delay, 0,
191 "Number of TXGs that an unused metaslab can be kept in memory");
192
193/*
176 * Should we be willing to write data to degraded vdevs?
177 */
178boolean_t zfs_write_to_degraded = B_FALSE;
179SYSCTL_INT(_vfs_zfs, OID_AUTO, write_to_degraded, CTLFLAG_RWTUN,
180 &zfs_write_to_degraded, 0, "Allow writing data to degraded vdevs");
181
182/*
183 * Max number of metaslabs per group to preload.
184 */
185int metaslab_preload_limit = SPA_DVAS_PER_BP;
186SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
187 &metaslab_preload_limit, 0,
188 "Max number of metaslabs per group to preload");
189
190/*
191 * Enable/disable preloading of metaslab.
192 */
193boolean_t metaslab_preload_enabled = B_TRUE;
194SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
195 &metaslab_preload_enabled, 0,
196 "Max number of metaslabs per group to preload");
197
198/*
194 * Max number of metaslabs per group to preload.
195 */
196int metaslab_preload_limit = SPA_DVAS_PER_BP;
197SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
198 &metaslab_preload_limit, 0,
199 "Max number of metaslabs per group to preload");
200
201/*
202 * Enable/disable preloading of metaslab.
203 */
204boolean_t metaslab_preload_enabled = B_TRUE;
205SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
206 &metaslab_preload_enabled, 0,
207 "Max number of metaslabs per group to preload");
208
209/*
199 * Enable/disable additional weight factor for each metaslab.
210 * Enable/disable fragmentation weighting on metaslabs.
200 */
211 */
201boolean_t metaslab_weight_factor_enable = B_FALSE;
202SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, weight_factor_enable, CTLFLAG_RWTUN,
203 &metaslab_weight_factor_enable, 0,
204 "Enable additional weight factor for each metaslab");
212boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
213SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
214 &metaslab_fragmentation_factor_enabled, 0,
215 "Enable fragmentation weighting on metaslabs");
205
216
217/*
218 * Enable/disable lba weighting (i.e. outer tracks are given preference).
219 */
220boolean_t metaslab_lba_weighting_enabled = B_TRUE;
221SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
222 &metaslab_lba_weighting_enabled, 0,
223 "Enable LBA weighting (i.e. outer tracks are given preference)");
206
207/*
224
225/*
226 * Enable/disable metaslab group biasing.
227 */
228boolean_t metaslab_bias_enabled = B_TRUE;
229SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
230 &metaslab_bias_enabled, 0,
231 "Enable metaslab group biasing");
232
233static uint64_t metaslab_fragmentation(metaslab_t *);
234
235/*
208 * ==========================================================================
209 * Metaslab classes
210 * ==========================================================================
211 */
212metaslab_class_t *
213metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
214{
215 metaslab_class_t *mc;

--- 101 unchanged lines hidden (view full) ---

317}
318
319uint64_t
320metaslab_class_get_minblocksize(metaslab_class_t *mc)
321{
322 return (mc->mc_minblocksize);
323}
324
236 * ==========================================================================
237 * Metaslab classes
238 * ==========================================================================
239 */
240metaslab_class_t *
241metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
242{
243 metaslab_class_t *mc;

--- 101 unchanged lines hidden (view full) ---

345}
346
347uint64_t
348metaslab_class_get_minblocksize(metaslab_class_t *mc)
349{
350 return (mc->mc_minblocksize);
351}
352
353void
354metaslab_class_histogram_verify(metaslab_class_t *mc)
355{
356 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
357 uint64_t *mc_hist;
358 int i;
359
360 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
361 return;
362
363 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
364 KM_SLEEP);
365
366 for (int c = 0; c < rvd->vdev_children; c++) {
367 vdev_t *tvd = rvd->vdev_child[c];
368 metaslab_group_t *mg = tvd->vdev_mg;
369
370 /*
371 * Skip any holes, uninitialized top-levels, or
372 * vdevs that are not in this metalab class.
373 */
374 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
375 mg->mg_class != mc) {
376 continue;
377 }
378
379 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
380 mc_hist[i] += mg->mg_histogram[i];
381 }
382
383 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
384 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
385
386 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
387}
388
325/*
389/*
390 * Calculate the metaslab class's fragmentation metric. The metric
391 * is weighted based on the space contribution of each metaslab group.
392 * The return value will be a number between 0 and 100 (inclusive), or
393 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
394 * zfs_frag_table for more information about the metric.
395 */
396uint64_t
397metaslab_class_fragmentation(metaslab_class_t *mc)
398{
399 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
400 uint64_t fragmentation = 0;
401
402 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
403
404 for (int c = 0; c < rvd->vdev_children; c++) {
405 vdev_t *tvd = rvd->vdev_child[c];
406 metaslab_group_t *mg = tvd->vdev_mg;
407
408 /*
409 * Skip any holes, uninitialized top-levels, or
410 * vdevs that are not in this metalab class.
411 */
412 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
413 mg->mg_class != mc) {
414 continue;
415 }
416
417 /*
418 * If a metaslab group does not contain a fragmentation
419 * metric then just bail out.
420 */
421 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
422 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
423 return (ZFS_FRAG_INVALID);
424 }
425
426 /*
427 * Determine how much this metaslab_group is contributing
428 * to the overall pool fragmentation metric.
429 */
430 fragmentation += mg->mg_fragmentation *
431 metaslab_group_get_space(mg);
432 }
433 fragmentation /= metaslab_class_get_space(mc);
434
435 ASSERT3U(fragmentation, <=, 100);
436 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
437 return (fragmentation);
438}
439
440/*
441 * Calculate the amount of expandable space that is available in
442 * this metaslab class. If a device is expanded then its expandable
443 * space will be the amount of allocatable space that is currently not
444 * part of this metaslab class.
445 */
446uint64_t
447metaslab_class_expandable_space(metaslab_class_t *mc)
448{
449 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
450 uint64_t space = 0;
451
452 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
453 for (int c = 0; c < rvd->vdev_children; c++) {
454 vdev_t *tvd = rvd->vdev_child[c];
455 metaslab_group_t *mg = tvd->vdev_mg;
456
457 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
458 mg->mg_class != mc) {
459 continue;
460 }
461
462 space += tvd->vdev_max_asize - tvd->vdev_asize;
463 }
464 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
465 return (space);
466}
467
468/*
326 * ==========================================================================
327 * Metaslab groups
328 * ==========================================================================
329 */
330static int
331metaslab_compare(const void *x1, const void *x2)
332{
333 const metaslab_t *m1 = x1;

--- 35 unchanged lines hidden (view full) ---

369 ASSERT(vd == vd->vdev_top);
370
371 mutex_enter(&mg->mg_lock);
372 was_allocatable = mg->mg_allocatable;
373
374 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
375 (vs->vs_space + 1);
376
469 * ==========================================================================
470 * Metaslab groups
471 * ==========================================================================
472 */
473static int
474metaslab_compare(const void *x1, const void *x2)
475{
476 const metaslab_t *m1 = x1;

--- 35 unchanged lines hidden (view full) ---

512 ASSERT(vd == vd->vdev_top);
513
514 mutex_enter(&mg->mg_lock);
515 was_allocatable = mg->mg_allocatable;
516
517 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
518 (vs->vs_space + 1);
519
377 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
520 /*
521 * A metaslab group is considered allocatable if it has plenty
522 * of free space or is not heavily fragmented. We only take
523 * fragmentation into account if the metaslab group has a valid
524 * fragmentation metric (i.e. a value between 0 and 100).
525 */
526 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
527 (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
528 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
378
379 /*
380 * The mc_alloc_groups maintains a count of the number of
381 * groups in this metaslab class that are still above the
382 * zfs_mg_noalloc_threshold. This is used by the allocating
383 * threads to determine if they should avoid allocations to
384 * a given group. The allocator will avoid allocations to a group
385 * if that group has reached or is below the zfs_mg_noalloc_threshold

--- 4 unchanged lines hidden (view full) ---

390 * groups have reached the zfs_mg_noalloc_threshold making all groups
391 * eligible for allocations. This effectively means that all devices
392 * are balanced again.
393 */
394 if (was_allocatable && !mg->mg_allocatable)
395 mc->mc_alloc_groups--;
396 else if (!was_allocatable && mg->mg_allocatable)
397 mc->mc_alloc_groups++;
529
530 /*
531 * The mc_alloc_groups maintains a count of the number of
532 * groups in this metaslab class that are still above the
533 * zfs_mg_noalloc_threshold. This is used by the allocating
534 * threads to determine if they should avoid allocations to
535 * a given group. The allocator will avoid allocations to a group
536 * if that group has reached or is below the zfs_mg_noalloc_threshold

--- 4 unchanged lines hidden (view full) ---

541 * groups have reached the zfs_mg_noalloc_threshold making all groups
542 * eligible for allocations. This effectively means that all devices
543 * are balanced again.
544 */
545 if (was_allocatable && !mg->mg_allocatable)
546 mc->mc_alloc_groups--;
547 else if (!was_allocatable && mg->mg_allocatable)
548 mc->mc_alloc_groups++;
549
398 mutex_exit(&mg->mg_lock);
399}
400
401metaslab_group_t *
402metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
403{
404 metaslab_group_t *mg;
405

--- 74 unchanged lines hidden (view full) ---

480 ASSERT(mc->mc_rotor != mg);
481 ASSERT(mg->mg_prev == NULL);
482 ASSERT(mg->mg_next == NULL);
483 ASSERT(mg->mg_activation_count < 0);
484 return;
485 }
486
487 taskq_wait(mg->mg_taskq);
550 mutex_exit(&mg->mg_lock);
551}
552
553metaslab_group_t *
554metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
555{
556 metaslab_group_t *mg;
557

--- 74 unchanged lines hidden (view full) ---

632 ASSERT(mc->mc_rotor != mg);
633 ASSERT(mg->mg_prev == NULL);
634 ASSERT(mg->mg_next == NULL);
635 ASSERT(mg->mg_activation_count < 0);
636 return;
637 }
638
639 taskq_wait(mg->mg_taskq);
640 metaslab_group_alloc_update(mg);
488
489 mgprev = mg->mg_prev;
490 mgnext = mg->mg_next;
491
492 if (mg == mgnext) {
493 mc->mc_rotor = NULL;
494 } else {
495 mc->mc_rotor = mgnext;
496 mgprev->mg_next = mgnext;
497 mgnext->mg_prev = mgprev;
498 }
499
500 mg->mg_prev = NULL;
501 mg->mg_next = NULL;
502 metaslab_class_minblocksize_update(mc);
503}
504
641
642 mgprev = mg->mg_prev;
643 mgnext = mg->mg_next;
644
645 if (mg == mgnext) {
646 mc->mc_rotor = NULL;
647 } else {
648 mc->mc_rotor = mgnext;
649 mgprev->mg_next = mgnext;
650 mgnext->mg_prev = mgprev;
651 }
652
653 mg->mg_prev = NULL;
654 mg->mg_next = NULL;
655 metaslab_class_minblocksize_update(mc);
656}
657
658uint64_t
659metaslab_group_get_space(metaslab_group_t *mg)
660{
661 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
662}
663
664void
665metaslab_group_histogram_verify(metaslab_group_t *mg)
666{
667 uint64_t *mg_hist;
668 vdev_t *vd = mg->mg_vd;
669 uint64_t ashift = vd->vdev_ashift;
670 int i;
671
672 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
673 return;
674
675 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
676 KM_SLEEP);
677
678 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
679 SPACE_MAP_HISTOGRAM_SIZE + ashift);
680
681 for (int m = 0; m < vd->vdev_ms_count; m++) {
682 metaslab_t *msp = vd->vdev_ms[m];
683
684 if (msp->ms_sm == NULL)
685 continue;
686
687 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
688 mg_hist[i + ashift] +=
689 msp->ms_sm->sm_phys->smp_histogram[i];
690 }
691
692 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
693 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
694
695 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
696}
697
505static void
698static void
506metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
699metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
507{
700{
701 metaslab_class_t *mc = mg->mg_class;
702 uint64_t ashift = mg->mg_vd->vdev_ashift;
703
704 ASSERT(MUTEX_HELD(&msp->ms_lock));
705 if (msp->ms_sm == NULL)
706 return;
707
508 mutex_enter(&mg->mg_lock);
708 mutex_enter(&mg->mg_lock);
709 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
710 mg->mg_histogram[i + ashift] +=
711 msp->ms_sm->sm_phys->smp_histogram[i];
712 mc->mc_histogram[i + ashift] +=
713 msp->ms_sm->sm_phys->smp_histogram[i];
714 }
715 mutex_exit(&mg->mg_lock);
716}
717
718void
719metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
720{
721 metaslab_class_t *mc = mg->mg_class;
722 uint64_t ashift = mg->mg_vd->vdev_ashift;
723
724 ASSERT(MUTEX_HELD(&msp->ms_lock));
725 if (msp->ms_sm == NULL)
726 return;
727
728 mutex_enter(&mg->mg_lock);
729 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
730 ASSERT3U(mg->mg_histogram[i + ashift], >=,
731 msp->ms_sm->sm_phys->smp_histogram[i]);
732 ASSERT3U(mc->mc_histogram[i + ashift], >=,
733 msp->ms_sm->sm_phys->smp_histogram[i]);
734
735 mg->mg_histogram[i + ashift] -=
736 msp->ms_sm->sm_phys->smp_histogram[i];
737 mc->mc_histogram[i + ashift] -=
738 msp->ms_sm->sm_phys->smp_histogram[i];
739 }
740 mutex_exit(&mg->mg_lock);
741}
742
743static void
744metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
745{
509 ASSERT(msp->ms_group == NULL);
746 ASSERT(msp->ms_group == NULL);
747 mutex_enter(&mg->mg_lock);
510 msp->ms_group = mg;
511 msp->ms_weight = 0;
512 avl_add(&mg->mg_metaslab_tree, msp);
513 mutex_exit(&mg->mg_lock);
748 msp->ms_group = mg;
749 msp->ms_weight = 0;
750 avl_add(&mg->mg_metaslab_tree, msp);
751 mutex_exit(&mg->mg_lock);
752
753 mutex_enter(&msp->ms_lock);
754 metaslab_group_histogram_add(mg, msp);
755 mutex_exit(&msp->ms_lock);
514}
515
516static void
517metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
518{
756}
757
758static void
759metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
760{
761 mutex_enter(&msp->ms_lock);
762 metaslab_group_histogram_remove(mg, msp);
763 mutex_exit(&msp->ms_lock);
764
519 mutex_enter(&mg->mg_lock);
520 ASSERT(msp->ms_group == mg);
521 avl_remove(&mg->mg_metaslab_tree, msp);
522 msp->ms_group = NULL;
523 mutex_exit(&mg->mg_lock);
524}
525
526static void
527metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
528{
529 /*
530 * Although in principle the weight can be any value, in
765 mutex_enter(&mg->mg_lock);
766 ASSERT(msp->ms_group == mg);
767 avl_remove(&mg->mg_metaslab_tree, msp);
768 msp->ms_group = NULL;
769 mutex_exit(&mg->mg_lock);
770}
771
772static void
773metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
774{
775 /*
776 * Although in principle the weight can be any value, in
531 * practice we do not use values in the range [1, 510].
777 * practice we do not use values in the range [1, 511].
532 */
778 */
533 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
779 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
534 ASSERT(MUTEX_HELD(&msp->ms_lock));
535
536 mutex_enter(&mg->mg_lock);
537 ASSERT(msp->ms_group == mg);
538 avl_remove(&mg->mg_metaslab_tree, msp);
539 msp->ms_weight = weight;
540 avl_add(&mg->mg_metaslab_tree, msp);
541 mutex_exit(&mg->mg_lock);
542}
543
544/*
780 ASSERT(MUTEX_HELD(&msp->ms_lock));
781
782 mutex_enter(&mg->mg_lock);
783 ASSERT(msp->ms_group == mg);
784 avl_remove(&mg->mg_metaslab_tree, msp);
785 msp->ms_weight = weight;
786 avl_add(&mg->mg_metaslab_tree, msp);
787 mutex_exit(&mg->mg_lock);
788}
789
790/*
791 * Calculate the fragmentation for a given metaslab group. We can use
792 * a simple average here since all metaslabs within the group must have
793 * the same size. The return value will be a value between 0 and 100
794 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
795 * group have a fragmentation metric.
796 */
797uint64_t
798metaslab_group_fragmentation(metaslab_group_t *mg)
799{
800 vdev_t *vd = mg->mg_vd;
801 uint64_t fragmentation = 0;
802 uint64_t valid_ms = 0;
803
804 for (int m = 0; m < vd->vdev_ms_count; m++) {
805 metaslab_t *msp = vd->vdev_ms[m];
806
807 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
808 continue;
809
810 valid_ms++;
811 fragmentation += msp->ms_fragmentation;
812 }
813
814 if (valid_ms <= vd->vdev_ms_count / 2)
815 return (ZFS_FRAG_INVALID);
816
817 fragmentation /= valid_ms;
818 ASSERT3U(fragmentation, <=, 100);
819 return (fragmentation);
820}
821
822/*
545 * Determine if a given metaslab group should skip allocations. A metaslab
823 * Determine if a given metaslab group should skip allocations. A metaslab
546 * group should avoid allocations if its used capacity has crossed the
547 * zfs_mg_noalloc_threshold and there is at least one metaslab group
824 * group should avoid allocations if its free capacity is less than the
825 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
826 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
548 * that can still handle allocations.
549 */
550static boolean_t
551metaslab_group_allocatable(metaslab_group_t *mg)
552{
553 vdev_t *vd = mg->mg_vd;
554 spa_t *spa = vd->vdev_spa;
555 metaslab_class_t *mc = mg->mg_class;
556
557 /*
827 * that can still handle allocations.
828 */
829static boolean_t
830metaslab_group_allocatable(metaslab_group_t *mg)
831{
832 vdev_t *vd = mg->mg_vd;
833 spa_t *spa = vd->vdev_spa;
834 metaslab_class_t *mc = mg->mg_class;
835
836 /*
558 * A metaslab group is considered allocatable if its free capacity
559 * is greater than the set value of zfs_mg_noalloc_threshold, it's
560 * associated with a slog, or there are no other metaslab groups
561 * with free capacity greater than zfs_mg_noalloc_threshold.
837 * We use two key metrics to determine if a metaslab group is
838 * considered allocatable -- free space and fragmentation. If
839 * the free space is greater than the free space threshold and
840 * the fragmentation is less than the fragmentation threshold then
841 * consider the group allocatable. There are two case when we will
842 * not consider these key metrics. The first is if the group is
843 * associated with a slog device and the second is if all groups
844 * in this metaslab class have already been consider ineligible
845 * for allocations.
562 */
846 */
563 return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
847 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
848 (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
849 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) ||
564 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
565}
566
567/*
568 * ==========================================================================
569 * Range tree callbacks
570 * ==========================================================================
571 */

--- 207 unchanged lines hidden (view full) ---

779 */
780 uint64_t align = size & -size;
781 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
782 avl_tree_t *t = &msp->ms_tree->rt_root;
783
784 return (metaslab_block_picker(t, cursor, size, align));
785}
786
850 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
851}
852
853/*
854 * ==========================================================================
855 * Range tree callbacks
856 * ==========================================================================
857 */

--- 207 unchanged lines hidden (view full) ---

1065 */
1066 uint64_t align = size & -size;
1067 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1068 avl_tree_t *t = &msp->ms_tree->rt_root;
1069
1070 return (metaslab_block_picker(t, cursor, size, align));
1071}
1072
787/* ARGSUSED */
788static boolean_t
789metaslab_ff_fragmented(metaslab_t *msp)
790{
791 return (B_TRUE);
792}
793
794static metaslab_ops_t metaslab_ff_ops = {
1073static metaslab_ops_t metaslab_ff_ops = {
795 metaslab_ff_alloc,
796 metaslab_ff_fragmented
1074 metaslab_ff_alloc
797};
798
799/*
800 * ==========================================================================
801 * Dynamic block allocator -
802 * Uses the first fit allocation scheme until space get low and then
803 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
804 * and metaslab_df_free_pct to determine when to switch the allocation scheme.

--- 30 unchanged lines hidden (view full) ---

835 free_pct < metaslab_df_free_pct) {
836 t = &msp->ms_size_tree;
837 *cursor = 0;
838 }
839
840 return (metaslab_block_picker(t, cursor, size, 1ULL));
841}
842
1075};
1076
1077/*
1078 * ==========================================================================
1079 * Dynamic block allocator -
1080 * Uses the first fit allocation scheme until space get low and then
1081 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1082 * and metaslab_df_free_pct to determine when to switch the allocation scheme.

--- 30 unchanged lines hidden (view full) ---

1113 free_pct < metaslab_df_free_pct) {
1114 t = &msp->ms_size_tree;
1115 *cursor = 0;
1116 }
1117
1118 return (metaslab_block_picker(t, cursor, size, 1ULL));
1119}
1120
843static boolean_t
844metaslab_df_fragmented(metaslab_t *msp)
845{
846 range_tree_t *rt = msp->ms_tree;
847 uint64_t max_size = metaslab_block_maxsize(msp);
848 int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
849
850 if (max_size >= metaslab_df_alloc_threshold &&
851 free_pct >= metaslab_df_free_pct)
852 return (B_FALSE);
853
854 return (B_TRUE);
855}
856
857static metaslab_ops_t metaslab_df_ops = {
1121static metaslab_ops_t metaslab_df_ops = {
858 metaslab_df_alloc,
859 metaslab_df_fragmented
1122 metaslab_df_alloc
860};
861
862/*
863 * ==========================================================================
864 * Cursor fit block allocator -
865 * Select the largest region in the metaslab, set the cursor to the beginning
866 * of the range and the cursor_end to the end of the range. As allocations
867 * are made advance the cursor. Continue allocating from the cursor until

--- 26 unchanged lines hidden (view full) ---

894 }
895
896 offset = *cursor;
897 *cursor += size;
898
899 return (offset);
900}
901
1123};
1124
1125/*
1126 * ==========================================================================
1127 * Cursor fit block allocator -
1128 * Select the largest region in the metaslab, set the cursor to the beginning
1129 * of the range and the cursor_end to the end of the range. As allocations
1130 * are made advance the cursor. Continue allocating from the cursor until

--- 26 unchanged lines hidden (view full) ---

1157 }
1158
1159 offset = *cursor;
1160 *cursor += size;
1161
1162 return (offset);
1163}
1164
902static boolean_t
903metaslab_cf_fragmented(metaslab_t *msp)
904{
905 return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
906}
907
908static metaslab_ops_t metaslab_cf_ops = {
1165static metaslab_ops_t metaslab_cf_ops = {
909 metaslab_cf_alloc,
910 metaslab_cf_fragmented
1166 metaslab_cf_alloc
911};
912
913/*
914 * ==========================================================================
915 * New dynamic fit allocator -
916 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
917 * contiguous blocks. If no region is found then just use the largest segment
918 * that remains.

--- 40 unchanged lines hidden (view full) ---

959
960 if ((rs->rs_end - rs->rs_start) >= size) {
961 *cursor = rs->rs_start + size;
962 return (rs->rs_start);
963 }
964 return (-1ULL);
965}
966
1167};
1168
1169/*
1170 * ==========================================================================
1171 * New dynamic fit allocator -
1172 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1173 * contiguous blocks. If no region is found then just use the largest segment
1174 * that remains.

--- 40 unchanged lines hidden (view full) ---

1215
1216 if ((rs->rs_end - rs->rs_start) >= size) {
1217 *cursor = rs->rs_start + size;
1218 return (rs->rs_start);
1219 }
1220 return (-1ULL);
1221}
1222
967static boolean_t
968metaslab_ndf_fragmented(metaslab_t *msp)
969{
970 return (metaslab_block_maxsize(msp) <=
971 (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
972}
973
974static metaslab_ops_t metaslab_ndf_ops = {
1223static metaslab_ops_t metaslab_ndf_ops = {
975 metaslab_ndf_alloc,
976 metaslab_ndf_fragmented
1224 metaslab_ndf_alloc
977};
978
979metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
980
981/*
982 * ==========================================================================
983 * Metaslabs
984 * ==========================================================================

--- 85 unchanged lines hidden (view full) ---

1070 * alloctree and freetree until metaslab_sync_done(). This serves
1071 * two purposes: it allows metaslab_sync_done() to detect the
1072 * addition of new space; and for debugging, it ensures that we'd
1073 * data fault on any attempt to use this metaslab before it's ready.
1074 */
1075 msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
1076 metaslab_group_add(mg, msp);
1077
1225};
1226
1227metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1228
1229/*
1230 * ==========================================================================
1231 * Metaslabs
1232 * ==========================================================================

--- 85 unchanged lines hidden (view full) ---

1318 * alloctree and freetree until metaslab_sync_done(). This serves
1319 * two purposes: it allows metaslab_sync_done() to detect the
1320 * addition of new space; and for debugging, it ensures that we'd
1321 * data fault on any attempt to use this metaslab before it's ready.
1322 */
1323 msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
1324 metaslab_group_add(mg, msp);
1325
1326 msp->ms_fragmentation = metaslab_fragmentation(msp);
1078 msp->ms_ops = mg->mg_class->mc_ops;
1079
1080 /*
1081 * If we're opening an existing pool (txg == 0) or creating
1082 * a new one (txg == TXG_INITIAL), all space is available now.
1083 * If we're adding space to an existing pool, the new space
1084 * does not become available until after this txg has synced.
1085 */

--- 49 unchanged lines hidden (view full) ---

1135
1136 mutex_exit(&msp->ms_lock);
1137 cv_destroy(&msp->ms_load_cv);
1138 mutex_destroy(&msp->ms_lock);
1139
1140 kmem_free(msp, sizeof (metaslab_t));
1141}
1142
1327 msp->ms_ops = mg->mg_class->mc_ops;
1328
1329 /*
1330 * If we're opening an existing pool (txg == 0) or creating
1331 * a new one (txg == TXG_INITIAL), all space is available now.
1332 * If we're adding space to an existing pool, the new space
1333 * does not become available until after this txg has synced.
1334 */

--- 49 unchanged lines hidden (view full) ---

1384
1385 mutex_exit(&msp->ms_lock);
1386 cv_destroy(&msp->ms_load_cv);
1387 mutex_destroy(&msp->ms_lock);
1388
1389 kmem_free(msp, sizeof (metaslab_t));
1390}
1391
1392#define FRAGMENTATION_TABLE_SIZE 17
1393
1143/*
1394/*
1144 * Apply a weighting factor based on the histogram information for this
1145 * metaslab. The current weighting factor is somewhat arbitrary and requires
1146 * additional investigation. The implementation provides a measure of
1147 * "weighted" free space and gives a higher weighting for larger contiguous
1148 * regions. The weighting factor is determined by counting the number of
1149 * sm_shift sectors that exist in each region represented by the histogram.
1150 * That value is then multiplied by the power of 2 exponent and the sm_shift
1151 * value.
1395 * This table defines a segment size based fragmentation metric that will
1396 * allow each metaslab to derive its own fragmentation value. This is done
1397 * by calculating the space in each bucket of the spacemap histogram and
1398 * multiplying that by the fragmetation metric in this table. Doing
1399 * this for all buckets and dividing it by the total amount of free
1400 * space in this metaslab (i.e. the total free space in all buckets) gives
1401 * us the fragmentation metric. This means that a high fragmentation metric
1402 * equates to most of the free space being comprised of small segments.
1403 * Conversely, if the metric is low, then most of the free space is in
1404 * large segments. A 10% change in fragmentation equates to approximately
1405 * double the number of segments.
1152 *
1406 *
1153 * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
1154 * metaslab has an sm_shift value of 9 (512B):
1155 *
1156 * 1) calculate the number of sm_shift sectors in the region:
1157 * 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1158 * 2) multiply by the power of 2 exponent and the sm_shift value:
1159 * 16384 * 21 * 9 = 3096576
1160 * This value will be added to the weighting of the metaslab.
1407 * This table defines 0% fragmented space using 16MB segments. Testing has
1408 * shown that segments that are greater than or equal to 16MB do not suffer
1409 * from drastic performance problems. Using this value, we derive the rest
1410 * of the table. Since the fragmentation value is never stored on disk, it
1411 * is possible to change these calculations in the future.
1161 */
1412 */
1413int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1414 100, /* 512B */
1415 100, /* 1K */
1416 98, /* 2K */
1417 95, /* 4K */
1418 90, /* 8K */
1419 80, /* 16K */
1420 70, /* 32K */
1421 60, /* 64K */
1422 50, /* 128K */
1423 40, /* 256K */
1424 30, /* 512K */
1425 20, /* 1M */
1426 15, /* 2M */
1427 10, /* 4M */
1428 5, /* 8M */
1429 0 /* 16M */
1430};
1431
1432/*
1433 * Calclate the metaslab's fragmentation metric. A return value
1434 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1435 * not support this metric. Otherwise, the return value should be in the
1436 * range [0, 100].
1437 */
1162static uint64_t
1438static uint64_t
1163metaslab_weight_factor(metaslab_t *msp)
1439metaslab_fragmentation(metaslab_t *msp)
1164{
1440{
1165 uint64_t factor = 0;
1166 uint64_t sectors;
1167 int i;
1441 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1442 uint64_t fragmentation = 0;
1443 uint64_t total = 0;
1444 boolean_t feature_enabled = spa_feature_is_enabled(spa,
1445 SPA_FEATURE_SPACEMAP_HISTOGRAM);
1168
1446
1447 if (!feature_enabled)
1448 return (ZFS_FRAG_INVALID);
1449
1169 /*
1450 /*
1170 * A null space map means that the entire metaslab is free,
1171 * calculate a weight factor that spans the entire size of the
1172 * metaslab.
1451 * A null space map means that the entire metaslab is free
1452 * and thus is not fragmented.
1173 */
1453 */
1174 if (msp->ms_sm == NULL) {
1454 if (msp->ms_sm == NULL)
1455 return (0);
1456
1457 /*
1458 * If this metaslab's space_map has not been upgraded, flag it
1459 * so that we upgrade next time we encounter it.
1460 */
1461 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1462 uint64_t txg = spa_syncing_txg(spa);
1175 vdev_t *vd = msp->ms_group->mg_vd;
1176
1463 vdev_t *vd = msp->ms_group->mg_vd;
1464
1177 i = highbit64(msp->ms_size) - 1;
1178 sectors = msp->ms_size >> vd->vdev_ashift;
1179 return (sectors * i * vd->vdev_ashift);
1465 msp->ms_condense_wanted = B_TRUE;
1466 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1467 spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1468 "msp %p, vd %p", txg, msp, vd);
1469 return (ZFS_FRAG_INVALID);
1180 }
1181
1470 }
1471
1182 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1183 return (0);
1472 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1473 uint64_t space = 0;
1474 uint8_t shift = msp->ms_sm->sm_shift;
1475 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1476 FRAGMENTATION_TABLE_SIZE - 1);
1184
1477
1185 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1186 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1187 continue;
1188
1478 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1479 continue;
1480
1189 /*
1190 * Determine the number of sm_shift sectors in the region
1191 * indicated by the histogram. For example, given an
1192 * sm_shift value of 9 (512 bytes) and i = 4 then we know
1193 * that we're looking at an 8K region in the histogram
1194 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1195 * number of sm_shift sectors (512 bytes in this example),
1196 * we would take 8192 / 512 = 16. Since the histogram
1197 * is offset by sm_shift we can simply use the value of
1198 * of i to calculate this (i.e. 2^i = 16 where i = 4).
1199 */
1200 sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
1201 factor += (i + msp->ms_sm->sm_shift) * sectors;
1481 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1482 total += space;
1483
1484 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1485 fragmentation += space * zfs_frag_table[idx];
1202 }
1486 }
1203 return (factor * msp->ms_sm->sm_shift);
1487
1488 if (total > 0)
1489 fragmentation /= total;
1490 ASSERT3U(fragmentation, <=, 100);
1491 return (fragmentation);
1204}
1205
1492}
1493
1494/*
1495 * Compute a weight -- a selection preference value -- for the given metaslab.
1496 * This is based on the amount of free space, the level of fragmentation,
1497 * the LBA range, and whether the metaslab is loaded.
1498 */
1206static uint64_t
1207metaslab_weight(metaslab_t *msp)
1208{
1209 metaslab_group_t *mg = msp->ms_group;
1210 vdev_t *vd = mg->mg_vd;
1211 uint64_t weight, space;
1212
1213 ASSERT(MUTEX_HELD(&msp->ms_lock));

--- 7 unchanged lines hidden (view full) ---

1221 ASSERT0(vd->vdev_ms_shift);
1222 return (0);
1223 }
1224
1225 /*
1226 * The baseline weight is the metaslab's free space.
1227 */
1228 space = msp->ms_size - space_map_allocated(msp->ms_sm);
1499static uint64_t
1500metaslab_weight(metaslab_t *msp)
1501{
1502 metaslab_group_t *mg = msp->ms_group;
1503 vdev_t *vd = mg->mg_vd;
1504 uint64_t weight, space;
1505
1506 ASSERT(MUTEX_HELD(&msp->ms_lock));

--- 7 unchanged lines hidden (view full) ---

1514 ASSERT0(vd->vdev_ms_shift);
1515 return (0);
1516 }
1517
1518 /*
1519 * The baseline weight is the metaslab's free space.
1520 */
1521 space = msp->ms_size - space_map_allocated(msp->ms_sm);
1522
1523 msp->ms_fragmentation = metaslab_fragmentation(msp);
1524 if (metaslab_fragmentation_factor_enabled &&
1525 msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1526 /*
1527 * Use the fragmentation information to inversely scale
1528 * down the baseline weight. We need to ensure that we
1529 * don't exclude this metaslab completely when it's 100%
1530 * fragmented. To avoid this we reduce the fragmented value
1531 * by 1.
1532 */
1533 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1534
1535 /*
1536 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1537 * this metaslab again. The fragmentation metric may have
1538 * decreased the space to something smaller than
1539 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1540 * so that we can consume any remaining space.
1541 */
1542 if (space > 0 && space < SPA_MINBLOCKSIZE)
1543 space = SPA_MINBLOCKSIZE;
1544 }
1229 weight = space;
1230
1231 /*
1232 * Modern disks have uniform bit density and constant angular velocity.
1233 * Therefore, the outer recording zones are faster (higher bandwidth)
1234 * than the inner zones by the ratio of outer to inner track diameter,
1235 * which is typically around 2:1. We account for this by assigning
1236 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1237 * In effect, this means that we'll select the metaslab with the most
1238 * free bandwidth rather than simply the one with the most free space.
1239 */
1545 weight = space;
1546
1547 /*
1548 * Modern disks have uniform bit density and constant angular velocity.
1549 * Therefore, the outer recording zones are faster (higher bandwidth)
1550 * than the inner zones by the ratio of outer to inner track diameter,
1551 * which is typically around 2:1. We account for this by assigning
1552 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1553 * In effect, this means that we'll select the metaslab with the most
1554 * free bandwidth rather than simply the one with the most free space.
1555 */
1240 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1241 ASSERT(weight >= space && weight <= 2 * space);
1556 if (metaslab_lba_weighting_enabled) {
1557 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1558 ASSERT(weight >= space && weight <= 2 * space);
1559 }
1242
1560
1243 msp->ms_factor = metaslab_weight_factor(msp);
1244 if (metaslab_weight_factor_enable)
1245 weight += msp->ms_factor;
1246
1247 if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
1248 /*
1249 * If this metaslab is one we're actively using, adjust its
1250 * weight to make it preferable to any inactive metaslab so
1251 * we'll polish it off.
1252 */
1561 /*
1562 * If this metaslab is one we're actively using, adjust its
1563 * weight to make it preferable to any inactive metaslab so
1564 * we'll polish it off. If the fragmentation on this metaslab
1565 * has exceed our threshold, then don't mark it active.
1566 */
1567 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1568 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1253 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1254 }
1255
1256 return (weight);
1257}
1258
1259static int
1260metaslab_activate(metaslab_t *msp, uint64_t activation_weight)

--- 68 unchanged lines hidden (view full) ---

1329 mutex_enter(&mg->mg_lock);
1330 /*
1331 * Load the next potential metaslabs
1332 */
1333 msp = avl_first(t);
1334 while (msp != NULL) {
1335 metaslab_t *msp_next = AVL_NEXT(t, msp);
1336
1569 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1570 }
1571
1572 return (weight);
1573}
1574
1575static int
1576metaslab_activate(metaslab_t *msp, uint64_t activation_weight)

--- 68 unchanged lines hidden (view full) ---

1645 mutex_enter(&mg->mg_lock);
1646 /*
1647 * Load the next potential metaslabs
1648 */
1649 msp = avl_first(t);
1650 while (msp != NULL) {
1651 metaslab_t *msp_next = AVL_NEXT(t, msp);
1652
1337 /* If we have reached our preload limit then we're done */
1338 if (++m > metaslab_preload_limit)
1339 break;
1653 /*
1654 * We preload only the maximum number of metaslabs specified
1655 * by metaslab_preload_limit. If a metaslab is being forced
1656 * to condense then we preload it too. This will ensure
1657 * that force condensing happens in the next txg.
1658 */
1659 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
1660 msp = msp_next;
1661 continue;
1662 }
1340
1341 /*
1342 * We must drop the metaslab group lock here to preserve
1343 * lock ordering with the ms_lock (when grabbing both
1344 * the mg_lock and the ms_lock, the ms_lock must be taken
1345 * first). As a result, it is possible that the ordering
1346 * of the metaslabs within the avl tree may change before
1347 * we reacquire the lock. The metaslab cannot be removed from

--- 51 unchanged lines hidden (view full) ---

1399 dmu_object_info_t doi;
1400 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
1401
1402 ASSERT(MUTEX_HELD(&msp->ms_lock));
1403 ASSERT(msp->ms_loaded);
1404
1405 /*
1406 * Use the ms_size_tree range tree, which is ordered by size, to
1663
1664 /*
1665 * We must drop the metaslab group lock here to preserve
1666 * lock ordering with the ms_lock (when grabbing both
1667 * the mg_lock and the ms_lock, the ms_lock must be taken
1668 * first). As a result, it is possible that the ordering
1669 * of the metaslabs within the avl tree may change before
1670 * we reacquire the lock. The metaslab cannot be removed from

--- 51 unchanged lines hidden (view full) ---

1722 dmu_object_info_t doi;
1723 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
1724
1725 ASSERT(MUTEX_HELD(&msp->ms_lock));
1726 ASSERT(msp->ms_loaded);
1727
1728 /*
1729 * Use the ms_size_tree range tree, which is ordered by size, to
1407 * obtain the largest segment in the free tree. If the tree is empty
1408 * then we should condense the map.
1730 * obtain the largest segment in the free tree. We always condense
1731 * metaslabs that are empty and metaslabs for which a condense
1732 * request has been made.
1409 */
1410 rs = avl_last(&msp->ms_size_tree);
1733 */
1734 rs = avl_last(&msp->ms_size_tree);
1411 if (rs == NULL)
1735 if (rs == NULL || msp->ms_condense_wanted)
1412 return (B_TRUE);
1413
1414 /*
1415 * Calculate the number of 64-bit entries this segment would
1416 * require when written to disk. If this single segment would be
1417 * larger on-disk than the entire current on-disk structure, then
1418 * clearly condensing will increase the on-disk structure size.
1419 */

--- 24 unchanged lines hidden (view full) ---

1444 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1445 range_tree_t *condense_tree;
1446 space_map_t *sm = msp->ms_sm;
1447
1448 ASSERT(MUTEX_HELD(&msp->ms_lock));
1449 ASSERT3U(spa_sync_pass(spa), ==, 1);
1450 ASSERT(msp->ms_loaded);
1451
1736 return (B_TRUE);
1737
1738 /*
1739 * Calculate the number of 64-bit entries this segment would
1740 * require when written to disk. If this single segment would be
1741 * larger on-disk than the entire current on-disk structure, then
1742 * clearly condensing will increase the on-disk structure size.
1743 */

--- 24 unchanged lines hidden (view full) ---

1768 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1769 range_tree_t *condense_tree;
1770 space_map_t *sm = msp->ms_sm;
1771
1772 ASSERT(MUTEX_HELD(&msp->ms_lock));
1773 ASSERT3U(spa_sync_pass(spa), ==, 1);
1774 ASSERT(msp->ms_loaded);
1775
1776
1452 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1777 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1453 "smp size %llu, segments %lu", txg, msp->ms_id, msp,
1454 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1778 "smp size %llu, segments %lu, forcing condense=%s", txg,
1779 msp->ms_id, msp, space_map_length(msp->ms_sm),
1780 avl_numnodes(&msp->ms_tree->rt_root),
1781 msp->ms_condense_wanted ? "TRUE" : "FALSE");
1455
1782
1783 msp->ms_condense_wanted = B_FALSE;
1784
1456 /*
1457 * Create an range tree that is 100% allocated. We remove segments
1458 * that have been freed in this txg, any deferred frees that exist,
1459 * and any allocation in the future. Removing segments should be
1460 * a relatively inexpensive operation since we expect these trees to
1461 * have a small number of nodes.
1462 */
1463 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);

--- 75 unchanged lines hidden (view full) ---

1539 ASSERT3P(alloctree, ==, NULL);
1540 return;
1541 }
1542
1543 ASSERT3P(alloctree, !=, NULL);
1544 ASSERT3P(*freetree, !=, NULL);
1545 ASSERT3P(*freed_tree, !=, NULL);
1546
1785 /*
1786 * Create an range tree that is 100% allocated. We remove segments
1787 * that have been freed in this txg, any deferred frees that exist,
1788 * and any allocation in the future. Removing segments should be
1789 * a relatively inexpensive operation since we expect these trees to
1790 * have a small number of nodes.
1791 */
1792 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);

--- 75 unchanged lines hidden (view full) ---

1868 ASSERT3P(alloctree, ==, NULL);
1869 return;
1870 }
1871
1872 ASSERT3P(alloctree, !=, NULL);
1873 ASSERT3P(*freetree, !=, NULL);
1874 ASSERT3P(*freed_tree, !=, NULL);
1875
1876 /*
1877 * Normally, we don't want to process a metaslab if there
1878 * are no allocations or frees to perform. However, if the metaslab
1879 * is being forced to condense we need to let it through.
1880 */
1547 if (range_tree_space(alloctree) == 0 &&
1881 if (range_tree_space(alloctree) == 0 &&
1548 range_tree_space(*freetree) == 0)
1882 range_tree_space(*freetree) == 0 &&
1883 !msp->ms_condense_wanted)
1549 return;
1550
1551 /*
1552 * The only state that can actually be changing concurrently with
1553 * metaslab_sync() is the metaslab's ms_tree. No other thread can
1554 * be modifying this txg's alloctree, freetree, freed_tree, or
1555 * space_map_phys_t. Therefore, we only hold ms_lock to satify
1556 * space_map ASSERTs. We drop it whenever we call into the DMU,

--- 20 unchanged lines hidden (view full) ---

1577 if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1578 metaslab_should_condense(msp)) {
1579 metaslab_condense(msp, txg, tx);
1580 } else {
1581 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1582 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1583 }
1584
1884 return;
1885
1886 /*
1887 * The only state that can actually be changing concurrently with
1888 * metaslab_sync() is the metaslab's ms_tree. No other thread can
1889 * be modifying this txg's alloctree, freetree, freed_tree, or
1890 * space_map_phys_t. Therefore, we only hold ms_lock to satify
1891 * space_map ASSERTs. We drop it whenever we call into the DMU,

--- 20 unchanged lines hidden (view full) ---

1912 if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1913 metaslab_should_condense(msp)) {
1914 metaslab_condense(msp, txg, tx);
1915 } else {
1916 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1917 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1918 }
1919
1585 range_tree_vacate(alloctree, NULL, NULL);
1586
1920 metaslab_group_histogram_verify(mg);
1921 metaslab_class_histogram_verify(mg->mg_class);
1922 metaslab_group_histogram_remove(mg, msp);
1587 if (msp->ms_loaded) {
1588 /*
1589 * When the space map is loaded, we have an accruate
1590 * histogram in the range tree. This gives us an opportunity
1591 * to bring the space map's histogram up-to-date so we clear
1592 * it first before updating it.
1593 */
1594 space_map_histogram_clear(msp->ms_sm);
1595 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1596 } else {
1597 /*
1598 * Since the space map is not loaded we simply update the
1599 * exisiting histogram with what was freed in this txg. This
1600 * means that the on-disk histogram may not have an accurate
1601 * view of the free space but it's close enough to allow
1602 * us to make allocation decisions.
1603 */
1604 space_map_histogram_add(msp->ms_sm, *freetree, tx);
1605 }
1923 if (msp->ms_loaded) {
1924 /*
1925 * When the space map is loaded, we have an accruate
1926 * histogram in the range tree. This gives us an opportunity
1927 * to bring the space map's histogram up-to-date so we clear
1928 * it first before updating it.
1929 */
1930 space_map_histogram_clear(msp->ms_sm);
1931 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1932 } else {
1933 /*
1934 * Since the space map is not loaded we simply update the
1935 * exisiting histogram with what was freed in this txg. This
1936 * means that the on-disk histogram may not have an accurate
1937 * view of the free space but it's close enough to allow
1938 * us to make allocation decisions.
1939 */
1940 space_map_histogram_add(msp->ms_sm, *freetree, tx);
1941 }
1942 metaslab_group_histogram_add(mg, msp);
1943 metaslab_group_histogram_verify(mg);
1944 metaslab_class_histogram_verify(mg->mg_class);
1606
1607 /*
1608 * For sync pass 1, we avoid traversing this txg's free range tree
1609 * and instead will just swap the pointers for freetree and
1610 * freed_tree. We can safely do this since the freed_tree is
1611 * guaranteed to be empty on the initial pass.
1612 */
1613 if (spa_sync_pass(spa) == 1) {
1614 range_tree_swap(freetree, freed_tree);
1615 } else {
1616 range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1617 }
1945
1946 /*
1947 * For sync pass 1, we avoid traversing this txg's free range tree
1948 * and instead will just swap the pointers for freetree and
1949 * freed_tree. We can safely do this since the freed_tree is
1950 * guaranteed to be empty on the initial pass.
1951 */
1952 if (spa_sync_pass(spa) == 1) {
1953 range_tree_swap(freetree, freed_tree);
1954 } else {
1955 range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1956 }
1957 range_tree_vacate(alloctree, NULL, NULL);
1618
1619 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1620 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1621
1622 mutex_exit(&msp->ms_lock);
1623
1624 if (object != space_map_object(msp->ms_sm)) {
1625 object = space_map_object(msp->ms_sm);

--- 94 unchanged lines hidden (view full) ---

1720 }
1721
1722 if (!metaslab_debug_unload)
1723 metaslab_unload(msp);
1724 }
1725
1726 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1727 mutex_exit(&msp->ms_lock);
1958
1959 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1960 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1961
1962 mutex_exit(&msp->ms_lock);
1963
1964 if (object != space_map_object(msp->ms_sm)) {
1965 object = space_map_object(msp->ms_sm);

--- 94 unchanged lines hidden (view full) ---

2060 }
2061
2062 if (!metaslab_debug_unload)
2063 metaslab_unload(msp);
2064 }
2065
2066 metaslab_group_sort(mg, msp, metaslab_weight(msp));
2067 mutex_exit(&msp->ms_lock);
1728
1729}
1730
1731void
1732metaslab_sync_reassess(metaslab_group_t *mg)
1733{
1734 metaslab_group_alloc_update(mg);
2068}
2069
2070void
2071metaslab_sync_reassess(metaslab_group_t *mg)
2072{
2073 metaslab_group_alloc_update(mg);
2074 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
1735
1736 /*
1737 * Preload the next potential metaslabs
1738 */
1739 metaslab_group_preload(mg);
1740}
1741
1742static uint64_t

--- 245 unchanged lines hidden (view full) ---

1988 goto next;
1989
1990 /*
1991 * Avoid writing single-copy data to a failing vdev
1992 * unless the user instructs us that it is okay.
1993 */
1994 if ((vd->vdev_stat.vs_write_errors > 0 ||
1995 vd->vdev_state < VDEV_STATE_HEALTHY) &&
2075
2076 /*
2077 * Preload the next potential metaslabs
2078 */
2079 metaslab_group_preload(mg);
2080}
2081
2082static uint64_t

--- 245 unchanged lines hidden (view full) ---

2328 goto next;
2329
2330 /*
2331 * Avoid writing single-copy data to a failing vdev
2332 * unless the user instructs us that it is okay.
2333 */
2334 if ((vd->vdev_stat.vs_write_errors > 0 ||
2335 vd->vdev_state < VDEV_STATE_HEALTHY) &&
1996 d == 0 && dshift == 3 &&
1997 !(zfs_write_to_degraded && vd->vdev_state ==
1998 VDEV_STATE_DEGRADED)) {
2336 d == 0 && dshift == 3 && vd->vdev_children == 0) {
1999 all_zero = B_FALSE;
2000 goto next;
2001 }
2002
2003 ASSERT(mg->mg_class == mc);
2004
2005 distance = vd->vdev_asize >> dshift;
2006 if (distance <= (1ULL << vd->vdev_ms_shift))

--- 8 unchanged lines hidden (view full) ---

2015 dva, d);
2016 if (offset != -1ULL) {
2017 /*
2018 * If we've just selected this metaslab group,
2019 * figure out whether the corresponding vdev is
2020 * over- or under-used relative to the pool,
2021 * and set an allocation bias to even it out.
2022 */
2337 all_zero = B_FALSE;
2338 goto next;
2339 }
2340
2341 ASSERT(mg->mg_class == mc);
2342
2343 distance = vd->vdev_asize >> dshift;
2344 if (distance <= (1ULL << vd->vdev_ms_shift))

--- 8 unchanged lines hidden (view full) ---

2353 dva, d);
2354 if (offset != -1ULL) {
2355 /*
2356 * If we've just selected this metaslab group,
2357 * figure out whether the corresponding vdev is
2358 * over- or under-used relative to the pool,
2359 * and set an allocation bias to even it out.
2360 */
2023 if (mc->mc_aliquot == 0) {
2361 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
2024 vdev_stat_t *vs = &vd->vdev_stat;
2025 int64_t vu, cu;
2026
2027 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
2028 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
2029
2030 /*
2031 * Calculate how much more or less we should

--- 5 unchanged lines hidden (view full) ---

2037 *
2038 * mg_bias = (20 - 80) * 512K / 100 = -307K
2039 *
2040 * This reduces allocations by 307K for this
2041 * iteration.
2042 */
2043 mg->mg_bias = ((cu - vu) *
2044 (int64_t)mg->mg_aliquot) / 100;
2362 vdev_stat_t *vs = &vd->vdev_stat;
2363 int64_t vu, cu;
2364
2365 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
2366 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
2367
2368 /*
2369 * Calculate how much more or less we should

--- 5 unchanged lines hidden (view full) ---

2375 *
2376 * mg_bias = (20 - 80) * 512K / 100 = -307K
2377 *
2378 * This reduces allocations by 307K for this
2379 * iteration.
2380 */
2381 mg->mg_bias = ((cu - vu) *
2382 (int64_t)mg->mg_aliquot) / 100;
2383 } else if (!metaslab_bias_enabled) {
2384 mg->mg_bias = 0;
2045 }
2046
2047 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
2048 mg->mg_aliquot + mg->mg_bias) {
2049 mc->mc_rotor = mg->mg_next;
2050 mc->mc_aliquot = 0;
2051 }
2052

--- 257 unchanged lines hidden ---
2385 }
2386
2387 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
2388 mg->mg_aliquot + mg->mg_bias) {
2389 mc->mc_rotor = mg->mg_next;
2390 mc->mc_aliquot = 0;
2391 }
2392

--- 257 unchanged lines hidden ---