Deleted Added
full compact
metaslab.c (307267) metaslab.c (307277)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 24 unchanged lines hidden (view full) ---

33#include <sys/vdev_impl.h>
34#include <sys/zio.h>
35#include <sys/spa_impl.h>
36#include <sys/zfeature.h>
37
38SYSCTL_DECL(_vfs_zfs);
39SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
40
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 24 unchanged lines hidden (view full) ---

33#include <sys/vdev_impl.h>
34#include <sys/zio.h>
35#include <sys/spa_impl.h>
36#include <sys/zfeature.h>
37
38SYSCTL_DECL(_vfs_zfs);
39SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
40
41/*
42 * Allow allocations to switch to gang blocks quickly. We do this to
43 * avoid having to load lots of space_maps in a given txg. There are,
44 * however, some cases where we want to avoid "fast" ganging and instead
45 * we want to do an exhaustive search of all metaslabs on this device.
46 * Currently we don't allow any gang, slog, or dump device related allocations
47 * to "fast" gang.
48 */
49#define CAN_FASTGANG(flags) \
50 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
51 METASLAB_GANG_AVOID)))
41#define GANG_ALLOCATION(flags) \
42 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
52
53#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
54#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
55#define METASLAB_ACTIVE_MASK \
56 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
57
58uint64_t metaslab_aliquot = 512ULL << 10;
59uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */

--- 191 unchanged lines hidden (view full) ---

251{
252 metaslab_class_t *mc;
253
254 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
255
256 mc->mc_spa = spa;
257 mc->mc_rotor = NULL;
258 mc->mc_ops = ops;
43
44#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
45#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
46#define METASLAB_ACTIVE_MASK \
47 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
48
49uint64_t metaslab_aliquot = 512ULL << 10;
50uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */

--- 191 unchanged lines hidden (view full) ---

242{
243 metaslab_class_t *mc;
244
245 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
246
247 mc->mc_spa = spa;
248 mc->mc_rotor = NULL;
249 mc->mc_ops = ops;
250 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
251 refcount_create_tracked(&mc->mc_alloc_slots);
259
260 return (mc);
261}
262
263void
264metaslab_class_destroy(metaslab_class_t *mc)
265{
266 ASSERT(mc->mc_rotor == NULL);
267 ASSERT(mc->mc_alloc == 0);
268 ASSERT(mc->mc_deferred == 0);
269 ASSERT(mc->mc_space == 0);
270 ASSERT(mc->mc_dspace == 0);
271
252
253 return (mc);
254}
255
256void
257metaslab_class_destroy(metaslab_class_t *mc)
258{
259 ASSERT(mc->mc_rotor == NULL);
260 ASSERT(mc->mc_alloc == 0);
261 ASSERT(mc->mc_deferred == 0);
262 ASSERT(mc->mc_space == 0);
263 ASSERT(mc->mc_dspace == 0);
264
265 refcount_destroy(&mc->mc_alloc_slots);
266 mutex_destroy(&mc->mc_lock);
272 kmem_free(mc, sizeof (metaslab_class_t));
273}
274
275int
276metaslab_class_validate(metaslab_class_t *mc)
277{
278 metaslab_group_t *mg;
279 vdev_t *vd;

--- 227 unchanged lines hidden (view full) ---

507 ASSERT3P(m1, ==, m2);
508
509 return (0);
510}
511
512/*
513 * Update the allocatable flag and the metaslab group's capacity.
514 * The allocatable flag is set to true if the capacity is below
267 kmem_free(mc, sizeof (metaslab_class_t));
268}
269
270int
271metaslab_class_validate(metaslab_class_t *mc)
272{
273 metaslab_group_t *mg;
274 vdev_t *vd;

--- 227 unchanged lines hidden (view full) ---

502 ASSERT3P(m1, ==, m2);
503
504 return (0);
505}
506
507/*
508 * Update the allocatable flag and the metaslab group's capacity.
509 * The allocatable flag is set to true if the capacity is below
515 * the zfs_mg_noalloc_threshold. If a metaslab group transitions
516 * from allocatable to non-allocatable or vice versa then the metaslab
517 * group's class is updated to reflect the transition.
510 * the zfs_mg_noalloc_threshold or has a fragmentation value that is
511 * greater than zfs_mg_fragmentation_threshold. If a metaslab group
512 * transitions from allocatable to non-allocatable or vice versa then the
513 * metaslab group's class is updated to reflect the transition.
518 */
519static void
520metaslab_group_alloc_update(metaslab_group_t *mg)
521{
522 vdev_t *vd = mg->mg_vd;
523 metaslab_class_t *mc = mg->mg_class;
524 vdev_stat_t *vs = &vd->vdev_stat;
525 boolean_t was_allocatable;
514 */
515static void
516metaslab_group_alloc_update(metaslab_group_t *mg)
517{
518 vdev_t *vd = mg->mg_vd;
519 metaslab_class_t *mc = mg->mg_class;
520 vdev_stat_t *vs = &vd->vdev_stat;
521 boolean_t was_allocatable;
522 boolean_t was_initialized;
526
527 ASSERT(vd == vd->vdev_top);
528
529 mutex_enter(&mg->mg_lock);
530 was_allocatable = mg->mg_allocatable;
523
524 ASSERT(vd == vd->vdev_top);
525
526 mutex_enter(&mg->mg_lock);
527 was_allocatable = mg->mg_allocatable;
528 was_initialized = mg->mg_initialized;
531
532 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
533 (vs->vs_space + 1);
534
529
530 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
531 (vs->vs_space + 1);
532
533 mutex_enter(&mc->mc_lock);
534
535 /*
535 /*
536 * If the metaslab group was just added then it won't
537 * have any space until we finish syncing out this txg.
538 * At that point we will consider it initialized and available
539 * for allocations. We also don't consider non-activated
540 * metaslab groups (e.g. vdevs that are in the middle of being removed)
541 * to be initialized, because they can't be used for allocation.
542 */
543 mg->mg_initialized = metaslab_group_initialized(mg);
544 if (!was_initialized && mg->mg_initialized) {
545 mc->mc_groups++;
546 } else if (was_initialized && !mg->mg_initialized) {
547 ASSERT3U(mc->mc_groups, >, 0);
548 mc->mc_groups--;
549 }
550 if (mg->mg_initialized)
551 mg->mg_no_free_space = B_FALSE;
552
553 /*
536 * A metaslab group is considered allocatable if it has plenty
537 * of free space or is not heavily fragmented. We only take
538 * fragmentation into account if the metaslab group has a valid
539 * fragmentation metric (i.e. a value between 0 and 100).
540 */
554 * A metaslab group is considered allocatable if it has plenty
555 * of free space or is not heavily fragmented. We only take
556 * fragmentation into account if the metaslab group has a valid
557 * fragmentation metric (i.e. a value between 0 and 100).
558 */
541 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
559 mg->mg_allocatable = (mg->mg_activation_count > 0 &&
560 mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
542 (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
543 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
544
545 /*
546 * The mc_alloc_groups maintains a count of the number of
547 * groups in this metaslab class that are still above the
548 * zfs_mg_noalloc_threshold. This is used by the allocating
549 * threads to determine if they should avoid allocations to

--- 6 unchanged lines hidden (view full) ---

556 * groups have reached the zfs_mg_noalloc_threshold making all groups
557 * eligible for allocations. This effectively means that all devices
558 * are balanced again.
559 */
560 if (was_allocatable && !mg->mg_allocatable)
561 mc->mc_alloc_groups--;
562 else if (!was_allocatable && mg->mg_allocatable)
563 mc->mc_alloc_groups++;
561 (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
562 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
563
564 /*
565 * The mc_alloc_groups maintains a count of the number of
566 * groups in this metaslab class that are still above the
567 * zfs_mg_noalloc_threshold. This is used by the allocating
568 * threads to determine if they should avoid allocations to

--- 6 unchanged lines hidden (view full) ---

575 * groups have reached the zfs_mg_noalloc_threshold making all groups
576 * eligible for allocations. This effectively means that all devices
577 * are balanced again.
578 */
579 if (was_allocatable && !mg->mg_allocatable)
580 mc->mc_alloc_groups--;
581 else if (!was_allocatable && mg->mg_allocatable)
582 mc->mc_alloc_groups++;
583 mutex_exit(&mc->mc_lock);
564
565 mutex_exit(&mg->mg_lock);
566}
567
568metaslab_group_t *
569metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
570{
571 metaslab_group_t *mg;
572
573 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
574 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
575 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
576 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
577 mg->mg_vd = vd;
578 mg->mg_class = mc;
579 mg->mg_activation_count = 0;
584
585 mutex_exit(&mg->mg_lock);
586}
587
588metaslab_group_t *
589metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
590{
591 metaslab_group_t *mg;
592
593 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
594 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
595 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
596 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
597 mg->mg_vd = vd;
598 mg->mg_class = mc;
599 mg->mg_activation_count = 0;
600 mg->mg_initialized = B_FALSE;
601 mg->mg_no_free_space = B_TRUE;
602 refcount_create_tracked(&mg->mg_alloc_queue_depth);
580
581 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
582 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
583
584 return (mg);
585}
586
587void

--- 6 unchanged lines hidden (view full) ---

594 * either because we never activated in the first place or
595 * because we're done, and possibly removing the vdev.
596 */
597 ASSERT(mg->mg_activation_count <= 0);
598
599 taskq_destroy(mg->mg_taskq);
600 avl_destroy(&mg->mg_metaslab_tree);
601 mutex_destroy(&mg->mg_lock);
603
604 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
605 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
606
607 return (mg);
608}
609
610void

--- 6 unchanged lines hidden (view full) ---

617 * either because we never activated in the first place or
618 * because we're done, and possibly removing the vdev.
619 */
620 ASSERT(mg->mg_activation_count <= 0);
621
622 taskq_destroy(mg->mg_taskq);
623 avl_destroy(&mg->mg_metaslab_tree);
624 mutex_destroy(&mg->mg_lock);
625 refcount_destroy(&mg->mg_alloc_queue_depth);
602 kmem_free(mg, sizeof (metaslab_group_t));
603}
604
605void
606metaslab_group_activate(metaslab_group_t *mg)
607{
608 metaslab_class_t *mc = mg->mg_class;
609 metaslab_group_t *mgprev, *mgnext;

--- 55 unchanged lines hidden (view full) ---

665 mgnext->mg_prev = mgprev;
666 }
667
668 mg->mg_prev = NULL;
669 mg->mg_next = NULL;
670 metaslab_class_minblocksize_update(mc);
671}
672
626 kmem_free(mg, sizeof (metaslab_group_t));
627}
628
629void
630metaslab_group_activate(metaslab_group_t *mg)
631{
632 metaslab_class_t *mc = mg->mg_class;
633 metaslab_group_t *mgprev, *mgnext;

--- 55 unchanged lines hidden (view full) ---

689 mgnext->mg_prev = mgprev;
690 }
691
692 mg->mg_prev = NULL;
693 mg->mg_next = NULL;
694 metaslab_class_minblocksize_update(mc);
695}
696
697boolean_t
698metaslab_group_initialized(metaslab_group_t *mg)
699{
700 vdev_t *vd = mg->mg_vd;
701 vdev_stat_t *vs = &vd->vdev_stat;
702
703 return (vs->vs_space != 0 && mg->mg_activation_count > 0);
704}
705
673uint64_t
674metaslab_group_get_space(metaslab_group_t *mg)
675{
676 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
677}
678
679void
680metaslab_group_histogram_verify(metaslab_group_t *mg)

--- 153 unchanged lines hidden (view full) ---

834 return (fragmentation);
835}
836
837/*
838 * Determine if a given metaslab group should skip allocations. A metaslab
839 * group should avoid allocations if its free capacity is less than the
840 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
841 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
706uint64_t
707metaslab_group_get_space(metaslab_group_t *mg)
708{
709 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
710}
711
712void
713metaslab_group_histogram_verify(metaslab_group_t *mg)

--- 153 unchanged lines hidden (view full) ---

867 return (fragmentation);
868}
869
870/*
871 * Determine if a given metaslab group should skip allocations. A metaslab
872 * group should avoid allocations if its free capacity is less than the
873 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
874 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
842 * that can still handle allocations.
875 * that can still handle allocations. If the allocation throttle is enabled
876 * then we skip allocations to devices that have reached their maximum
877 * allocation queue depth unless the selected metaslab group is the only
878 * eligible group remaining.
843 */
844static boolean_t
879 */
880static boolean_t
845metaslab_group_allocatable(metaslab_group_t *mg)
881metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
882 uint64_t psize)
846{
883{
847 vdev_t *vd = mg->mg_vd;
848 spa_t *spa = vd->vdev_spa;
884 spa_t *spa = mg->mg_vd->vdev_spa;
849 metaslab_class_t *mc = mg->mg_class;
850
851 /*
885 metaslab_class_t *mc = mg->mg_class;
886
887 /*
852 * We use two key metrics to determine if a metaslab group is
853 * considered allocatable -- free space and fragmentation. If
854 * the free space is greater than the free space threshold and
855 * the fragmentation is less than the fragmentation threshold then
856 * consider the group allocatable. There are two case when we will
857 * not consider these key metrics. The first is if the group is
858 * associated with a slog device and the second is if all groups
859 * in this metaslab class have already been consider ineligible
888 * We can only consider skipping this metaslab group if it's
889 * in the normal metaslab class and there are other metaslab
890 * groups to select from. Otherwise, we always consider it eligible
860 * for allocations.
861 */
891 * for allocations.
892 */
862 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
863 (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
864 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) ||
865 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
893 if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
894 return (B_TRUE);
895
896 /*
897 * If the metaslab group's mg_allocatable flag is set (see comments
898 * in metaslab_group_alloc_update() for more information) and
899 * the allocation throttle is disabled then allow allocations to this
900 * device. However, if the allocation throttle is enabled then
901 * check if we have reached our allocation limit (mg_alloc_queue_depth)
902 * to determine if we should allow allocations to this metaslab group.
903 * If all metaslab groups are no longer considered allocatable
904 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
905 * gang block size then we allow allocations on this metaslab group
906 * regardless of the mg_allocatable or throttle settings.
907 */
908 if (mg->mg_allocatable) {
909 metaslab_group_t *mgp;
910 int64_t qdepth;
911 uint64_t qmax = mg->mg_max_alloc_queue_depth;
912
913 if (!mc->mc_alloc_throttle_enabled)
914 return (B_TRUE);
915
916 /*
917 * If this metaslab group does not have any free space, then
918 * there is no point in looking further.
919 */
920 if (mg->mg_no_free_space)
921 return (B_FALSE);
922
923 qdepth = refcount_count(&mg->mg_alloc_queue_depth);
924
925 /*
926 * If this metaslab group is below its qmax or it's
927 * the only allocatable metasable group, then attempt
928 * to allocate from it.
929 */
930 if (qdepth < qmax || mc->mc_alloc_groups == 1)
931 return (B_TRUE);
932 ASSERT3U(mc->mc_alloc_groups, >, 1);
933
934 /*
935 * Since this metaslab group is at or over its qmax, we
936 * need to determine if there are metaslab groups after this
937 * one that might be able to handle this allocation. This is
938 * racy since we can't hold the locks for all metaslab
939 * groups at the same time when we make this check.
940 */
941 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
942 qmax = mgp->mg_max_alloc_queue_depth;
943
944 qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
945
946 /*
947 * If there is another metaslab group that
948 * might be able to handle the allocation, then
949 * we return false so that we skip this group.
950 */
951 if (qdepth < qmax && !mgp->mg_no_free_space)
952 return (B_FALSE);
953 }
954
955 /*
956 * We didn't find another group to handle the allocation
957 * so we can't skip this metaslab group even though
958 * we are at or over our qmax.
959 */
960 return (B_TRUE);
961
962 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
963 return (B_TRUE);
964 }
965 return (B_FALSE);
866}
867
868/*
869 * ==========================================================================
870 * Range tree callbacks
871 * ==========================================================================
872 */
873

--- 1251 unchanged lines hidden (view full) ---

2125
2126 if (offset < start)
2127 return ((start - offset) << ms_shift);
2128 if (offset > start)
2129 return ((offset - start) << ms_shift);
2130 return (0);
2131}
2132
966}
967
968/*
969 * ==========================================================================
970 * Range tree callbacks
971 * ==========================================================================
972 */
973

--- 1251 unchanged lines hidden (view full) ---

2225
2226 if (offset < start)
2227 return ((start - offset) << ms_shift);
2228 if (offset > start)
2229 return ((offset - start) << ms_shift);
2230 return (0);
2231}
2232
2233/*
2234 * ==========================================================================
2235 * Metaslab block operations
2236 * ==========================================================================
2237 */
2238
2239static void
2240metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2241{
2242 if (!(flags & METASLAB_ASYNC_ALLOC) ||
2243 flags & METASLAB_DONT_THROTTLE)
2244 return;
2245
2246 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2247 if (!mg->mg_class->mc_alloc_throttle_enabled)
2248 return;
2249
2250 (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2251}
2252
2253void
2254metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2255{
2256 if (!(flags & METASLAB_ASYNC_ALLOC) ||
2257 flags & METASLAB_DONT_THROTTLE)
2258 return;
2259
2260 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2261 if (!mg->mg_class->mc_alloc_throttle_enabled)
2262 return;
2263
2264 (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2265}
2266
2267void
2268metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2269{
2270#ifdef ZFS_DEBUG
2271 const dva_t *dva = bp->blk_dva;
2272 int ndvas = BP_GET_NDVAS(bp);
2273
2274 for (int d = 0; d < ndvas; d++) {
2275 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2276 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2277 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2278 }
2279#endif
2280}
2281
2133static uint64_t
2282static uint64_t
2134metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
2283metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
2135 uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2136{
2137 spa_t *spa = mg->mg_vd->vdev_spa;
2138 metaslab_t *msp = NULL;
2139 uint64_t offset = -1ULL;
2140 avl_tree_t *t = &mg->mg_metaslab_tree;
2141 uint64_t activation_weight;
2142 uint64_t target_distance;

--- 10 unchanged lines hidden (view full) ---

2153 for (;;) {
2154 boolean_t was_active;
2155
2156 mutex_enter(&mg->mg_lock);
2157 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
2158 if (msp->ms_weight < asize) {
2159 spa_dbgmsg(spa, "%s: failed to meet weight "
2160 "requirement: vdev %llu, txg %llu, mg %p, "
2284 uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2285{
2286 spa_t *spa = mg->mg_vd->vdev_spa;
2287 metaslab_t *msp = NULL;
2288 uint64_t offset = -1ULL;
2289 avl_tree_t *t = &mg->mg_metaslab_tree;
2290 uint64_t activation_weight;
2291 uint64_t target_distance;

--- 10 unchanged lines hidden (view full) ---

2302 for (;;) {
2303 boolean_t was_active;
2304
2305 mutex_enter(&mg->mg_lock);
2306 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
2307 if (msp->ms_weight < asize) {
2308 spa_dbgmsg(spa, "%s: failed to meet weight "
2309 "requirement: vdev %llu, txg %llu, mg %p, "
2161 "msp %p, psize %llu, asize %llu, "
2310 "msp %p, asize %llu, "
2162 "weight %llu", spa_name(spa),
2163 mg->mg_vd->vdev_id, txg,
2311 "weight %llu", spa_name(spa),
2312 mg->mg_vd->vdev_id, txg,
2164 mg, msp, psize, asize, msp->ms_weight);
2313 mg, msp, asize, msp->ms_weight);
2165 mutex_exit(&mg->mg_lock);
2166 return (-1ULL);
2167 }
2168
2169 /*
2170 * If the selected metaslab is condensing, skip it.
2171 */
2172 if (msp->ms_condensing)

--- 65 unchanged lines hidden (view full) ---

2238
2239 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2240 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2241
2242 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
2243 msp->ms_access_txg = txg + metaslab_unload_delay;
2244
2245 mutex_exit(&msp->ms_lock);
2314 mutex_exit(&mg->mg_lock);
2315 return (-1ULL);
2316 }
2317
2318 /*
2319 * If the selected metaslab is condensing, skip it.
2320 */
2321 if (msp->ms_condensing)

--- 65 unchanged lines hidden (view full) ---

2387
2388 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2389 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2390
2391 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
2392 msp->ms_access_txg = txg + metaslab_unload_delay;
2393
2394 mutex_exit(&msp->ms_lock);
2246
2247 return (offset);
2248}
2249
2250/*
2251 * Allocate a block for the specified i/o.
2252 */
2253static int
2254metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2255 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
2256{
2257 metaslab_group_t *mg, *rotor;
2258 vdev_t *vd;
2259 int dshift = 3;
2260 int all_zero;
2261 int zio_lock = B_FALSE;
2262 boolean_t allocatable;
2395 return (offset);
2396}
2397
2398/*
2399 * Allocate a block for the specified i/o.
2400 */
2401static int
2402metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2403 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
2404{
2405 metaslab_group_t *mg, *rotor;
2406 vdev_t *vd;
2407 int dshift = 3;
2408 int all_zero;
2409 int zio_lock = B_FALSE;
2410 boolean_t allocatable;
2263 uint64_t offset = -1ULL;
2264 uint64_t asize;
2265 uint64_t distance;
2266
2267 ASSERT(!DVA_IS_VALID(&dva[d]));
2268
2269 /*
2270 * For testing, make some blocks above a certain size be gang blocks.
2271 */

--- 53 unchanged lines hidden (view full) ---

2325 if (mg->mg_class != mc || mg->mg_activation_count <= 0)
2326 mg = mc->mc_rotor;
2327
2328 rotor = mg;
2329top:
2330 all_zero = B_TRUE;
2331 do {
2332 ASSERT(mg->mg_activation_count == 1);
2411 uint64_t asize;
2412 uint64_t distance;
2413
2414 ASSERT(!DVA_IS_VALID(&dva[d]));
2415
2416 /*
2417 * For testing, make some blocks above a certain size be gang blocks.
2418 */

--- 53 unchanged lines hidden (view full) ---

2472 if (mg->mg_class != mc || mg->mg_activation_count <= 0)
2473 mg = mc->mc_rotor;
2474
2475 rotor = mg;
2476top:
2477 all_zero = B_TRUE;
2478 do {
2479 ASSERT(mg->mg_activation_count == 1);
2333
2334 vd = mg->mg_vd;
2335
2336 /*
2337 * Don't allocate from faulted devices.
2338 */
2339 if (zio_lock) {
2340 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2341 allocatable = vdev_allocatable(vd);
2342 spa_config_exit(spa, SCL_ZIO, FTAG);
2343 } else {
2344 allocatable = vdev_allocatable(vd);
2345 }
2346
2347 /*
2348 * Determine if the selected metaslab group is eligible
2480 vd = mg->mg_vd;
2481
2482 /*
2483 * Don't allocate from faulted devices.
2484 */
2485 if (zio_lock) {
2486 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2487 allocatable = vdev_allocatable(vd);
2488 spa_config_exit(spa, SCL_ZIO, FTAG);
2489 } else {
2490 allocatable = vdev_allocatable(vd);
2491 }
2492
2493 /*
2494 * Determine if the selected metaslab group is eligible
2349 * for allocations. If we're ganging or have requested
2350 * an allocation for the smallest gang block size
2351 * then we don't want to avoid allocating to the this
2352 * metaslab group. If we're in this condition we should
2353 * try to allocate from any device possible so that we
2354 * don't inadvertently return ENOSPC and suspend the pool
2495 * for allocations. If we're ganging then don't allow
2496 * this metaslab group to skip allocations since that would
2497 * inadvertently return ENOSPC and suspend the pool
2355 * even though space is still available.
2356 */
2498 * even though space is still available.
2499 */
2357 if (allocatable && CAN_FASTGANG(flags) &&
2358 psize > SPA_GANGBLOCKSIZE)
2359 allocatable = metaslab_group_allocatable(mg);
2500 if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) {
2501 allocatable = metaslab_group_allocatable(mg, rotor,
2502 psize);
2503 }
2360
2361 if (!allocatable)
2362 goto next;
2363
2504
2505 if (!allocatable)
2506 goto next;
2507
2508 ASSERT(mg->mg_initialized);
2509
2364 /*
2510 /*
2365 * Avoid writing single-copy data to a failing vdev
2366 * unless the user instructs us that it is okay.
2511 * Avoid writing single-copy data to a failing vdev.
2367 */
2368 if ((vd->vdev_stat.vs_write_errors > 0 ||
2369 vd->vdev_state < VDEV_STATE_HEALTHY) &&
2370 d == 0 && dshift == 3 && vd->vdev_children == 0) {
2371 all_zero = B_FALSE;
2372 goto next;
2373 }
2374
2375 ASSERT(mg->mg_class == mc);
2376
2377 distance = vd->vdev_asize >> dshift;
2378 if (distance <= (1ULL << vd->vdev_ms_shift))
2379 distance = 0;
2380 else
2381 all_zero = B_FALSE;
2382
2383 asize = vdev_psize_to_asize(vd, psize);
2384 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
2385
2512 */
2513 if ((vd->vdev_stat.vs_write_errors > 0 ||
2514 vd->vdev_state < VDEV_STATE_HEALTHY) &&
2515 d == 0 && dshift == 3 && vd->vdev_children == 0) {
2516 all_zero = B_FALSE;
2517 goto next;
2518 }
2519
2520 ASSERT(mg->mg_class == mc);
2521
2522 distance = vd->vdev_asize >> dshift;
2523 if (distance <= (1ULL << vd->vdev_ms_shift))
2524 distance = 0;
2525 else
2526 all_zero = B_FALSE;
2527
2528 asize = vdev_psize_to_asize(vd, psize);
2529 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
2530
2386 offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
2387 dva, d);
2531 uint64_t offset = metaslab_group_alloc(mg, asize, txg,
2532 distance, dva, d);
2533
2534 mutex_enter(&mg->mg_lock);
2535 if (offset == -1ULL) {
2536 mg->mg_failed_allocations++;
2537 if (asize == SPA_GANGBLOCKSIZE) {
2538 /*
2539 * This metaslab group was unable to allocate
2540 * the minimum gang block size so it must be
2541 * out of space. We must notify the allocation
2542 * throttle to start skipping allocation
2543 * attempts to this metaslab group until more
2544 * space becomes available.
2545 *
2546 * Note: this failure cannot be caused by the
2547 * allocation throttle since the allocation
2548 * throttle is only responsible for skipping
2549 * devices and not failing block allocations.
2550 */
2551 mg->mg_no_free_space = B_TRUE;
2552 }
2553 }
2554 mg->mg_allocations++;
2555 mutex_exit(&mg->mg_lock);
2556
2388 if (offset != -1ULL) {
2389 /*
2390 * If we've just selected this metaslab group,
2391 * figure out whether the corresponding vdev is
2392 * over- or under-used relative to the pool,
2393 * and set an allocation bias to even it out.
2394 */
2395 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {

--- 164 unchanged lines hidden (view full) ---

2560 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2561 }
2562
2563 mutex_exit(&msp->ms_lock);
2564
2565 return (0);
2566}
2567
2557 if (offset != -1ULL) {
2558 /*
2559 * If we've just selected this metaslab group,
2560 * figure out whether the corresponding vdev is
2561 * over- or under-used relative to the pool,
2562 * and set an allocation bias to even it out.
2563 */
2564 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {

--- 164 unchanged lines hidden (view full) ---

2729 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2730 }
2731
2732 mutex_exit(&msp->ms_lock);
2733
2734 return (0);
2735}
2736
2737/*
2738 * Reserve some allocation slots. The reservation system must be called
2739 * before we call into the allocator. If there aren't any available slots
2740 * then the I/O will be throttled until an I/O completes and its slots are
2741 * freed up. The function returns true if it was successful in placing
2742 * the reservation.
2743 */
2744boolean_t
2745metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
2746 int flags)
2747{
2748 uint64_t available_slots = 0;
2749 boolean_t slot_reserved = B_FALSE;
2750
2751 ASSERT(mc->mc_alloc_throttle_enabled);
2752 mutex_enter(&mc->mc_lock);
2753
2754 uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
2755 if (reserved_slots < mc->mc_alloc_max_slots)
2756 available_slots = mc->mc_alloc_max_slots - reserved_slots;
2757
2758 if (slots <= available_slots || GANG_ALLOCATION(flags)) {
2759 /*
2760 * We reserve the slots individually so that we can unreserve
2761 * them individually when an I/O completes.
2762 */
2763 for (int d = 0; d < slots; d++) {
2764 reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
2765 }
2766 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
2767 slot_reserved = B_TRUE;
2768 }
2769
2770 mutex_exit(&mc->mc_lock);
2771 return (slot_reserved);
2772}
2773
2774void
2775metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
2776{
2777 ASSERT(mc->mc_alloc_throttle_enabled);
2778 mutex_enter(&mc->mc_lock);
2779 for (int d = 0; d < slots; d++) {
2780 (void) refcount_remove(&mc->mc_alloc_slots, zio);
2781 }
2782 mutex_exit(&mc->mc_lock);
2783}
2784
2568int
2569metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2785int
2786metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2570 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
2787 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio)
2571{
2572 dva_t *dva = bp->blk_dva;
2573 dva_t *hintdva = hintbp->blk_dva;
2574 int error = 0;
2575
2576 ASSERT(bp->blk_birth == 0);
2577 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2578

--- 9 unchanged lines hidden (view full) ---

2588 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2589
2590 for (int d = 0; d < ndvas; d++) {
2591 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2592 txg, flags);
2593 if (error != 0) {
2594 for (d--; d >= 0; d--) {
2595 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2788{
2789 dva_t *dva = bp->blk_dva;
2790 dva_t *hintdva = hintbp->blk_dva;
2791 int error = 0;
2792
2793 ASSERT(bp->blk_birth == 0);
2794 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2795

--- 9 unchanged lines hidden (view full) ---

2805 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2806
2807 for (int d = 0; d < ndvas; d++) {
2808 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2809 txg, flags);
2810 if (error != 0) {
2811 for (d--; d >= 0; d--) {
2812 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2813 metaslab_group_alloc_decrement(spa,
2814 DVA_GET_VDEV(&dva[d]), zio, flags);
2596 bzero(&dva[d], sizeof (dva_t));
2597 }
2598 spa_config_exit(spa, SCL_ALLOC, FTAG);
2599 return (error);
2815 bzero(&dva[d], sizeof (dva_t));
2816 }
2817 spa_config_exit(spa, SCL_ALLOC, FTAG);
2818 return (error);
2819 } else {
2820 /*
2821 * Update the metaslab group's queue depth
2822 * based on the newly allocated dva.
2823 */
2824 metaslab_group_alloc_increment(spa,
2825 DVA_GET_VDEV(&dva[d]), zio, flags);
2600 }
2826 }
2827
2601 }
2602 ASSERT(error == 0);
2603 ASSERT(BP_GET_NDVAS(bp) == ndvas);
2604
2605 spa_config_exit(spa, SCL_ALLOC, FTAG);
2606
2607 BP_SET_BIRTH(bp, txg, txg);
2608

--- 75 unchanged lines hidden ---
2828 }
2829 ASSERT(error == 0);
2830 ASSERT(BP_GET_NDVAS(bp) == ndvas);
2831
2832 spa_config_exit(spa, SCL_ALLOC, FTAG);
2833
2834 BP_SET_BIRTH(bp, txg, txg);
2835

--- 75 unchanged lines hidden ---