Deleted Added
full compact
metaslab.c (339104) metaslab.c (339105)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 6 unchanged lines hidden (view full) ---

15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 6 unchanged lines hidden (view full) ---

15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/dmu.h>
30#include <sys/dmu_tx.h>
31#include <sys/space_map.h>

--- 238 unchanged lines hidden (view full) ---

270 * limit is ever reached allowing for further investigation.
271 */
272uint64_t metaslab_trace_max_entries = 5000;
273
274static uint64_t metaslab_weight(metaslab_t *);
275static void metaslab_set_fragmentation(metaslab_t *);
276static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
277static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/dmu.h>
30#include <sys/dmu_tx.h>
31#include <sys/space_map.h>

--- 238 unchanged lines hidden (view full) ---

270 * limit is ever reached allowing for further investigation.
271 */
272uint64_t metaslab_trace_max_entries = 5000;
273
274static uint64_t metaslab_weight(metaslab_t *);
275static void metaslab_set_fragmentation(metaslab_t *);
276static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
277static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
278static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
279static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
278
279kmem_cache_t *metaslab_alloc_trace_cache;
280
281/*
282 * ==========================================================================
283 * Metaslab classes
284 * ==========================================================================
285 */
286metaslab_class_t *
287metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
288{
289 metaslab_class_t *mc;
290
291 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
292
293 mc->mc_spa = spa;
294 mc->mc_rotor = NULL;
295 mc->mc_ops = ops;
296 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
280
281kmem_cache_t *metaslab_alloc_trace_cache;
282
283/*
284 * ==========================================================================
285 * Metaslab classes
286 * ==========================================================================
287 */
288metaslab_class_t *
289metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
290{
291 metaslab_class_t *mc;
292
293 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
294
295 mc->mc_spa = spa;
296 mc->mc_rotor = NULL;
297 mc->mc_ops = ops;
298 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
297 refcount_create_tracked(&mc->mc_alloc_slots);
299 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
300 sizeof (refcount_t), KM_SLEEP);
301 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
302 sizeof (uint64_t), KM_SLEEP);
303 for (int i = 0; i < spa->spa_alloc_count; i++)
304 refcount_create_tracked(&mc->mc_alloc_slots[i]);
298
299 return (mc);
300}
301
302void
303metaslab_class_destroy(metaslab_class_t *mc)
304{
305 ASSERT(mc->mc_rotor == NULL);
306 ASSERT(mc->mc_alloc == 0);
307 ASSERT(mc->mc_deferred == 0);
308 ASSERT(mc->mc_space == 0);
309 ASSERT(mc->mc_dspace == 0);
310
305
306 return (mc);
307}
308
309void
310metaslab_class_destroy(metaslab_class_t *mc)
311{
312 ASSERT(mc->mc_rotor == NULL);
313 ASSERT(mc->mc_alloc == 0);
314 ASSERT(mc->mc_deferred == 0);
315 ASSERT(mc->mc_space == 0);
316 ASSERT(mc->mc_dspace == 0);
317
311 refcount_destroy(&mc->mc_alloc_slots);
318 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
319 refcount_destroy(&mc->mc_alloc_slots[i]);
320 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
321 sizeof (refcount_t));
322 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
323 sizeof (uint64_t));
312 mutex_destroy(&mc->mc_lock);
313 kmem_free(mc, sizeof (metaslab_class_t));
314}
315
316int
317metaslab_class_validate(metaslab_class_t *mc)
318{
319 metaslab_group_t *mg;

--- 207 unchanged lines hidden (view full) ---

527}
528
529static int
530metaslab_compare(const void *x1, const void *x2)
531{
532 const metaslab_t *m1 = x1;
533 const metaslab_t *m2 = x2;
534
324 mutex_destroy(&mc->mc_lock);
325 kmem_free(mc, sizeof (metaslab_class_t));
326}
327
328int
329metaslab_class_validate(metaslab_class_t *mc)
330{
331 metaslab_group_t *mg;

--- 207 unchanged lines hidden (view full) ---

539}
540
541static int
542metaslab_compare(const void *x1, const void *x2)
543{
544 const metaslab_t *m1 = x1;
545 const metaslab_t *m2 = x2;
546
547 int sort1 = 0;
548 int sort2 = 0;
549 if (m1->ms_allocator != -1 && m1->ms_primary)
550 sort1 = 1;
551 else if (m1->ms_allocator != -1 && !m1->ms_primary)
552 sort1 = 2;
553 if (m2->ms_allocator != -1 && m2->ms_primary)
554 sort2 = 1;
555 else if (m2->ms_allocator != -1 && !m2->ms_primary)
556 sort2 = 2;
557
558 /*
559 * Sort inactive metaslabs first, then primaries, then secondaries. When
560 * selecting a metaslab to allocate from, an allocator first tries its
561 * primary, then secondary active metaslab. If it doesn't have active
562 * metaslabs, or can't allocate from them, it searches for an inactive
563 * metaslab to activate. If it can't find a suitable one, it will steal
564 * a primary or secondary metaslab from another allocator.
565 */
566 if (sort1 < sort2)
567 return (-1);
568 if (sort1 > sort2)
569 return (1);
570
535 if (m1->ms_weight < m2->ms_weight)
536 return (1);
537 if (m1->ms_weight > m2->ms_weight)
538 return (-1);
539
540 /*
541 * If the weights are identical, use the offset to force uniqueness.
542 */

--- 135 unchanged lines hidden (view full) ---

678 else if (!was_allocatable && mg->mg_allocatable)
679 mc->mc_alloc_groups++;
680 mutex_exit(&mc->mc_lock);
681
682 mutex_exit(&mg->mg_lock);
683}
684
685metaslab_group_t *
571 if (m1->ms_weight < m2->ms_weight)
572 return (1);
573 if (m1->ms_weight > m2->ms_weight)
574 return (-1);
575
576 /*
577 * If the weights are identical, use the offset to force uniqueness.
578 */

--- 135 unchanged lines hidden (view full) ---

714 else if (!was_allocatable && mg->mg_allocatable)
715 mc->mc_alloc_groups++;
716 mutex_exit(&mc->mc_lock);
717
718 mutex_exit(&mg->mg_lock);
719}
720
721metaslab_group_t *
686metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
722metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
687{
688 metaslab_group_t *mg;
689
690 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
691 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
723{
724 metaslab_group_t *mg;
725
726 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
727 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
728 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
729 KM_SLEEP);
730 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
731 KM_SLEEP);
692 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
693 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
694 mg->mg_vd = vd;
695 mg->mg_class = mc;
696 mg->mg_activation_count = 0;
697 mg->mg_initialized = B_FALSE;
698 mg->mg_no_free_space = B_TRUE;
732 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
733 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
734 mg->mg_vd = vd;
735 mg->mg_class = mc;
736 mg->mg_activation_count = 0;
737 mg->mg_initialized = B_FALSE;
738 mg->mg_no_free_space = B_TRUE;
699 refcount_create_tracked(&mg->mg_alloc_queue_depth);
739 mg->mg_allocators = allocators;
700
740
741 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t),
742 KM_SLEEP);
743 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
744 sizeof (uint64_t), KM_SLEEP);
745 for (int i = 0; i < allocators; i++) {
746 refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
747 mg->mg_cur_max_alloc_queue_depth[i] = 0;
748 }
749
701 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
702 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
703
704 return (mg);
705}
706
707void
708metaslab_group_destroy(metaslab_group_t *mg)

--- 4 unchanged lines hidden (view full) ---

713 * We may have gone below zero with the activation count
714 * either because we never activated in the first place or
715 * because we're done, and possibly removing the vdev.
716 */
717 ASSERT(mg->mg_activation_count <= 0);
718
719 taskq_destroy(mg->mg_taskq);
720 avl_destroy(&mg->mg_metaslab_tree);
750 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
751 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
752
753 return (mg);
754}
755
756void
757metaslab_group_destroy(metaslab_group_t *mg)

--- 4 unchanged lines hidden (view full) ---

762 * We may have gone below zero with the activation count
763 * either because we never activated in the first place or
764 * because we're done, and possibly removing the vdev.
765 */
766 ASSERT(mg->mg_activation_count <= 0);
767
768 taskq_destroy(mg->mg_taskq);
769 avl_destroy(&mg->mg_metaslab_tree);
770 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
771 kmem_free(mg->mg_secondaries, mg->mg_allocators *
772 sizeof (metaslab_t *));
721 mutex_destroy(&mg->mg_lock);
773 mutex_destroy(&mg->mg_lock);
722 refcount_destroy(&mg->mg_alloc_queue_depth);
774
775 for (int i = 0; i < mg->mg_allocators; i++) {
776 refcount_destroy(&mg->mg_alloc_queue_depth[i]);
777 mg->mg_cur_max_alloc_queue_depth[i] = 0;
778 }
779 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
780 sizeof (refcount_t));
781 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
782 sizeof (uint64_t));
783
723 kmem_free(mg, sizeof (metaslab_group_t));
724}
725
726void
727metaslab_group_activate(metaslab_group_t *mg)
728{
729 metaslab_class_t *mc = mg->mg_class;
730 metaslab_group_t *mgprev, *mgnext;

--- 63 unchanged lines hidden (view full) ---

794 * lower locks to allow the I/O to complete. At a minimum,
795 * we continue to hold the SCL_ALLOC lock, which prevents any future
796 * allocations from taking place and any changes to the vdev tree.
797 */
798 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
799 taskq_wait(mg->mg_taskq);
800 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
801 metaslab_group_alloc_update(mg);
784 kmem_free(mg, sizeof (metaslab_group_t));
785}
786
787void
788metaslab_group_activate(metaslab_group_t *mg)
789{
790 metaslab_class_t *mc = mg->mg_class;
791 metaslab_group_t *mgprev, *mgnext;

--- 63 unchanged lines hidden (view full) ---

855 * lower locks to allow the I/O to complete. At a minimum,
856 * we continue to hold the SCL_ALLOC lock, which prevents any future
857 * allocations from taking place and any changes to the vdev tree.
858 */
859 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
860 taskq_wait(mg->mg_taskq);
861 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
862 metaslab_group_alloc_update(mg);
863 for (int i = 0; i < mg->mg_allocators; i++) {
864 metaslab_t *msp = mg->mg_primaries[i];
865 if (msp != NULL) {
866 mutex_enter(&msp->ms_lock);
867 metaslab_passivate(msp,
868 metaslab_weight_from_range_tree(msp));
869 mutex_exit(&msp->ms_lock);
870 }
871 msp = mg->mg_secondaries[i];
872 if (msp != NULL) {
873 mutex_enter(&msp->ms_lock);
874 metaslab_passivate(msp,
875 metaslab_weight_from_range_tree(msp));
876 mutex_exit(&msp->ms_lock);
877 }
878 }
802
803 mgprev = mg->mg_prev;
804 mgnext = mg->mg_next;
805
806 if (mg == mgnext) {
807 mc->mc_rotor = NULL;
808 } else {
809 mc->mc_rotor = mgnext;

--- 125 unchanged lines hidden (view full) ---

935 mutex_enter(&mg->mg_lock);
936 ASSERT(msp->ms_group == mg);
937 avl_remove(&mg->mg_metaslab_tree, msp);
938 msp->ms_group = NULL;
939 mutex_exit(&mg->mg_lock);
940}
941
942static void
879
880 mgprev = mg->mg_prev;
881 mgnext = mg->mg_next;
882
883 if (mg == mgnext) {
884 mc->mc_rotor = NULL;
885 } else {
886 mc->mc_rotor = mgnext;

--- 125 unchanged lines hidden (view full) ---

1012 mutex_enter(&mg->mg_lock);
1013 ASSERT(msp->ms_group == mg);
1014 avl_remove(&mg->mg_metaslab_tree, msp);
1015 msp->ms_group = NULL;
1016 mutex_exit(&mg->mg_lock);
1017}
1018
1019static void
1020metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1021{
1022 ASSERT(MUTEX_HELD(&mg->mg_lock));
1023 ASSERT(msp->ms_group == mg);
1024 avl_remove(&mg->mg_metaslab_tree, msp);
1025 msp->ms_weight = weight;
1026 avl_add(&mg->mg_metaslab_tree, msp);
1027
1028}
1029
1030static void
943metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
944{
945 /*
946 * Although in principle the weight can be any value, in
947 * practice we do not use values in the range [1, 511].
948 */
949 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
950 ASSERT(MUTEX_HELD(&msp->ms_lock));
951
952 mutex_enter(&mg->mg_lock);
1031metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1032{
1033 /*
1034 * Although in principle the weight can be any value, in
1035 * practice we do not use values in the range [1, 511].
1036 */
1037 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
1038 ASSERT(MUTEX_HELD(&msp->ms_lock));
1039
1040 mutex_enter(&mg->mg_lock);
953 ASSERT(msp->ms_group == mg);
954 avl_remove(&mg->mg_metaslab_tree, msp);
955 msp->ms_weight = weight;
956 avl_add(&mg->mg_metaslab_tree, msp);
1041 metaslab_group_sort_impl(mg, msp, weight);
957 mutex_exit(&mg->mg_lock);
958}
959
960/*
961 * Calculate the fragmentation for a given metaslab group. We can use
962 * a simple average here since all metaslabs within the group must have
963 * the same size. The return value will be a value between 0 and 100
964 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this

--- 31 unchanged lines hidden (view full) ---

996 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
997 * that can still handle allocations. If the allocation throttle is enabled
998 * then we skip allocations to devices that have reached their maximum
999 * allocation queue depth unless the selected metaslab group is the only
1000 * eligible group remaining.
1001 */
1002static boolean_t
1003metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1042 mutex_exit(&mg->mg_lock);
1043}
1044
1045/*
1046 * Calculate the fragmentation for a given metaslab group. We can use
1047 * a simple average here since all metaslabs within the group must have
1048 * the same size. The return value will be a value between 0 and 100
1049 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this

--- 31 unchanged lines hidden (view full) ---

1081 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
1082 * that can still handle allocations. If the allocation throttle is enabled
1083 * then we skip allocations to devices that have reached their maximum
1084 * allocation queue depth unless the selected metaslab group is the only
1085 * eligible group remaining.
1086 */
1087static boolean_t
1088metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1004 uint64_t psize)
1089 uint64_t psize, int allocator)
1005{
1006 spa_t *spa = mg->mg_vd->vdev_spa;
1007 metaslab_class_t *mc = mg->mg_class;
1008
1009 /*
1010 * We can only consider skipping this metaslab group if it's
1011 * in the normal metaslab class and there are other metaslab
1012 * groups to select from. Otherwise, we always consider it eligible

--- 12 unchanged lines hidden (view full) ---

1025 * If all metaslab groups are no longer considered allocatable
1026 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1027 * gang block size then we allow allocations on this metaslab group
1028 * regardless of the mg_allocatable or throttle settings.
1029 */
1030 if (mg->mg_allocatable) {
1031 metaslab_group_t *mgp;
1032 int64_t qdepth;
1090{
1091 spa_t *spa = mg->mg_vd->vdev_spa;
1092 metaslab_class_t *mc = mg->mg_class;
1093
1094 /*
1095 * We can only consider skipping this metaslab group if it's
1096 * in the normal metaslab class and there are other metaslab
1097 * groups to select from. Otherwise, we always consider it eligible

--- 12 unchanged lines hidden (view full) ---

1110 * If all metaslab groups are no longer considered allocatable
1111 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1112 * gang block size then we allow allocations on this metaslab group
1113 * regardless of the mg_allocatable or throttle settings.
1114 */
1115 if (mg->mg_allocatable) {
1116 metaslab_group_t *mgp;
1117 int64_t qdepth;
1033 uint64_t qmax = mg->mg_max_alloc_queue_depth;
1118 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
1034
1035 if (!mc->mc_alloc_throttle_enabled)
1036 return (B_TRUE);
1037
1038 /*
1039 * If this metaslab group does not have any free space, then
1040 * there is no point in looking further.
1041 */
1042 if (mg->mg_no_free_space)
1043 return (B_FALSE);
1044
1119
1120 if (!mc->mc_alloc_throttle_enabled)
1121 return (B_TRUE);
1122
1123 /*
1124 * If this metaslab group does not have any free space, then
1125 * there is no point in looking further.
1126 */
1127 if (mg->mg_no_free_space)
1128 return (B_FALSE);
1129
1045 qdepth = refcount_count(&mg->mg_alloc_queue_depth);
1130 qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]);
1046
1047 /*
1048 * If this metaslab group is below its qmax or it's
1049 * the only allocatable metasable group, then attempt
1050 * to allocate from it.
1051 */
1052 if (qdepth < qmax || mc->mc_alloc_groups == 1)
1053 return (B_TRUE);
1054 ASSERT3U(mc->mc_alloc_groups, >, 1);
1055
1056 /*
1057 * Since this metaslab group is at or over its qmax, we
1058 * need to determine if there are metaslab groups after this
1059 * one that might be able to handle this allocation. This is
1060 * racy since we can't hold the locks for all metaslab
1061 * groups at the same time when we make this check.
1062 */
1063 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
1131
1132 /*
1133 * If this metaslab group is below its qmax or it's
1134 * the only allocatable metasable group, then attempt
1135 * to allocate from it.
1136 */
1137 if (qdepth < qmax || mc->mc_alloc_groups == 1)
1138 return (B_TRUE);
1139 ASSERT3U(mc->mc_alloc_groups, >, 1);
1140
1141 /*
1142 * Since this metaslab group is at or over its qmax, we
1143 * need to determine if there are metaslab groups after this
1144 * one that might be able to handle this allocation. This is
1145 * racy since we can't hold the locks for all metaslab
1146 * groups at the same time when we make this check.
1147 */
1148 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
1064 qmax = mgp->mg_max_alloc_queue_depth;
1149 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
1065
1150
1066 qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
1151 qdepth = refcount_count(
1152 &mgp->mg_alloc_queue_depth[allocator]);
1067
1068 /*
1069 * If there is another metaslab group that
1070 * might be able to handle the allocation, then
1071 * we return false so that we skip this group.
1072 */
1073 if (qdepth < qmax && !mgp->mg_no_free_space)
1074 return (B_FALSE);

--- 391 unchanged lines hidden (view full) ---

1466
1467 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1468 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1469 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1470 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1471 ms->ms_id = id;
1472 ms->ms_start = id << vd->vdev_ms_shift;
1473 ms->ms_size = 1ULL << vd->vdev_ms_shift;
1153
1154 /*
1155 * If there is another metaslab group that
1156 * might be able to handle the allocation, then
1157 * we return false so that we skip this group.
1158 */
1159 if (qdepth < qmax && !mgp->mg_no_free_space)
1160 return (B_FALSE);

--- 391 unchanged lines hidden (view full) ---

1552
1553 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1554 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1555 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1556 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1557 ms->ms_id = id;
1558 ms->ms_start = id << vd->vdev_ms_shift;
1559 ms->ms_size = 1ULL << vd->vdev_ms_shift;
1560 ms->ms_allocator = -1;
1561 ms->ms_new = B_TRUE;
1474
1475 /*
1476 * We only open space map objects that already exist. All others
1477 * will be opened when we finally allocate an object for it.
1478 */
1479 if (object != 0) {
1480 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1481 ms->ms_size, vd->vdev_ashift);

--- 80 unchanged lines hidden (view full) ---

1562 ASSERT0(msp->ms_deferspace);
1563
1564 range_tree_destroy(msp->ms_checkpointing);
1565
1566 mutex_exit(&msp->ms_lock);
1567 cv_destroy(&msp->ms_load_cv);
1568 mutex_destroy(&msp->ms_lock);
1569 mutex_destroy(&msp->ms_sync_lock);
1562
1563 /*
1564 * We only open space map objects that already exist. All others
1565 * will be opened when we finally allocate an object for it.
1566 */
1567 if (object != 0) {
1568 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1569 ms->ms_size, vd->vdev_ashift);

--- 80 unchanged lines hidden (view full) ---

1650 ASSERT0(msp->ms_deferspace);
1651
1652 range_tree_destroy(msp->ms_checkpointing);
1653
1654 mutex_exit(&msp->ms_lock);
1655 cv_destroy(&msp->ms_load_cv);
1656 mutex_destroy(&msp->ms_lock);
1657 mutex_destroy(&msp->ms_sync_lock);
1658 ASSERT3U(msp->ms_allocator, ==, -1);
1570
1571 kmem_free(msp, sizeof (metaslab_t));
1572}
1573
1574#define FRAGMENTATION_TABLE_SIZE 17
1575
1576/*
1577 * This table defines a segment size based fragmentation metric that will

--- 380 unchanged lines hidden (view full) ---

1958 weight = metaslab_segment_weight(msp);
1959 } else {
1960 weight = metaslab_space_weight(msp);
1961 }
1962 return (weight);
1963}
1964
1965static int
1659
1660 kmem_free(msp, sizeof (metaslab_t));
1661}
1662
1663#define FRAGMENTATION_TABLE_SIZE 17
1664
1665/*
1666 * This table defines a segment size based fragmentation metric that will

--- 380 unchanged lines hidden (view full) ---

2047 weight = metaslab_segment_weight(msp);
2048 } else {
2049 weight = metaslab_space_weight(msp);
2050 }
2051 return (weight);
2052}
2053
2054static int
1966metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
2055metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2056 int allocator, uint64_t activation_weight)
1967{
2057{
2058 /*
2059 * If we're activating for the claim code, we don't want to actually
2060 * set the metaslab up for a specific allocator.
2061 */
2062 if (activation_weight == METASLAB_WEIGHT_CLAIM)
2063 return (0);
2064 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
2065 mg->mg_primaries : mg->mg_secondaries);
2066
1968 ASSERT(MUTEX_HELD(&msp->ms_lock));
2067 ASSERT(MUTEX_HELD(&msp->ms_lock));
2068 mutex_enter(&mg->mg_lock);
2069 if (arr[allocator] != NULL) {
2070 mutex_exit(&mg->mg_lock);
2071 return (EEXIST);
2072 }
1969
2073
2074 arr[allocator] = msp;
2075 ASSERT3S(msp->ms_allocator, ==, -1);
2076 msp->ms_allocator = allocator;
2077 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
2078 mutex_exit(&mg->mg_lock);
2079
2080 return (0);
2081}
2082
2083static int
2084metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
2085{
2086 ASSERT(MUTEX_HELD(&msp->ms_lock));
2087
1970 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
2088 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
2089 int error = 0;
1971 metaslab_load_wait(msp);
1972 if (!msp->ms_loaded) {
2090 metaslab_load_wait(msp);
2091 if (!msp->ms_loaded) {
1973 int error = metaslab_load(msp);
1974 if (error) {
2092 if ((error = metaslab_load(msp)) != 0) {
1975 metaslab_group_sort(msp->ms_group, msp, 0);
1976 return (error);
1977 }
1978 }
2093 metaslab_group_sort(msp->ms_group, msp, 0);
2094 return (error);
2095 }
2096 }
2097 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
2098 /*
2099 * The metaslab was activated for another allocator
2100 * while we were waiting, we should reselect.
2101 */
2102 return (EBUSY);
2103 }
2104 if ((error = metaslab_activate_allocator(msp->ms_group, msp,
2105 allocator, activation_weight)) != 0) {
2106 return (error);
2107 }
1979
1980 msp->ms_activation_weight = msp->ms_weight;
1981 metaslab_group_sort(msp->ms_group, msp,
1982 msp->ms_weight | activation_weight);
1983 }
1984 ASSERT(msp->ms_loaded);
1985 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1986
1987 return (0);
1988}
1989
1990static void
2108
2109 msp->ms_activation_weight = msp->ms_weight;
2110 metaslab_group_sort(msp->ms_group, msp,
2111 msp->ms_weight | activation_weight);
2112 }
2113 ASSERT(msp->ms_loaded);
2114 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
2115
2116 return (0);
2117}
2118
2119static void
2120metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2121 uint64_t weight)
2122{
2123 ASSERT(MUTEX_HELD(&msp->ms_lock));
2124 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
2125 metaslab_group_sort(mg, msp, weight);
2126 return;
2127 }
2128
2129 mutex_enter(&mg->mg_lock);
2130 ASSERT3P(msp->ms_group, ==, mg);
2131 if (msp->ms_primary) {
2132 ASSERT3U(0, <=, msp->ms_allocator);
2133 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
2134 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
2135 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
2136 mg->mg_primaries[msp->ms_allocator] = NULL;
2137 } else {
2138 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
2139 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
2140 mg->mg_secondaries[msp->ms_allocator] = NULL;
2141 }
2142 msp->ms_allocator = -1;
2143 metaslab_group_sort_impl(mg, msp, weight);
2144 mutex_exit(&mg->mg_lock);
2145}
2146
2147static void
1991metaslab_passivate(metaslab_t *msp, uint64_t weight)
1992{
1993 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
1994
1995 /*
1996 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1997 * this metaslab again. In that case, it had better be empty,
1998 * or we would be leaving space on the table.
1999 */
2000 ASSERT(size >= SPA_MINBLOCKSIZE ||
2001 range_tree_is_empty(msp->ms_allocatable));
2002 ASSERT0(weight & METASLAB_ACTIVE_MASK);
2003
2004 msp->ms_activation_weight = 0;
2148metaslab_passivate(metaslab_t *msp, uint64_t weight)
2149{
2150 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
2151
2152 /*
2153 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
2154 * this metaslab again. In that case, it had better be empty,
2155 * or we would be leaving space on the table.
2156 */
2157 ASSERT(size >= SPA_MINBLOCKSIZE ||
2158 range_tree_is_empty(msp->ms_allocatable));
2159 ASSERT0(weight & METASLAB_ACTIVE_MASK);
2160
2161 msp->ms_activation_weight = 0;
2005 metaslab_group_sort(msp->ms_group, msp, weight);
2162 metaslab_passivate_allocator(msp->ms_group, msp, weight);
2006 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
2007}
2008
2009/*
2010 * Segment-based metaslabs are activated once and remain active until
2011 * we either fail an allocation attempt (similar to space-based metaslabs)
2012 * or have exhausted the free space in zfs_metaslab_switch_threshold
2013 * buckets since the metaslab was activated. This function checks to see

--- 537 unchanged lines hidden (view full) ---

2551 if (msp->ms_deferspace != 0) {
2552 /*
2553 * Keep syncing this metaslab until all deferred frees
2554 * are back in circulation.
2555 */
2556 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2557 }
2558
2163 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
2164}
2165
2166/*
2167 * Segment-based metaslabs are activated once and remain active until
2168 * we either fail an allocation attempt (similar to space-based metaslabs)
2169 * or have exhausted the free space in zfs_metaslab_switch_threshold
2170 * buckets since the metaslab was activated. This function checks to see

--- 537 unchanged lines hidden (view full) ---

2708 if (msp->ms_deferspace != 0) {
2709 /*
2710 * Keep syncing this metaslab until all deferred frees
2711 * are back in circulation.
2712 */
2713 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2714 }
2715
2716 if (msp->ms_new) {
2717 msp->ms_new = B_FALSE;
2718 mutex_enter(&mg->mg_lock);
2719 mg->mg_ms_ready++;
2720 mutex_exit(&mg->mg_lock);
2721 }
2559 /*
2560 * Calculate the new weights before unloading any metaslabs.
2561 * This will give us the most accurate weighting.
2562 */
2722 /*
2723 * Calculate the new weights before unloading any metaslabs.
2724 * This will give us the most accurate weighting.
2725 */
2563 metaslab_group_sort(mg, msp, metaslab_weight(msp));
2726 metaslab_group_sort(mg, msp, metaslab_weight(msp) |
2727 (msp->ms_weight & METASLAB_ACTIVE_MASK));
2564
2565 /*
2566 * If the metaslab is loaded and we've not tried to load or allocate
2567 * from it in 'metaslab_unload_delay' txgs, then unload it.
2568 */
2569 if (msp->ms_loaded &&
2570 msp->ms_selected_txg + metaslab_unload_delay < txg) {
2571 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2572 VERIFY0(range_tree_space(
2573 msp->ms_allocating[(txg + t) & TXG_MASK]));
2574 }
2728
2729 /*
2730 * If the metaslab is loaded and we've not tried to load or allocate
2731 * from it in 'metaslab_unload_delay' txgs, then unload it.
2732 */
2733 if (msp->ms_loaded &&
2734 msp->ms_selected_txg + metaslab_unload_delay < txg) {
2735 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2736 VERIFY0(range_tree_space(
2737 msp->ms_allocating[(txg + t) & TXG_MASK]));
2738 }
2739 if (msp->ms_allocator != -1) {
2740 metaslab_passivate(msp, msp->ms_weight &
2741 ~METASLAB_ACTIVE_MASK);
2742 }
2575
2576 if (!metaslab_debug_unload)
2577 metaslab_unload(msp);
2578 }
2579
2580 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2581 ASSERT0(range_tree_space(msp->ms_freeing));
2582 ASSERT0(range_tree_space(msp->ms_freed));

--- 77 unchanged lines hidden (view full) ---

2660 metaslab_alloc_trace_cache = NULL;
2661}
2662
2663/*
2664 * Add an allocation trace element to the allocation tracing list.
2665 */
2666static void
2667metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2743
2744 if (!metaslab_debug_unload)
2745 metaslab_unload(msp);
2746 }
2747
2748 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2749 ASSERT0(range_tree_space(msp->ms_freeing));
2750 ASSERT0(range_tree_space(msp->ms_freed));

--- 77 unchanged lines hidden (view full) ---

2828 metaslab_alloc_trace_cache = NULL;
2829}
2830
2831/*
2832 * Add an allocation trace element to the allocation tracing list.
2833 */
2834static void
2835metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2668 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2836 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
2837 int allocator)
2669{
2670 if (!metaslab_trace_enabled)
2671 return;
2672
2673 /*
2674 * When the tracing list reaches its maximum we remove
2675 * the second element in the list before adding a new one.
2676 * By removing the second element we preserve the original

--- 16 unchanged lines hidden (view full) ---

2693 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2694 list_link_init(&mat->mat_list_node);
2695 mat->mat_mg = mg;
2696 mat->mat_msp = msp;
2697 mat->mat_size = psize;
2698 mat->mat_dva_id = dva_id;
2699 mat->mat_offset = offset;
2700 mat->mat_weight = 0;
2838{
2839 if (!metaslab_trace_enabled)
2840 return;
2841
2842 /*
2843 * When the tracing list reaches its maximum we remove
2844 * the second element in the list before adding a new one.
2845 * By removing the second element we preserve the original

--- 16 unchanged lines hidden (view full) ---

2862 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2863 list_link_init(&mat->mat_list_node);
2864 mat->mat_mg = mg;
2865 mat->mat_msp = msp;
2866 mat->mat_size = psize;
2867 mat->mat_dva_id = dva_id;
2868 mat->mat_offset = offset;
2869 mat->mat_weight = 0;
2870 mat->mat_allocator = allocator;
2701
2702 if (msp != NULL)
2703 mat->mat_weight = msp->ms_weight;
2704
2705 /*
2706 * The list is part of the zio so locking is not required. Only
2707 * a single thread will perform allocations for a given zio.
2708 */

--- 24 unchanged lines hidden (view full) ---

2733
2734/*
2735 * ==========================================================================
2736 * Metaslab block operations
2737 * ==========================================================================
2738 */
2739
2740static void
2871
2872 if (msp != NULL)
2873 mat->mat_weight = msp->ms_weight;
2874
2875 /*
2876 * The list is part of the zio so locking is not required. Only
2877 * a single thread will perform allocations for a given zio.
2878 */

--- 24 unchanged lines hidden (view full) ---

2903
2904/*
2905 * ==========================================================================
2906 * Metaslab block operations
2907 * ==========================================================================
2908 */
2909
2910static void
2741metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2911metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
2912 int allocator)
2742{
2743 if (!(flags & METASLAB_ASYNC_ALLOC) ||
2913{
2914 if (!(flags & METASLAB_ASYNC_ALLOC) ||
2744 flags & METASLAB_DONT_THROTTLE)
2915 (flags & METASLAB_DONT_THROTTLE))
2745 return;
2746
2747 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2748 if (!mg->mg_class->mc_alloc_throttle_enabled)
2749 return;
2750
2916 return;
2917
2918 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2919 if (!mg->mg_class->mc_alloc_throttle_enabled)
2920 return;
2921
2751 (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2922 (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
2752}
2753
2923}
2924
2925static void
2926metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
2927{
2928 uint64_t max = mg->mg_max_alloc_queue_depth;
2929 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
2930 while (cur < max) {
2931 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
2932 cur, cur + 1) == cur) {
2933 atomic_inc_64(
2934 &mg->mg_class->mc_alloc_max_slots[allocator]);
2935 return;
2936 }
2937 cur = mg->mg_cur_max_alloc_queue_depth[allocator];
2938 }
2939}
2940
2754void
2941void
2755metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2942metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
2943 int allocator, boolean_t io_complete)
2756{
2757 if (!(flags & METASLAB_ASYNC_ALLOC) ||
2944{
2945 if (!(flags & METASLAB_ASYNC_ALLOC) ||
2758 flags & METASLAB_DONT_THROTTLE)
2946 (flags & METASLAB_DONT_THROTTLE))
2759 return;
2760
2761 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2762 if (!mg->mg_class->mc_alloc_throttle_enabled)
2763 return;
2764
2947 return;
2948
2949 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2950 if (!mg->mg_class->mc_alloc_throttle_enabled)
2951 return;
2952
2765 (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2953 (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
2954 if (io_complete)
2955 metaslab_group_increment_qdepth(mg, allocator);
2766}
2767
2768void
2956}
2957
2958void
2769metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2959metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
2960 int allocator)
2770{
2771#ifdef ZFS_DEBUG
2772 const dva_t *dva = bp->blk_dva;
2773 int ndvas = BP_GET_NDVAS(bp);
2774
2775 for (int d = 0; d < ndvas; d++) {
2776 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2777 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2961{
2962#ifdef ZFS_DEBUG
2963 const dva_t *dva = bp->blk_dva;
2964 int ndvas = BP_GET_NDVAS(bp);
2965
2966 for (int d = 0; d < ndvas; d++) {
2967 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2968 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2778 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2969 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator],
2970 tag));
2779 }
2780#endif
2781}
2782
2783static uint64_t
2784metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2785{
2786 uint64_t start;

--- 25 unchanged lines hidden (view full) ---

2812 /*
2813 * Now that we've attempted the allocation we need to update the
2814 * metaslab's maximum block size since it may have changed.
2815 */
2816 msp->ms_max_size = metaslab_block_maxsize(msp);
2817 return (start);
2818}
2819
2971 }
2972#endif
2973}
2974
2975static uint64_t
2976metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2977{
2978 uint64_t start;

--- 25 unchanged lines hidden (view full) ---

3004 /*
3005 * Now that we've attempted the allocation we need to update the
3006 * metaslab's maximum block size since it may have changed.
3007 */
3008 msp->ms_max_size = metaslab_block_maxsize(msp);
3009 return (start);
3010}
3011
3012/*
3013 * Find the metaslab with the highest weight that is less than what we've
3014 * already tried. In the common case, this means that we will examine each
3015 * metaslab at most once. Note that concurrent callers could reorder metaslabs
3016 * by activation/passivation once we have dropped the mg_lock. If a metaslab is
3017 * activated by another thread, and we fail to allocate from the metaslab we
3018 * have selected, we may not try the newly-activated metaslab, and instead
3019 * activate another metaslab. This is not optimal, but generally does not cause
3020 * any problems (a possible exception being if every metaslab is completely full
3021 * except for the the newly-activated metaslab which we fail to examine).
3022 */
3023static metaslab_t *
3024find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
3025 dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
3026 zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
3027{
3028 avl_index_t idx;
3029 avl_tree_t *t = &mg->mg_metaslab_tree;
3030 metaslab_t *msp = avl_find(t, search, &idx);
3031 if (msp == NULL)
3032 msp = avl_nearest(t, idx, AVL_AFTER);
3033
3034 for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
3035 int i;
3036 if (!metaslab_should_allocate(msp, asize)) {
3037 metaslab_trace_add(zal, mg, msp, asize, d,
3038 TRACE_TOO_SMALL, allocator);
3039 continue;
3040 }
3041
3042 /*
3043 * If the selected metaslab is condensing, skip it.
3044 */
3045 if (msp->ms_condensing)
3046 continue;
3047
3048 *was_active = msp->ms_allocator != -1;
3049 /*
3050 * If we're activating as primary, this is our first allocation
3051 * from this disk, so we don't need to check how close we are.
3052 * If the metaslab under consideration was already active,
3053 * we're getting desperate enough to steal another allocator's
3054 * metaslab, so we still don't care about distances.
3055 */
3056 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
3057 break;
3058
3059 uint64_t target_distance = min_distance
3060 + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
3061 min_distance >> 1);
3062
3063 for (i = 0; i < d; i++) {
3064 if (metaslab_distance(msp, &dva[i]) < target_distance)
3065 break;
3066 }
3067 if (i == d)
3068 break;
3069 }
3070
3071 if (msp != NULL) {
3072 search->ms_weight = msp->ms_weight;
3073 search->ms_start = msp->ms_start + 1;
3074 search->ms_allocator = msp->ms_allocator;
3075 search->ms_primary = msp->ms_primary;
3076 }
3077 return (msp);
3078}
3079
3080/* ARGSUSED */
2820static uint64_t
2821metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
3081static uint64_t
3082metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2822 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
3083 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
3084 int allocator)
2823{
2824 metaslab_t *msp = NULL;
2825 uint64_t offset = -1ULL;
2826 uint64_t activation_weight;
3085{
3086 metaslab_t *msp = NULL;
3087 uint64_t offset = -1ULL;
3088 uint64_t activation_weight;
2827 uint64_t target_distance;
2828 int i;
3089 boolean_t tertiary = B_FALSE;
2829
2830 activation_weight = METASLAB_WEIGHT_PRIMARY;
3090
3091 activation_weight = METASLAB_WEIGHT_PRIMARY;
2831 for (i = 0; i < d; i++) {
2832 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
3092 for (int i = 0; i < d; i++) {
3093 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
3094 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2833 activation_weight = METASLAB_WEIGHT_SECONDARY;
3095 activation_weight = METASLAB_WEIGHT_SECONDARY;
3096 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
3097 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
3098 tertiary = B_TRUE;
2834 break;
2835 }
2836 }
2837
3099 break;
3100 }
3101 }
3102
3103 /*
3104 * If we don't have enough metaslabs active to fill the entire array, we
3105 * just use the 0th slot.
3106 */
3107 if (mg->mg_ms_ready < mg->mg_allocators * 2) {
3108 tertiary = B_FALSE;
3109 allocator = 0;
3110 }
3111
3112 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
3113
2838 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2839 search->ms_weight = UINT64_MAX;
2840 search->ms_start = 0;
3114 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
3115 search->ms_weight = UINT64_MAX;
3116 search->ms_start = 0;
3117 /*
3118 * At the end of the metaslab tree are the already-active metaslabs,
3119 * first the primaries, then the secondaries. When we resume searching
3120 * through the tree, we need to consider ms_allocator and ms_primary so
3121 * we start in the location right after where we left off, and don't
3122 * accidentally loop forever considering the same metaslabs.
3123 */
3124 search->ms_allocator = -1;
3125 search->ms_primary = B_TRUE;
2841 for (;;) {
3126 for (;;) {
2842 boolean_t was_active;
2843 avl_tree_t *t = &mg->mg_metaslab_tree;
2844 avl_index_t idx;
3127 boolean_t was_active = B_FALSE;
2845
2846 mutex_enter(&mg->mg_lock);
2847
3128
3129 mutex_enter(&mg->mg_lock);
3130
2848 /*
2849 * Find the metaslab with the highest weight that is less
2850 * than what we've already tried. In the common case, this
2851 * means that we will examine each metaslab at most once.
2852 * Note that concurrent callers could reorder metaslabs
2853 * by activation/passivation once we have dropped the mg_lock.
2854 * If a metaslab is activated by another thread, and we fail
2855 * to allocate from the metaslab we have selected, we may
2856 * not try the newly-activated metaslab, and instead activate
2857 * another metaslab. This is not optimal, but generally
2858 * does not cause any problems (a possible exception being
2859 * if every metaslab is completely full except for the
2860 * the newly-activated metaslab which we fail to examine).
2861 */
2862 msp = avl_find(t, search, &idx);
2863 if (msp == NULL)
2864 msp = avl_nearest(t, idx, AVL_AFTER);
2865 for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2866
2867 if (!metaslab_should_allocate(msp, asize)) {
2868 metaslab_trace_add(zal, mg, msp, asize, d,
2869 TRACE_TOO_SMALL);
2870 continue;
2871 }
2872
2873 /*
2874 * If the selected metaslab is condensing, skip it.
2875 */
2876 if (msp->ms_condensing)
2877 continue;
2878
2879 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2880 if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2881 break;
2882
2883 target_distance = min_distance +
2884 (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2885 min_distance >> 1);
2886
2887 for (i = 0; i < d; i++) {
2888 if (metaslab_distance(msp, &dva[i]) <
2889 target_distance)
2890 break;
2891 }
2892 if (i == d)
2893 break;
3131 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
3132 mg->mg_primaries[allocator] != NULL) {
3133 msp = mg->mg_primaries[allocator];
3134 was_active = B_TRUE;
3135 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
3136 mg->mg_secondaries[allocator] != NULL && !tertiary) {
3137 msp = mg->mg_secondaries[allocator];
3138 was_active = B_TRUE;
3139 } else {
3140 msp = find_valid_metaslab(mg, activation_weight, dva, d,
3141 min_distance, asize, allocator, zal, search,
3142 &was_active);
2894 }
3143 }
3144
2895 mutex_exit(&mg->mg_lock);
2896 if (msp == NULL) {
2897 kmem_free(search, sizeof (*search));
2898 return (-1ULL);
2899 }
3145 mutex_exit(&mg->mg_lock);
3146 if (msp == NULL) {
3147 kmem_free(search, sizeof (*search));
3148 return (-1ULL);
3149 }
2900 search->ms_weight = msp->ms_weight;
2901 search->ms_start = msp->ms_start + 1;
2902
2903 mutex_enter(&msp->ms_lock);
3150
3151 mutex_enter(&msp->ms_lock);
2904
2905 /*
2906 * Ensure that the metaslab we have selected is still
2907 * capable of handling our request. It's possible that
2908 * another thread may have changed the weight while we
2909 * were blocked on the metaslab lock. We check the
2910 * active status first to see if we need to reselect
2911 * a new metaslab.
2912 */
2913 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2914 mutex_exit(&msp->ms_lock);
2915 continue;
2916 }
2917
3152 /*
3153 * Ensure that the metaslab we have selected is still
3154 * capable of handling our request. It's possible that
3155 * another thread may have changed the weight while we
3156 * were blocked on the metaslab lock. We check the
3157 * active status first to see if we need to reselect
3158 * a new metaslab.
3159 */
3160 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
3161 mutex_exit(&msp->ms_lock);
3162 continue;
3163 }
3164
2918 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2919 activation_weight == METASLAB_WEIGHT_PRIMARY) {
2920 metaslab_passivate(msp,
2921 msp->ms_weight & ~METASLAB_ACTIVE_MASK);
3165 /*
3166 * If the metaslab is freshly activated for an allocator that
3167 * isn't the one we're allocating from, or if it's a primary and
3168 * we're seeking a secondary (or vice versa), we go back and
3169 * select a new metaslab.
3170 */
3171 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
3172 (msp->ms_allocator != -1) &&
3173 (msp->ms_allocator != allocator || ((activation_weight ==
3174 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
2922 mutex_exit(&msp->ms_lock);
2923 continue;
2924 }
2925
3175 mutex_exit(&msp->ms_lock);
3176 continue;
3177 }
3178
2926 if (metaslab_activate(msp, activation_weight) != 0) {
3179 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
3180 metaslab_passivate(msp, msp->ms_weight &
3181 ~METASLAB_WEIGHT_CLAIM);
2927 mutex_exit(&msp->ms_lock);
2928 continue;
2929 }
3182 mutex_exit(&msp->ms_lock);
3183 continue;
3184 }
3185
3186 if (metaslab_activate(msp, allocator, activation_weight) != 0) {
3187 mutex_exit(&msp->ms_lock);
3188 continue;
3189 }
3190
2930 msp->ms_selected_txg = txg;
2931
2932 /*
2933 * Now that we have the lock, recheck to see if we should
2934 * continue to use this metaslab for this allocation. The
2935 * the metaslab is now loaded so metaslab_should_allocate() can
2936 * accurately determine if the allocation attempt should
2937 * proceed.
2938 */
2939 if (!metaslab_should_allocate(msp, asize)) {
2940 /* Passivate this metaslab and select a new one. */
2941 metaslab_trace_add(zal, mg, msp, asize, d,
3191 msp->ms_selected_txg = txg;
3192
3193 /*
3194 * Now that we have the lock, recheck to see if we should
3195 * continue to use this metaslab for this allocation. The
3196 * the metaslab is now loaded so metaslab_should_allocate() can
3197 * accurately determine if the allocation attempt should
3198 * proceed.
3199 */
3200 if (!metaslab_should_allocate(msp, asize)) {
3201 /* Passivate this metaslab and select a new one. */
3202 metaslab_trace_add(zal, mg, msp, asize, d,
2942 TRACE_TOO_SMALL);
3203 TRACE_TOO_SMALL, allocator);
2943 goto next;
2944 }
2945
2946 /*
2947 * If this metaslab is currently condensing then pick again as
2948 * we can't manipulate this metaslab until it's committed
2949 * to disk.
2950 */
2951 if (msp->ms_condensing) {
2952 metaslab_trace_add(zal, mg, msp, asize, d,
3204 goto next;
3205 }
3206
3207 /*
3208 * If this metaslab is currently condensing then pick again as
3209 * we can't manipulate this metaslab until it's committed
3210 * to disk.
3211 */
3212 if (msp->ms_condensing) {
3213 metaslab_trace_add(zal, mg, msp, asize, d,
2953 TRACE_CONDENSING);
3214 TRACE_CONDENSING, allocator);
3215 metaslab_passivate(msp, msp->ms_weight &
3216 ~METASLAB_ACTIVE_MASK);
2954 mutex_exit(&msp->ms_lock);
2955 continue;
2956 }
2957
2958 offset = metaslab_block_alloc(msp, asize, txg);
3217 mutex_exit(&msp->ms_lock);
3218 continue;
3219 }
3220
3221 offset = metaslab_block_alloc(msp, asize, txg);
2959 metaslab_trace_add(zal, mg, msp, asize, d, offset);
3222 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
2960
2961 if (offset != -1ULL) {
2962 /* Proactively passivate the metaslab, if needed */
2963 metaslab_segment_may_passivate(msp);
2964 break;
2965 }
2966next:
2967 ASSERT(msp->ms_loaded);

--- 39 unchanged lines hidden (view full) ---

3007 }
3008 mutex_exit(&msp->ms_lock);
3009 kmem_free(search, sizeof (*search));
3010 return (offset);
3011}
3012
3013static uint64_t
3014metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
3223
3224 if (offset != -1ULL) {
3225 /* Proactively passivate the metaslab, if needed */
3226 metaslab_segment_may_passivate(msp);
3227 break;
3228 }
3229next:
3230 ASSERT(msp->ms_loaded);

--- 39 unchanged lines hidden (view full) ---

3270 }
3271 mutex_exit(&msp->ms_lock);
3272 kmem_free(search, sizeof (*search));
3273 return (offset);
3274}
3275
3276static uint64_t
3277metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
3015 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
3278 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
3279 int allocator)
3016{
3017 uint64_t offset;
3018 ASSERT(mg->mg_initialized);
3019
3020 offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
3280{
3281 uint64_t offset;
3282 ASSERT(mg->mg_initialized);
3283
3284 offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
3021 min_distance, dva, d);
3285 min_distance, dva, d, allocator);
3022
3023 mutex_enter(&mg->mg_lock);
3024 if (offset == -1ULL) {
3025 mg->mg_failed_allocations++;
3026 metaslab_trace_add(zal, mg, NULL, asize, d,
3286
3287 mutex_enter(&mg->mg_lock);
3288 if (offset == -1ULL) {
3289 mg->mg_failed_allocations++;
3290 metaslab_trace_add(zal, mg, NULL, asize, d,
3027 TRACE_GROUP_FAILURE);
3291 TRACE_GROUP_FAILURE, allocator);
3028 if (asize == SPA_GANGBLOCKSIZE) {
3029 /*
3030 * This metaslab group was unable to allocate
3031 * the minimum gang block size so it must be out of
3032 * space. We must notify the allocation throttle
3033 * to start skipping allocation attempts to this
3034 * metaslab group until more space becomes available.
3035 * Note: this failure cannot be caused by the

--- 18 unchanged lines hidden (view full) ---

3054int ditto_same_vdev_distance_shift = 3;
3055
3056/*
3057 * Allocate a block for the specified i/o.
3058 */
3059int
3060metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
3061 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
3292 if (asize == SPA_GANGBLOCKSIZE) {
3293 /*
3294 * This metaslab group was unable to allocate
3295 * the minimum gang block size so it must be out of
3296 * space. We must notify the allocation throttle
3297 * to start skipping allocation attempts to this
3298 * metaslab group until more space becomes available.
3299 * Note: this failure cannot be caused by the

--- 18 unchanged lines hidden (view full) ---

3318int ditto_same_vdev_distance_shift = 3;
3319
3320/*
3321 * Allocate a block for the specified i/o.
3322 */
3323int
3324metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
3325 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
3062 zio_alloc_list_t *zal)
3326 zio_alloc_list_t *zal, int allocator)
3063{
3064 metaslab_group_t *mg, *rotor;
3065 vdev_t *vd;
3066 boolean_t try_hard = B_FALSE;
3067
3068 ASSERT(!DVA_IS_VALID(&dva[d]));
3069
3070 /*
3071 * For testing, make some blocks above a certain size be gang blocks.
3072 */
3073 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
3327{
3328 metaslab_group_t *mg, *rotor;
3329 vdev_t *vd;
3330 boolean_t try_hard = B_FALSE;
3331
3332 ASSERT(!DVA_IS_VALID(&dva[d]));
3333
3334 /*
3335 * For testing, make some blocks above a certain size be gang blocks.
3336 */
3337 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
3074 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
3338 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
3339 allocator);
3075 return (SET_ERROR(ENOSPC));
3076 }
3077
3078 /*
3079 * Start at the rotor and loop through all mgs until we find something.
3080 * Note that there's no locking on mc_rotor or mc_aliquot because
3081 * nothing actually breaks if we miss a few updates -- we just won't
3082 * allocate quite as evenly. It all balances out over time.

--- 69 unchanged lines hidden (view full) ---

3152 * Determine if the selected metaslab group is eligible
3153 * for allocations. If we're ganging then don't allow
3154 * this metaslab group to skip allocations since that would
3155 * inadvertently return ENOSPC and suspend the pool
3156 * even though space is still available.
3157 */
3158 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3159 allocatable = metaslab_group_allocatable(mg, rotor,
3340 return (SET_ERROR(ENOSPC));
3341 }
3342
3343 /*
3344 * Start at the rotor and loop through all mgs until we find something.
3345 * Note that there's no locking on mc_rotor or mc_aliquot because
3346 * nothing actually breaks if we miss a few updates -- we just won't
3347 * allocate quite as evenly. It all balances out over time.

--- 69 unchanged lines hidden (view full) ---

3417 * Determine if the selected metaslab group is eligible
3418 * for allocations. If we're ganging then don't allow
3419 * this metaslab group to skip allocations since that would
3420 * inadvertently return ENOSPC and suspend the pool
3421 * even though space is still available.
3422 */
3423 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3424 allocatable = metaslab_group_allocatable(mg, rotor,
3160 psize);
3425 psize, allocator);
3161 }
3162
3163 if (!allocatable) {
3164 metaslab_trace_add(zal, mg, NULL, psize, d,
3426 }
3427
3428 if (!allocatable) {
3429 metaslab_trace_add(zal, mg, NULL, psize, d,
3165 TRACE_NOT_ALLOCATABLE);
3430 TRACE_NOT_ALLOCATABLE, allocator);
3166 goto next;
3167 }
3168
3169 ASSERT(mg->mg_initialized);
3170
3171 /*
3172 * Avoid writing single-copy data to a failing,
3173 * non-redundant vdev, unless we've already tried all
3174 * other vdevs.
3175 */
3176 if ((vd->vdev_stat.vs_write_errors > 0 ||
3177 vd->vdev_state < VDEV_STATE_HEALTHY) &&
3178 d == 0 && !try_hard && vd->vdev_children == 0) {
3179 metaslab_trace_add(zal, mg, NULL, psize, d,
3431 goto next;
3432 }
3433
3434 ASSERT(mg->mg_initialized);
3435
3436 /*
3437 * Avoid writing single-copy data to a failing,
3438 * non-redundant vdev, unless we've already tried all
3439 * other vdevs.
3440 */
3441 if ((vd->vdev_stat.vs_write_errors > 0 ||
3442 vd->vdev_state < VDEV_STATE_HEALTHY) &&
3443 d == 0 && !try_hard && vd->vdev_children == 0) {
3444 metaslab_trace_add(zal, mg, NULL, psize, d,
3180 TRACE_VDEV_ERROR);
3445 TRACE_VDEV_ERROR, allocator);
3181 goto next;
3182 }
3183
3184 ASSERT(mg->mg_class == mc);
3185
3186 /*
3187 * If we don't need to try hard, then require that the
3188 * block be 1/8th of the device away from any other DVAs

--- 7 unchanged lines hidden (view full) ---

3196 if (distance <= (1ULL << vd->vdev_ms_shift))
3197 distance = 0;
3198 }
3199
3200 uint64_t asize = vdev_psize_to_asize(vd, psize);
3201 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3202
3203 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3446 goto next;
3447 }
3448
3449 ASSERT(mg->mg_class == mc);
3450
3451 /*
3452 * If we don't need to try hard, then require that the
3453 * block be 1/8th of the device away from any other DVAs

--- 7 unchanged lines hidden (view full) ---

3461 if (distance <= (1ULL << vd->vdev_ms_shift))
3462 distance = 0;
3463 }
3464
3465 uint64_t asize = vdev_psize_to_asize(vd, psize);
3466 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3467
3468 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3204 distance, dva, d);
3469 distance, dva, d, allocator);
3205
3206 if (offset != -1ULL) {
3207 /*
3208 * If we've just selected this metaslab group,
3209 * figure out whether the corresponding vdev is
3210 * over- or under-used relative to the pool,
3211 * and set an allocation bias to even it out.
3212 */

--- 46 unchanged lines hidden (view full) ---

3259 */
3260 if (!try_hard) {
3261 try_hard = B_TRUE;
3262 goto top;
3263 }
3264
3265 bzero(&dva[d], sizeof (dva_t));
3266
3470
3471 if (offset != -1ULL) {
3472 /*
3473 * If we've just selected this metaslab group,
3474 * figure out whether the corresponding vdev is
3475 * over- or under-used relative to the pool,
3476 * and set an allocation bias to even it out.
3477 */

--- 46 unchanged lines hidden (view full) ---

3524 */
3525 if (!try_hard) {
3526 try_hard = B_TRUE;
3527 goto top;
3528 }
3529
3530 bzero(&dva[d], sizeof (dva_t));
3531
3267 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3532 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
3268 return (SET_ERROR(ENOSPC));
3269}
3270
3271void
3272metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
3273 boolean_t checkpoint)
3274{
3275 metaslab_t *msp;

--- 284 unchanged lines hidden (view full) ---

3560/*
3561 * Reserve some allocation slots. The reservation system must be called
3562 * before we call into the allocator. If there aren't any available slots
3563 * then the I/O will be throttled until an I/O completes and its slots are
3564 * freed up. The function returns true if it was successful in placing
3565 * the reservation.
3566 */
3567boolean_t
3533 return (SET_ERROR(ENOSPC));
3534}
3535
3536void
3537metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
3538 boolean_t checkpoint)
3539{
3540 metaslab_t *msp;

--- 284 unchanged lines hidden (view full) ---

3825/*
3826 * Reserve some allocation slots. The reservation system must be called
3827 * before we call into the allocator. If there aren't any available slots
3828 * then the I/O will be throttled until an I/O completes and its slots are
3829 * freed up. The function returns true if it was successful in placing
3830 * the reservation.
3831 */
3832boolean_t
3568metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3569 int flags)
3833metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
3834 zio_t *zio, int flags)
3570{
3571 uint64_t available_slots = 0;
3572 boolean_t slot_reserved = B_FALSE;
3835{
3836 uint64_t available_slots = 0;
3837 boolean_t slot_reserved = B_FALSE;
3838 uint64_t max = mc->mc_alloc_max_slots[allocator];
3573
3574 ASSERT(mc->mc_alloc_throttle_enabled);
3575 mutex_enter(&mc->mc_lock);
3576
3839
3840 ASSERT(mc->mc_alloc_throttle_enabled);
3841 mutex_enter(&mc->mc_lock);
3842
3577 uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3578 if (reserved_slots < mc->mc_alloc_max_slots)
3579 available_slots = mc->mc_alloc_max_slots - reserved_slots;
3843 uint64_t reserved_slots =
3844 refcount_count(&mc->mc_alloc_slots[allocator]);
3845 if (reserved_slots < max)
3846 available_slots = max - reserved_slots;
3580
3581 if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3582 /*
3583 * We reserve the slots individually so that we can unreserve
3584 * them individually when an I/O completes.
3585 */
3586 for (int d = 0; d < slots; d++) {
3847
3848 if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3849 /*
3850 * We reserve the slots individually so that we can unreserve
3851 * them individually when an I/O completes.
3852 */
3853 for (int d = 0; d < slots; d++) {
3587 reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3854 reserved_slots =
3855 refcount_add(&mc->mc_alloc_slots[allocator],
3856 zio);
3588 }
3589 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3590 slot_reserved = B_TRUE;
3591 }
3592
3593 mutex_exit(&mc->mc_lock);
3594 return (slot_reserved);
3595}
3596
3597void
3857 }
3858 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3859 slot_reserved = B_TRUE;
3860 }
3861
3862 mutex_exit(&mc->mc_lock);
3863 return (slot_reserved);
3864}
3865
3866void
3598metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3867metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
3868 int allocator, zio_t *zio)
3599{
3600 ASSERT(mc->mc_alloc_throttle_enabled);
3601 mutex_enter(&mc->mc_lock);
3602 for (int d = 0; d < slots; d++) {
3869{
3870 ASSERT(mc->mc_alloc_throttle_enabled);
3871 mutex_enter(&mc->mc_lock);
3872 for (int d = 0; d < slots; d++) {
3603 (void) refcount_remove(&mc->mc_alloc_slots, zio);
3873 (void) refcount_remove(&mc->mc_alloc_slots[allocator],
3874 zio);
3604 }
3605 mutex_exit(&mc->mc_lock);
3606}
3607
3608static int
3609metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
3610 uint64_t txg)
3611{

--- 5 unchanged lines hidden (view full) ---

3617 return (ENXIO);
3618
3619 ASSERT3P(vd->vdev_ms, !=, NULL);
3620 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3621
3622 mutex_enter(&msp->ms_lock);
3623
3624 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3875 }
3876 mutex_exit(&mc->mc_lock);
3877}
3878
3879static int
3880metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
3881 uint64_t txg)
3882{

--- 5 unchanged lines hidden (view full) ---

3888 return (ENXIO);
3889
3890 ASSERT3P(vd->vdev_ms, !=, NULL);
3891 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3892
3893 mutex_enter(&msp->ms_lock);
3894
3895 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3625 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3896 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
3897 /*
3898 * No need to fail in that case; someone else has activated the
3899 * metaslab, but that doesn't preclude us from using it.
3900 */
3901 if (error == EBUSY)
3902 error = 0;
3626
3627 if (error == 0 &&
3628 !range_tree_contains(msp->ms_allocatable, offset, size))
3629 error = SET_ERROR(ENOENT);
3630
3631 if (error || txg == 0) { /* txg == 0 indicates dry run */
3632 mutex_exit(&msp->ms_lock);
3633 return (error);

--- 88 unchanged lines hidden (view full) ---

3722 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3723
3724 return (metaslab_claim_impl(vd, offset, size, txg));
3725}
3726
3727int
3728metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3729 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3903
3904 if (error == 0 &&
3905 !range_tree_contains(msp->ms_allocatable, offset, size))
3906 error = SET_ERROR(ENOENT);
3907
3908 if (error || txg == 0) { /* txg == 0 indicates dry run */
3909 mutex_exit(&msp->ms_lock);
3910 return (error);

--- 88 unchanged lines hidden (view full) ---

3999 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4000
4001 return (metaslab_claim_impl(vd, offset, size, txg));
4002}
4003
4004int
4005metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
4006 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3730 zio_alloc_list_t *zal, zio_t *zio)
4007 zio_alloc_list_t *zal, zio_t *zio, int allocator)
3731{
3732 dva_t *dva = bp->blk_dva;
3733 dva_t *hintdva = hintbp->blk_dva;
3734 int error = 0;
3735
3736 ASSERT(bp->blk_birth == 0);
3737 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3738

--- 6 unchanged lines hidden (view full) ---

3745
3746 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3747 ASSERT(BP_GET_NDVAS(bp) == 0);
3748 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3749 ASSERT3P(zal, !=, NULL);
3750
3751 for (int d = 0; d < ndvas; d++) {
3752 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
4008{
4009 dva_t *dva = bp->blk_dva;
4010 dva_t *hintdva = hintbp->blk_dva;
4011 int error = 0;
4012
4013 ASSERT(bp->blk_birth == 0);
4014 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
4015

--- 6 unchanged lines hidden (view full) ---

4022
4023 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
4024 ASSERT(BP_GET_NDVAS(bp) == 0);
4025 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
4026 ASSERT3P(zal, !=, NULL);
4027
4028 for (int d = 0; d < ndvas; d++) {
4029 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3753 txg, flags, zal);
4030 txg, flags, zal, allocator);
3754 if (error != 0) {
3755 for (d--; d >= 0; d--) {
3756 metaslab_unalloc_dva(spa, &dva[d], txg);
3757 metaslab_group_alloc_decrement(spa,
4031 if (error != 0) {
4032 for (d--; d >= 0; d--) {
4033 metaslab_unalloc_dva(spa, &dva[d], txg);
4034 metaslab_group_alloc_decrement(spa,
3758 DVA_GET_VDEV(&dva[d]), zio, flags);
4035 DVA_GET_VDEV(&dva[d]), zio, flags,
4036 allocator, B_FALSE);
3759 bzero(&dva[d], sizeof (dva_t));
3760 }
3761 spa_config_exit(spa, SCL_ALLOC, FTAG);
3762 return (error);
3763 } else {
3764 /*
3765 * Update the metaslab group's queue depth
3766 * based on the newly allocated dva.
3767 */
3768 metaslab_group_alloc_increment(spa,
4037 bzero(&dva[d], sizeof (dva_t));
4038 }
4039 spa_config_exit(spa, SCL_ALLOC, FTAG);
4040 return (error);
4041 } else {
4042 /*
4043 * Update the metaslab group's queue depth
4044 * based on the newly allocated dva.
4045 */
4046 metaslab_group_alloc_increment(spa,
3769 DVA_GET_VDEV(&dva[d]), zio, flags);
4047 DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
3770 }
3771
3772 }
3773 ASSERT(error == 0);
3774 ASSERT(BP_GET_NDVAS(bp) == ndvas);
3775
3776 spa_config_exit(spa, SCL_ALLOC, FTAG);
3777

--- 152 unchanged lines hidden ---
4048 }
4049
4050 }
4051 ASSERT(error == 0);
4052 ASSERT(BP_GET_NDVAS(bp) == ndvas);
4053
4054 spa_config_exit(spa, SCL_ALLOC, FTAG);
4055

--- 152 unchanged lines hidden ---