Cross Reference: /freebsd-11-stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c

Deleted Added

sdiff udiff text old ( 339104 ) new ( 339105 )

full compact

metaslab.c (339104)	metaslab.c (339105)
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 6 unchanged lines hidden (view full) --- 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21/ 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.	1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 6 unchanged lines hidden (view full) --- 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21/ 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.	23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 / 27 28#include <sys/zfs_context.h> 29#include <sys/dmu.h> 30#include <sys/dmu_tx.h> 31#include <sys/space_map.h> --- 238 unchanged lines hidden* (view full) --- 270 * limit is ever reached allowing for further investigation. 271 / 272uint64_t metaslab_trace_max_entries = 5000; 273* 274static uint64_t metaslab_weight(metaslab_t ); 275static void metaslab_set_fragmentation(metaslab_t ); 276static void metaslab_free_impl(vdev_t , uint64_t, uint64_t, boolean_t); 277static void metaslab_check_free_impl(vdev_t , uint64_t, uint64_t);	24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 / 27 28#include <sys/zfs_context.h> 29#include <sys/dmu.h> 30#include <sys/dmu_tx.h> 31#include <sys/space_map.h> --- 238 unchanged lines hidden* (view full) --- 270 * limit is ever reached allowing for further investigation. 271 / 272uint64_t metaslab_trace_max_entries = 5000; 273* 274static uint64_t metaslab_weight(metaslab_t ); 275static void metaslab_set_fragmentation(metaslab_t ); 276static void metaslab_free_impl(vdev_t , uint64_t, uint64_t, boolean_t); 277static void metaslab_check_free_impl(vdev_t , uint64_t, uint64_t);
	278static void metaslab_passivate(metaslab_t msp, uint64_t weight); 279static uint64_t metaslab_weight_from_range_tree(metaslab_t msp);
278 279kmem_cache_t metaslab_alloc_trace_cache; 280* 281/* 282 * ========================================================================== 283 * Metaslab classes 284 * ========================================================================== 285 / 286metaslab_class_t 287metaslab_class_create(spa_t spa, metaslab_ops_t ops) 288{ 289 metaslab_class_t mc; 290* 291 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 292 293 mc->mc_spa = spa; 294 mc->mc_rotor = NULL; 295 mc->mc_ops = ops; 296 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);	280 281kmem_cache_t metaslab_alloc_trace_cache; 282* 283/* 284 * ========================================================================== 285 * Metaslab classes 286 * ========================================================================== 287 / 288metaslab_class_t 289metaslab_class_create(spa_t spa, metaslab_ops_t ops) 290{ 291 metaslab_class_t mc; 292* 293 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 294 295 mc->mc_spa = spa; 296 mc->mc_rotor = NULL; 297 mc->mc_ops = ops; 298 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
297 refcount_create_tracked(&mc->mc_alloc_slots);	299 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 300 sizeof (refcount_t), KM_SLEEP); 301 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 302 sizeof (uint64_t), KM_SLEEP); 303 for (int i = 0; i < spa->spa_alloc_count; i++) 304 refcount_create_tracked(&mc->mc_alloc_slots[i]);
298 299 return (mc); 300} 301 302void 303metaslab_class_destroy(metaslab_class_t mc) 304{ 305* ASSERT(mc->mc_rotor == NULL); 306 ASSERT(mc->mc_alloc == 0); 307 ASSERT(mc->mc_deferred == 0); 308 ASSERT(mc->mc_space == 0); 309 ASSERT(mc->mc_dspace == 0); 310	305 306 return (mc); 307} 308 309void 310metaslab_class_destroy(metaslab_class_t mc) 311{ 312* ASSERT(mc->mc_rotor == NULL); 313 ASSERT(mc->mc_alloc == 0); 314 ASSERT(mc->mc_deferred == 0); 315 ASSERT(mc->mc_space == 0); 316 ASSERT(mc->mc_dspace == 0); 317
311 refcount_destroy(&mc->mc_alloc_slots);	318 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 319 refcount_destroy(&mc->mc_alloc_slots[i]); 320 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 321 sizeof (refcount_t)); 322 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 323 sizeof (uint64_t));
312 mutex_destroy(&mc->mc_lock); 313 kmem_free(mc, sizeof (metaslab_class_t)); 314} 315 316int 317metaslab_class_validate(metaslab_class_t mc) 318{ 319* metaslab_group_t mg; --- 207 unchanged lines hidden* (view full) --- 527} 528 529static int 530metaslab_compare(const void x1, const void x2) 531{ 532 const metaslab_t m1 = x1; 533* const metaslab_t m2 = x2; 534*	324 mutex_destroy(&mc->mc_lock); 325 kmem_free(mc, sizeof (metaslab_class_t)); 326} 327 328int 329metaslab_class_validate(metaslab_class_t mc) 330{ 331* metaslab_group_t mg; --- 207 unchanged lines hidden* (view full) --- 539} 540 541static int 542metaslab_compare(const void x1, const void x2) 543{ 544 const metaslab_t m1 = x1; 545* const metaslab_t m2 = x2; 546*
	547 int sort1 = 0; 548 int sort2 = 0; 549 if (m1->ms_allocator != -1 && m1->ms_primary) 550 sort1 = 1; 551 else if (m1->ms_allocator != -1 && !m1->ms_primary) 552 sort1 = 2; 553 if (m2->ms_allocator != -1 && m2->ms_primary) 554 sort2 = 1; 555 else if (m2->ms_allocator != -1 && !m2->ms_primary) 556 sort2 = 2; 557 558 /* 559 * Sort inactive metaslabs first, then primaries, then secondaries. When 560 * selecting a metaslab to allocate from, an allocator first tries its 561 * primary, then secondary active metaslab. If it doesn't have active 562 * metaslabs, or can't allocate from them, it searches for an inactive 563 * metaslab to activate. If it can't find a suitable one, it will steal 564 * a primary or secondary metaslab from another allocator. 565 / 566* if (sort1 < sort2) 567 return (-1); 568 if (sort1 > sort2) 569 return (1); 570
535 if (m1->ms_weight < m2->ms_weight) 536 return (1); 537 if (m1->ms_weight > m2->ms_weight) 538 return (-1); 539 540 /* 541 * If the weights are identical, use the offset to force uniqueness. 542 / --- 135 unchanged lines hidden* (view full) --- 678 else if (!was_allocatable && mg->mg_allocatable) 679 mc->mc_alloc_groups++; 680 mutex_exit(&mc->mc_lock); 681 682 mutex_exit(&mg->mg_lock); 683} 684 685metaslab_group_t *	571 if (m1->ms_weight < m2->ms_weight) 572 return (1); 573 if (m1->ms_weight > m2->ms_weight) 574 return (-1); 575 576 /* 577 * If the weights are identical, use the offset to force uniqueness. 578 / --- 135 unchanged lines hidden* (view full) --- 714 else if (!was_allocatable && mg->mg_allocatable) 715 mc->mc_alloc_groups++; 716 mutex_exit(&mc->mc_lock); 717 718 mutex_exit(&mg->mg_lock); 719} 720 721metaslab_group_t *
686metaslab_group_create(metaslab_class_t mc, vdev_t vd)	722metaslab_group_create(metaslab_class_t mc, vdev_t vd, int allocators)
687{ 688 metaslab_group_t mg; 689* 690 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 691 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);	723{ 724 metaslab_group_t mg; 725* 726 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 727 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
	728 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t ), 729* KM_SLEEP); 730 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t ), 731* KM_SLEEP);
692 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 693 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 694 mg->mg_vd = vd; 695 mg->mg_class = mc; 696 mg->mg_activation_count = 0; 697 mg->mg_initialized = B_FALSE; 698 mg->mg_no_free_space = B_TRUE;	732 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 733 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 734 mg->mg_vd = vd; 735 mg->mg_class = mc; 736 mg->mg_activation_count = 0; 737 mg->mg_initialized = B_FALSE; 738 mg->mg_no_free_space = B_TRUE;
699 refcount_create_tracked(&mg->mg_alloc_queue_depth);	739 mg->mg_allocators = allocators;
700	740
	741 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t), 742 KM_SLEEP); 743 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 744 sizeof (uint64_t), KM_SLEEP); 745 for (int i = 0; i < allocators; i++) { 746 refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 747 mg->mg_cur_max_alloc_queue_depth[i] = 0; 748 } 749
701 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 702 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 703 704 return (mg); 705} 706 707void 708metaslab_group_destroy(metaslab_group_t mg) --- 4 unchanged lines hidden* (view full) --- 713 * We may have gone below zero with the activation count 714 * either because we never activated in the first place or 715 * because we're done, and possibly removing the vdev. 716 / 717* ASSERT(mg->mg_activation_count <= 0); 718 719 taskq_destroy(mg->mg_taskq); 720 avl_destroy(&mg->mg_metaslab_tree);	750 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 751 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 752 753 return (mg); 754} 755 756void 757metaslab_group_destroy(metaslab_group_t mg) --- 4 unchanged lines hidden* (view full) --- 762 * We may have gone below zero with the activation count 763 * either because we never activated in the first place or 764 * because we're done, and possibly removing the vdev. 765 / 766* ASSERT(mg->mg_activation_count <= 0); 767 768 taskq_destroy(mg->mg_taskq); 769 avl_destroy(&mg->mg_metaslab_tree);
	770 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t )); 771* kmem_free(mg->mg_secondaries, mg->mg_allocators * 772 sizeof (metaslab_t *));
721 mutex_destroy(&mg->mg_lock);	773 mutex_destroy(&mg->mg_lock);
722 refcount_destroy(&mg->mg_alloc_queue_depth);	774 775 for (int i = 0; i < mg->mg_allocators; i++) { 776 refcount_destroy(&mg->mg_alloc_queue_depth[i]); 777 mg->mg_cur_max_alloc_queue_depth[i] = 0; 778 } 779 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 780 sizeof (refcount_t)); 781 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 782 sizeof (uint64_t)); 783
723 kmem_free(mg, sizeof (metaslab_group_t)); 724} 725 726void 727metaslab_group_activate(metaslab_group_t mg) 728{ 729* metaslab_class_t mc = mg->mg_class; 730* metaslab_group_t mgprev, mgnext; --- 63 unchanged lines hidden (view full) --- 794 * lower locks to allow the I/O to complete. At a minimum, 795 * we continue to hold the SCL_ALLOC lock, which prevents any future 796 * allocations from taking place and any changes to the vdev tree. 797 / 798* spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 799 taskq_wait(mg->mg_taskq); 800 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 801 metaslab_group_alloc_update(mg);	784 kmem_free(mg, sizeof (metaslab_group_t)); 785} 786 787void 788metaslab_group_activate(metaslab_group_t mg) 789{ 790* metaslab_class_t mc = mg->mg_class; 791* metaslab_group_t mgprev, mgnext; --- 63 unchanged lines hidden (view full) --- 855 * lower locks to allow the I/O to complete. At a minimum, 856 * we continue to hold the SCL_ALLOC lock, which prevents any future 857 * allocations from taking place and any changes to the vdev tree. 858 / 859* spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 860 taskq_wait(mg->mg_taskq); 861 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 862 metaslab_group_alloc_update(mg);
	863 for (int i = 0; i < mg->mg_allocators; i++) { 864 metaslab_t msp = mg->mg_primaries[i]; 865* if (msp != NULL) { 866 mutex_enter(&msp->ms_lock); 867 metaslab_passivate(msp, 868 metaslab_weight_from_range_tree(msp)); 869 mutex_exit(&msp->ms_lock); 870 } 871 msp = mg->mg_secondaries[i]; 872 if (msp != NULL) { 873 mutex_enter(&msp->ms_lock); 874 metaslab_passivate(msp, 875 metaslab_weight_from_range_tree(msp)); 876 mutex_exit(&msp->ms_lock); 877 } 878 }
802 803 mgprev = mg->mg_prev; 804 mgnext = mg->mg_next; 805 806 if (mg == mgnext) { 807 mc->mc_rotor = NULL; 808 } else { 809 mc->mc_rotor = mgnext; --- 125 unchanged lines hidden (view full) --- 935 mutex_enter(&mg->mg_lock); 936 ASSERT(msp->ms_group == mg); 937 avl_remove(&mg->mg_metaslab_tree, msp); 938 msp->ms_group = NULL; 939 mutex_exit(&mg->mg_lock); 940} 941 942static void	879 880 mgprev = mg->mg_prev; 881 mgnext = mg->mg_next; 882 883 if (mg == mgnext) { 884 mc->mc_rotor = NULL; 885 } else { 886 mc->mc_rotor = mgnext; --- 125 unchanged lines hidden (view full) --- 1012 mutex_enter(&mg->mg_lock); 1013 ASSERT(msp->ms_group == mg); 1014 avl_remove(&mg->mg_metaslab_tree, msp); 1015 msp->ms_group = NULL; 1016 mutex_exit(&mg->mg_lock); 1017} 1018 1019static void
	1020metaslab_group_sort_impl(metaslab_group_t mg, metaslab_t msp, uint64_t weight) 1021{ 1022 ASSERT(MUTEX_HELD(&mg->mg_lock)); 1023 ASSERT(msp->ms_group == mg); 1024 avl_remove(&mg->mg_metaslab_tree, msp); 1025 msp->ms_weight = weight; 1026 avl_add(&mg->mg_metaslab_tree, msp); 1027 1028} 1029 1030static void
943metaslab_group_sort(metaslab_group_t mg, metaslab_t msp, uint64_t weight) 944{ 945 /* 946 * Although in principle the weight can be any value, in 947 * practice we do not use values in the range [1, 511]. 948 / 949* ASSERT(weight >= SPA_MINBLOCKSIZE \|\| weight == 0); 950 ASSERT(MUTEX_HELD(&msp->ms_lock)); 951 952 mutex_enter(&mg->mg_lock);	1031metaslab_group_sort(metaslab_group_t mg, metaslab_t msp, uint64_t weight) 1032{ 1033 /* 1034 * Although in principle the weight can be any value, in 1035 * practice we do not use values in the range [1, 511]. 1036 / 1037* ASSERT(weight >= SPA_MINBLOCKSIZE \|\| weight == 0); 1038 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1039 1040 mutex_enter(&mg->mg_lock);
953 ASSERT(msp->ms_group == mg); 954 avl_remove(&mg->mg_metaslab_tree, msp); 955 msp->ms_weight = weight; 956 avl_add(&mg->mg_metaslab_tree, msp);	1041 metaslab_group_sort_impl(mg, msp, weight);
957 mutex_exit(&mg->mg_lock); 958} 959 960/* 961 * Calculate the fragmentation for a given metaslab group. We can use 962 * a simple average here since all metaslabs within the group must have 963 * the same size. The return value will be a value between 0 and 100 964 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this --- 31 unchanged lines hidden (view full) --- 996 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 997 * that can still handle allocations. If the allocation throttle is enabled 998 * then we skip allocations to devices that have reached their maximum 999 * allocation queue depth unless the selected metaslab group is the only 1000 * eligible group remaining. 1001 / 1002static boolean_t 1003metaslab_group_allocatable(metaslab_group_t mg, metaslab_group_t *rotor,	1042 mutex_exit(&mg->mg_lock); 1043} 1044 1045/* 1046 * Calculate the fragmentation for a given metaslab group. We can use 1047 * a simple average here since all metaslabs within the group must have 1048 * the same size. The return value will be a value between 0 and 100 1049 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this --- 31 unchanged lines hidden (view full) --- 1081 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1082 * that can still handle allocations. If the allocation throttle is enabled 1083 * then we skip allocations to devices that have reached their maximum 1084 * allocation queue depth unless the selected metaslab group is the only 1085 * eligible group remaining. 1086 / 1087static boolean_t 1088metaslab_group_allocatable(metaslab_group_t mg, metaslab_group_t *rotor,
1004 uint64_t psize)	1089 uint64_t psize, int allocator)
1005{ 1006 spa_t spa = mg->mg_vd->vdev_spa; 1007* metaslab_class_t mc = mg->mg_class; 1008* 1009 /* 1010 * We can only consider skipping this metaslab group if it's 1011 * in the normal metaslab class and there are other metaslab 1012 * groups to select from. Otherwise, we always consider it eligible --- 12 unchanged lines hidden (view full) --- 1025 * If all metaslab groups are no longer considered allocatable 1026 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1027 * gang block size then we allow allocations on this metaslab group 1028 * regardless of the mg_allocatable or throttle settings. 1029 / 1030* if (mg->mg_allocatable) { 1031 metaslab_group_t mgp; 1032* int64_t qdepth;	1090{ 1091 spa_t spa = mg->mg_vd->vdev_spa; 1092* metaslab_class_t mc = mg->mg_class; 1093* 1094 /* 1095 * We can only consider skipping this metaslab group if it's 1096 * in the normal metaslab class and there are other metaslab 1097 * groups to select from. Otherwise, we always consider it eligible --- 12 unchanged lines hidden (view full) --- 1110 * If all metaslab groups are no longer considered allocatable 1111 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1112 * gang block size then we allow allocations on this metaslab group 1113 * regardless of the mg_allocatable or throttle settings. 1114 / 1115* if (mg->mg_allocatable) { 1116 metaslab_group_t mgp; 1117* int64_t qdepth;
1033 uint64_t qmax = mg->mg_max_alloc_queue_depth;	1118 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
1034 1035 if (!mc->mc_alloc_throttle_enabled) 1036 return (B_TRUE); 1037 1038 /* 1039 * If this metaslab group does not have any free space, then 1040 * there is no point in looking further. 1041 / 1042* if (mg->mg_no_free_space) 1043 return (B_FALSE); 1044	1119 1120 if (!mc->mc_alloc_throttle_enabled) 1121 return (B_TRUE); 1122 1123 /* 1124 * If this metaslab group does not have any free space, then 1125 * there is no point in looking further. 1126 / 1127* if (mg->mg_no_free_space) 1128 return (B_FALSE); 1129
1045 qdepth = refcount_count(&mg->mg_alloc_queue_depth);	1130 qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]);
1046 1047 /* 1048 * If this metaslab group is below its qmax or it's 1049 * the only allocatable metasable group, then attempt 1050 * to allocate from it. 1051 / 1052* if (qdepth < qmax \|\| mc->mc_alloc_groups == 1) 1053 return (B_TRUE); 1054 ASSERT3U(mc->mc_alloc_groups, >, 1); 1055 1056 /* 1057 * Since this metaslab group is at or over its qmax, we 1058 * need to determine if there are metaslab groups after this 1059 * one that might be able to handle this allocation. This is 1060 * racy since we can't hold the locks for all metaslab 1061 * groups at the same time when we make this check. 1062 / 1063* for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {	1131 1132 /* 1133 * If this metaslab group is below its qmax or it's 1134 * the only allocatable metasable group, then attempt 1135 * to allocate from it. 1136 / 1137* if (qdepth < qmax \|\| mc->mc_alloc_groups == 1) 1138 return (B_TRUE); 1139 ASSERT3U(mc->mc_alloc_groups, >, 1); 1140 1141 /* 1142 * Since this metaslab group is at or over its qmax, we 1143 * need to determine if there are metaslab groups after this 1144 * one that might be able to handle this allocation. This is 1145 * racy since we can't hold the locks for all metaslab 1146 * groups at the same time when we make this check. 1147 / 1148* for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
1064 qmax = mgp->mg_max_alloc_queue_depth;	1149 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
1065	1150
1066 qdepth = refcount_count(&mgp->mg_alloc_queue_depth);	1151 qdepth = refcount_count( 1152 &mgp->mg_alloc_queue_depth[allocator]);
1067 1068 /* 1069 * If there is another metaslab group that 1070 * might be able to handle the allocation, then 1071 * we return false so that we skip this group. 1072 / 1073* if (qdepth < qmax && !mgp->mg_no_free_space) 1074 return (B_FALSE); --- 391 unchanged lines hidden (view full) --- 1466 1467 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1468 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1469 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1470 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1471 ms->ms_id = id; 1472 ms->ms_start = id << vd->vdev_ms_shift; 1473 ms->ms_size = 1ULL << vd->vdev_ms_shift;	1153 1154 /* 1155 * If there is another metaslab group that 1156 * might be able to handle the allocation, then 1157 * we return false so that we skip this group. 1158 / 1159* if (qdepth < qmax && !mgp->mg_no_free_space) 1160 return (B_FALSE); --- 391 unchanged lines hidden (view full) --- 1552 1553 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1554 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1555 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1556 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1557 ms->ms_id = id; 1558 ms->ms_start = id << vd->vdev_ms_shift; 1559 ms->ms_size = 1ULL << vd->vdev_ms_shift;
	1560 ms->ms_allocator = -1; 1561 ms->ms_new = B_TRUE;
1474 1475 /* 1476 * We only open space map objects that already exist. All others 1477 * will be opened when we finally allocate an object for it. 1478 / 1479* if (object != 0) { 1480 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1481 ms->ms_size, vd->vdev_ashift); --- 80 unchanged lines hidden (view full) --- 1562 ASSERT0(msp->ms_deferspace); 1563 1564 range_tree_destroy(msp->ms_checkpointing); 1565 1566 mutex_exit(&msp->ms_lock); 1567 cv_destroy(&msp->ms_load_cv); 1568 mutex_destroy(&msp->ms_lock); 1569 mutex_destroy(&msp->ms_sync_lock);	1562 1563 /* 1564 * We only open space map objects that already exist. All others 1565 * will be opened when we finally allocate an object for it. 1566 / 1567* if (object != 0) { 1568 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1569 ms->ms_size, vd->vdev_ashift); --- 80 unchanged lines hidden (view full) --- 1650 ASSERT0(msp->ms_deferspace); 1651 1652 range_tree_destroy(msp->ms_checkpointing); 1653 1654 mutex_exit(&msp->ms_lock); 1655 cv_destroy(&msp->ms_load_cv); 1656 mutex_destroy(&msp->ms_lock); 1657 mutex_destroy(&msp->ms_sync_lock);
	1658 ASSERT3U(msp->ms_allocator, ==, -1);
1570 1571 kmem_free(msp, sizeof (metaslab_t)); 1572} 1573 1574#define FRAGMENTATION_TABLE_SIZE 17 1575 1576/* 1577 * This table defines a segment size based fragmentation metric that will --- 380 unchanged lines hidden (view full) --- 1958 weight = metaslab_segment_weight(msp); 1959 } else { 1960 weight = metaslab_space_weight(msp); 1961 } 1962 return (weight); 1963} 1964 1965static int	1659 1660 kmem_free(msp, sizeof (metaslab_t)); 1661} 1662 1663#define FRAGMENTATION_TABLE_SIZE 17 1664 1665/* 1666 * This table defines a segment size based fragmentation metric that will --- 380 unchanged lines hidden (view full) --- 2047 weight = metaslab_segment_weight(msp); 2048 } else { 2049 weight = metaslab_space_weight(msp); 2050 } 2051 return (weight); 2052} 2053 2054static int
1966metaslab_activate(metaslab_t *msp, uint64_t activation_weight)	2055metaslab_activate_allocator(metaslab_group_t mg, metaslab_t msp, 2056 int allocator, uint64_t activation_weight)
1967{	2057{
	2058 /* 2059 * If we're activating for the claim code, we don't want to actually 2060 * set the metaslab up for a specific allocator. 2061 / 2062* if (activation_weight == METASLAB_WEIGHT_CLAIM) 2063 return (0); 2064 metaslab_t *arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2065* mg->mg_primaries : mg->mg_secondaries); 2066
1968 ASSERT(MUTEX_HELD(&msp->ms_lock));	2067 ASSERT(MUTEX_HELD(&msp->ms_lock));
	2068 mutex_enter(&mg->mg_lock); 2069 if (arr[allocator] != NULL) { 2070 mutex_exit(&mg->mg_lock); 2071 return (EEXIST); 2072 }
1969	2073
	2074 arr[allocator] = msp; 2075 ASSERT3S(msp->ms_allocator, ==, -1); 2076 msp->ms_allocator = allocator; 2077 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2078 mutex_exit(&mg->mg_lock); 2079 2080 return (0); 2081} 2082 2083static int 2084metaslab_activate(metaslab_t msp, int allocator, uint64_t activation_weight) 2085{ 2086* ASSERT(MUTEX_HELD(&msp->ms_lock)); 2087
1970 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {	2088 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
	2089 int error = 0;
1971 metaslab_load_wait(msp); 1972 if (!msp->ms_loaded) {	2090 metaslab_load_wait(msp); 2091 if (!msp->ms_loaded) {
1973 int error = metaslab_load(msp); 1974 if (error) {	2092 if ((error = metaslab_load(msp)) != 0) {
1975 metaslab_group_sort(msp->ms_group, msp, 0); 1976 return (error); 1977 } 1978 }	2093 metaslab_group_sort(msp->ms_group, msp, 0); 2094 return (error); 2095 } 2096 }
	2097 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2098 /* 2099 * The metaslab was activated for another allocator 2100 * while we were waiting, we should reselect. 2101 / 2102* return (EBUSY); 2103 } 2104 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2105 allocator, activation_weight)) != 0) { 2106 return (error); 2107 }
1979 1980 msp->ms_activation_weight = msp->ms_weight; 1981 metaslab_group_sort(msp->ms_group, msp, 1982 msp->ms_weight \| activation_weight); 1983 } 1984 ASSERT(msp->ms_loaded); 1985 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1986 1987 return (0); 1988} 1989 1990static void	2108 2109 msp->ms_activation_weight = msp->ms_weight; 2110 metaslab_group_sort(msp->ms_group, msp, 2111 msp->ms_weight \| activation_weight); 2112 } 2113 ASSERT(msp->ms_loaded); 2114 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2115 2116 return (0); 2117} 2118 2119static void
	2120metaslab_passivate_allocator(metaslab_group_t mg, metaslab_t msp, 2121 uint64_t weight) 2122{ 2123 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2124 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2125 metaslab_group_sort(mg, msp, weight); 2126 return; 2127 } 2128 2129 mutex_enter(&mg->mg_lock); 2130 ASSERT3P(msp->ms_group, ==, mg); 2131 if (msp->ms_primary) { 2132 ASSERT3U(0, <=, msp->ms_allocator); 2133 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 2134 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 2135 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 2136 mg->mg_primaries[msp->ms_allocator] = NULL; 2137 } else { 2138 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 2139 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 2140 mg->mg_secondaries[msp->ms_allocator] = NULL; 2141 } 2142 msp->ms_allocator = -1; 2143 metaslab_group_sort_impl(mg, msp, weight); 2144 mutex_exit(&mg->mg_lock); 2145} 2146 2147static void
1991metaslab_passivate(metaslab_t msp, uint64_t weight) 1992{ 1993* uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 1994 1995 /* 1996 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1997 * this metaslab again. In that case, it had better be empty, 1998 * or we would be leaving space on the table. 1999 / 2000* ASSERT(size >= SPA_MINBLOCKSIZE \|\| 2001 range_tree_is_empty(msp->ms_allocatable)); 2002 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2003 2004 msp->ms_activation_weight = 0;	2148metaslab_passivate(metaslab_t msp, uint64_t weight) 2149{ 2150* uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 2151 2152 /* 2153 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 2154 * this metaslab again. In that case, it had better be empty, 2155 * or we would be leaving space on the table. 2156 / 2157* ASSERT(size >= SPA_MINBLOCKSIZE \|\| 2158 range_tree_is_empty(msp->ms_allocatable)); 2159 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2160 2161 msp->ms_activation_weight = 0;
2005 metaslab_group_sort(msp->ms_group, msp, weight);	2162 metaslab_passivate_allocator(msp->ms_group, msp, weight);
2006 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2007} 2008 2009/* 2010 * Segment-based metaslabs are activated once and remain active until 2011 * we either fail an allocation attempt (similar to space-based metaslabs) 2012 * or have exhausted the free space in zfs_metaslab_switch_threshold 2013 * buckets since the metaslab was activated. This function checks to see --- 537 unchanged lines hidden (view full) --- 2551 if (msp->ms_deferspace != 0) { 2552 /* 2553 * Keep syncing this metaslab until all deferred frees 2554 * are back in circulation. 2555 / 2556* vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2557 } 2558	2163 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2164} 2165 2166/* 2167 * Segment-based metaslabs are activated once and remain active until 2168 * we either fail an allocation attempt (similar to space-based metaslabs) 2169 * or have exhausted the free space in zfs_metaslab_switch_threshold 2170 * buckets since the metaslab was activated. This function checks to see --- 537 unchanged lines hidden (view full) --- 2708 if (msp->ms_deferspace != 0) { 2709 /* 2710 * Keep syncing this metaslab until all deferred frees 2711 * are back in circulation. 2712 / 2713* vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2714 } 2715
	2716 if (msp->ms_new) { 2717 msp->ms_new = B_FALSE; 2718 mutex_enter(&mg->mg_lock); 2719 mg->mg_ms_ready++; 2720 mutex_exit(&mg->mg_lock); 2721 }
2559 /* 2560 * Calculate the new weights before unloading any metaslabs. 2561 * This will give us the most accurate weighting. 2562 */	2722 /* 2723 * Calculate the new weights before unloading any metaslabs. 2724 * This will give us the most accurate weighting. 2725 */
2563 metaslab_group_sort(mg, msp, metaslab_weight(msp));	2726 metaslab_group_sort(mg, msp, metaslab_weight(msp) \| 2727 (msp->ms_weight & METASLAB_ACTIVE_MASK));
2564 2565 /* 2566 * If the metaslab is loaded and we've not tried to load or allocate 2567 * from it in 'metaslab_unload_delay' txgs, then unload it. 2568 / 2569* if (msp->ms_loaded && 2570 msp->ms_selected_txg + metaslab_unload_delay < txg) { 2571 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2572 VERIFY0(range_tree_space( 2573 msp->ms_allocating[(txg + t) & TXG_MASK])); 2574 }	2728 2729 /* 2730 * If the metaslab is loaded and we've not tried to load or allocate 2731 * from it in 'metaslab_unload_delay' txgs, then unload it. 2732 / 2733* if (msp->ms_loaded && 2734 msp->ms_selected_txg + metaslab_unload_delay < txg) { 2735 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2736 VERIFY0(range_tree_space( 2737 msp->ms_allocating[(txg + t) & TXG_MASK])); 2738 }
	2739 if (msp->ms_allocator != -1) { 2740 metaslab_passivate(msp, msp->ms_weight & 2741 ~METASLAB_ACTIVE_MASK); 2742 }
2575 2576 if (!metaslab_debug_unload) 2577 metaslab_unload(msp); 2578 } 2579 2580 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2581 ASSERT0(range_tree_space(msp->ms_freeing)); 2582 ASSERT0(range_tree_space(msp->ms_freed)); --- 77 unchanged lines hidden (view full) --- 2660 metaslab_alloc_trace_cache = NULL; 2661} 2662 2663/* 2664 * Add an allocation trace element to the allocation tracing list. 2665 / 2666static void 2667metaslab_trace_add(zio_alloc_list_t zal, metaslab_group_t *mg,	2743 2744 if (!metaslab_debug_unload) 2745 metaslab_unload(msp); 2746 } 2747 2748 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2749 ASSERT0(range_tree_space(msp->ms_freeing)); 2750 ASSERT0(range_tree_space(msp->ms_freed)); --- 77 unchanged lines hidden (view full) --- 2828 metaslab_alloc_trace_cache = NULL; 2829} 2830 2831/* 2832 * Add an allocation trace element to the allocation tracing list. 2833 / 2834static void 2835metaslab_trace_add(zio_alloc_list_t zal, metaslab_group_t *mg,
2668 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)	2836 metaslab_t msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 2837* int allocator)
2669{ 2670 if (!metaslab_trace_enabled) 2671 return; 2672 2673 /* 2674 * When the tracing list reaches its maximum we remove 2675 * the second element in the list before adding a new one. 2676 * By removing the second element we preserve the original --- 16 unchanged lines hidden (view full) --- 2693 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 2694 list_link_init(&mat->mat_list_node); 2695 mat->mat_mg = mg; 2696 mat->mat_msp = msp; 2697 mat->mat_size = psize; 2698 mat->mat_dva_id = dva_id; 2699 mat->mat_offset = offset; 2700 mat->mat_weight = 0;	2838{ 2839 if (!metaslab_trace_enabled) 2840 return; 2841 2842 /* 2843 * When the tracing list reaches its maximum we remove 2844 * the second element in the list before adding a new one. 2845 * By removing the second element we preserve the original --- 16 unchanged lines hidden (view full) --- 2862 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 2863 list_link_init(&mat->mat_list_node); 2864 mat->mat_mg = mg; 2865 mat->mat_msp = msp; 2866 mat->mat_size = psize; 2867 mat->mat_dva_id = dva_id; 2868 mat->mat_offset = offset; 2869 mat->mat_weight = 0;
	2870 mat->mat_allocator = allocator;
2701 2702 if (msp != NULL) 2703 mat->mat_weight = msp->ms_weight; 2704 2705 /* 2706 * The list is part of the zio so locking is not required. Only 2707 * a single thread will perform allocations for a given zio. 2708 / --- 24 unchanged lines hidden* (view full) --- 2733 2734/* 2735 * ========================================================================== 2736 * Metaslab block operations 2737 * ========================================================================== 2738 / 2739* 2740static void	2871 2872 if (msp != NULL) 2873 mat->mat_weight = msp->ms_weight; 2874 2875 /* 2876 * The list is part of the zio so locking is not required. Only 2877 * a single thread will perform allocations for a given zio. 2878 / --- 24 unchanged lines hidden* (view full) --- 2903 2904/* 2905 * ========================================================================== 2906 * Metaslab block operations 2907 * ========================================================================== 2908 / 2909* 2910static void
2741metaslab_group_alloc_increment(spa_t spa, uint64_t vdev, void tag, int flags)	2911metaslab_group_alloc_increment(spa_t spa, uint64_t vdev, void tag, int flags, 2912 int allocator)
2742{ 2743 if (!(flags & METASLAB_ASYNC_ALLOC) \|\|	2913{ 2914 if (!(flags & METASLAB_ASYNC_ALLOC) \|\|
2744 flags & METASLAB_DONT_THROTTLE)	2915 (flags & METASLAB_DONT_THROTTLE))
2745 return; 2746 2747 metaslab_group_t mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2748* if (!mg->mg_class->mc_alloc_throttle_enabled) 2749 return; 2750	2916 return; 2917 2918 metaslab_group_t mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2919* if (!mg->mg_class->mc_alloc_throttle_enabled) 2920 return; 2921
2751 (void) refcount_add(&mg->mg_alloc_queue_depth, tag);	2922 (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
2752} 2753	2923} 2924
	2925static void 2926metaslab_group_increment_qdepth(metaslab_group_t mg, int allocator) 2927{ 2928* uint64_t max = mg->mg_max_alloc_queue_depth; 2929 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2930 while (cur < max) { 2931 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 2932 cur, cur + 1) == cur) { 2933 atomic_inc_64( 2934 &mg->mg_class->mc_alloc_max_slots[allocator]); 2935 return; 2936 } 2937 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2938 } 2939} 2940
2754void	2941void
2755metaslab_group_alloc_decrement(spa_t spa, uint64_t vdev, void tag, int flags)	2942metaslab_group_alloc_decrement(spa_t spa, uint64_t vdev, void tag, int flags, 2943 int allocator, boolean_t io_complete)
2756{ 2757 if (!(flags & METASLAB_ASYNC_ALLOC) \|\|	2944{ 2945 if (!(flags & METASLAB_ASYNC_ALLOC) \|\|
2758 flags & METASLAB_DONT_THROTTLE)	2946 (flags & METASLAB_DONT_THROTTLE))
2759 return; 2760 2761 metaslab_group_t mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2762* if (!mg->mg_class->mc_alloc_throttle_enabled) 2763 return; 2764	2947 return; 2948 2949 metaslab_group_t mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2950* if (!mg->mg_class->mc_alloc_throttle_enabled) 2951 return; 2952
2765 (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);	2953 (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 2954 if (io_complete) 2955 metaslab_group_increment_qdepth(mg, allocator);
2766} 2767 2768void	2956} 2957 2958void
2769metaslab_group_alloc_verify(spa_t spa, const blkptr_t bp, void *tag)	2959metaslab_group_alloc_verify(spa_t spa, const blkptr_t bp, void tag, 2960* int allocator)
2770{ 2771#ifdef ZFS_DEBUG 2772 const dva_t dva = bp->blk_dva; 2773* int ndvas = BP_GET_NDVAS(bp); 2774 2775 for (int d = 0; d < ndvas; d++) { 2776 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2777 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;	2961{ 2962#ifdef ZFS_DEBUG 2963 const dva_t dva = bp->blk_dva; 2964* int ndvas = BP_GET_NDVAS(bp); 2965 2966 for (int d = 0; d < ndvas; d++) { 2967 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2968 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2778 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));	2969 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator], 2970 tag));
2779 } 2780#endif 2781} 2782 2783static uint64_t 2784metaslab_block_alloc(metaslab_t msp, uint64_t size, uint64_t txg) 2785{ 2786* uint64_t start; --- 25 unchanged lines hidden (view full) --- 2812 /* 2813 * Now that we've attempted the allocation we need to update the 2814 * metaslab's maximum block size since it may have changed. 2815 / 2816* msp->ms_max_size = metaslab_block_maxsize(msp); 2817 return (start); 2818} 2819	2971 } 2972#endif 2973} 2974 2975static uint64_t 2976metaslab_block_alloc(metaslab_t msp, uint64_t size, uint64_t txg) 2977{ 2978* uint64_t start; --- 25 unchanged lines hidden (view full) --- 3004 /* 3005 * Now that we've attempted the allocation we need to update the 3006 * metaslab's maximum block size since it may have changed. 3007 / 3008* msp->ms_max_size = metaslab_block_maxsize(msp); 3009 return (start); 3010} 3011
	3012/* 3013 * Find the metaslab with the highest weight that is less than what we've 3014 * already tried. In the common case, this means that we will examine each 3015 * metaslab at most once. Note that concurrent callers could reorder metaslabs 3016 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 3017 * activated by another thread, and we fail to allocate from the metaslab we 3018 * have selected, we may not try the newly-activated metaslab, and instead 3019 * activate another metaslab. This is not optimal, but generally does not cause 3020 * any problems (a possible exception being if every metaslab is completely full 3021 * except for the the newly-activated metaslab which we fail to examine). 3022 / 3023static metaslab_t 3024find_valid_metaslab(metaslab_group_t mg, uint64_t activation_weight, 3025* dva_t dva, int d, uint64_t min_distance, uint64_t asize, int allocator, 3026* zio_alloc_list_t zal, metaslab_t search, boolean_t was_active) 3027{ 3028* avl_index_t idx; 3029 avl_tree_t t = &mg->mg_metaslab_tree; 3030* metaslab_t msp = avl_find(t, search, &idx); 3031* if (msp == NULL) 3032 msp = avl_nearest(t, idx, AVL_AFTER); 3033 3034 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 3035 int i; 3036 if (!metaslab_should_allocate(msp, asize)) { 3037 metaslab_trace_add(zal, mg, msp, asize, d, 3038 TRACE_TOO_SMALL, allocator); 3039 continue; 3040 } 3041 3042 /* 3043 * If the selected metaslab is condensing, skip it. 3044 / 3045* if (msp->ms_condensing) 3046 continue; 3047 3048 was_active = msp->ms_allocator != -1; 3049* /* 3050 * If we're activating as primary, this is our first allocation 3051 * from this disk, so we don't need to check how close we are. 3052 * If the metaslab under consideration was already active, 3053 * we're getting desperate enough to steal another allocator's 3054 * metaslab, so we still don't care about distances. 3055 / 3056* if (activation_weight == METASLAB_WEIGHT_PRIMARY \|\| was_active) 3057* break; 3058 3059 uint64_t target_distance = min_distance 3060 + (space_map_allocated(msp->ms_sm) != 0 ? 0 : 3061 min_distance >> 1); 3062 3063 for (i = 0; i < d; i++) { 3064 if (metaslab_distance(msp, &dva[i]) < target_distance) 3065 break; 3066 } 3067 if (i == d) 3068 break; 3069 } 3070 3071 if (msp != NULL) { 3072 search->ms_weight = msp->ms_weight; 3073 search->ms_start = msp->ms_start + 1; 3074 search->ms_allocator = msp->ms_allocator; 3075 search->ms_primary = msp->ms_primary; 3076 } 3077 return (msp); 3078} 3079 3080/* ARGSUSED */
2820static uint64_t 2821metaslab_group_alloc_normal(metaslab_group_t mg, zio_alloc_list_t zal,	3081static uint64_t 3082metaslab_group_alloc_normal(metaslab_group_t mg, zio_alloc_list_t zal,
2822 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)	3083 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t dva, int d, 3084* int allocator)
2823{ 2824 metaslab_t msp = NULL; 2825* uint64_t offset = -1ULL; 2826 uint64_t activation_weight;	3085{ 3086 metaslab_t msp = NULL; 3087* uint64_t offset = -1ULL; 3088 uint64_t activation_weight;
2827 uint64_t target_distance; 2828 int i;	3089 boolean_t tertiary = B_FALSE;
2829 2830 activation_weight = METASLAB_WEIGHT_PRIMARY;	3090 3091 activation_weight = METASLAB_WEIGHT_PRIMARY;
2831 for (i = 0; i < d; i++) { 2832 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {	3092 for (int i = 0; i < d; i++) { 3093 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3094 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2833 activation_weight = METASLAB_WEIGHT_SECONDARY;	3095 activation_weight = METASLAB_WEIGHT_SECONDARY;
	3096 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3097 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3098 tertiary = B_TRUE;
2834 break; 2835 } 2836 } 2837	3099 break; 3100 } 3101 } 3102
	3103 /* 3104 * If we don't have enough metaslabs active to fill the entire array, we 3105 * just use the 0th slot. 3106 / 3107* if (mg->mg_ms_ready < mg->mg_allocators * 2) { 3108 tertiary = B_FALSE; 3109 allocator = 0; 3110 } 3111 3112 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 3113
2838 metaslab_t search = kmem_alloc(sizeof (search), KM_SLEEP); 2839 search->ms_weight = UINT64_MAX; 2840 search->ms_start = 0;	3114 metaslab_t search = kmem_alloc(sizeof (search), KM_SLEEP); 3115 search->ms_weight = UINT64_MAX; 3116 search->ms_start = 0;
	3117 /* 3118 * At the end of the metaslab tree are the already-active metaslabs, 3119 * first the primaries, then the secondaries. When we resume searching 3120 * through the tree, we need to consider ms_allocator and ms_primary so 3121 * we start in the location right after where we left off, and don't 3122 * accidentally loop forever considering the same metaslabs. 3123 / 3124* search->ms_allocator = -1; 3125 search->ms_primary = B_TRUE;
2841 for (;;) {	3126 for (;;) {
2842 boolean_t was_active; 2843 avl_tree_t t = &mg->mg_metaslab_tree; 2844* avl_index_t idx;	3127 boolean_t was_active = B_FALSE;
2845 2846 mutex_enter(&mg->mg_lock); 2847	3128 3129 mutex_enter(&mg->mg_lock); 3130
2848 /* 2849 * Find the metaslab with the highest weight that is less 2850 * than what we've already tried. In the common case, this 2851 * means that we will examine each metaslab at most once. 2852 * Note that concurrent callers could reorder metaslabs 2853 * by activation/passivation once we have dropped the mg_lock. 2854 * If a metaslab is activated by another thread, and we fail 2855 * to allocate from the metaslab we have selected, we may 2856 * not try the newly-activated metaslab, and instead activate 2857 * another metaslab. This is not optimal, but generally 2858 * does not cause any problems (a possible exception being 2859 * if every metaslab is completely full except for the 2860 * the newly-activated metaslab which we fail to examine). 2861 / 2862* msp = avl_find(t, search, &idx); 2863 if (msp == NULL) 2864 msp = avl_nearest(t, idx, AVL_AFTER); 2865 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 2866 2867 if (!metaslab_should_allocate(msp, asize)) { 2868 metaslab_trace_add(zal, mg, msp, asize, d, 2869 TRACE_TOO_SMALL); 2870 continue; 2871 } 2872 2873 /* 2874 * If the selected metaslab is condensing, skip it. 2875 / 2876* if (msp->ms_condensing) 2877 continue; 2878 2879 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2880 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2881 break; 2882 2883 target_distance = min_distance + 2884 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2885 min_distance >> 1); 2886 2887 for (i = 0; i < d; i++) { 2888 if (metaslab_distance(msp, &dva[i]) < 2889 target_distance) 2890 break; 2891 } 2892 if (i == d) 2893 break;	3131 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3132 mg->mg_primaries[allocator] != NULL) { 3133 msp = mg->mg_primaries[allocator]; 3134 was_active = B_TRUE; 3135 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3136 mg->mg_secondaries[allocator] != NULL && !tertiary) { 3137 msp = mg->mg_secondaries[allocator]; 3138 was_active = B_TRUE; 3139 } else { 3140 msp = find_valid_metaslab(mg, activation_weight, dva, d, 3141 min_distance, asize, allocator, zal, search, 3142 &was_active);
2894 }	3143 }
	3144
2895 mutex_exit(&mg->mg_lock); 2896 if (msp == NULL) { 2897 kmem_free(search, sizeof (search)); 2898* return (-1ULL); 2899 }	3145 mutex_exit(&mg->mg_lock); 3146 if (msp == NULL) { 3147 kmem_free(search, sizeof (search)); 3148* return (-1ULL); 3149 }
2900 search->ms_weight = msp->ms_weight; 2901 search->ms_start = msp->ms_start + 1;
2902 2903 mutex_enter(&msp->ms_lock);	3150 3151 mutex_enter(&msp->ms_lock);
2904
2905 /* 2906 * Ensure that the metaslab we have selected is still 2907 * capable of handling our request. It's possible that 2908 * another thread may have changed the weight while we 2909 * were blocked on the metaslab lock. We check the 2910 * active status first to see if we need to reselect 2911 * a new metaslab. 2912 / 2913* if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 2914 mutex_exit(&msp->ms_lock); 2915 continue; 2916 } 2917	3152 /* 3153 * Ensure that the metaslab we have selected is still 3154 * capable of handling our request. It's possible that 3155 * another thread may have changed the weight while we 3156 * were blocked on the metaslab lock. We check the 3157 * active status first to see if we need to reselect 3158 * a new metaslab. 3159 / 3160* if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 3161 mutex_exit(&msp->ms_lock); 3162 continue; 3163 } 3164
2918 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2919 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2920 metaslab_passivate(msp, 2921 msp->ms_weight & ~METASLAB_ACTIVE_MASK);	3165 /* 3166 * If the metaslab is freshly activated for an allocator that 3167 * isn't the one we're allocating from, or if it's a primary and 3168 * we're seeking a secondary (or vice versa), we go back and 3169 * select a new metaslab. 3170 / 3171* if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 3172 (msp->ms_allocator != -1) && 3173 (msp->ms_allocator != allocator \|\| ((activation_weight == 3174 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
2922 mutex_exit(&msp->ms_lock); 2923 continue; 2924 } 2925	3175 mutex_exit(&msp->ms_lock); 3176 continue; 3177 } 3178
2926 if (metaslab_activate(msp, activation_weight) != 0) {	3179 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 3180 metaslab_passivate(msp, msp->ms_weight & 3181 ~METASLAB_WEIGHT_CLAIM);
2927 mutex_exit(&msp->ms_lock); 2928 continue; 2929 }	3182 mutex_exit(&msp->ms_lock); 3183 continue; 3184 }
	3185 3186 if (metaslab_activate(msp, allocator, activation_weight) != 0) { 3187 mutex_exit(&msp->ms_lock); 3188 continue; 3189 } 3190
2930 msp->ms_selected_txg = txg; 2931 2932 /* 2933 * Now that we have the lock, recheck to see if we should 2934 * continue to use this metaslab for this allocation. The 2935 * the metaslab is now loaded so metaslab_should_allocate() can 2936 * accurately determine if the allocation attempt should 2937 * proceed. 2938 / 2939* if (!metaslab_should_allocate(msp, asize)) { 2940 /* Passivate this metaslab and select a new one. / 2941* metaslab_trace_add(zal, mg, msp, asize, d,	3191 msp->ms_selected_txg = txg; 3192 3193 /* 3194 * Now that we have the lock, recheck to see if we should 3195 * continue to use this metaslab for this allocation. The 3196 * the metaslab is now loaded so metaslab_should_allocate() can 3197 * accurately determine if the allocation attempt should 3198 * proceed. 3199 / 3200* if (!metaslab_should_allocate(msp, asize)) { 3201 /* Passivate this metaslab and select a new one. / 3202* metaslab_trace_add(zal, mg, msp, asize, d,
2942 TRACE_TOO_SMALL);	3203 TRACE_TOO_SMALL, allocator);
2943 goto next; 2944 } 2945 2946 /* 2947 * If this metaslab is currently condensing then pick again as 2948 * we can't manipulate this metaslab until it's committed 2949 * to disk. 2950 / 2951* if (msp->ms_condensing) { 2952 metaslab_trace_add(zal, mg, msp, asize, d,	3204 goto next; 3205 } 3206 3207 /* 3208 * If this metaslab is currently condensing then pick again as 3209 * we can't manipulate this metaslab until it's committed 3210 * to disk. 3211 / 3212* if (msp->ms_condensing) { 3213 metaslab_trace_add(zal, mg, msp, asize, d,
2953 TRACE_CONDENSING);	3214 TRACE_CONDENSING, allocator); 3215 metaslab_passivate(msp, msp->ms_weight & 3216 ~METASLAB_ACTIVE_MASK);
2954 mutex_exit(&msp->ms_lock); 2955 continue; 2956 } 2957 2958 offset = metaslab_block_alloc(msp, asize, txg);	3217 mutex_exit(&msp->ms_lock); 3218 continue; 3219 } 3220 3221 offset = metaslab_block_alloc(msp, asize, txg);
2959 metaslab_trace_add(zal, mg, msp, asize, d, offset);	3222 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
2960 2961 if (offset != -1ULL) { 2962 /* Proactively passivate the metaslab, if needed / 2963* metaslab_segment_may_passivate(msp); 2964 break; 2965 } 2966next: 2967 ASSERT(msp->ms_loaded); --- 39 unchanged lines hidden (view full) --- 3007 } 3008 mutex_exit(&msp->ms_lock); 3009 kmem_free(search, sizeof (search)); 3010* return (offset); 3011} 3012 3013static uint64_t 3014metaslab_group_alloc(metaslab_group_t mg, zio_alloc_list_t zal,	3223 3224 if (offset != -1ULL) { 3225 /* Proactively passivate the metaslab, if needed / 3226* metaslab_segment_may_passivate(msp); 3227 break; 3228 } 3229next: 3230 ASSERT(msp->ms_loaded); --- 39 unchanged lines hidden (view full) --- 3270 } 3271 mutex_exit(&msp->ms_lock); 3272 kmem_free(search, sizeof (search)); 3273* return (offset); 3274} 3275 3276static uint64_t 3277metaslab_group_alloc(metaslab_group_t mg, zio_alloc_list_t zal,
3015 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)	3278 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t dva, int d, 3279* int allocator)
3016{ 3017 uint64_t offset; 3018 ASSERT(mg->mg_initialized); 3019 3020 offset = metaslab_group_alloc_normal(mg, zal, asize, txg,	3280{ 3281 uint64_t offset; 3282 ASSERT(mg->mg_initialized); 3283 3284 offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
3021 min_distance, dva, d);	3285 min_distance, dva, d, allocator);
3022 3023 mutex_enter(&mg->mg_lock); 3024 if (offset == -1ULL) { 3025 mg->mg_failed_allocations++; 3026 metaslab_trace_add(zal, mg, NULL, asize, d,	3286 3287 mutex_enter(&mg->mg_lock); 3288 if (offset == -1ULL) { 3289 mg->mg_failed_allocations++; 3290 metaslab_trace_add(zal, mg, NULL, asize, d,
3027 TRACE_GROUP_FAILURE);	3291 TRACE_GROUP_FAILURE, allocator);
3028 if (asize == SPA_GANGBLOCKSIZE) { 3029 /* 3030 * This metaslab group was unable to allocate 3031 * the minimum gang block size so it must be out of 3032 * space. We must notify the allocation throttle 3033 * to start skipping allocation attempts to this 3034 * metaslab group until more space becomes available. 3035 * Note: this failure cannot be caused by the --- 18 unchanged lines hidden (view full) --- 3054int ditto_same_vdev_distance_shift = 3; 3055 3056/* 3057 * Allocate a block for the specified i/o. 3058 / 3059int 3060metaslab_alloc_dva(spa_t spa, metaslab_class_t mc, uint64_t psize, 3061* dva_t dva, int d, dva_t hintdva, uint64_t txg, int flags,	3292 if (asize == SPA_GANGBLOCKSIZE) { 3293 /* 3294 * This metaslab group was unable to allocate 3295 * the minimum gang block size so it must be out of 3296 * space. We must notify the allocation throttle 3297 * to start skipping allocation attempts to this 3298 * metaslab group until more space becomes available. 3299 * Note: this failure cannot be caused by the --- 18 unchanged lines hidden (view full) --- 3318int ditto_same_vdev_distance_shift = 3; 3319 3320/* 3321 * Allocate a block for the specified i/o. 3322 / 3323int 3324metaslab_alloc_dva(spa_t spa, metaslab_class_t mc, uint64_t psize, 3325* dva_t dva, int d, dva_t hintdva, uint64_t txg, int flags,
3062 zio_alloc_list_t *zal)	3326 zio_alloc_list_t *zal, int allocator)
3063{ 3064 metaslab_group_t mg, rotor; 3065 vdev_t vd; 3066* boolean_t try_hard = B_FALSE; 3067 3068 ASSERT(!DVA_IS_VALID(&dva[d])); 3069 3070 /* 3071 * For testing, make some blocks above a certain size be gang blocks. 3072 / 3073* if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {	3327{ 3328 metaslab_group_t mg, rotor; 3329 vdev_t vd; 3330* boolean_t try_hard = B_FALSE; 3331 3332 ASSERT(!DVA_IS_VALID(&dva[d])); 3333 3334 /* 3335 * For testing, make some blocks above a certain size be gang blocks. 3336 / 3337* if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
3074 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);	3338 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 3339 allocator);
3075 return (SET_ERROR(ENOSPC)); 3076 } 3077 3078 /* 3079 * Start at the rotor and loop through all mgs until we find something. 3080 * Note that there's no locking on mc_rotor or mc_aliquot because 3081 * nothing actually breaks if we miss a few updates -- we just won't 3082 * allocate quite as evenly. It all balances out over time. --- 69 unchanged lines hidden (view full) --- 3152 * Determine if the selected metaslab group is eligible 3153 * for allocations. If we're ganging then don't allow 3154 * this metaslab group to skip allocations since that would 3155 * inadvertently return ENOSPC and suspend the pool 3156 * even though space is still available. 3157 / 3158* if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3159 allocatable = metaslab_group_allocatable(mg, rotor,	3340 return (SET_ERROR(ENOSPC)); 3341 } 3342 3343 /* 3344 * Start at the rotor and loop through all mgs until we find something. 3345 * Note that there's no locking on mc_rotor or mc_aliquot because 3346 * nothing actually breaks if we miss a few updates -- we just won't 3347 * allocate quite as evenly. It all balances out over time. --- 69 unchanged lines hidden (view full) --- 3417 * Determine if the selected metaslab group is eligible 3418 * for allocations. If we're ganging then don't allow 3419 * this metaslab group to skip allocations since that would 3420 * inadvertently return ENOSPC and suspend the pool 3421 * even though space is still available. 3422 / 3423* if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3424 allocatable = metaslab_group_allocatable(mg, rotor,
3160 psize);	3425 psize, allocator);
3161 } 3162 3163 if (!allocatable) { 3164 metaslab_trace_add(zal, mg, NULL, psize, d,	3426 } 3427 3428 if (!allocatable) { 3429 metaslab_trace_add(zal, mg, NULL, psize, d,
3165 TRACE_NOT_ALLOCATABLE);	3430 TRACE_NOT_ALLOCATABLE, allocator);
3166 goto next; 3167 } 3168 3169 ASSERT(mg->mg_initialized); 3170 3171 /* 3172 * Avoid writing single-copy data to a failing, 3173 * non-redundant vdev, unless we've already tried all 3174 * other vdevs. 3175 / 3176* if ((vd->vdev_stat.vs_write_errors > 0 \|\| 3177 vd->vdev_state < VDEV_STATE_HEALTHY) && 3178 d == 0 && !try_hard && vd->vdev_children == 0) { 3179 metaslab_trace_add(zal, mg, NULL, psize, d,	3431 goto next; 3432 } 3433 3434 ASSERT(mg->mg_initialized); 3435 3436 /* 3437 * Avoid writing single-copy data to a failing, 3438 * non-redundant vdev, unless we've already tried all 3439 * other vdevs. 3440 / 3441* if ((vd->vdev_stat.vs_write_errors > 0 \|\| 3442 vd->vdev_state < VDEV_STATE_HEALTHY) && 3443 d == 0 && !try_hard && vd->vdev_children == 0) { 3444 metaslab_trace_add(zal, mg, NULL, psize, d,
3180 TRACE_VDEV_ERROR);	3445 TRACE_VDEV_ERROR, allocator);
3181 goto next; 3182 } 3183 3184 ASSERT(mg->mg_class == mc); 3185 3186 /* 3187 * If we don't need to try hard, then require that the 3188 * block be 1/8th of the device away from any other DVAs --- 7 unchanged lines hidden (view full) --- 3196 if (distance <= (1ULL << vd->vdev_ms_shift)) 3197 distance = 0; 3198 } 3199 3200 uint64_t asize = vdev_psize_to_asize(vd, psize); 3201 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3202 3203 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,	3446 goto next; 3447 } 3448 3449 ASSERT(mg->mg_class == mc); 3450 3451 /* 3452 * If we don't need to try hard, then require that the 3453 * block be 1/8th of the device away from any other DVAs --- 7 unchanged lines hidden (view full) --- 3461 if (distance <= (1ULL << vd->vdev_ms_shift)) 3462 distance = 0; 3463 } 3464 3465 uint64_t asize = vdev_psize_to_asize(vd, psize); 3466 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3467 3468 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3204 distance, dva, d);	3469 distance, dva, d, allocator);
3205 3206 if (offset != -1ULL) { 3207 /* 3208 * If we've just selected this metaslab group, 3209 * figure out whether the corresponding vdev is 3210 * over- or under-used relative to the pool, 3211 * and set an allocation bias to even it out. 3212 / --- 46 unchanged lines hidden* (view full) --- 3259 / 3260* if (!try_hard) { 3261 try_hard = B_TRUE; 3262 goto top; 3263 } 3264 3265 bzero(&dva[d], sizeof (dva_t)); 3266	3470 3471 if (offset != -1ULL) { 3472 /* 3473 * If we've just selected this metaslab group, 3474 * figure out whether the corresponding vdev is 3475 * over- or under-used relative to the pool, 3476 * and set an allocation bias to even it out. 3477 / --- 46 unchanged lines hidden* (view full) --- 3524 / 3525* if (!try_hard) { 3526 try_hard = B_TRUE; 3527 goto top; 3528 } 3529 3530 bzero(&dva[d], sizeof (dva_t)); 3531
3267 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);	3532 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
3268 return (SET_ERROR(ENOSPC)); 3269} 3270 3271void 3272metaslab_free_concrete(vdev_t vd, uint64_t offset, uint64_t asize, 3273* boolean_t checkpoint) 3274{ 3275 metaslab_t msp; --- 284 unchanged lines hidden* (view full) --- 3560/* 3561 * Reserve some allocation slots. The reservation system must be called 3562 * before we call into the allocator. If there aren't any available slots 3563 * then the I/O will be throttled until an I/O completes and its slots are 3564 * freed up. The function returns true if it was successful in placing 3565 * the reservation. 3566 / 3567*boolean_t	3533 return (SET_ERROR(ENOSPC)); 3534} 3535 3536void 3537metaslab_free_concrete(vdev_t vd, uint64_t offset, uint64_t asize, 3538* boolean_t checkpoint) 3539{ 3540 metaslab_t msp; --- 284 unchanged lines hidden* (view full) --- 3825/* 3826 * Reserve some allocation slots. The reservation system must be called 3827 * before we call into the allocator. If there aren't any available slots 3828 * then the I/O will be throttled until an I/O completes and its slots are 3829 * freed up. The function returns true if it was successful in placing 3830 * the reservation. 3831 / 3832*boolean_t
3568metaslab_class_throttle_reserve(metaslab_class_t mc, int slots, zio_t zio, 3569 int flags)	3833metaslab_class_throttle_reserve(metaslab_class_t mc, int slots, int allocator, 3834* zio_t *zio, int flags)
3570{ 3571 uint64_t available_slots = 0; 3572 boolean_t slot_reserved = B_FALSE;	3835{ 3836 uint64_t available_slots = 0; 3837 boolean_t slot_reserved = B_FALSE;
	3838 uint64_t max = mc->mc_alloc_max_slots[allocator];
3573 3574 ASSERT(mc->mc_alloc_throttle_enabled); 3575 mutex_enter(&mc->mc_lock); 3576	3839 3840 ASSERT(mc->mc_alloc_throttle_enabled); 3841 mutex_enter(&mc->mc_lock); 3842
3577 uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); 3578 if (reserved_slots < mc->mc_alloc_max_slots) 3579 available_slots = mc->mc_alloc_max_slots - reserved_slots;	3843 uint64_t reserved_slots = 3844 refcount_count(&mc->mc_alloc_slots[allocator]); 3845 if (reserved_slots < max) 3846 available_slots = max - reserved_slots;
3580 3581 if (slots <= available_slots \|\| GANG_ALLOCATION(flags)) { 3582 /* 3583 * We reserve the slots individually so that we can unreserve 3584 * them individually when an I/O completes. 3585 / 3586* for (int d = 0; d < slots; d++) {	3847 3848 if (slots <= available_slots \|\| GANG_ALLOCATION(flags)) { 3849 /* 3850 * We reserve the slots individually so that we can unreserve 3851 * them individually when an I/O completes. 3852 / 3853* for (int d = 0; d < slots; d++) {
3587 reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);	3854 reserved_slots = 3855 refcount_add(&mc->mc_alloc_slots[allocator], 3856 zio);
3588 } 3589 zio->io_flags \|= ZIO_FLAG_IO_ALLOCATING; 3590 slot_reserved = B_TRUE; 3591 } 3592 3593 mutex_exit(&mc->mc_lock); 3594 return (slot_reserved); 3595} 3596 3597void	3857 } 3858 zio->io_flags \|= ZIO_FLAG_IO_ALLOCATING; 3859 slot_reserved = B_TRUE; 3860 } 3861 3862 mutex_exit(&mc->mc_lock); 3863 return (slot_reserved); 3864} 3865 3866void
3598metaslab_class_throttle_unreserve(metaslab_class_t mc, int slots, zio_t zio)	3867metaslab_class_throttle_unreserve(metaslab_class_t mc, int slots, 3868* int allocator, zio_t *zio)
3599{ 3600 ASSERT(mc->mc_alloc_throttle_enabled); 3601 mutex_enter(&mc->mc_lock); 3602 for (int d = 0; d < slots; d++) {	3869{ 3870 ASSERT(mc->mc_alloc_throttle_enabled); 3871 mutex_enter(&mc->mc_lock); 3872 for (int d = 0; d < slots; d++) {
3603 (void) refcount_remove(&mc->mc_alloc_slots, zio);	3873 (void) refcount_remove(&mc->mc_alloc_slots[allocator], 3874 zio);
3604 } 3605 mutex_exit(&mc->mc_lock); 3606} 3607 3608static int 3609metaslab_claim_concrete(vdev_t vd, uint64_t offset, uint64_t size, 3610* uint64_t txg) 3611{ --- 5 unchanged lines hidden (view full) --- 3617 return (ENXIO); 3618 3619 ASSERT3P(vd->vdev_ms, !=, NULL); 3620 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3621 3622 mutex_enter(&msp->ms_lock); 3623 3624 if ((txg != 0 && spa_writeable(spa)) \|\| !msp->ms_loaded)	3875 } 3876 mutex_exit(&mc->mc_lock); 3877} 3878 3879static int 3880metaslab_claim_concrete(vdev_t vd, uint64_t offset, uint64_t size, 3881* uint64_t txg) 3882{ --- 5 unchanged lines hidden (view full) --- 3888 return (ENXIO); 3889 3890 ASSERT3P(vd->vdev_ms, !=, NULL); 3891 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3892 3893 mutex_enter(&msp->ms_lock); 3894 3895 if ((txg != 0 && spa_writeable(spa)) \|\| !msp->ms_loaded)
3625 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);	3896 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 3897 /* 3898 * No need to fail in that case; someone else has activated the 3899 * metaslab, but that doesn't preclude us from using it. 3900 / 3901* if (error == EBUSY) 3902 error = 0;
3626 3627 if (error == 0 && 3628 !range_tree_contains(msp->ms_allocatable, offset, size)) 3629 error = SET_ERROR(ENOENT); 3630 3631 if (error \|\| txg == 0) { /* txg == 0 indicates dry run / 3632* mutex_exit(&msp->ms_lock); 3633 return (error); --- 88 unchanged lines hidden (view full) --- 3722 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3723 3724 return (metaslab_claim_impl(vd, offset, size, txg)); 3725} 3726 3727int 3728metaslab_alloc(spa_t spa, metaslab_class_t mc, uint64_t psize, blkptr_t bp, 3729* int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,	3903 3904 if (error == 0 && 3905 !range_tree_contains(msp->ms_allocatable, offset, size)) 3906 error = SET_ERROR(ENOENT); 3907 3908 if (error \|\| txg == 0) { /* txg == 0 indicates dry run / 3909* mutex_exit(&msp->ms_lock); 3910 return (error); --- 88 unchanged lines hidden (view full) --- 3999 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4000 4001 return (metaslab_claim_impl(vd, offset, size, txg)); 4002} 4003 4004int 4005metaslab_alloc(spa_t spa, metaslab_class_t mc, uint64_t psize, blkptr_t bp, 4006* int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3730 zio_alloc_list_t zal, zio_t zio)	4007 zio_alloc_list_t zal, zio_t zio, int allocator)
3731{ 3732 dva_t dva = bp->blk_dva; 3733* dva_t hintdva = hintbp->blk_dva; 3734* int error = 0; 3735 3736 ASSERT(bp->blk_birth == 0); 3737 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 3738 --- 6 unchanged lines hidden (view full) --- 3745 3746 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 3747 ASSERT(BP_GET_NDVAS(bp) == 0); 3748 ASSERT(hintbp == NULL \|\| ndvas <= BP_GET_NDVAS(hintbp)); 3749 ASSERT3P(zal, !=, NULL); 3750 3751 for (int d = 0; d < ndvas; d++) { 3752 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,	4008{ 4009 dva_t dva = bp->blk_dva; 4010* dva_t hintdva = hintbp->blk_dva; 4011* int error = 0; 4012 4013 ASSERT(bp->blk_birth == 0); 4014 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 4015 --- 6 unchanged lines hidden (view full) --- 4022 4023 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 4024 ASSERT(BP_GET_NDVAS(bp) == 0); 4025 ASSERT(hintbp == NULL \|\| ndvas <= BP_GET_NDVAS(hintbp)); 4026 ASSERT3P(zal, !=, NULL); 4027 4028 for (int d = 0; d < ndvas; d++) { 4029 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3753 txg, flags, zal);	4030 txg, flags, zal, allocator);
3754 if (error != 0) { 3755 for (d--; d >= 0; d--) { 3756 metaslab_unalloc_dva(spa, &dva[d], txg); 3757 metaslab_group_alloc_decrement(spa,	4031 if (error != 0) { 4032 for (d--; d >= 0; d--) { 4033 metaslab_unalloc_dva(spa, &dva[d], txg); 4034 metaslab_group_alloc_decrement(spa,
3758 DVA_GET_VDEV(&dva[d]), zio, flags);	4035 DVA_GET_VDEV(&dva[d]), zio, flags, 4036 allocator, B_FALSE);
3759 bzero(&dva[d], sizeof (dva_t)); 3760 } 3761 spa_config_exit(spa, SCL_ALLOC, FTAG); 3762 return (error); 3763 } else { 3764 /* 3765 * Update the metaslab group's queue depth 3766 * based on the newly allocated dva. 3767 / 3768* metaslab_group_alloc_increment(spa,	4037 bzero(&dva[d], sizeof (dva_t)); 4038 } 4039 spa_config_exit(spa, SCL_ALLOC, FTAG); 4040 return (error); 4041 } else { 4042 /* 4043 * Update the metaslab group's queue depth 4044 * based on the newly allocated dva. 4045 / 4046* metaslab_group_alloc_increment(spa,
3769 DVA_GET_VDEV(&dva[d]), zio, flags);	4047 DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
3770 } 3771 3772 } 3773 ASSERT(error == 0); 3774 ASSERT(BP_GET_NDVAS(bp) == ndvas); 3775 3776 spa_config_exit(spa, SCL_ALLOC, FTAG); 3777 --- 152 unchanged lines hidden ---	4048 } 4049 4050 } 4051 ASSERT(error == 0); 4052 ASSERT(BP_GET_NDVAS(bp) == ndvas); 4053 4054 spa_config_exit(spa, SCL_ALLOC, FTAG); 4055 --- 152 unchanged lines hidden ---