metaslab.c (339104) | metaslab.c (339105) |
---|---|
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 6 unchanged lines hidden (view full) --- 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | 1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 6 unchanged lines hidden (view full) --- 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. | 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. |
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28#include <sys/zfs_context.h> 29#include <sys/dmu.h> 30#include <sys/dmu_tx.h> 31#include <sys/space_map.h> --- 238 unchanged lines hidden (view full) --- 270 * limit is ever reached allowing for further investigation. 271 */ 272uint64_t metaslab_trace_max_entries = 5000; 273 274static uint64_t metaslab_weight(metaslab_t *); 275static void metaslab_set_fragmentation(metaslab_t *); 276static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 277static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); | 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28#include <sys/zfs_context.h> 29#include <sys/dmu.h> 30#include <sys/dmu_tx.h> 31#include <sys/space_map.h> --- 238 unchanged lines hidden (view full) --- 270 * limit is ever reached allowing for further investigation. 271 */ 272uint64_t metaslab_trace_max_entries = 5000; 273 274static uint64_t metaslab_weight(metaslab_t *); 275static void metaslab_set_fragmentation(metaslab_t *); 276static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 277static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); |
278static void metaslab_passivate(metaslab_t *msp, uint64_t weight); 279static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); |
|
278 279kmem_cache_t *metaslab_alloc_trace_cache; 280 281/* 282 * ========================================================================== 283 * Metaslab classes 284 * ========================================================================== 285 */ 286metaslab_class_t * 287metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 288{ 289 metaslab_class_t *mc; 290 291 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 292 293 mc->mc_spa = spa; 294 mc->mc_rotor = NULL; 295 mc->mc_ops = ops; 296 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); | 280 281kmem_cache_t *metaslab_alloc_trace_cache; 282 283/* 284 * ========================================================================== 285 * Metaslab classes 286 * ========================================================================== 287 */ 288metaslab_class_t * 289metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 290{ 291 metaslab_class_t *mc; 292 293 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 294 295 mc->mc_spa = spa; 296 mc->mc_rotor = NULL; 297 mc->mc_ops = ops; 298 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); |
297 refcount_create_tracked(&mc->mc_alloc_slots); | 299 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 300 sizeof (refcount_t), KM_SLEEP); 301 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 302 sizeof (uint64_t), KM_SLEEP); 303 for (int i = 0; i < spa->spa_alloc_count; i++) 304 refcount_create_tracked(&mc->mc_alloc_slots[i]); |
298 299 return (mc); 300} 301 302void 303metaslab_class_destroy(metaslab_class_t *mc) 304{ 305 ASSERT(mc->mc_rotor == NULL); 306 ASSERT(mc->mc_alloc == 0); 307 ASSERT(mc->mc_deferred == 0); 308 ASSERT(mc->mc_space == 0); 309 ASSERT(mc->mc_dspace == 0); 310 | 305 306 return (mc); 307} 308 309void 310metaslab_class_destroy(metaslab_class_t *mc) 311{ 312 ASSERT(mc->mc_rotor == NULL); 313 ASSERT(mc->mc_alloc == 0); 314 ASSERT(mc->mc_deferred == 0); 315 ASSERT(mc->mc_space == 0); 316 ASSERT(mc->mc_dspace == 0); 317 |
311 refcount_destroy(&mc->mc_alloc_slots); | 318 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 319 refcount_destroy(&mc->mc_alloc_slots[i]); 320 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 321 sizeof (refcount_t)); 322 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 323 sizeof (uint64_t)); |
312 mutex_destroy(&mc->mc_lock); 313 kmem_free(mc, sizeof (metaslab_class_t)); 314} 315 316int 317metaslab_class_validate(metaslab_class_t *mc) 318{ 319 metaslab_group_t *mg; --- 207 unchanged lines hidden (view full) --- 527} 528 529static int 530metaslab_compare(const void *x1, const void *x2) 531{ 532 const metaslab_t *m1 = x1; 533 const metaslab_t *m2 = x2; 534 | 324 mutex_destroy(&mc->mc_lock); 325 kmem_free(mc, sizeof (metaslab_class_t)); 326} 327 328int 329metaslab_class_validate(metaslab_class_t *mc) 330{ 331 metaslab_group_t *mg; --- 207 unchanged lines hidden (view full) --- 539} 540 541static int 542metaslab_compare(const void *x1, const void *x2) 543{ 544 const metaslab_t *m1 = x1; 545 const metaslab_t *m2 = x2; 546 |
547 int sort1 = 0; 548 int sort2 = 0; 549 if (m1->ms_allocator != -1 && m1->ms_primary) 550 sort1 = 1; 551 else if (m1->ms_allocator != -1 && !m1->ms_primary) 552 sort1 = 2; 553 if (m2->ms_allocator != -1 && m2->ms_primary) 554 sort2 = 1; 555 else if (m2->ms_allocator != -1 && !m2->ms_primary) 556 sort2 = 2; 557 558 /* 559 * Sort inactive metaslabs first, then primaries, then secondaries. When 560 * selecting a metaslab to allocate from, an allocator first tries its 561 * primary, then secondary active metaslab. If it doesn't have active 562 * metaslabs, or can't allocate from them, it searches for an inactive 563 * metaslab to activate. If it can't find a suitable one, it will steal 564 * a primary or secondary metaslab from another allocator. 565 */ 566 if (sort1 < sort2) 567 return (-1); 568 if (sort1 > sort2) 569 return (1); 570 |
|
535 if (m1->ms_weight < m2->ms_weight) 536 return (1); 537 if (m1->ms_weight > m2->ms_weight) 538 return (-1); 539 540 /* 541 * If the weights are identical, use the offset to force uniqueness. 542 */ --- 135 unchanged lines hidden (view full) --- 678 else if (!was_allocatable && mg->mg_allocatable) 679 mc->mc_alloc_groups++; 680 mutex_exit(&mc->mc_lock); 681 682 mutex_exit(&mg->mg_lock); 683} 684 685metaslab_group_t * | 571 if (m1->ms_weight < m2->ms_weight) 572 return (1); 573 if (m1->ms_weight > m2->ms_weight) 574 return (-1); 575 576 /* 577 * If the weights are identical, use the offset to force uniqueness. 578 */ --- 135 unchanged lines hidden (view full) --- 714 else if (!was_allocatable && mg->mg_allocatable) 715 mc->mc_alloc_groups++; 716 mutex_exit(&mc->mc_lock); 717 718 mutex_exit(&mg->mg_lock); 719} 720 721metaslab_group_t * |
686metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) | 722metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) |
687{ 688 metaslab_group_t *mg; 689 690 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 691 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); | 723{ 724 metaslab_group_t *mg; 725 726 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 727 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); |
728 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 729 KM_SLEEP); 730 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 731 KM_SLEEP); |
|
692 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 693 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 694 mg->mg_vd = vd; 695 mg->mg_class = mc; 696 mg->mg_activation_count = 0; 697 mg->mg_initialized = B_FALSE; 698 mg->mg_no_free_space = B_TRUE; | 732 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 733 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 734 mg->mg_vd = vd; 735 mg->mg_class = mc; 736 mg->mg_activation_count = 0; 737 mg->mg_initialized = B_FALSE; 738 mg->mg_no_free_space = B_TRUE; |
699 refcount_create_tracked(&mg->mg_alloc_queue_depth); | 739 mg->mg_allocators = allocators; |
700 | 740 |
741 mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t), 742 KM_SLEEP); 743 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 744 sizeof (uint64_t), KM_SLEEP); 745 for (int i = 0; i < allocators; i++) { 746 refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 747 mg->mg_cur_max_alloc_queue_depth[i] = 0; 748 } 749 |
|
701 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 702 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 703 704 return (mg); 705} 706 707void 708metaslab_group_destroy(metaslab_group_t *mg) --- 4 unchanged lines hidden (view full) --- 713 * We may have gone below zero with the activation count 714 * either because we never activated in the first place or 715 * because we're done, and possibly removing the vdev. 716 */ 717 ASSERT(mg->mg_activation_count <= 0); 718 719 taskq_destroy(mg->mg_taskq); 720 avl_destroy(&mg->mg_metaslab_tree); | 750 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 751 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 752 753 return (mg); 754} 755 756void 757metaslab_group_destroy(metaslab_group_t *mg) --- 4 unchanged lines hidden (view full) --- 762 * We may have gone below zero with the activation count 763 * either because we never activated in the first place or 764 * because we're done, and possibly removing the vdev. 765 */ 766 ASSERT(mg->mg_activation_count <= 0); 767 768 taskq_destroy(mg->mg_taskq); 769 avl_destroy(&mg->mg_metaslab_tree); |
770 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 771 kmem_free(mg->mg_secondaries, mg->mg_allocators * 772 sizeof (metaslab_t *)); |
|
721 mutex_destroy(&mg->mg_lock); | 773 mutex_destroy(&mg->mg_lock); |
722 refcount_destroy(&mg->mg_alloc_queue_depth); | 774 775 for (int i = 0; i < mg->mg_allocators; i++) { 776 refcount_destroy(&mg->mg_alloc_queue_depth[i]); 777 mg->mg_cur_max_alloc_queue_depth[i] = 0; 778 } 779 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 780 sizeof (refcount_t)); 781 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 782 sizeof (uint64_t)); 783 |
723 kmem_free(mg, sizeof (metaslab_group_t)); 724} 725 726void 727metaslab_group_activate(metaslab_group_t *mg) 728{ 729 metaslab_class_t *mc = mg->mg_class; 730 metaslab_group_t *mgprev, *mgnext; --- 63 unchanged lines hidden (view full) --- 794 * lower locks to allow the I/O to complete. At a minimum, 795 * we continue to hold the SCL_ALLOC lock, which prevents any future 796 * allocations from taking place and any changes to the vdev tree. 797 */ 798 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 799 taskq_wait(mg->mg_taskq); 800 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 801 metaslab_group_alloc_update(mg); | 784 kmem_free(mg, sizeof (metaslab_group_t)); 785} 786 787void 788metaslab_group_activate(metaslab_group_t *mg) 789{ 790 metaslab_class_t *mc = mg->mg_class; 791 metaslab_group_t *mgprev, *mgnext; --- 63 unchanged lines hidden (view full) --- 855 * lower locks to allow the I/O to complete. At a minimum, 856 * we continue to hold the SCL_ALLOC lock, which prevents any future 857 * allocations from taking place and any changes to the vdev tree. 858 */ 859 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 860 taskq_wait(mg->mg_taskq); 861 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 862 metaslab_group_alloc_update(mg); |
863 for (int i = 0; i < mg->mg_allocators; i++) { 864 metaslab_t *msp = mg->mg_primaries[i]; 865 if (msp != NULL) { 866 mutex_enter(&msp->ms_lock); 867 metaslab_passivate(msp, 868 metaslab_weight_from_range_tree(msp)); 869 mutex_exit(&msp->ms_lock); 870 } 871 msp = mg->mg_secondaries[i]; 872 if (msp != NULL) { 873 mutex_enter(&msp->ms_lock); 874 metaslab_passivate(msp, 875 metaslab_weight_from_range_tree(msp)); 876 mutex_exit(&msp->ms_lock); 877 } 878 } |
|
802 803 mgprev = mg->mg_prev; 804 mgnext = mg->mg_next; 805 806 if (mg == mgnext) { 807 mc->mc_rotor = NULL; 808 } else { 809 mc->mc_rotor = mgnext; --- 125 unchanged lines hidden (view full) --- 935 mutex_enter(&mg->mg_lock); 936 ASSERT(msp->ms_group == mg); 937 avl_remove(&mg->mg_metaslab_tree, msp); 938 msp->ms_group = NULL; 939 mutex_exit(&mg->mg_lock); 940} 941 942static void | 879 880 mgprev = mg->mg_prev; 881 mgnext = mg->mg_next; 882 883 if (mg == mgnext) { 884 mc->mc_rotor = NULL; 885 } else { 886 mc->mc_rotor = mgnext; --- 125 unchanged lines hidden (view full) --- 1012 mutex_enter(&mg->mg_lock); 1013 ASSERT(msp->ms_group == mg); 1014 avl_remove(&mg->mg_metaslab_tree, msp); 1015 msp->ms_group = NULL; 1016 mutex_exit(&mg->mg_lock); 1017} 1018 1019static void |
1020metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1021{ 1022 ASSERT(MUTEX_HELD(&mg->mg_lock)); 1023 ASSERT(msp->ms_group == mg); 1024 avl_remove(&mg->mg_metaslab_tree, msp); 1025 msp->ms_weight = weight; 1026 avl_add(&mg->mg_metaslab_tree, msp); 1027 1028} 1029 1030static void |
|
943metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 944{ 945 /* 946 * Although in principle the weight can be any value, in 947 * practice we do not use values in the range [1, 511]. 948 */ 949 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 950 ASSERT(MUTEX_HELD(&msp->ms_lock)); 951 952 mutex_enter(&mg->mg_lock); | 1031metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1032{ 1033 /* 1034 * Although in principle the weight can be any value, in 1035 * practice we do not use values in the range [1, 511]. 1036 */ 1037 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 1038 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1039 1040 mutex_enter(&mg->mg_lock); |
953 ASSERT(msp->ms_group == mg); 954 avl_remove(&mg->mg_metaslab_tree, msp); 955 msp->ms_weight = weight; 956 avl_add(&mg->mg_metaslab_tree, msp); | 1041 metaslab_group_sort_impl(mg, msp, weight); |
957 mutex_exit(&mg->mg_lock); 958} 959 960/* 961 * Calculate the fragmentation for a given metaslab group. We can use 962 * a simple average here since all metaslabs within the group must have 963 * the same size. The return value will be a value between 0 and 100 964 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this --- 31 unchanged lines hidden (view full) --- 996 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 997 * that can still handle allocations. If the allocation throttle is enabled 998 * then we skip allocations to devices that have reached their maximum 999 * allocation queue depth unless the selected metaslab group is the only 1000 * eligible group remaining. 1001 */ 1002static boolean_t 1003metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, | 1042 mutex_exit(&mg->mg_lock); 1043} 1044 1045/* 1046 * Calculate the fragmentation for a given metaslab group. We can use 1047 * a simple average here since all metaslabs within the group must have 1048 * the same size. The return value will be a value between 0 and 100 1049 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this --- 31 unchanged lines hidden (view full) --- 1081 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1082 * that can still handle allocations. If the allocation throttle is enabled 1083 * then we skip allocations to devices that have reached their maximum 1084 * allocation queue depth unless the selected metaslab group is the only 1085 * eligible group remaining. 1086 */ 1087static boolean_t 1088metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, |
1004 uint64_t psize) | 1089 uint64_t psize, int allocator) |
1005{ 1006 spa_t *spa = mg->mg_vd->vdev_spa; 1007 metaslab_class_t *mc = mg->mg_class; 1008 1009 /* 1010 * We can only consider skipping this metaslab group if it's 1011 * in the normal metaslab class and there are other metaslab 1012 * groups to select from. Otherwise, we always consider it eligible --- 12 unchanged lines hidden (view full) --- 1025 * If all metaslab groups are no longer considered allocatable 1026 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1027 * gang block size then we allow allocations on this metaslab group 1028 * regardless of the mg_allocatable or throttle settings. 1029 */ 1030 if (mg->mg_allocatable) { 1031 metaslab_group_t *mgp; 1032 int64_t qdepth; | 1090{ 1091 spa_t *spa = mg->mg_vd->vdev_spa; 1092 metaslab_class_t *mc = mg->mg_class; 1093 1094 /* 1095 * We can only consider skipping this metaslab group if it's 1096 * in the normal metaslab class and there are other metaslab 1097 * groups to select from. Otherwise, we always consider it eligible --- 12 unchanged lines hidden (view full) --- 1110 * If all metaslab groups are no longer considered allocatable 1111 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1112 * gang block size then we allow allocations on this metaslab group 1113 * regardless of the mg_allocatable or throttle settings. 1114 */ 1115 if (mg->mg_allocatable) { 1116 metaslab_group_t *mgp; 1117 int64_t qdepth; |
1033 uint64_t qmax = mg->mg_max_alloc_queue_depth; | 1118 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; |
1034 1035 if (!mc->mc_alloc_throttle_enabled) 1036 return (B_TRUE); 1037 1038 /* 1039 * If this metaslab group does not have any free space, then 1040 * there is no point in looking further. 1041 */ 1042 if (mg->mg_no_free_space) 1043 return (B_FALSE); 1044 | 1119 1120 if (!mc->mc_alloc_throttle_enabled) 1121 return (B_TRUE); 1122 1123 /* 1124 * If this metaslab group does not have any free space, then 1125 * there is no point in looking further. 1126 */ 1127 if (mg->mg_no_free_space) 1128 return (B_FALSE); 1129 |
1045 qdepth = refcount_count(&mg->mg_alloc_queue_depth); | 1130 qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]); |
1046 1047 /* 1048 * If this metaslab group is below its qmax or it's 1049 * the only allocatable metasable group, then attempt 1050 * to allocate from it. 1051 */ 1052 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1053 return (B_TRUE); 1054 ASSERT3U(mc->mc_alloc_groups, >, 1); 1055 1056 /* 1057 * Since this metaslab group is at or over its qmax, we 1058 * need to determine if there are metaslab groups after this 1059 * one that might be able to handle this allocation. This is 1060 * racy since we can't hold the locks for all metaslab 1061 * groups at the same time when we make this check. 1062 */ 1063 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { | 1131 1132 /* 1133 * If this metaslab group is below its qmax or it's 1134 * the only allocatable metasable group, then attempt 1135 * to allocate from it. 1136 */ 1137 if (qdepth < qmax || mc->mc_alloc_groups == 1) 1138 return (B_TRUE); 1139 ASSERT3U(mc->mc_alloc_groups, >, 1); 1140 1141 /* 1142 * Since this metaslab group is at or over its qmax, we 1143 * need to determine if there are metaslab groups after this 1144 * one that might be able to handle this allocation. This is 1145 * racy since we can't hold the locks for all metaslab 1146 * groups at the same time when we make this check. 1147 */ 1148 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { |
1064 qmax = mgp->mg_max_alloc_queue_depth; | 1149 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; |
1065 | 1150 |
1066 qdepth = refcount_count(&mgp->mg_alloc_queue_depth); | 1151 qdepth = refcount_count( 1152 &mgp->mg_alloc_queue_depth[allocator]); |
1067 1068 /* 1069 * If there is another metaslab group that 1070 * might be able to handle the allocation, then 1071 * we return false so that we skip this group. 1072 */ 1073 if (qdepth < qmax && !mgp->mg_no_free_space) 1074 return (B_FALSE); --- 391 unchanged lines hidden (view full) --- 1466 1467 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1468 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1469 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1470 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1471 ms->ms_id = id; 1472 ms->ms_start = id << vd->vdev_ms_shift; 1473 ms->ms_size = 1ULL << vd->vdev_ms_shift; | 1153 1154 /* 1155 * If there is another metaslab group that 1156 * might be able to handle the allocation, then 1157 * we return false so that we skip this group. 1158 */ 1159 if (qdepth < qmax && !mgp->mg_no_free_space) 1160 return (B_FALSE); --- 391 unchanged lines hidden (view full) --- 1552 1553 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1554 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1555 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1556 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1557 ms->ms_id = id; 1558 ms->ms_start = id << vd->vdev_ms_shift; 1559 ms->ms_size = 1ULL << vd->vdev_ms_shift; |
1560 ms->ms_allocator = -1; 1561 ms->ms_new = B_TRUE; |
|
1474 1475 /* 1476 * We only open space map objects that already exist. All others 1477 * will be opened when we finally allocate an object for it. 1478 */ 1479 if (object != 0) { 1480 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1481 ms->ms_size, vd->vdev_ashift); --- 80 unchanged lines hidden (view full) --- 1562 ASSERT0(msp->ms_deferspace); 1563 1564 range_tree_destroy(msp->ms_checkpointing); 1565 1566 mutex_exit(&msp->ms_lock); 1567 cv_destroy(&msp->ms_load_cv); 1568 mutex_destroy(&msp->ms_lock); 1569 mutex_destroy(&msp->ms_sync_lock); | 1562 1563 /* 1564 * We only open space map objects that already exist. All others 1565 * will be opened when we finally allocate an object for it. 1566 */ 1567 if (object != 0) { 1568 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1569 ms->ms_size, vd->vdev_ashift); --- 80 unchanged lines hidden (view full) --- 1650 ASSERT0(msp->ms_deferspace); 1651 1652 range_tree_destroy(msp->ms_checkpointing); 1653 1654 mutex_exit(&msp->ms_lock); 1655 cv_destroy(&msp->ms_load_cv); 1656 mutex_destroy(&msp->ms_lock); 1657 mutex_destroy(&msp->ms_sync_lock); |
1658 ASSERT3U(msp->ms_allocator, ==, -1); |
|
1570 1571 kmem_free(msp, sizeof (metaslab_t)); 1572} 1573 1574#define FRAGMENTATION_TABLE_SIZE 17 1575 1576/* 1577 * This table defines a segment size based fragmentation metric that will --- 380 unchanged lines hidden (view full) --- 1958 weight = metaslab_segment_weight(msp); 1959 } else { 1960 weight = metaslab_space_weight(msp); 1961 } 1962 return (weight); 1963} 1964 1965static int | 1659 1660 kmem_free(msp, sizeof (metaslab_t)); 1661} 1662 1663#define FRAGMENTATION_TABLE_SIZE 17 1664 1665/* 1666 * This table defines a segment size based fragmentation metric that will --- 380 unchanged lines hidden (view full) --- 2047 weight = metaslab_segment_weight(msp); 2048 } else { 2049 weight = metaslab_space_weight(msp); 2050 } 2051 return (weight); 2052} 2053 2054static int |
1966metaslab_activate(metaslab_t *msp, uint64_t activation_weight) | 2055metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2056 int allocator, uint64_t activation_weight) |
1967{ | 2057{ |
2058 /* 2059 * If we're activating for the claim code, we don't want to actually 2060 * set the metaslab up for a specific allocator. 2061 */ 2062 if (activation_weight == METASLAB_WEIGHT_CLAIM) 2063 return (0); 2064 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2065 mg->mg_primaries : mg->mg_secondaries); 2066 |
|
1968 ASSERT(MUTEX_HELD(&msp->ms_lock)); | 2067 ASSERT(MUTEX_HELD(&msp->ms_lock)); |
2068 mutex_enter(&mg->mg_lock); 2069 if (arr[allocator] != NULL) { 2070 mutex_exit(&mg->mg_lock); 2071 return (EEXIST); 2072 } |
|
1969 | 2073 |
2074 arr[allocator] = msp; 2075 ASSERT3S(msp->ms_allocator, ==, -1); 2076 msp->ms_allocator = allocator; 2077 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2078 mutex_exit(&mg->mg_lock); 2079 2080 return (0); 2081} 2082 2083static int 2084metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 2085{ 2086 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2087 |
|
1970 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | 2088 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { |
2089 int error = 0; |
|
1971 metaslab_load_wait(msp); 1972 if (!msp->ms_loaded) { | 2090 metaslab_load_wait(msp); 2091 if (!msp->ms_loaded) { |
1973 int error = metaslab_load(msp); 1974 if (error) { | 2092 if ((error = metaslab_load(msp)) != 0) { |
1975 metaslab_group_sort(msp->ms_group, msp, 0); 1976 return (error); 1977 } 1978 } | 2093 metaslab_group_sort(msp->ms_group, msp, 0); 2094 return (error); 2095 } 2096 } |
2097 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2098 /* 2099 * The metaslab was activated for another allocator 2100 * while we were waiting, we should reselect. 2101 */ 2102 return (EBUSY); 2103 } 2104 if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2105 allocator, activation_weight)) != 0) { 2106 return (error); 2107 } |
|
1979 1980 msp->ms_activation_weight = msp->ms_weight; 1981 metaslab_group_sort(msp->ms_group, msp, 1982 msp->ms_weight | activation_weight); 1983 } 1984 ASSERT(msp->ms_loaded); 1985 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1986 1987 return (0); 1988} 1989 1990static void | 2108 2109 msp->ms_activation_weight = msp->ms_weight; 2110 metaslab_group_sort(msp->ms_group, msp, 2111 msp->ms_weight | activation_weight); 2112 } 2113 ASSERT(msp->ms_loaded); 2114 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2115 2116 return (0); 2117} 2118 2119static void |
2120metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2121 uint64_t weight) 2122{ 2123 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2124 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2125 metaslab_group_sort(mg, msp, weight); 2126 return; 2127 } 2128 2129 mutex_enter(&mg->mg_lock); 2130 ASSERT3P(msp->ms_group, ==, mg); 2131 if (msp->ms_primary) { 2132 ASSERT3U(0, <=, msp->ms_allocator); 2133 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 2134 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 2135 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 2136 mg->mg_primaries[msp->ms_allocator] = NULL; 2137 } else { 2138 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 2139 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 2140 mg->mg_secondaries[msp->ms_allocator] = NULL; 2141 } 2142 msp->ms_allocator = -1; 2143 metaslab_group_sort_impl(mg, msp, weight); 2144 mutex_exit(&mg->mg_lock); 2145} 2146 2147static void |
|
1991metaslab_passivate(metaslab_t *msp, uint64_t weight) 1992{ 1993 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 1994 1995 /* 1996 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1997 * this metaslab again. In that case, it had better be empty, 1998 * or we would be leaving space on the table. 1999 */ 2000 ASSERT(size >= SPA_MINBLOCKSIZE || 2001 range_tree_is_empty(msp->ms_allocatable)); 2002 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2003 2004 msp->ms_activation_weight = 0; | 2148metaslab_passivate(metaslab_t *msp, uint64_t weight) 2149{ 2150 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 2151 2152 /* 2153 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 2154 * this metaslab again. In that case, it had better be empty, 2155 * or we would be leaving space on the table. 2156 */ 2157 ASSERT(size >= SPA_MINBLOCKSIZE || 2158 range_tree_is_empty(msp->ms_allocatable)); 2159 ASSERT0(weight & METASLAB_ACTIVE_MASK); 2160 2161 msp->ms_activation_weight = 0; |
2005 metaslab_group_sort(msp->ms_group, msp, weight); | 2162 metaslab_passivate_allocator(msp->ms_group, msp, weight); |
2006 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2007} 2008 2009/* 2010 * Segment-based metaslabs are activated once and remain active until 2011 * we either fail an allocation attempt (similar to space-based metaslabs) 2012 * or have exhausted the free space in zfs_metaslab_switch_threshold 2013 * buckets since the metaslab was activated. This function checks to see --- 537 unchanged lines hidden (view full) --- 2551 if (msp->ms_deferspace != 0) { 2552 /* 2553 * Keep syncing this metaslab until all deferred frees 2554 * are back in circulation. 2555 */ 2556 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2557 } 2558 | 2163 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2164} 2165 2166/* 2167 * Segment-based metaslabs are activated once and remain active until 2168 * we either fail an allocation attempt (similar to space-based metaslabs) 2169 * or have exhausted the free space in zfs_metaslab_switch_threshold 2170 * buckets since the metaslab was activated. This function checks to see --- 537 unchanged lines hidden (view full) --- 2708 if (msp->ms_deferspace != 0) { 2709 /* 2710 * Keep syncing this metaslab until all deferred frees 2711 * are back in circulation. 2712 */ 2713 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2714 } 2715 |
2716 if (msp->ms_new) { 2717 msp->ms_new = B_FALSE; 2718 mutex_enter(&mg->mg_lock); 2719 mg->mg_ms_ready++; 2720 mutex_exit(&mg->mg_lock); 2721 } |
|
2559 /* 2560 * Calculate the new weights before unloading any metaslabs. 2561 * This will give us the most accurate weighting. 2562 */ | 2722 /* 2723 * Calculate the new weights before unloading any metaslabs. 2724 * This will give us the most accurate weighting. 2725 */ |
2563 metaslab_group_sort(mg, msp, metaslab_weight(msp)); | 2726 metaslab_group_sort(mg, msp, metaslab_weight(msp) | 2727 (msp->ms_weight & METASLAB_ACTIVE_MASK)); |
2564 2565 /* 2566 * If the metaslab is loaded and we've not tried to load or allocate 2567 * from it in 'metaslab_unload_delay' txgs, then unload it. 2568 */ 2569 if (msp->ms_loaded && 2570 msp->ms_selected_txg + metaslab_unload_delay < txg) { 2571 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2572 VERIFY0(range_tree_space( 2573 msp->ms_allocating[(txg + t) & TXG_MASK])); 2574 } | 2728 2729 /* 2730 * If the metaslab is loaded and we've not tried to load or allocate 2731 * from it in 'metaslab_unload_delay' txgs, then unload it. 2732 */ 2733 if (msp->ms_loaded && 2734 msp->ms_selected_txg + metaslab_unload_delay < txg) { 2735 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2736 VERIFY0(range_tree_space( 2737 msp->ms_allocating[(txg + t) & TXG_MASK])); 2738 } |
2739 if (msp->ms_allocator != -1) { 2740 metaslab_passivate(msp, msp->ms_weight & 2741 ~METASLAB_ACTIVE_MASK); 2742 } |
|
2575 2576 if (!metaslab_debug_unload) 2577 metaslab_unload(msp); 2578 } 2579 2580 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2581 ASSERT0(range_tree_space(msp->ms_freeing)); 2582 ASSERT0(range_tree_space(msp->ms_freed)); --- 77 unchanged lines hidden (view full) --- 2660 metaslab_alloc_trace_cache = NULL; 2661} 2662 2663/* 2664 * Add an allocation trace element to the allocation tracing list. 2665 */ 2666static void 2667metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, | 2743 2744 if (!metaslab_debug_unload) 2745 metaslab_unload(msp); 2746 } 2747 2748 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2749 ASSERT0(range_tree_space(msp->ms_freeing)); 2750 ASSERT0(range_tree_space(msp->ms_freed)); --- 77 unchanged lines hidden (view full) --- 2828 metaslab_alloc_trace_cache = NULL; 2829} 2830 2831/* 2832 * Add an allocation trace element to the allocation tracing list. 2833 */ 2834static void 2835metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, |
2668 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset) | 2836 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 2837 int allocator) |
2669{ 2670 if (!metaslab_trace_enabled) 2671 return; 2672 2673 /* 2674 * When the tracing list reaches its maximum we remove 2675 * the second element in the list before adding a new one. 2676 * By removing the second element we preserve the original --- 16 unchanged lines hidden (view full) --- 2693 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 2694 list_link_init(&mat->mat_list_node); 2695 mat->mat_mg = mg; 2696 mat->mat_msp = msp; 2697 mat->mat_size = psize; 2698 mat->mat_dva_id = dva_id; 2699 mat->mat_offset = offset; 2700 mat->mat_weight = 0; | 2838{ 2839 if (!metaslab_trace_enabled) 2840 return; 2841 2842 /* 2843 * When the tracing list reaches its maximum we remove 2844 * the second element in the list before adding a new one. 2845 * By removing the second element we preserve the original --- 16 unchanged lines hidden (view full) --- 2862 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 2863 list_link_init(&mat->mat_list_node); 2864 mat->mat_mg = mg; 2865 mat->mat_msp = msp; 2866 mat->mat_size = psize; 2867 mat->mat_dva_id = dva_id; 2868 mat->mat_offset = offset; 2869 mat->mat_weight = 0; |
2870 mat->mat_allocator = allocator; |
|
2701 2702 if (msp != NULL) 2703 mat->mat_weight = msp->ms_weight; 2704 2705 /* 2706 * The list is part of the zio so locking is not required. Only 2707 * a single thread will perform allocations for a given zio. 2708 */ --- 24 unchanged lines hidden (view full) --- 2733 2734/* 2735 * ========================================================================== 2736 * Metaslab block operations 2737 * ========================================================================== 2738 */ 2739 2740static void | 2871 2872 if (msp != NULL) 2873 mat->mat_weight = msp->ms_weight; 2874 2875 /* 2876 * The list is part of the zio so locking is not required. Only 2877 * a single thread will perform allocations for a given zio. 2878 */ --- 24 unchanged lines hidden (view full) --- 2903 2904/* 2905 * ========================================================================== 2906 * Metaslab block operations 2907 * ========================================================================== 2908 */ 2909 2910static void |
2741metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) | 2911metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 2912 int allocator) |
2742{ 2743 if (!(flags & METASLAB_ASYNC_ALLOC) || | 2913{ 2914 if (!(flags & METASLAB_ASYNC_ALLOC) || |
2744 flags & METASLAB_DONT_THROTTLE) | 2915 (flags & METASLAB_DONT_THROTTLE)) |
2745 return; 2746 2747 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2748 if (!mg->mg_class->mc_alloc_throttle_enabled) 2749 return; 2750 | 2916 return; 2917 2918 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2919 if (!mg->mg_class->mc_alloc_throttle_enabled) 2920 return; 2921 |
2751 (void) refcount_add(&mg->mg_alloc_queue_depth, tag); | 2922 (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); |
2752} 2753 | 2923} 2924 |
2925static void 2926metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 2927{ 2928 uint64_t max = mg->mg_max_alloc_queue_depth; 2929 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2930 while (cur < max) { 2931 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 2932 cur, cur + 1) == cur) { 2933 atomic_inc_64( 2934 &mg->mg_class->mc_alloc_max_slots[allocator]); 2935 return; 2936 } 2937 cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2938 } 2939} 2940 |
|
2754void | 2941void |
2755metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) | 2942metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 2943 int allocator, boolean_t io_complete) |
2756{ 2757 if (!(flags & METASLAB_ASYNC_ALLOC) || | 2944{ 2945 if (!(flags & METASLAB_ASYNC_ALLOC) || |
2758 flags & METASLAB_DONT_THROTTLE) | 2946 (flags & METASLAB_DONT_THROTTLE)) |
2759 return; 2760 2761 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2762 if (!mg->mg_class->mc_alloc_throttle_enabled) 2763 return; 2764 | 2947 return; 2948 2949 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2950 if (!mg->mg_class->mc_alloc_throttle_enabled) 2951 return; 2952 |
2765 (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); | 2953 (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 2954 if (io_complete) 2955 metaslab_group_increment_qdepth(mg, allocator); |
2766} 2767 2768void | 2956} 2957 2958void |
2769metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) | 2959metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 2960 int allocator) |
2770{ 2771#ifdef ZFS_DEBUG 2772 const dva_t *dva = bp->blk_dva; 2773 int ndvas = BP_GET_NDVAS(bp); 2774 2775 for (int d = 0; d < ndvas; d++) { 2776 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2777 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; | 2961{ 2962#ifdef ZFS_DEBUG 2963 const dva_t *dva = bp->blk_dva; 2964 int ndvas = BP_GET_NDVAS(bp); 2965 2966 for (int d = 0; d < ndvas; d++) { 2967 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2968 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; |
2778 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); | 2969 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator], 2970 tag)); |
2779 } 2780#endif 2781} 2782 2783static uint64_t 2784metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 2785{ 2786 uint64_t start; --- 25 unchanged lines hidden (view full) --- 2812 /* 2813 * Now that we've attempted the allocation we need to update the 2814 * metaslab's maximum block size since it may have changed. 2815 */ 2816 msp->ms_max_size = metaslab_block_maxsize(msp); 2817 return (start); 2818} 2819 | 2971 } 2972#endif 2973} 2974 2975static uint64_t 2976metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 2977{ 2978 uint64_t start; --- 25 unchanged lines hidden (view full) --- 3004 /* 3005 * Now that we've attempted the allocation we need to update the 3006 * metaslab's maximum block size since it may have changed. 3007 */ 3008 msp->ms_max_size = metaslab_block_maxsize(msp); 3009 return (start); 3010} 3011 |
3012/* 3013 * Find the metaslab with the highest weight that is less than what we've 3014 * already tried. In the common case, this means that we will examine each 3015 * metaslab at most once. Note that concurrent callers could reorder metaslabs 3016 * by activation/passivation once we have dropped the mg_lock. If a metaslab is 3017 * activated by another thread, and we fail to allocate from the metaslab we 3018 * have selected, we may not try the newly-activated metaslab, and instead 3019 * activate another metaslab. This is not optimal, but generally does not cause 3020 * any problems (a possible exception being if every metaslab is completely full 3021 * except for the the newly-activated metaslab which we fail to examine). 3022 */ 3023static metaslab_t * 3024find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 3025 dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator, 3026 zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) 3027{ 3028 avl_index_t idx; 3029 avl_tree_t *t = &mg->mg_metaslab_tree; 3030 metaslab_t *msp = avl_find(t, search, &idx); 3031 if (msp == NULL) 3032 msp = avl_nearest(t, idx, AVL_AFTER); 3033 3034 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 3035 int i; 3036 if (!metaslab_should_allocate(msp, asize)) { 3037 metaslab_trace_add(zal, mg, msp, asize, d, 3038 TRACE_TOO_SMALL, allocator); 3039 continue; 3040 } 3041 3042 /* 3043 * If the selected metaslab is condensing, skip it. 3044 */ 3045 if (msp->ms_condensing) 3046 continue; 3047 3048 *was_active = msp->ms_allocator != -1; 3049 /* 3050 * If we're activating as primary, this is our first allocation 3051 * from this disk, so we don't need to check how close we are. 3052 * If the metaslab under consideration was already active, 3053 * we're getting desperate enough to steal another allocator's 3054 * metaslab, so we still don't care about distances. 3055 */ 3056 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 3057 break; 3058 3059 uint64_t target_distance = min_distance 3060 + (space_map_allocated(msp->ms_sm) != 0 ? 0 : 3061 min_distance >> 1); 3062 3063 for (i = 0; i < d; i++) { 3064 if (metaslab_distance(msp, &dva[i]) < target_distance) 3065 break; 3066 } 3067 if (i == d) 3068 break; 3069 } 3070 3071 if (msp != NULL) { 3072 search->ms_weight = msp->ms_weight; 3073 search->ms_start = msp->ms_start + 1; 3074 search->ms_allocator = msp->ms_allocator; 3075 search->ms_primary = msp->ms_primary; 3076 } 3077 return (msp); 3078} 3079 3080/* ARGSUSED */ |
|
2820static uint64_t 2821metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, | 3081static uint64_t 3082metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, |
2822 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) | 3083 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, 3084 int allocator) |
2823{ 2824 metaslab_t *msp = NULL; 2825 uint64_t offset = -1ULL; 2826 uint64_t activation_weight; | 3085{ 3086 metaslab_t *msp = NULL; 3087 uint64_t offset = -1ULL; 3088 uint64_t activation_weight; |
2827 uint64_t target_distance; 2828 int i; | 3089 boolean_t tertiary = B_FALSE; |
2829 2830 activation_weight = METASLAB_WEIGHT_PRIMARY; | 3090 3091 activation_weight = METASLAB_WEIGHT_PRIMARY; |
2831 for (i = 0; i < d; i++) { 2832 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { | 3092 for (int i = 0; i < d; i++) { 3093 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3094 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { |
2833 activation_weight = METASLAB_WEIGHT_SECONDARY; | 3095 activation_weight = METASLAB_WEIGHT_SECONDARY; |
3096 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3097 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3098 tertiary = B_TRUE; |
|
2834 break; 2835 } 2836 } 2837 | 3099 break; 3100 } 3101 } 3102 |
3103 /* 3104 * If we don't have enough metaslabs active to fill the entire array, we 3105 * just use the 0th slot. 3106 */ 3107 if (mg->mg_ms_ready < mg->mg_allocators * 2) { 3108 tertiary = B_FALSE; 3109 allocator = 0; 3110 } 3111 3112 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 3113 |
|
2838 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 2839 search->ms_weight = UINT64_MAX; 2840 search->ms_start = 0; | 3114 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 3115 search->ms_weight = UINT64_MAX; 3116 search->ms_start = 0; |
3117 /* 3118 * At the end of the metaslab tree are the already-active metaslabs, 3119 * first the primaries, then the secondaries. When we resume searching 3120 * through the tree, we need to consider ms_allocator and ms_primary so 3121 * we start in the location right after where we left off, and don't 3122 * accidentally loop forever considering the same metaslabs. 3123 */ 3124 search->ms_allocator = -1; 3125 search->ms_primary = B_TRUE; |
|
2841 for (;;) { | 3126 for (;;) { |
2842 boolean_t was_active; 2843 avl_tree_t *t = &mg->mg_metaslab_tree; 2844 avl_index_t idx; | 3127 boolean_t was_active = B_FALSE; |
2845 2846 mutex_enter(&mg->mg_lock); 2847 | 3128 3129 mutex_enter(&mg->mg_lock); 3130 |
2848 /* 2849 * Find the metaslab with the highest weight that is less 2850 * than what we've already tried. In the common case, this 2851 * means that we will examine each metaslab at most once. 2852 * Note that concurrent callers could reorder metaslabs 2853 * by activation/passivation once we have dropped the mg_lock. 2854 * If a metaslab is activated by another thread, and we fail 2855 * to allocate from the metaslab we have selected, we may 2856 * not try the newly-activated metaslab, and instead activate 2857 * another metaslab. This is not optimal, but generally 2858 * does not cause any problems (a possible exception being 2859 * if every metaslab is completely full except for the 2860 * the newly-activated metaslab which we fail to examine). 2861 */ 2862 msp = avl_find(t, search, &idx); 2863 if (msp == NULL) 2864 msp = avl_nearest(t, idx, AVL_AFTER); 2865 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 2866 2867 if (!metaslab_should_allocate(msp, asize)) { 2868 metaslab_trace_add(zal, mg, msp, asize, d, 2869 TRACE_TOO_SMALL); 2870 continue; 2871 } 2872 2873 /* 2874 * If the selected metaslab is condensing, skip it. 2875 */ 2876 if (msp->ms_condensing) 2877 continue; 2878 2879 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2880 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2881 break; 2882 2883 target_distance = min_distance + 2884 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2885 min_distance >> 1); 2886 2887 for (i = 0; i < d; i++) { 2888 if (metaslab_distance(msp, &dva[i]) < 2889 target_distance) 2890 break; 2891 } 2892 if (i == d) 2893 break; | 3131 if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3132 mg->mg_primaries[allocator] != NULL) { 3133 msp = mg->mg_primaries[allocator]; 3134 was_active = B_TRUE; 3135 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3136 mg->mg_secondaries[allocator] != NULL && !tertiary) { 3137 msp = mg->mg_secondaries[allocator]; 3138 was_active = B_TRUE; 3139 } else { 3140 msp = find_valid_metaslab(mg, activation_weight, dva, d, 3141 min_distance, asize, allocator, zal, search, 3142 &was_active); |
2894 } | 3143 } |
3144 |
|
2895 mutex_exit(&mg->mg_lock); 2896 if (msp == NULL) { 2897 kmem_free(search, sizeof (*search)); 2898 return (-1ULL); 2899 } | 3145 mutex_exit(&mg->mg_lock); 3146 if (msp == NULL) { 3147 kmem_free(search, sizeof (*search)); 3148 return (-1ULL); 3149 } |
2900 search->ms_weight = msp->ms_weight; 2901 search->ms_start = msp->ms_start + 1; | |
2902 2903 mutex_enter(&msp->ms_lock); | 3150 3151 mutex_enter(&msp->ms_lock); |
2904 | |
2905 /* 2906 * Ensure that the metaslab we have selected is still 2907 * capable of handling our request. It's possible that 2908 * another thread may have changed the weight while we 2909 * were blocked on the metaslab lock. We check the 2910 * active status first to see if we need to reselect 2911 * a new metaslab. 2912 */ 2913 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 2914 mutex_exit(&msp->ms_lock); 2915 continue; 2916 } 2917 | 3152 /* 3153 * Ensure that the metaslab we have selected is still 3154 * capable of handling our request. It's possible that 3155 * another thread may have changed the weight while we 3156 * were blocked on the metaslab lock. We check the 3157 * active status first to see if we need to reselect 3158 * a new metaslab. 3159 */ 3160 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 3161 mutex_exit(&msp->ms_lock); 3162 continue; 3163 } 3164 |
2918 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2919 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2920 metaslab_passivate(msp, 2921 msp->ms_weight & ~METASLAB_ACTIVE_MASK); | 3165 /* 3166 * If the metaslab is freshly activated for an allocator that 3167 * isn't the one we're allocating from, or if it's a primary and 3168 * we're seeking a secondary (or vice versa), we go back and 3169 * select a new metaslab. 3170 */ 3171 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 3172 (msp->ms_allocator != -1) && 3173 (msp->ms_allocator != allocator || ((activation_weight == 3174 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { |
2922 mutex_exit(&msp->ms_lock); 2923 continue; 2924 } 2925 | 3175 mutex_exit(&msp->ms_lock); 3176 continue; 3177 } 3178 |
2926 if (metaslab_activate(msp, activation_weight) != 0) { | 3179 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 3180 metaslab_passivate(msp, msp->ms_weight & 3181 ~METASLAB_WEIGHT_CLAIM); |
2927 mutex_exit(&msp->ms_lock); 2928 continue; 2929 } | 3182 mutex_exit(&msp->ms_lock); 3183 continue; 3184 } |
3185 3186 if (metaslab_activate(msp, allocator, activation_weight) != 0) { 3187 mutex_exit(&msp->ms_lock); 3188 continue; 3189 } 3190 |
|
2930 msp->ms_selected_txg = txg; 2931 2932 /* 2933 * Now that we have the lock, recheck to see if we should 2934 * continue to use this metaslab for this allocation. The 2935 * the metaslab is now loaded so metaslab_should_allocate() can 2936 * accurately determine if the allocation attempt should 2937 * proceed. 2938 */ 2939 if (!metaslab_should_allocate(msp, asize)) { 2940 /* Passivate this metaslab and select a new one. */ 2941 metaslab_trace_add(zal, mg, msp, asize, d, | 3191 msp->ms_selected_txg = txg; 3192 3193 /* 3194 * Now that we have the lock, recheck to see if we should 3195 * continue to use this metaslab for this allocation. The 3196 * the metaslab is now loaded so metaslab_should_allocate() can 3197 * accurately determine if the allocation attempt should 3198 * proceed. 3199 */ 3200 if (!metaslab_should_allocate(msp, asize)) { 3201 /* Passivate this metaslab and select a new one. */ 3202 metaslab_trace_add(zal, mg, msp, asize, d, |
2942 TRACE_TOO_SMALL); | 3203 TRACE_TOO_SMALL, allocator); |
2943 goto next; 2944 } 2945 2946 /* 2947 * If this metaslab is currently condensing then pick again as 2948 * we can't manipulate this metaslab until it's committed 2949 * to disk. 2950 */ 2951 if (msp->ms_condensing) { 2952 metaslab_trace_add(zal, mg, msp, asize, d, | 3204 goto next; 3205 } 3206 3207 /* 3208 * If this metaslab is currently condensing then pick again as 3209 * we can't manipulate this metaslab until it's committed 3210 * to disk. 3211 */ 3212 if (msp->ms_condensing) { 3213 metaslab_trace_add(zal, mg, msp, asize, d, |
2953 TRACE_CONDENSING); | 3214 TRACE_CONDENSING, allocator); 3215 metaslab_passivate(msp, msp->ms_weight & 3216 ~METASLAB_ACTIVE_MASK); |
2954 mutex_exit(&msp->ms_lock); 2955 continue; 2956 } 2957 2958 offset = metaslab_block_alloc(msp, asize, txg); | 3217 mutex_exit(&msp->ms_lock); 3218 continue; 3219 } 3220 3221 offset = metaslab_block_alloc(msp, asize, txg); |
2959 metaslab_trace_add(zal, mg, msp, asize, d, offset); | 3222 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); |
2960 2961 if (offset != -1ULL) { 2962 /* Proactively passivate the metaslab, if needed */ 2963 metaslab_segment_may_passivate(msp); 2964 break; 2965 } 2966next: 2967 ASSERT(msp->ms_loaded); --- 39 unchanged lines hidden (view full) --- 3007 } 3008 mutex_exit(&msp->ms_lock); 3009 kmem_free(search, sizeof (*search)); 3010 return (offset); 3011} 3012 3013static uint64_t 3014metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, | 3223 3224 if (offset != -1ULL) { 3225 /* Proactively passivate the metaslab, if needed */ 3226 metaslab_segment_may_passivate(msp); 3227 break; 3228 } 3229next: 3230 ASSERT(msp->ms_loaded); --- 39 unchanged lines hidden (view full) --- 3270 } 3271 mutex_exit(&msp->ms_lock); 3272 kmem_free(search, sizeof (*search)); 3273 return (offset); 3274} 3275 3276static uint64_t 3277metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, |
3015 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) | 3278 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, 3279 int allocator) |
3016{ 3017 uint64_t offset; 3018 ASSERT(mg->mg_initialized); 3019 3020 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, | 3280{ 3281 uint64_t offset; 3282 ASSERT(mg->mg_initialized); 3283 3284 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, |
3021 min_distance, dva, d); | 3285 min_distance, dva, d, allocator); |
3022 3023 mutex_enter(&mg->mg_lock); 3024 if (offset == -1ULL) { 3025 mg->mg_failed_allocations++; 3026 metaslab_trace_add(zal, mg, NULL, asize, d, | 3286 3287 mutex_enter(&mg->mg_lock); 3288 if (offset == -1ULL) { 3289 mg->mg_failed_allocations++; 3290 metaslab_trace_add(zal, mg, NULL, asize, d, |
3027 TRACE_GROUP_FAILURE); | 3291 TRACE_GROUP_FAILURE, allocator); |
3028 if (asize == SPA_GANGBLOCKSIZE) { 3029 /* 3030 * This metaslab group was unable to allocate 3031 * the minimum gang block size so it must be out of 3032 * space. We must notify the allocation throttle 3033 * to start skipping allocation attempts to this 3034 * metaslab group until more space becomes available. 3035 * Note: this failure cannot be caused by the --- 18 unchanged lines hidden (view full) --- 3054int ditto_same_vdev_distance_shift = 3; 3055 3056/* 3057 * Allocate a block for the specified i/o. 3058 */ 3059int 3060metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3061 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, | 3292 if (asize == SPA_GANGBLOCKSIZE) { 3293 /* 3294 * This metaslab group was unable to allocate 3295 * the minimum gang block size so it must be out of 3296 * space. We must notify the allocation throttle 3297 * to start skipping allocation attempts to this 3298 * metaslab group until more space becomes available. 3299 * Note: this failure cannot be caused by the --- 18 unchanged lines hidden (view full) --- 3318int ditto_same_vdev_distance_shift = 3; 3319 3320/* 3321 * Allocate a block for the specified i/o. 3322 */ 3323int 3324metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3325 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, |
3062 zio_alloc_list_t *zal) | 3326 zio_alloc_list_t *zal, int allocator) |
3063{ 3064 metaslab_group_t *mg, *rotor; 3065 vdev_t *vd; 3066 boolean_t try_hard = B_FALSE; 3067 3068 ASSERT(!DVA_IS_VALID(&dva[d])); 3069 3070 /* 3071 * For testing, make some blocks above a certain size be gang blocks. 3072 */ 3073 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { | 3327{ 3328 metaslab_group_t *mg, *rotor; 3329 vdev_t *vd; 3330 boolean_t try_hard = B_FALSE; 3331 3332 ASSERT(!DVA_IS_VALID(&dva[d])); 3333 3334 /* 3335 * For testing, make some blocks above a certain size be gang blocks. 3336 */ 3337 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { |
3074 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG); | 3338 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 3339 allocator); |
3075 return (SET_ERROR(ENOSPC)); 3076 } 3077 3078 /* 3079 * Start at the rotor and loop through all mgs until we find something. 3080 * Note that there's no locking on mc_rotor or mc_aliquot because 3081 * nothing actually breaks if we miss a few updates -- we just won't 3082 * allocate quite as evenly. It all balances out over time. --- 69 unchanged lines hidden (view full) --- 3152 * Determine if the selected metaslab group is eligible 3153 * for allocations. If we're ganging then don't allow 3154 * this metaslab group to skip allocations since that would 3155 * inadvertently return ENOSPC and suspend the pool 3156 * even though space is still available. 3157 */ 3158 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3159 allocatable = metaslab_group_allocatable(mg, rotor, | 3340 return (SET_ERROR(ENOSPC)); 3341 } 3342 3343 /* 3344 * Start at the rotor and loop through all mgs until we find something. 3345 * Note that there's no locking on mc_rotor or mc_aliquot because 3346 * nothing actually breaks if we miss a few updates -- we just won't 3347 * allocate quite as evenly. It all balances out over time. --- 69 unchanged lines hidden (view full) --- 3417 * Determine if the selected metaslab group is eligible 3418 * for allocations. If we're ganging then don't allow 3419 * this metaslab group to skip allocations since that would 3420 * inadvertently return ENOSPC and suspend the pool 3421 * even though space is still available. 3422 */ 3423 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3424 allocatable = metaslab_group_allocatable(mg, rotor, |
3160 psize); | 3425 psize, allocator); |
3161 } 3162 3163 if (!allocatable) { 3164 metaslab_trace_add(zal, mg, NULL, psize, d, | 3426 } 3427 3428 if (!allocatable) { 3429 metaslab_trace_add(zal, mg, NULL, psize, d, |
3165 TRACE_NOT_ALLOCATABLE); | 3430 TRACE_NOT_ALLOCATABLE, allocator); |
3166 goto next; 3167 } 3168 3169 ASSERT(mg->mg_initialized); 3170 3171 /* 3172 * Avoid writing single-copy data to a failing, 3173 * non-redundant vdev, unless we've already tried all 3174 * other vdevs. 3175 */ 3176 if ((vd->vdev_stat.vs_write_errors > 0 || 3177 vd->vdev_state < VDEV_STATE_HEALTHY) && 3178 d == 0 && !try_hard && vd->vdev_children == 0) { 3179 metaslab_trace_add(zal, mg, NULL, psize, d, | 3431 goto next; 3432 } 3433 3434 ASSERT(mg->mg_initialized); 3435 3436 /* 3437 * Avoid writing single-copy data to a failing, 3438 * non-redundant vdev, unless we've already tried all 3439 * other vdevs. 3440 */ 3441 if ((vd->vdev_stat.vs_write_errors > 0 || 3442 vd->vdev_state < VDEV_STATE_HEALTHY) && 3443 d == 0 && !try_hard && vd->vdev_children == 0) { 3444 metaslab_trace_add(zal, mg, NULL, psize, d, |
3180 TRACE_VDEV_ERROR); | 3445 TRACE_VDEV_ERROR, allocator); |
3181 goto next; 3182 } 3183 3184 ASSERT(mg->mg_class == mc); 3185 3186 /* 3187 * If we don't need to try hard, then require that the 3188 * block be 1/8th of the device away from any other DVAs --- 7 unchanged lines hidden (view full) --- 3196 if (distance <= (1ULL << vd->vdev_ms_shift)) 3197 distance = 0; 3198 } 3199 3200 uint64_t asize = vdev_psize_to_asize(vd, psize); 3201 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3202 3203 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, | 3446 goto next; 3447 } 3448 3449 ASSERT(mg->mg_class == mc); 3450 3451 /* 3452 * If we don't need to try hard, then require that the 3453 * block be 1/8th of the device away from any other DVAs --- 7 unchanged lines hidden (view full) --- 3461 if (distance <= (1ULL << vd->vdev_ms_shift)) 3462 distance = 0; 3463 } 3464 3465 uint64_t asize = vdev_psize_to_asize(vd, psize); 3466 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3467 3468 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, |
3204 distance, dva, d); | 3469 distance, dva, d, allocator); |
3205 3206 if (offset != -1ULL) { 3207 /* 3208 * If we've just selected this metaslab group, 3209 * figure out whether the corresponding vdev is 3210 * over- or under-used relative to the pool, 3211 * and set an allocation bias to even it out. 3212 */ --- 46 unchanged lines hidden (view full) --- 3259 */ 3260 if (!try_hard) { 3261 try_hard = B_TRUE; 3262 goto top; 3263 } 3264 3265 bzero(&dva[d], sizeof (dva_t)); 3266 | 3470 3471 if (offset != -1ULL) { 3472 /* 3473 * If we've just selected this metaslab group, 3474 * figure out whether the corresponding vdev is 3475 * over- or under-used relative to the pool, 3476 * and set an allocation bias to even it out. 3477 */ --- 46 unchanged lines hidden (view full) --- 3524 */ 3525 if (!try_hard) { 3526 try_hard = B_TRUE; 3527 goto top; 3528 } 3529 3530 bzero(&dva[d], sizeof (dva_t)); 3531 |
3267 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC); | 3532 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); |
3268 return (SET_ERROR(ENOSPC)); 3269} 3270 3271void 3272metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3273 boolean_t checkpoint) 3274{ 3275 metaslab_t *msp; --- 284 unchanged lines hidden (view full) --- 3560/* 3561 * Reserve some allocation slots. The reservation system must be called 3562 * before we call into the allocator. If there aren't any available slots 3563 * then the I/O will be throttled until an I/O completes and its slots are 3564 * freed up. The function returns true if it was successful in placing 3565 * the reservation. 3566 */ 3567boolean_t | 3533 return (SET_ERROR(ENOSPC)); 3534} 3535 3536void 3537metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3538 boolean_t checkpoint) 3539{ 3540 metaslab_t *msp; --- 284 unchanged lines hidden (view full) --- 3825/* 3826 * Reserve some allocation slots. The reservation system must be called 3827 * before we call into the allocator. If there aren't any available slots 3828 * then the I/O will be throttled until an I/O completes and its slots are 3829 * freed up. The function returns true if it was successful in placing 3830 * the reservation. 3831 */ 3832boolean_t |
3568metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, 3569 int flags) | 3833metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 3834 zio_t *zio, int flags) |
3570{ 3571 uint64_t available_slots = 0; 3572 boolean_t slot_reserved = B_FALSE; | 3835{ 3836 uint64_t available_slots = 0; 3837 boolean_t slot_reserved = B_FALSE; |
3838 uint64_t max = mc->mc_alloc_max_slots[allocator]; |
|
3573 3574 ASSERT(mc->mc_alloc_throttle_enabled); 3575 mutex_enter(&mc->mc_lock); 3576 | 3839 3840 ASSERT(mc->mc_alloc_throttle_enabled); 3841 mutex_enter(&mc->mc_lock); 3842 |
3577 uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); 3578 if (reserved_slots < mc->mc_alloc_max_slots) 3579 available_slots = mc->mc_alloc_max_slots - reserved_slots; | 3843 uint64_t reserved_slots = 3844 refcount_count(&mc->mc_alloc_slots[allocator]); 3845 if (reserved_slots < max) 3846 available_slots = max - reserved_slots; |
3580 3581 if (slots <= available_slots || GANG_ALLOCATION(flags)) { 3582 /* 3583 * We reserve the slots individually so that we can unreserve 3584 * them individually when an I/O completes. 3585 */ 3586 for (int d = 0; d < slots; d++) { | 3847 3848 if (slots <= available_slots || GANG_ALLOCATION(flags)) { 3849 /* 3850 * We reserve the slots individually so that we can unreserve 3851 * them individually when an I/O completes. 3852 */ 3853 for (int d = 0; d < slots; d++) { |
3587 reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); | 3854 reserved_slots = 3855 refcount_add(&mc->mc_alloc_slots[allocator], 3856 zio); |
3588 } 3589 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 3590 slot_reserved = B_TRUE; 3591 } 3592 3593 mutex_exit(&mc->mc_lock); 3594 return (slot_reserved); 3595} 3596 3597void | 3857 } 3858 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 3859 slot_reserved = B_TRUE; 3860 } 3861 3862 mutex_exit(&mc->mc_lock); 3863 return (slot_reserved); 3864} 3865 3866void |
3598metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) | 3867metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 3868 int allocator, zio_t *zio) |
3599{ 3600 ASSERT(mc->mc_alloc_throttle_enabled); 3601 mutex_enter(&mc->mc_lock); 3602 for (int d = 0; d < slots; d++) { | 3869{ 3870 ASSERT(mc->mc_alloc_throttle_enabled); 3871 mutex_enter(&mc->mc_lock); 3872 for (int d = 0; d < slots; d++) { |
3603 (void) refcount_remove(&mc->mc_alloc_slots, zio); | 3873 (void) refcount_remove(&mc->mc_alloc_slots[allocator], 3874 zio); |
3604 } 3605 mutex_exit(&mc->mc_lock); 3606} 3607 3608static int 3609metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 3610 uint64_t txg) 3611{ --- 5 unchanged lines hidden (view full) --- 3617 return (ENXIO); 3618 3619 ASSERT3P(vd->vdev_ms, !=, NULL); 3620 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3621 3622 mutex_enter(&msp->ms_lock); 3623 3624 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) | 3875 } 3876 mutex_exit(&mc->mc_lock); 3877} 3878 3879static int 3880metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 3881 uint64_t txg) 3882{ --- 5 unchanged lines hidden (view full) --- 3888 return (ENXIO); 3889 3890 ASSERT3P(vd->vdev_ms, !=, NULL); 3891 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3892 3893 mutex_enter(&msp->ms_lock); 3894 3895 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) |
3625 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); | 3896 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 3897 /* 3898 * No need to fail in that case; someone else has activated the 3899 * metaslab, but that doesn't preclude us from using it. 3900 */ 3901 if (error == EBUSY) 3902 error = 0; |
3626 3627 if (error == 0 && 3628 !range_tree_contains(msp->ms_allocatable, offset, size)) 3629 error = SET_ERROR(ENOENT); 3630 3631 if (error || txg == 0) { /* txg == 0 indicates dry run */ 3632 mutex_exit(&msp->ms_lock); 3633 return (error); --- 88 unchanged lines hidden (view full) --- 3722 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3723 3724 return (metaslab_claim_impl(vd, offset, size, txg)); 3725} 3726 3727int 3728metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 3729 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, | 3903 3904 if (error == 0 && 3905 !range_tree_contains(msp->ms_allocatable, offset, size)) 3906 error = SET_ERROR(ENOENT); 3907 3908 if (error || txg == 0) { /* txg == 0 indicates dry run */ 3909 mutex_exit(&msp->ms_lock); 3910 return (error); --- 88 unchanged lines hidden (view full) --- 3999 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4000 4001 return (metaslab_claim_impl(vd, offset, size, txg)); 4002} 4003 4004int 4005metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 4006 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, |
3730 zio_alloc_list_t *zal, zio_t *zio) | 4007 zio_alloc_list_t *zal, zio_t *zio, int allocator) |
3731{ 3732 dva_t *dva = bp->blk_dva; 3733 dva_t *hintdva = hintbp->blk_dva; 3734 int error = 0; 3735 3736 ASSERT(bp->blk_birth == 0); 3737 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 3738 --- 6 unchanged lines hidden (view full) --- 3745 3746 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 3747 ASSERT(BP_GET_NDVAS(bp) == 0); 3748 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 3749 ASSERT3P(zal, !=, NULL); 3750 3751 for (int d = 0; d < ndvas; d++) { 3752 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, | 4008{ 4009 dva_t *dva = bp->blk_dva; 4010 dva_t *hintdva = hintbp->blk_dva; 4011 int error = 0; 4012 4013 ASSERT(bp->blk_birth == 0); 4014 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 4015 --- 6 unchanged lines hidden (view full) --- 4022 4023 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 4024 ASSERT(BP_GET_NDVAS(bp) == 0); 4025 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 4026 ASSERT3P(zal, !=, NULL); 4027 4028 for (int d = 0; d < ndvas; d++) { 4029 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, |
3753 txg, flags, zal); | 4030 txg, flags, zal, allocator); |
3754 if (error != 0) { 3755 for (d--; d >= 0; d--) { 3756 metaslab_unalloc_dva(spa, &dva[d], txg); 3757 metaslab_group_alloc_decrement(spa, | 4031 if (error != 0) { 4032 for (d--; d >= 0; d--) { 4033 metaslab_unalloc_dva(spa, &dva[d], txg); 4034 metaslab_group_alloc_decrement(spa, |
3758 DVA_GET_VDEV(&dva[d]), zio, flags); | 4035 DVA_GET_VDEV(&dva[d]), zio, flags, 4036 allocator, B_FALSE); |
3759 bzero(&dva[d], sizeof (dva_t)); 3760 } 3761 spa_config_exit(spa, SCL_ALLOC, FTAG); 3762 return (error); 3763 } else { 3764 /* 3765 * Update the metaslab group's queue depth 3766 * based on the newly allocated dva. 3767 */ 3768 metaslab_group_alloc_increment(spa, | 4037 bzero(&dva[d], sizeof (dva_t)); 4038 } 4039 spa_config_exit(spa, SCL_ALLOC, FTAG); 4040 return (error); 4041 } else { 4042 /* 4043 * Update the metaslab group's queue depth 4044 * based on the newly allocated dva. 4045 */ 4046 metaslab_group_alloc_increment(spa, |
3769 DVA_GET_VDEV(&dva[d]), zio, flags); | 4047 DVA_GET_VDEV(&dva[d]), zio, flags, allocator); |
3770 } 3771 3772 } 3773 ASSERT(error == 0); 3774 ASSERT(BP_GET_NDVAS(bp) == ndvas); 3775 3776 spa_config_exit(spa, SCL_ALLOC, FTAG); 3777 --- 152 unchanged lines hidden --- | 4048 } 4049 4050 } 4051 ASSERT(error == 0); 4052 ASSERT(BP_GET_NDVAS(bp) == ndvas); 4053 4054 spa_config_exit(spa, SCL_ALLOC, FTAG); 4055 --- 152 unchanged lines hidden --- |