Cross Reference: /freebsd-11-stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c

Deleted Added

sdiff udiff text old ( 307267 ) new ( 307277 )

full compact

metaslab.c (307267)	metaslab.c (307277)
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 24 unchanged lines hidden (view full) --- 33#include <sys/vdev_impl.h> 34#include <sys/zio.h> 35#include <sys/spa_impl.h> 36#include <sys/zfeature.h> 37 38SYSCTL_DECL(_vfs_zfs); 39SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 40	1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 24 unchanged lines hidden (view full) --- 33#include <sys/vdev_impl.h> 34#include <sys/zio.h> 35#include <sys/spa_impl.h> 36#include <sys/zfeature.h> 37 38SYSCTL_DECL(_vfs_zfs); 39SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 40
41/* 42 * Allow allocations to switch to gang blocks quickly. We do this to 43 * avoid having to load lots of space_maps in a given txg. There are, 44 * however, some cases where we want to avoid "fast" ganging and instead 45 * we want to do an exhaustive search of all metaslabs on this device. 46 * Currently we don't allow any gang, slog, or dump device related allocations 47 * to "fast" gang. 48 */ 49#define CAN_FASTGANG(flags) \ 50 (!((flags) & (METASLAB_GANG_CHILD \| METASLAB_GANG_HEADER \| \ 51 METASLAB_GANG_AVOID)))	41#define GANG_ALLOCATION(flags) \ 42 ((flags) & (METASLAB_GANG_CHILD \| METASLAB_GANG_HEADER))
52 53#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 54#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 55#define METASLAB_ACTIVE_MASK \ 56 (METASLAB_WEIGHT_PRIMARY \| METASLAB_WEIGHT_SECONDARY) 57 58uint64_t metaslab_aliquot = 512ULL << 10; 59uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks / --- 191 unchanged lines hidden* (view full) --- 251{ 252 metaslab_class_t mc; 253* 254 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 255 256 mc->mc_spa = spa; 257 mc->mc_rotor = NULL; 258 mc->mc_ops = ops;	43 44#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 45#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 46#define METASLAB_ACTIVE_MASK \ 47 (METASLAB_WEIGHT_PRIMARY \| METASLAB_WEIGHT_SECONDARY) 48 49uint64_t metaslab_aliquot = 512ULL << 10; 50uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks / --- 191 unchanged lines hidden* (view full) --- 242{ 243 metaslab_class_t mc; 244* 245 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 246 247 mc->mc_spa = spa; 248 mc->mc_rotor = NULL; 249 mc->mc_ops = ops;
	250 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 251 refcount_create_tracked(&mc->mc_alloc_slots);
259 260 return (mc); 261} 262 263void 264metaslab_class_destroy(metaslab_class_t mc) 265{ 266* ASSERT(mc->mc_rotor == NULL); 267 ASSERT(mc->mc_alloc == 0); 268 ASSERT(mc->mc_deferred == 0); 269 ASSERT(mc->mc_space == 0); 270 ASSERT(mc->mc_dspace == 0); 271	252 253 return (mc); 254} 255 256void 257metaslab_class_destroy(metaslab_class_t mc) 258{ 259* ASSERT(mc->mc_rotor == NULL); 260 ASSERT(mc->mc_alloc == 0); 261 ASSERT(mc->mc_deferred == 0); 262 ASSERT(mc->mc_space == 0); 263 ASSERT(mc->mc_dspace == 0); 264
	265 refcount_destroy(&mc->mc_alloc_slots); 266 mutex_destroy(&mc->mc_lock);
272 kmem_free(mc, sizeof (metaslab_class_t)); 273} 274 275int 276metaslab_class_validate(metaslab_class_t mc) 277{ 278* metaslab_group_t mg; 279* vdev_t vd; --- 227 unchanged lines hidden* (view full) --- 507 ASSERT3P(m1, ==, m2); 508 509 return (0); 510} 511 512/* 513 * Update the allocatable flag and the metaslab group's capacity. 514 * The allocatable flag is set to true if the capacity is below	267 kmem_free(mc, sizeof (metaslab_class_t)); 268} 269 270int 271metaslab_class_validate(metaslab_class_t mc) 272{ 273* metaslab_group_t mg; 274* vdev_t vd; --- 227 unchanged lines hidden* (view full) --- 502 ASSERT3P(m1, ==, m2); 503 504 return (0); 505} 506 507/* 508 * Update the allocatable flag and the metaslab group's capacity. 509 * The allocatable flag is set to true if the capacity is below
515 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 516 * from allocatable to non-allocatable or vice versa then the metaslab 517 * group's class is updated to reflect the transition.	510 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 511 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 512 * transitions from allocatable to non-allocatable or vice versa then the 513 * metaslab group's class is updated to reflect the transition.
518 / 519static void 520metaslab_group_alloc_update(metaslab_group_t mg) 521{ 522 vdev_t vd = mg->mg_vd; 523* metaslab_class_t mc = mg->mg_class; 524* vdev_stat_t vs = &vd->vdev_stat; 525* boolean_t was_allocatable;	514 / 515static void 516metaslab_group_alloc_update(metaslab_group_t mg) 517{ 518 vdev_t vd = mg->mg_vd; 519* metaslab_class_t mc = mg->mg_class; 520* vdev_stat_t vs = &vd->vdev_stat; 521* boolean_t was_allocatable;
	522 boolean_t was_initialized;
526 527 ASSERT(vd == vd->vdev_top); 528 529 mutex_enter(&mg->mg_lock); 530 was_allocatable = mg->mg_allocatable;	523 524 ASSERT(vd == vd->vdev_top); 525 526 mutex_enter(&mg->mg_lock); 527 was_allocatable = mg->mg_allocatable;
	528 was_initialized = mg->mg_initialized;
531 532 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 533 (vs->vs_space + 1); 534	529 530 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 531 (vs->vs_space + 1); 532
	533 mutex_enter(&mc->mc_lock); 534
535 /*	535 /*
	536 * If the metaslab group was just added then it won't 537 * have any space until we finish syncing out this txg. 538 * At that point we will consider it initialized and available 539 * for allocations. We also don't consider non-activated 540 * metaslab groups (e.g. vdevs that are in the middle of being removed) 541 * to be initialized, because they can't be used for allocation. 542 / 543* mg->mg_initialized = metaslab_group_initialized(mg); 544 if (!was_initialized && mg->mg_initialized) { 545 mc->mc_groups++; 546 } else if (was_initialized && !mg->mg_initialized) { 547 ASSERT3U(mc->mc_groups, >, 0); 548 mc->mc_groups--; 549 } 550 if (mg->mg_initialized) 551 mg->mg_no_free_space = B_FALSE; 552 553 /*
536 * A metaslab group is considered allocatable if it has plenty 537 * of free space or is not heavily fragmented. We only take 538 * fragmentation into account if the metaslab group has a valid 539 * fragmentation metric (i.e. a value between 0 and 100). 540 */	554 * A metaslab group is considered allocatable if it has plenty 555 * of free space or is not heavily fragmented. We only take 556 * fragmentation into account if the metaslab group has a valid 557 * fragmentation metric (i.e. a value between 0 and 100). 558 */
541 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold &&	559 mg->mg_allocatable = (mg->mg_activation_count > 0 && 560 mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
542 (mg->mg_fragmentation == ZFS_FRAG_INVALID \|\| 543 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 544 545 /* 546 * The mc_alloc_groups maintains a count of the number of 547 * groups in this metaslab class that are still above the 548 * zfs_mg_noalloc_threshold. This is used by the allocating 549 * threads to determine if they should avoid allocations to --- 6 unchanged lines hidden (view full) --- 556 * groups have reached the zfs_mg_noalloc_threshold making all groups 557 * eligible for allocations. This effectively means that all devices 558 * are balanced again. 559 / 560* if (was_allocatable && !mg->mg_allocatable) 561 mc->mc_alloc_groups--; 562 else if (!was_allocatable && mg->mg_allocatable) 563 mc->mc_alloc_groups++;	561 (mg->mg_fragmentation == ZFS_FRAG_INVALID \|\| 562 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 563 564 /* 565 * The mc_alloc_groups maintains a count of the number of 566 * groups in this metaslab class that are still above the 567 * zfs_mg_noalloc_threshold. This is used by the allocating 568 * threads to determine if they should avoid allocations to --- 6 unchanged lines hidden (view full) --- 575 * groups have reached the zfs_mg_noalloc_threshold making all groups 576 * eligible for allocations. This effectively means that all devices 577 * are balanced again. 578 / 579* if (was_allocatable && !mg->mg_allocatable) 580 mc->mc_alloc_groups--; 581 else if (!was_allocatable && mg->mg_allocatable) 582 mc->mc_alloc_groups++;
	583 mutex_exit(&mc->mc_lock);
564 565 mutex_exit(&mg->mg_lock); 566} 567 568metaslab_group_t * 569metaslab_group_create(metaslab_class_t mc, vdev_t vd) 570{ 571 metaslab_group_t mg; 572* 573 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 574 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 575 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 576 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 577 mg->mg_vd = vd; 578 mg->mg_class = mc; 579 mg->mg_activation_count = 0;	584 585 mutex_exit(&mg->mg_lock); 586} 587 588metaslab_group_t * 589metaslab_group_create(metaslab_class_t mc, vdev_t vd) 590{ 591 metaslab_group_t mg; 592* 593 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 594 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 595 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 596 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 597 mg->mg_vd = vd; 598 mg->mg_class = mc; 599 mg->mg_activation_count = 0;
	600 mg->mg_initialized = B_FALSE; 601 mg->mg_no_free_space = B_TRUE; 602 refcount_create_tracked(&mg->mg_alloc_queue_depth);
580 581 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 582 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 583 584 return (mg); 585} 586 587void --- 6 unchanged lines hidden (view full) --- 594 * either because we never activated in the first place or 595 * because we're done, and possibly removing the vdev. 596 / 597* ASSERT(mg->mg_activation_count <= 0); 598 599 taskq_destroy(mg->mg_taskq); 600 avl_destroy(&mg->mg_metaslab_tree); 601 mutex_destroy(&mg->mg_lock);	603 604 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 605 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 606 607 return (mg); 608} 609 610void --- 6 unchanged lines hidden (view full) --- 617 * either because we never activated in the first place or 618 * because we're done, and possibly removing the vdev. 619 / 620* ASSERT(mg->mg_activation_count <= 0); 621 622 taskq_destroy(mg->mg_taskq); 623 avl_destroy(&mg->mg_metaslab_tree); 624 mutex_destroy(&mg->mg_lock);
	625 refcount_destroy(&mg->mg_alloc_queue_depth);
602 kmem_free(mg, sizeof (metaslab_group_t)); 603} 604 605void 606metaslab_group_activate(metaslab_group_t mg) 607{ 608* metaslab_class_t mc = mg->mg_class; 609* metaslab_group_t mgprev, mgnext; --- 55 unchanged lines hidden (view full) --- 665 mgnext->mg_prev = mgprev; 666 } 667 668 mg->mg_prev = NULL; 669 mg->mg_next = NULL; 670 metaslab_class_minblocksize_update(mc); 671} 672	626 kmem_free(mg, sizeof (metaslab_group_t)); 627} 628 629void 630metaslab_group_activate(metaslab_group_t mg) 631{ 632* metaslab_class_t mc = mg->mg_class; 633* metaslab_group_t mgprev, mgnext; --- 55 unchanged lines hidden (view full) --- 689 mgnext->mg_prev = mgprev; 690 } 691 692 mg->mg_prev = NULL; 693 mg->mg_next = NULL; 694 metaslab_class_minblocksize_update(mc); 695} 696
	697boolean_t 698metaslab_group_initialized(metaslab_group_t mg) 699{ 700* vdev_t vd = mg->mg_vd; 701* vdev_stat_t vs = &vd->vdev_stat; 702* 703 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 704} 705
673uint64_t 674metaslab_group_get_space(metaslab_group_t mg) 675{ 676* return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 677} 678 679void 680metaslab_group_histogram_verify(metaslab_group_t mg) --- 153 unchanged lines hidden* (view full) --- 834 return (fragmentation); 835} 836 837/* 838 * Determine if a given metaslab group should skip allocations. A metaslab 839 * group should avoid allocations if its free capacity is less than the 840 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 841 * zfs_mg_fragmentation_threshold and there is at least one metaslab group	706uint64_t 707metaslab_group_get_space(metaslab_group_t mg) 708{ 709* return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 710} 711 712void 713metaslab_group_histogram_verify(metaslab_group_t mg) --- 153 unchanged lines hidden* (view full) --- 867 return (fragmentation); 868} 869 870/* 871 * Determine if a given metaslab group should skip allocations. A metaslab 872 * group should avoid allocations if its free capacity is less than the 873 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 874 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
842 * that can still handle allocations.	875 * that can still handle allocations. If the allocation throttle is enabled 876 * then we skip allocations to devices that have reached their maximum 877 * allocation queue depth unless the selected metaslab group is the only 878 * eligible group remaining.
843 / 844*static boolean_t	879 / 880*static boolean_t
845metaslab_group_allocatable(metaslab_group_t *mg)	881metaslab_group_allocatable(metaslab_group_t mg, metaslab_group_t rotor, 882 uint64_t psize)
846{	883{
847 vdev_t vd = mg->mg_vd; 848* spa_t *spa = vd->vdev_spa;	884 spa_t *spa = mg->mg_vd->vdev_spa;
849 metaslab_class_t mc = mg->mg_class; 850* 851 /*	885 metaslab_class_t mc = mg->mg_class; 886* 887 /*
852 * We use two key metrics to determine if a metaslab group is 853 * considered allocatable -- free space and fragmentation. If 854 * the free space is greater than the free space threshold and 855 * the fragmentation is less than the fragmentation threshold then 856 * consider the group allocatable. There are two case when we will 857 * not consider these key metrics. The first is if the group is 858 * associated with a slog device and the second is if all groups 859 * in this metaslab class have already been consider ineligible	888 * We can only consider skipping this metaslab group if it's 889 * in the normal metaslab class and there are other metaslab 890 * groups to select from. Otherwise, we always consider it eligible
860 * for allocations. 861 */	891 * for allocations. 892 */
862 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 863 (mg->mg_fragmentation == ZFS_FRAG_INVALID \|\| 864 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) \|\| 865 mc != spa_normal_class(spa) \|\| mc->mc_alloc_groups == 0);	893 if (mc != spa_normal_class(spa) \|\| mc->mc_groups <= 1) 894 return (B_TRUE); 895 896 /* 897 * If the metaslab group's mg_allocatable flag is set (see comments 898 * in metaslab_group_alloc_update() for more information) and 899 * the allocation throttle is disabled then allow allocations to this 900 * device. However, if the allocation throttle is enabled then 901 * check if we have reached our allocation limit (mg_alloc_queue_depth) 902 * to determine if we should allow allocations to this metaslab group. 903 * If all metaslab groups are no longer considered allocatable 904 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 905 * gang block size then we allow allocations on this metaslab group 906 * regardless of the mg_allocatable or throttle settings. 907 / 908* if (mg->mg_allocatable) { 909 metaslab_group_t mgp; 910* int64_t qdepth; 911 uint64_t qmax = mg->mg_max_alloc_queue_depth; 912 913 if (!mc->mc_alloc_throttle_enabled) 914 return (B_TRUE); 915 916 /* 917 * If this metaslab group does not have any free space, then 918 * there is no point in looking further. 919 / 920* if (mg->mg_no_free_space) 921 return (B_FALSE); 922 923 qdepth = refcount_count(&mg->mg_alloc_queue_depth); 924 925 /* 926 * If this metaslab group is below its qmax or it's 927 * the only allocatable metasable group, then attempt 928 * to allocate from it. 929 / 930* if (qdepth < qmax \|\| mc->mc_alloc_groups == 1) 931 return (B_TRUE); 932 ASSERT3U(mc->mc_alloc_groups, >, 1); 933 934 /* 935 * Since this metaslab group is at or over its qmax, we 936 * need to determine if there are metaslab groups after this 937 * one that might be able to handle this allocation. This is 938 * racy since we can't hold the locks for all metaslab 939 * groups at the same time when we make this check. 940 / 941* for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 942 qmax = mgp->mg_max_alloc_queue_depth; 943 944 qdepth = refcount_count(&mgp->mg_alloc_queue_depth); 945 946 /* 947 * If there is another metaslab group that 948 * might be able to handle the allocation, then 949 * we return false so that we skip this group. 950 / 951* if (qdepth < qmax && !mgp->mg_no_free_space) 952 return (B_FALSE); 953 } 954 955 /* 956 * We didn't find another group to handle the allocation 957 * so we can't skip this metaslab group even though 958 * we are at or over our qmax. 959 / 960* return (B_TRUE); 961 962 } else if (mc->mc_alloc_groups == 0 \|\| psize == SPA_MINBLOCKSIZE) { 963 return (B_TRUE); 964 } 965 return (B_FALSE);
866} 867 868/* 869 * ========================================================================== 870 * Range tree callbacks 871 * ========================================================================== 872 / 873* --- 1251 unchanged lines hidden (view full) --- 2125 2126 if (offset < start) 2127 return ((start - offset) << ms_shift); 2128 if (offset > start) 2129 return ((offset - start) << ms_shift); 2130 return (0); 2131} 2132	966} 967 968/* 969 * ========================================================================== 970 * Range tree callbacks 971 * ========================================================================== 972 / 973* --- 1251 unchanged lines hidden (view full) --- 2225 2226 if (offset < start) 2227 return ((start - offset) << ms_shift); 2228 if (offset > start) 2229 return ((offset - start) << ms_shift); 2230 return (0); 2231} 2232
	2233/* 2234 * ========================================================================== 2235 * Metaslab block operations 2236 * ========================================================================== 2237 / 2238* 2239static void 2240metaslab_group_alloc_increment(spa_t spa, uint64_t vdev, void tag, int flags) 2241{ 2242 if (!(flags & METASLAB_ASYNC_ALLOC) \|\| 2243 flags & METASLAB_DONT_THROTTLE) 2244 return; 2245 2246 metaslab_group_t mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2247* if (!mg->mg_class->mc_alloc_throttle_enabled) 2248 return; 2249 2250 (void) refcount_add(&mg->mg_alloc_queue_depth, tag); 2251} 2252 2253void 2254metaslab_group_alloc_decrement(spa_t spa, uint64_t vdev, void tag, int flags) 2255{ 2256 if (!(flags & METASLAB_ASYNC_ALLOC) \|\| 2257 flags & METASLAB_DONT_THROTTLE) 2258 return; 2259 2260 metaslab_group_t mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2261* if (!mg->mg_class->mc_alloc_throttle_enabled) 2262 return; 2263 2264 (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); 2265} 2266 2267void 2268metaslab_group_alloc_verify(spa_t spa, const blkptr_t bp, void tag) 2269{ 2270#ifdef ZFS_DEBUG 2271* const dva_t dva = bp->blk_dva; 2272* int ndvas = BP_GET_NDVAS(bp); 2273 2274 for (int d = 0; d < ndvas; d++) { 2275 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2276 metaslab_group_t mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2277* VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); 2278 } 2279#endif 2280} 2281
2133static uint64_t	2282static uint64_t
2134metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,	2283metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
2135 uint64_t txg, uint64_t min_distance, dva_t dva, int d) 2136{ 2137* spa_t spa = mg->mg_vd->vdev_spa; 2138* metaslab_t msp = NULL; 2139* uint64_t offset = -1ULL; 2140 avl_tree_t t = &mg->mg_metaslab_tree; 2141* uint64_t activation_weight; 2142 uint64_t target_distance; --- 10 unchanged lines hidden (view full) --- 2153 for (;;) { 2154 boolean_t was_active; 2155 2156 mutex_enter(&mg->mg_lock); 2157 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2158 if (msp->ms_weight < asize) { 2159 spa_dbgmsg(spa, "%s: failed to meet weight " 2160 "requirement: vdev %llu, txg %llu, mg %p, "	2284 uint64_t txg, uint64_t min_distance, dva_t dva, int d) 2285{ 2286* spa_t spa = mg->mg_vd->vdev_spa; 2287* metaslab_t msp = NULL; 2288* uint64_t offset = -1ULL; 2289 avl_tree_t t = &mg->mg_metaslab_tree; 2290* uint64_t activation_weight; 2291 uint64_t target_distance; --- 10 unchanged lines hidden (view full) --- 2302 for (;;) { 2303 boolean_t was_active; 2304 2305 mutex_enter(&mg->mg_lock); 2306 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2307 if (msp->ms_weight < asize) { 2308 spa_dbgmsg(spa, "%s: failed to meet weight " 2309 "requirement: vdev %llu, txg %llu, mg %p, "
2161 "msp %p, psize %llu, asize %llu, "	2310 "msp %p, asize %llu, "
2162 "weight %llu", spa_name(spa), 2163 mg->mg_vd->vdev_id, txg,	2311 "weight %llu", spa_name(spa), 2312 mg->mg_vd->vdev_id, txg,
2164 mg, msp, psize, asize, msp->ms_weight);	2313 mg, msp, asize, msp->ms_weight);
2165 mutex_exit(&mg->mg_lock); 2166 return (-1ULL); 2167 } 2168 2169 /* 2170 * If the selected metaslab is condensing, skip it. 2171 / 2172* if (msp->ms_condensing) --- 65 unchanged lines hidden (view full) --- 2238 2239 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2240 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2241 2242 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2243 msp->ms_access_txg = txg + metaslab_unload_delay; 2244 2245 mutex_exit(&msp->ms_lock);	2314 mutex_exit(&mg->mg_lock); 2315 return (-1ULL); 2316 } 2317 2318 /* 2319 * If the selected metaslab is condensing, skip it. 2320 / 2321* if (msp->ms_condensing) --- 65 unchanged lines hidden (view full) --- 2387 2388 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2389 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2390 2391 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2392 msp->ms_access_txg = txg + metaslab_unload_delay; 2393 2394 mutex_exit(&msp->ms_lock);
2246
2247 return (offset); 2248} 2249 2250/* 2251 * Allocate a block for the specified i/o. 2252 / 2253static int 2254metaslab_alloc_dva(spa_t spa, metaslab_class_t mc, uint64_t psize, 2255* dva_t dva, int d, dva_t hintdva, uint64_t txg, int flags) 2256{ 2257 metaslab_group_t mg, rotor; 2258 vdev_t vd; 2259* int dshift = 3; 2260 int all_zero; 2261 int zio_lock = B_FALSE; 2262 boolean_t allocatable;	2395 return (offset); 2396} 2397 2398/* 2399 * Allocate a block for the specified i/o. 2400 / 2401static int 2402metaslab_alloc_dva(spa_t spa, metaslab_class_t mc, uint64_t psize, 2403* dva_t dva, int d, dva_t hintdva, uint64_t txg, int flags) 2404{ 2405 metaslab_group_t mg, rotor; 2406 vdev_t vd; 2407* int dshift = 3; 2408 int all_zero; 2409 int zio_lock = B_FALSE; 2410 boolean_t allocatable;
2263 uint64_t offset = -1ULL;
2264 uint64_t asize; 2265 uint64_t distance; 2266 2267 ASSERT(!DVA_IS_VALID(&dva[d])); 2268 2269 /* 2270 * For testing, make some blocks above a certain size be gang blocks. 2271 / --- 53 unchanged lines hidden* (view full) --- 2325 if (mg->mg_class != mc \|\| mg->mg_activation_count <= 0) 2326 mg = mc->mc_rotor; 2327 2328 rotor = mg; 2329top: 2330 all_zero = B_TRUE; 2331 do { 2332 ASSERT(mg->mg_activation_count == 1);	2411 uint64_t asize; 2412 uint64_t distance; 2413 2414 ASSERT(!DVA_IS_VALID(&dva[d])); 2415 2416 /* 2417 * For testing, make some blocks above a certain size be gang blocks. 2418 / --- 53 unchanged lines hidden* (view full) --- 2472 if (mg->mg_class != mc \|\| mg->mg_activation_count <= 0) 2473 mg = mc->mc_rotor; 2474 2475 rotor = mg; 2476top: 2477 all_zero = B_TRUE; 2478 do { 2479 ASSERT(mg->mg_activation_count == 1);
2333
2334 vd = mg->mg_vd; 2335 2336 /* 2337 * Don't allocate from faulted devices. 2338 / 2339* if (zio_lock) { 2340 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2341 allocatable = vdev_allocatable(vd); 2342 spa_config_exit(spa, SCL_ZIO, FTAG); 2343 } else { 2344 allocatable = vdev_allocatable(vd); 2345 } 2346 2347 /* 2348 * Determine if the selected metaslab group is eligible	2480 vd = mg->mg_vd; 2481 2482 /* 2483 * Don't allocate from faulted devices. 2484 / 2485* if (zio_lock) { 2486 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2487 allocatable = vdev_allocatable(vd); 2488 spa_config_exit(spa, SCL_ZIO, FTAG); 2489 } else { 2490 allocatable = vdev_allocatable(vd); 2491 } 2492 2493 /* 2494 * Determine if the selected metaslab group is eligible
2349 * for allocations. If we're ganging or have requested 2350 * an allocation for the smallest gang block size 2351 * then we don't want to avoid allocating to the this 2352 * metaslab group. If we're in this condition we should 2353 * try to allocate from any device possible so that we 2354 * don't inadvertently return ENOSPC and suspend the pool	2495 * for allocations. If we're ganging then don't allow 2496 * this metaslab group to skip allocations since that would 2497 * inadvertently return ENOSPC and suspend the pool
2355 * even though space is still available. 2356 */	2498 * even though space is still available. 2499 */
2357 if (allocatable && CAN_FASTGANG(flags) && 2358 psize > SPA_GANGBLOCKSIZE) 2359 allocatable = metaslab_group_allocatable(mg);	2500 if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) { 2501 allocatable = metaslab_group_allocatable(mg, rotor, 2502 psize); 2503 }
2360 2361 if (!allocatable) 2362 goto next; 2363	2504 2505 if (!allocatable) 2506 goto next; 2507
	2508 ASSERT(mg->mg_initialized); 2509
2364 /*	2510 /*
2365 * Avoid writing single-copy data to a failing vdev 2366 * unless the user instructs us that it is okay.	2511 * Avoid writing single-copy data to a failing vdev.
2367 / 2368* if ((vd->vdev_stat.vs_write_errors > 0 \|\| 2369 vd->vdev_state < VDEV_STATE_HEALTHY) && 2370 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2371 all_zero = B_FALSE; 2372 goto next; 2373 } 2374 2375 ASSERT(mg->mg_class == mc); 2376 2377 distance = vd->vdev_asize >> dshift; 2378 if (distance <= (1ULL << vd->vdev_ms_shift)) 2379 distance = 0; 2380 else 2381 all_zero = B_FALSE; 2382 2383 asize = vdev_psize_to_asize(vd, psize); 2384 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2385	2512 / 2513* if ((vd->vdev_stat.vs_write_errors > 0 \|\| 2514 vd->vdev_state < VDEV_STATE_HEALTHY) && 2515 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2516 all_zero = B_FALSE; 2517 goto next; 2518 } 2519 2520 ASSERT(mg->mg_class == mc); 2521 2522 distance = vd->vdev_asize >> dshift; 2523 if (distance <= (1ULL << vd->vdev_ms_shift)) 2524 distance = 0; 2525 else 2526 all_zero = B_FALSE; 2527 2528 asize = vdev_psize_to_asize(vd, psize); 2529 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2530
2386 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 2387 dva, d);	2531 uint64_t offset = metaslab_group_alloc(mg, asize, txg, 2532 distance, dva, d); 2533 2534 mutex_enter(&mg->mg_lock); 2535 if (offset == -1ULL) { 2536 mg->mg_failed_allocations++; 2537 if (asize == SPA_GANGBLOCKSIZE) { 2538 /* 2539 * This metaslab group was unable to allocate 2540 * the minimum gang block size so it must be 2541 * out of space. We must notify the allocation 2542 * throttle to start skipping allocation 2543 * attempts to this metaslab group until more 2544 * space becomes available. 2545 * 2546 * Note: this failure cannot be caused by the 2547 * allocation throttle since the allocation 2548 * throttle is only responsible for skipping 2549 * devices and not failing block allocations. 2550 / 2551* mg->mg_no_free_space = B_TRUE; 2552 } 2553 } 2554 mg->mg_allocations++; 2555 mutex_exit(&mg->mg_lock); 2556
2388 if (offset != -1ULL) { 2389 /* 2390 * If we've just selected this metaslab group, 2391 * figure out whether the corresponding vdev is 2392 * over- or under-used relative to the pool, 2393 * and set an allocation bias to even it out. 2394 / 2395* if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { --- 164 unchanged lines hidden (view full) --- 2560 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2561 } 2562 2563 mutex_exit(&msp->ms_lock); 2564 2565 return (0); 2566} 2567	2557 if (offset != -1ULL) { 2558 /* 2559 * If we've just selected this metaslab group, 2560 * figure out whether the corresponding vdev is 2561 * over- or under-used relative to the pool, 2562 * and set an allocation bias to even it out. 2563 / 2564* if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { --- 164 unchanged lines hidden (view full) --- 2729 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2730 } 2731 2732 mutex_exit(&msp->ms_lock); 2733 2734 return (0); 2735} 2736
	2737/* 2738 * Reserve some allocation slots. The reservation system must be called 2739 * before we call into the allocator. If there aren't any available slots 2740 * then the I/O will be throttled until an I/O completes and its slots are 2741 * freed up. The function returns true if it was successful in placing 2742 * the reservation. 2743 / 2744boolean_t 2745metaslab_class_throttle_reserve(metaslab_class_t mc, int slots, zio_t zio, 2746* int flags) 2747{ 2748 uint64_t available_slots = 0; 2749 boolean_t slot_reserved = B_FALSE; 2750 2751 ASSERT(mc->mc_alloc_throttle_enabled); 2752 mutex_enter(&mc->mc_lock); 2753 2754 uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); 2755 if (reserved_slots < mc->mc_alloc_max_slots) 2756 available_slots = mc->mc_alloc_max_slots - reserved_slots; 2757 2758 if (slots <= available_slots \|\| GANG_ALLOCATION(flags)) { 2759 /* 2760 * We reserve the slots individually so that we can unreserve 2761 * them individually when an I/O completes. 2762 / 2763* for (int d = 0; d < slots; d++) { 2764 reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); 2765 } 2766 zio->io_flags \|= ZIO_FLAG_IO_ALLOCATING; 2767 slot_reserved = B_TRUE; 2768 } 2769 2770 mutex_exit(&mc->mc_lock); 2771 return (slot_reserved); 2772} 2773 2774void 2775metaslab_class_throttle_unreserve(metaslab_class_t mc, int slots, zio_t zio) 2776{ 2777 ASSERT(mc->mc_alloc_throttle_enabled); 2778 mutex_enter(&mc->mc_lock); 2779 for (int d = 0; d < slots; d++) { 2780 (void) refcount_remove(&mc->mc_alloc_slots, zio); 2781 } 2782 mutex_exit(&mc->mc_lock); 2783} 2784
2568int 2569metaslab_alloc(spa_t spa, metaslab_class_t mc, uint64_t psize, blkptr_t *bp,	2785int 2786metaslab_alloc(spa_t spa, metaslab_class_t mc, uint64_t psize, blkptr_t *bp,
2570 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)	2787 int ndvas, uint64_t txg, blkptr_t hintbp, int flags, zio_t zio)
2571{ 2572 dva_t dva = bp->blk_dva; 2573* dva_t hintdva = hintbp->blk_dva; 2574* int error = 0; 2575 2576 ASSERT(bp->blk_birth == 0); 2577 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2578 --- 9 unchanged lines hidden (view full) --- 2588 ASSERT(hintbp == NULL \|\| ndvas <= BP_GET_NDVAS(hintbp)); 2589 2590 for (int d = 0; d < ndvas; d++) { 2591 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2592 txg, flags); 2593 if (error != 0) { 2594 for (d--; d >= 0; d--) { 2595 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);	2788{ 2789 dva_t dva = bp->blk_dva; 2790* dva_t hintdva = hintbp->blk_dva; 2791* int error = 0; 2792 2793 ASSERT(bp->blk_birth == 0); 2794 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2795 --- 9 unchanged lines hidden (view full) --- 2805 ASSERT(hintbp == NULL \|\| ndvas <= BP_GET_NDVAS(hintbp)); 2806 2807 for (int d = 0; d < ndvas; d++) { 2808 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2809 txg, flags); 2810 if (error != 0) { 2811 for (d--; d >= 0; d--) { 2812 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
	2813 metaslab_group_alloc_decrement(spa, 2814 DVA_GET_VDEV(&dva[d]), zio, flags);
2596 bzero(&dva[d], sizeof (dva_t)); 2597 } 2598 spa_config_exit(spa, SCL_ALLOC, FTAG); 2599 return (error);	2815 bzero(&dva[d], sizeof (dva_t)); 2816 } 2817 spa_config_exit(spa, SCL_ALLOC, FTAG); 2818 return (error);
	2819 } else { 2820 /* 2821 * Update the metaslab group's queue depth 2822 * based on the newly allocated dva. 2823 / 2824* metaslab_group_alloc_increment(spa, 2825 DVA_GET_VDEV(&dva[d]), zio, flags);
2600 }	2826 }
	2827
2601 } 2602 ASSERT(error == 0); 2603 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2604 2605 spa_config_exit(spa, SCL_ALLOC, FTAG); 2606 2607 BP_SET_BIRTH(bp, txg, txg); 2608 --- 75 unchanged lines hidden ---	2828 } 2829 ASSERT(error == 0); 2830 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2831 2832 spa_config_exit(spa, SCL_ALLOC, FTAG); 2833 2834 BP_SET_BIRTH(bp, txg, txg); 2835 --- 75 unchanged lines hidden ---