Cross Reference: /freebsd-11-stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c

Deleted Added

sdiff udiff text old ( 268855 ) new ( 269118 )

full compact

metaslab.c (268855)	metaslab.c (269118)
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 18 unchanged lines hidden (view full) --- 27#include <sys/zfs_context.h> 28#include <sys/dmu.h> 29#include <sys/dmu_tx.h> 30#include <sys/space_map.h> 31#include <sys/metaslab_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio.h> 34#include <sys/spa_impl.h>	1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 18 unchanged lines hidden (view full) --- 27#include <sys/zfs_context.h> 28#include <sys/dmu.h> 29#include <sys/dmu_tx.h> 30#include <sys/space_map.h> 31#include <sys/metaslab_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio.h> 34#include <sys/spa_impl.h>
	35#include <sys/zfeature.h>
35 36SYSCTL_DECL(_vfs_zfs); 37SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 38 39/* 40 * Allow allocations to switch to gang blocks quickly. We do this to 41 * avoid having to load lots of space_maps in a given txg. There are, 42 * however, some cases where we want to avoid "fast" ganging and instead --- 41 unchanged lines hidden (view full) --- 84 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 85 * blocks. 86 / 87int zfs_metaslab_condense_block_threshold = 4; 88 89/ 90 * The zfs_mg_noalloc_threshold defines which metaslab groups should 91 * be eligible for allocation. The value is defined as a percentage of	36 37SYSCTL_DECL(_vfs_zfs); 38SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 39 40/* 41 * Allow allocations to switch to gang blocks quickly. We do this to 42 * avoid having to load lots of space_maps in a given txg. There are, 43 * however, some cases where we want to avoid "fast" ganging and instead --- 41 unchanged lines hidden (view full) --- 85 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 86 * blocks. 87 / 88int zfs_metaslab_condense_block_threshold = 4; 89 90/ 91 * The zfs_mg_noalloc_threshold defines which metaslab groups should 92 * be eligible for allocation. The value is defined as a percentage of
92 * a free space. Metaslab groups that have more free space than	93 * free space. Metaslab groups that have more free space than
93 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 94 * a metaslab group's free space is less than or equal to the 95 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 96 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 97 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 98 * groups are allowed to accept allocations. Gang blocks are always 99 * eligible to allocate on any metaslab group. The default value of 0 means 100 * no metaslab group will be excluded based on this criterion. 101 / 102int zfs_mg_noalloc_threshold = 0; 103SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 104* &zfs_mg_noalloc_threshold, 0, 105 "Percentage of metaslab group size that should be free" 106 " to make it eligible for allocation"); 107 108/*	94 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 95 * a metaslab group's free space is less than or equal to the 96 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 97 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 98 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 99 * groups are allowed to accept allocations. Gang blocks are always 100 * eligible to allocate on any metaslab group. The default value of 0 means 101 * no metaslab group will be excluded based on this criterion. 102 / 103int zfs_mg_noalloc_threshold = 0; 104SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 105* &zfs_mg_noalloc_threshold, 0, 106 "Percentage of metaslab group size that should be free" 107 " to make it eligible for allocation"); 108 109/*
	110 * Metaslab groups are considered eligible for allocations if their 111 * fragmenation metric (measured as a percentage) is less than or equal to 112 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 113 * then it will be skipped unless all metaslab groups within the metaslab 114 * class have also crossed this threshold. 115 / 116int zfs_mg_fragmentation_threshold = 85; 117* 118/* 119 * Allow metaslabs to keep their active state as long as their fragmentation 120 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 121 * active metaslab that exceeds this threshold will no longer keep its active 122 * status allowing better metaslabs to be selected. 123 / 124int zfs_metaslab_fragmentation_threshold = 70; 125* 126/*
109 * When set will load all metaslabs when pool is first opened. 110 / 111int metaslab_debug_load = 0; 112SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 113* &metaslab_debug_load, 0, 114 "Load all metaslabs when pool is first opened"); 115 116/* --- 51 unchanged lines hidden (view full) --- 168 * keep it loaded. 169 / 170int metaslab_unload_delay = TXG_SIZE 2; 171SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 172 &metaslab_unload_delay, 0, 173 "Number of TXGs that an unused metaslab can be kept in memory"); 174 175/*	127 * When set will load all metaslabs when pool is first opened. 128 / 129int metaslab_debug_load = 0; 130SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 131* &metaslab_debug_load, 0, 132 "Load all metaslabs when pool is first opened"); 133 134/* --- 51 unchanged lines hidden (view full) --- 186 * keep it loaded. 187 / 188int metaslab_unload_delay = TXG_SIZE 2; 189SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 190 &metaslab_unload_delay, 0, 191 "Number of TXGs that an unused metaslab can be kept in memory"); 192 193/*
176 * Should we be willing to write data to degraded vdevs? 177 / 178boolean_t zfs_write_to_degraded = B_FALSE; 179SYSCTL_INT(_vfs_zfs, OID_AUTO, write_to_degraded, CTLFLAG_RWTUN, 180* &zfs_write_to_degraded, 0, "Allow writing data to degraded vdevs"); 181 182/*
183 * Max number of metaslabs per group to preload. 184 / 185int metaslab_preload_limit = SPA_DVAS_PER_BP; 186SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 187* &metaslab_preload_limit, 0, 188 "Max number of metaslabs per group to preload"); 189 190/* 191 * Enable/disable preloading of metaslab. 192 / 193boolean_t metaslab_preload_enabled = B_TRUE; 194SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 195* &metaslab_preload_enabled, 0, 196 "Max number of metaslabs per group to preload"); 197 198/*	194 * Max number of metaslabs per group to preload. 195 / 196int metaslab_preload_limit = SPA_DVAS_PER_BP; 197SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 198* &metaslab_preload_limit, 0, 199 "Max number of metaslabs per group to preload"); 200 201/* 202 * Enable/disable preloading of metaslab. 203 / 204boolean_t metaslab_preload_enabled = B_TRUE; 205SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 206* &metaslab_preload_enabled, 0, 207 "Max number of metaslabs per group to preload"); 208 209/*
199 * Enable/disable additional weight factor for each metaslab.	210 * Enable/disable fragmentation weighting on metaslabs.
200 */	211 */
201boolean_t metaslab_weight_factor_enable = B_FALSE; 202SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, weight_factor_enable, CTLFLAG_RWTUN, 203 &metaslab_weight_factor_enable, 0, 204 "Enable additional weight factor for each metaslab");	212boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 213SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 214 &metaslab_fragmentation_factor_enabled, 0, 215 "Enable fragmentation weighting on metaslabs");
205	216
	217/* 218 * Enable/disable lba weighting (i.e. outer tracks are given preference). 219 / 220boolean_t metaslab_lba_weighting_enabled = B_TRUE; 221SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 222* &metaslab_lba_weighting_enabled, 0, 223 "Enable LBA weighting (i.e. outer tracks are given preference)");
206 207/*	224 225/*
	226 * Enable/disable metaslab group biasing. 227 / 228boolean_t metaslab_bias_enabled = B_TRUE; 229SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 230* &metaslab_bias_enabled, 0, 231 "Enable metaslab group biasing"); 232 233static uint64_t metaslab_fragmentation(metaslab_t ); 234* 235/*
208 * ========================================================================== 209 * Metaslab classes 210 * ========================================================================== 211 / 212metaslab_class_t 213metaslab_class_create(spa_t spa, metaslab_ops_t ops) 214{ 215 metaslab_class_t mc; --- 101 unchanged lines hidden* (view full) --- 317} 318 319uint64_t 320metaslab_class_get_minblocksize(metaslab_class_t mc) 321{ 322* return (mc->mc_minblocksize); 323} 324	236 * ========================================================================== 237 * Metaslab classes 238 * ========================================================================== 239 / 240metaslab_class_t 241metaslab_class_create(spa_t spa, metaslab_ops_t ops) 242{ 243 metaslab_class_t mc; --- 101 unchanged lines hidden* (view full) --- 345} 346 347uint64_t 348metaslab_class_get_minblocksize(metaslab_class_t mc) 349{ 350* return (mc->mc_minblocksize); 351} 352
	353void 354metaslab_class_histogram_verify(metaslab_class_t mc) 355{ 356* vdev_t rvd = mc->mc_spa->spa_root_vdev; 357* uint64_t mc_hist; 358* int i; 359 360 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 361 return; 362 363 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 364 KM_SLEEP); 365 366 for (int c = 0; c < rvd->vdev_children; c++) { 367 vdev_t tvd = rvd->vdev_child[c]; 368* metaslab_group_t mg = tvd->vdev_mg; 369* 370 /* 371 * Skip any holes, uninitialized top-levels, or 372 * vdevs that are not in this metalab class. 373 / 374* if (tvd->vdev_ishole \|\| tvd->vdev_ms_shift == 0 \|\| 375 mg->mg_class != mc) { 376 continue; 377 } 378 379 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 380 mc_hist[i] += mg->mg_histogram[i]; 381 } 382 383 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 384 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 385 386 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 387} 388
325/*	389/*
	390 * Calculate the metaslab class's fragmentation metric. The metric 391 * is weighted based on the space contribution of each metaslab group. 392 * The return value will be a number between 0 and 100 (inclusive), or 393 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 394 * zfs_frag_table for more information about the metric. 395 / 396uint64_t 397metaslab_class_fragmentation(metaslab_class_t mc) 398{ 399 vdev_t rvd = mc->mc_spa->spa_root_vdev; 400* uint64_t fragmentation = 0; 401 402 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 403 404 for (int c = 0; c < rvd->vdev_children; c++) { 405 vdev_t tvd = rvd->vdev_child[c]; 406* metaslab_group_t mg = tvd->vdev_mg; 407* 408 /* 409 * Skip any holes, uninitialized top-levels, or 410 * vdevs that are not in this metalab class. 411 / 412* if (tvd->vdev_ishole \|\| tvd->vdev_ms_shift == 0 \|\| 413 mg->mg_class != mc) { 414 continue; 415 } 416 417 /* 418 * If a metaslab group does not contain a fragmentation 419 * metric then just bail out. 420 / 421* if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 422 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 423 return (ZFS_FRAG_INVALID); 424 } 425 426 /* 427 * Determine how much this metaslab_group is contributing 428 * to the overall pool fragmentation metric. 429 / 430* fragmentation += mg->mg_fragmentation * 431 metaslab_group_get_space(mg); 432 } 433 fragmentation /= metaslab_class_get_space(mc); 434 435 ASSERT3U(fragmentation, <=, 100); 436 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 437 return (fragmentation); 438} 439 440/* 441 * Calculate the amount of expandable space that is available in 442 * this metaslab class. If a device is expanded then its expandable 443 * space will be the amount of allocatable space that is currently not 444 * part of this metaslab class. 445 / 446uint64_t 447metaslab_class_expandable_space(metaslab_class_t mc) 448{ 449 vdev_t rvd = mc->mc_spa->spa_root_vdev; 450* uint64_t space = 0; 451 452 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 453 for (int c = 0; c < rvd->vdev_children; c++) { 454 vdev_t tvd = rvd->vdev_child[c]; 455* metaslab_group_t mg = tvd->vdev_mg; 456* 457 if (tvd->vdev_ishole \|\| tvd->vdev_ms_shift == 0 \|\| 458 mg->mg_class != mc) { 459 continue; 460 } 461 462 space += tvd->vdev_max_asize - tvd->vdev_asize; 463 } 464 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 465 return (space); 466} 467 468/*
326 * ========================================================================== 327 * Metaslab groups 328 * ========================================================================== 329 / 330static int 331metaslab_compare(const void x1, const void x2) 332{ 333* const metaslab_t m1 = x1; --- 35 unchanged lines hidden* (view full) --- 369 ASSERT(vd == vd->vdev_top); 370 371 mutex_enter(&mg->mg_lock); 372 was_allocatable = mg->mg_allocatable; 373 374 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 375 (vs->vs_space + 1); 376	469 * ========================================================================== 470 * Metaslab groups 471 * ========================================================================== 472 / 473static int 474metaslab_compare(const void x1, const void x2) 475{ 476* const metaslab_t m1 = x1; --- 35 unchanged lines hidden* (view full) --- 512 ASSERT(vd == vd->vdev_top); 513 514 mutex_enter(&mg->mg_lock); 515 was_allocatable = mg->mg_allocatable; 516 517 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 518 (vs->vs_space + 1); 519
377 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);	520 /* 521 * A metaslab group is considered allocatable if it has plenty 522 * of free space or is not heavily fragmented. We only take 523 * fragmentation into account if the metaslab group has a valid 524 * fragmentation metric (i.e. a value between 0 and 100). 525 / 526* mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 527 (mg->mg_fragmentation == ZFS_FRAG_INVALID \|\| 528 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
378 379 /* 380 * The mc_alloc_groups maintains a count of the number of 381 * groups in this metaslab class that are still above the 382 * zfs_mg_noalloc_threshold. This is used by the allocating 383 * threads to determine if they should avoid allocations to 384 * a given group. The allocator will avoid allocations to a group 385 * if that group has reached or is below the zfs_mg_noalloc_threshold --- 4 unchanged lines hidden (view full) --- 390 * groups have reached the zfs_mg_noalloc_threshold making all groups 391 * eligible for allocations. This effectively means that all devices 392 * are balanced again. 393 / 394* if (was_allocatable && !mg->mg_allocatable) 395 mc->mc_alloc_groups--; 396 else if (!was_allocatable && mg->mg_allocatable) 397 mc->mc_alloc_groups++;	529 530 /* 531 * The mc_alloc_groups maintains a count of the number of 532 * groups in this metaslab class that are still above the 533 * zfs_mg_noalloc_threshold. This is used by the allocating 534 * threads to determine if they should avoid allocations to 535 * a given group. The allocator will avoid allocations to a group 536 * if that group has reached or is below the zfs_mg_noalloc_threshold --- 4 unchanged lines hidden (view full) --- 541 * groups have reached the zfs_mg_noalloc_threshold making all groups 542 * eligible for allocations. This effectively means that all devices 543 * are balanced again. 544 / 545* if (was_allocatable && !mg->mg_allocatable) 546 mc->mc_alloc_groups--; 547 else if (!was_allocatable && mg->mg_allocatable) 548 mc->mc_alloc_groups++;
	549
398 mutex_exit(&mg->mg_lock); 399} 400 401metaslab_group_t * 402metaslab_group_create(metaslab_class_t mc, vdev_t vd) 403{ 404 metaslab_group_t mg; 405* --- 74 unchanged lines hidden (view full) --- 480 ASSERT(mc->mc_rotor != mg); 481 ASSERT(mg->mg_prev == NULL); 482 ASSERT(mg->mg_next == NULL); 483 ASSERT(mg->mg_activation_count < 0); 484 return; 485 } 486 487 taskq_wait(mg->mg_taskq);	550 mutex_exit(&mg->mg_lock); 551} 552 553metaslab_group_t * 554metaslab_group_create(metaslab_class_t mc, vdev_t vd) 555{ 556 metaslab_group_t mg; 557* --- 74 unchanged lines hidden (view full) --- 632 ASSERT(mc->mc_rotor != mg); 633 ASSERT(mg->mg_prev == NULL); 634 ASSERT(mg->mg_next == NULL); 635 ASSERT(mg->mg_activation_count < 0); 636 return; 637 } 638 639 taskq_wait(mg->mg_taskq);
	640 metaslab_group_alloc_update(mg);
488 489 mgprev = mg->mg_prev; 490 mgnext = mg->mg_next; 491 492 if (mg == mgnext) { 493 mc->mc_rotor = NULL; 494 } else { 495 mc->mc_rotor = mgnext; 496 mgprev->mg_next = mgnext; 497 mgnext->mg_prev = mgprev; 498 } 499 500 mg->mg_prev = NULL; 501 mg->mg_next = NULL; 502 metaslab_class_minblocksize_update(mc); 503} 504	641 642 mgprev = mg->mg_prev; 643 mgnext = mg->mg_next; 644 645 if (mg == mgnext) { 646 mc->mc_rotor = NULL; 647 } else { 648 mc->mc_rotor = mgnext; 649 mgprev->mg_next = mgnext; 650 mgnext->mg_prev = mgprev; 651 } 652 653 mg->mg_prev = NULL; 654 mg->mg_next = NULL; 655 metaslab_class_minblocksize_update(mc); 656} 657
	658uint64_t 659metaslab_group_get_space(metaslab_group_t mg) 660{ 661* return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 662} 663 664void 665metaslab_group_histogram_verify(metaslab_group_t mg) 666{ 667* uint64_t mg_hist; 668* vdev_t vd = mg->mg_vd; 669* uint64_t ashift = vd->vdev_ashift; 670 int i; 671 672 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 673 return; 674 675 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 676 KM_SLEEP); 677 678 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 679 SPACE_MAP_HISTOGRAM_SIZE + ashift); 680 681 for (int m = 0; m < vd->vdev_ms_count; m++) { 682 metaslab_t msp = vd->vdev_ms[m]; 683* 684 if (msp->ms_sm == NULL) 685 continue; 686 687 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 688 mg_hist[i + ashift] += 689 msp->ms_sm->sm_phys->smp_histogram[i]; 690 } 691 692 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 693 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 694 695 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 696} 697
505static void	698static void
506metaslab_group_add(metaslab_group_t mg, metaslab_t msp)	699metaslab_group_histogram_add(metaslab_group_t mg, metaslab_t msp)
507{	700{
	701 metaslab_class_t mc = mg->mg_class; 702* uint64_t ashift = mg->mg_vd->vdev_ashift; 703 704 ASSERT(MUTEX_HELD(&msp->ms_lock)); 705 if (msp->ms_sm == NULL) 706 return; 707
508 mutex_enter(&mg->mg_lock);	708 mutex_enter(&mg->mg_lock);
	709 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 710 mg->mg_histogram[i + ashift] += 711 msp->ms_sm->sm_phys->smp_histogram[i]; 712 mc->mc_histogram[i + ashift] += 713 msp->ms_sm->sm_phys->smp_histogram[i]; 714 } 715 mutex_exit(&mg->mg_lock); 716} 717 718void 719metaslab_group_histogram_remove(metaslab_group_t mg, metaslab_t msp) 720{ 721 metaslab_class_t mc = mg->mg_class; 722* uint64_t ashift = mg->mg_vd->vdev_ashift; 723 724 ASSERT(MUTEX_HELD(&msp->ms_lock)); 725 if (msp->ms_sm == NULL) 726 return; 727 728 mutex_enter(&mg->mg_lock); 729 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 730 ASSERT3U(mg->mg_histogram[i + ashift], >=, 731 msp->ms_sm->sm_phys->smp_histogram[i]); 732 ASSERT3U(mc->mc_histogram[i + ashift], >=, 733 msp->ms_sm->sm_phys->smp_histogram[i]); 734 735 mg->mg_histogram[i + ashift] -= 736 msp->ms_sm->sm_phys->smp_histogram[i]; 737 mc->mc_histogram[i + ashift] -= 738 msp->ms_sm->sm_phys->smp_histogram[i]; 739 } 740 mutex_exit(&mg->mg_lock); 741} 742 743static void 744metaslab_group_add(metaslab_group_t mg, metaslab_t msp) 745{
509 ASSERT(msp->ms_group == NULL);	746 ASSERT(msp->ms_group == NULL);
	747 mutex_enter(&mg->mg_lock);
510 msp->ms_group = mg; 511 msp->ms_weight = 0; 512 avl_add(&mg->mg_metaslab_tree, msp); 513 mutex_exit(&mg->mg_lock);	748 msp->ms_group = mg; 749 msp->ms_weight = 0; 750 avl_add(&mg->mg_metaslab_tree, msp); 751 mutex_exit(&mg->mg_lock);
	752 753 mutex_enter(&msp->ms_lock); 754 metaslab_group_histogram_add(mg, msp); 755 mutex_exit(&msp->ms_lock);
514} 515 516static void 517metaslab_group_remove(metaslab_group_t mg, metaslab_t msp) 518{	756} 757 758static void 759metaslab_group_remove(metaslab_group_t mg, metaslab_t msp) 760{
	761 mutex_enter(&msp->ms_lock); 762 metaslab_group_histogram_remove(mg, msp); 763 mutex_exit(&msp->ms_lock); 764
519 mutex_enter(&mg->mg_lock); 520 ASSERT(msp->ms_group == mg); 521 avl_remove(&mg->mg_metaslab_tree, msp); 522 msp->ms_group = NULL; 523 mutex_exit(&mg->mg_lock); 524} 525 526static void 527metaslab_group_sort(metaslab_group_t mg, metaslab_t msp, uint64_t weight) 528{ 529 /* 530 * Although in principle the weight can be any value, in	765 mutex_enter(&mg->mg_lock); 766 ASSERT(msp->ms_group == mg); 767 avl_remove(&mg->mg_metaslab_tree, msp); 768 msp->ms_group = NULL; 769 mutex_exit(&mg->mg_lock); 770} 771 772static void 773metaslab_group_sort(metaslab_group_t mg, metaslab_t msp, uint64_t weight) 774{ 775 /* 776 * Although in principle the weight can be any value, in
531 * practice we do not use values in the range [1, 510].	777 * practice we do not use values in the range [1, 511].
532 */	778 */
533 ASSERT(weight >= SPA_MINBLOCKSIZE-1 \|\| weight == 0);	779 ASSERT(weight >= SPA_MINBLOCKSIZE \|\| weight == 0);
534 ASSERT(MUTEX_HELD(&msp->ms_lock)); 535 536 mutex_enter(&mg->mg_lock); 537 ASSERT(msp->ms_group == mg); 538 avl_remove(&mg->mg_metaslab_tree, msp); 539 msp->ms_weight = weight; 540 avl_add(&mg->mg_metaslab_tree, msp); 541 mutex_exit(&mg->mg_lock); 542} 543 544/*	780 ASSERT(MUTEX_HELD(&msp->ms_lock)); 781 782 mutex_enter(&mg->mg_lock); 783 ASSERT(msp->ms_group == mg); 784 avl_remove(&mg->mg_metaslab_tree, msp); 785 msp->ms_weight = weight; 786 avl_add(&mg->mg_metaslab_tree, msp); 787 mutex_exit(&mg->mg_lock); 788} 789 790/*
	791 * Calculate the fragmentation for a given metaslab group. We can use 792 * a simple average here since all metaslabs within the group must have 793 * the same size. The return value will be a value between 0 and 100 794 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 795 * group have a fragmentation metric. 796 / 797uint64_t 798metaslab_group_fragmentation(metaslab_group_t mg) 799{ 800 vdev_t vd = mg->mg_vd; 801* uint64_t fragmentation = 0; 802 uint64_t valid_ms = 0; 803 804 for (int m = 0; m < vd->vdev_ms_count; m++) { 805 metaslab_t msp = vd->vdev_ms[m]; 806* 807 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 808 continue; 809 810 valid_ms++; 811 fragmentation += msp->ms_fragmentation; 812 } 813 814 if (valid_ms <= vd->vdev_ms_count / 2) 815 return (ZFS_FRAG_INVALID); 816 817 fragmentation /= valid_ms; 818 ASSERT3U(fragmentation, <=, 100); 819 return (fragmentation); 820} 821 822/*
545 * Determine if a given metaslab group should skip allocations. A metaslab	823 * Determine if a given metaslab group should skip allocations. A metaslab
546 * group should avoid allocations if its used capacity has crossed the 547 * zfs_mg_noalloc_threshold and there is at least one metaslab group	824 * group should avoid allocations if its free capacity is less than the 825 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 826 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
548 * that can still handle allocations. 549 / 550static boolean_t 551metaslab_group_allocatable(metaslab_group_t mg) 552{ 553 vdev_t vd = mg->mg_vd; 554* spa_t spa = vd->vdev_spa; 555* metaslab_class_t mc = mg->mg_class; 556* 557 /*	827 * that can still handle allocations. 828 / 829static boolean_t 830metaslab_group_allocatable(metaslab_group_t mg) 831{ 832 vdev_t vd = mg->mg_vd; 833* spa_t spa = vd->vdev_spa; 834* metaslab_class_t mc = mg->mg_class; 835* 836 /*
558 * A metaslab group is considered allocatable if its free capacity 559 * is greater than the set value of zfs_mg_noalloc_threshold, it's 560 * associated with a slog, or there are no other metaslab groups 561 * with free capacity greater than zfs_mg_noalloc_threshold.	837 * We use two key metrics to determine if a metaslab group is 838 * considered allocatable -- free space and fragmentation. If 839 * the free space is greater than the free space threshold and 840 * the fragmentation is less than the fragmentation threshold then 841 * consider the group allocatable. There are two case when we will 842 * not consider these key metrics. The first is if the group is 843 * associated with a slog device and the second is if all groups 844 * in this metaslab class have already been consider ineligible 845 * for allocations.
562 */	846 */
563 return (mg->mg_free_capacity > zfs_mg_noalloc_threshold \|\|	847 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 848 (mg->mg_fragmentation == ZFS_FRAG_INVALID \|\| 849 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) \|\|
564 mc != spa_normal_class(spa) \|\| mc->mc_alloc_groups == 0); 565} 566 567/* 568 * ========================================================================== 569 * Range tree callbacks 570 * ========================================================================== 571 / --- 207 unchanged lines hidden* (view full) --- 779 / 780* uint64_t align = size & -size; 781 uint64_t cursor = &msp->ms_lbas[highbit64(align) - 1]; 782* avl_tree_t t = &msp->ms_tree->rt_root; 783* 784 return (metaslab_block_picker(t, cursor, size, align)); 785} 786	850 mc != spa_normal_class(spa) \|\| mc->mc_alloc_groups == 0); 851} 852 853/* 854 * ========================================================================== 855 * Range tree callbacks 856 * ========================================================================== 857 / --- 207 unchanged lines hidden* (view full) --- 1065 / 1066* uint64_t align = size & -size; 1067 uint64_t cursor = &msp->ms_lbas[highbit64(align) - 1]; 1068* avl_tree_t t = &msp->ms_tree->rt_root; 1069* 1070 return (metaslab_block_picker(t, cursor, size, align)); 1071} 1072
787/* ARGSUSED / 788static boolean_t 789metaslab_ff_fragmented(metaslab_t msp) 790{ 791 return (B_TRUE); 792} 793
794static metaslab_ops_t metaslab_ff_ops = {	1073static metaslab_ops_t metaslab_ff_ops = {
795 metaslab_ff_alloc, 796 metaslab_ff_fragmented	1074 metaslab_ff_alloc
797}; 798 799/* 800 * ========================================================================== 801 * Dynamic block allocator - 802 * Uses the first fit allocation scheme until space get low and then 803 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 804 * and metaslab_df_free_pct to determine when to switch the allocation scheme. --- 30 unchanged lines hidden (view full) --- 835 free_pct < metaslab_df_free_pct) { 836 t = &msp->ms_size_tree; 837 cursor = 0; 838* } 839 840 return (metaslab_block_picker(t, cursor, size, 1ULL)); 841} 842	1075}; 1076 1077/* 1078 * ========================================================================== 1079 * Dynamic block allocator - 1080 * Uses the first fit allocation scheme until space get low and then 1081 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1082 * and metaslab_df_free_pct to determine when to switch the allocation scheme. --- 30 unchanged lines hidden (view full) --- 1113 free_pct < metaslab_df_free_pct) { 1114 t = &msp->ms_size_tree; 1115 cursor = 0; 1116* } 1117 1118 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1119} 1120
843static boolean_t 844metaslab_df_fragmented(metaslab_t msp) 845{ 846* range_tree_t rt = msp->ms_tree; 847* uint64_t max_size = metaslab_block_maxsize(msp); 848 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 849 850 if (max_size >= metaslab_df_alloc_threshold && 851 free_pct >= metaslab_df_free_pct) 852 return (B_FALSE); 853 854 return (B_TRUE); 855} 856
857static metaslab_ops_t metaslab_df_ops = {	1121static metaslab_ops_t metaslab_df_ops = {
858 metaslab_df_alloc, 859 metaslab_df_fragmented	1122 metaslab_df_alloc
860}; 861 862/* 863 * ========================================================================== 864 * Cursor fit block allocator - 865 * Select the largest region in the metaslab, set the cursor to the beginning 866 * of the range and the cursor_end to the end of the range. As allocations 867 * are made advance the cursor. Continue allocating from the cursor until --- 26 unchanged lines hidden (view full) --- 894 } 895 896 offset = cursor; 897* cursor += size; 898* 899 return (offset); 900} 901	1123}; 1124 1125/* 1126 * ========================================================================== 1127 * Cursor fit block allocator - 1128 * Select the largest region in the metaslab, set the cursor to the beginning 1129 * of the range and the cursor_end to the end of the range. As allocations 1130 * are made advance the cursor. Continue allocating from the cursor until --- 26 unchanged lines hidden (view full) --- 1157 } 1158 1159 offset = cursor; 1160* cursor += size; 1161* 1162 return (offset); 1163} 1164
902static boolean_t 903metaslab_cf_fragmented(metaslab_t msp) 904{ 905* return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size); 906} 907
908static metaslab_ops_t metaslab_cf_ops = {	1165static metaslab_ops_t metaslab_cf_ops = {
909 metaslab_cf_alloc, 910 metaslab_cf_fragmented	1166 metaslab_cf_alloc
911}; 912 913/* 914 * ========================================================================== 915 * New dynamic fit allocator - 916 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 917 * contiguous blocks. If no region is found then just use the largest segment 918 * that remains. --- 40 unchanged lines hidden (view full) --- 959 960 if ((rs->rs_end - rs->rs_start) >= size) { 961 cursor = rs->rs_start + size; 962* return (rs->rs_start); 963 } 964 return (-1ULL); 965} 966	1167}; 1168 1169/* 1170 * ========================================================================== 1171 * New dynamic fit allocator - 1172 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1173 * contiguous blocks. If no region is found then just use the largest segment 1174 * that remains. --- 40 unchanged lines hidden (view full) --- 1215 1216 if ((rs->rs_end - rs->rs_start) >= size) { 1217 cursor = rs->rs_start + size; 1218* return (rs->rs_start); 1219 } 1220 return (-1ULL); 1221} 1222
967static boolean_t 968metaslab_ndf_fragmented(metaslab_t msp) 969{ 970* return (metaslab_block_maxsize(msp) <= 971 (metaslab_min_alloc_size << metaslab_ndf_clump_shift)); 972} 973
974static metaslab_ops_t metaslab_ndf_ops = {	1223static metaslab_ops_t metaslab_ndf_ops = {
975 metaslab_ndf_alloc, 976 metaslab_ndf_fragmented	1224 metaslab_ndf_alloc
977}; 978 979metaslab_ops_t zfs_metaslab_ops = &metaslab_df_ops; 980* 981/* 982 * ========================================================================== 983 * Metaslabs 984 * ========================================================================== --- 85 unchanged lines hidden (view full) --- 1070 * alloctree and freetree until metaslab_sync_done(). This serves 1071 * two purposes: it allows metaslab_sync_done() to detect the 1072 * addition of new space; and for debugging, it ensures that we'd 1073 * data fault on any attempt to use this metaslab before it's ready. 1074 / 1075* msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock); 1076 metaslab_group_add(mg, msp); 1077	1225}; 1226 1227metaslab_ops_t zfs_metaslab_ops = &metaslab_df_ops; 1228* 1229/* 1230 * ========================================================================== 1231 * Metaslabs 1232 * ========================================================================== --- 85 unchanged lines hidden (view full) --- 1318 * alloctree and freetree until metaslab_sync_done(). This serves 1319 * two purposes: it allows metaslab_sync_done() to detect the 1320 * addition of new space; and for debugging, it ensures that we'd 1321 * data fault on any attempt to use this metaslab before it's ready. 1322 / 1323* msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock); 1324 metaslab_group_add(mg, msp); 1325
	1326 msp->ms_fragmentation = metaslab_fragmentation(msp);
1078 msp->ms_ops = mg->mg_class->mc_ops; 1079 1080 /* 1081 * If we're opening an existing pool (txg == 0) or creating 1082 * a new one (txg == TXG_INITIAL), all space is available now. 1083 * If we're adding space to an existing pool, the new space 1084 * does not become available until after this txg has synced. 1085 / --- 49 unchanged lines hidden* (view full) --- 1135 1136 mutex_exit(&msp->ms_lock); 1137 cv_destroy(&msp->ms_load_cv); 1138 mutex_destroy(&msp->ms_lock); 1139 1140 kmem_free(msp, sizeof (metaslab_t)); 1141} 1142	1327 msp->ms_ops = mg->mg_class->mc_ops; 1328 1329 /* 1330 * If we're opening an existing pool (txg == 0) or creating 1331 * a new one (txg == TXG_INITIAL), all space is available now. 1332 * If we're adding space to an existing pool, the new space 1333 * does not become available until after this txg has synced. 1334 / --- 49 unchanged lines hidden* (view full) --- 1384 1385 mutex_exit(&msp->ms_lock); 1386 cv_destroy(&msp->ms_load_cv); 1387 mutex_destroy(&msp->ms_lock); 1388 1389 kmem_free(msp, sizeof (metaslab_t)); 1390} 1391
	1392#define FRAGMENTATION_TABLE_SIZE 17 1393
1143/*	1394/*
1144 * Apply a weighting factor based on the histogram information for this 1145 * metaslab. The current weighting factor is somewhat arbitrary and requires 1146 * additional investigation. The implementation provides a measure of 1147 * "weighted" free space and gives a higher weighting for larger contiguous 1148 * regions. The weighting factor is determined by counting the number of 1149 * sm_shift sectors that exist in each region represented by the histogram. 1150 * That value is then multiplied by the power of 2 exponent and the sm_shift 1151 * value.	1395 * This table defines a segment size based fragmentation metric that will 1396 * allow each metaslab to derive its own fragmentation value. This is done 1397 * by calculating the space in each bucket of the spacemap histogram and 1398 * multiplying that by the fragmetation metric in this table. Doing 1399 * this for all buckets and dividing it by the total amount of free 1400 * space in this metaslab (i.e. the total free space in all buckets) gives 1401 * us the fragmentation metric. This means that a high fragmentation metric 1402 * equates to most of the free space being comprised of small segments. 1403 * Conversely, if the metric is low, then most of the free space is in 1404 * large segments. A 10% change in fragmentation equates to approximately 1405 * double the number of segments.
1152 *	1406 *
1153 * For example, assume the 2^21 histogram bucket has 4 2MB regions and the 1154 * metaslab has an sm_shift value of 9 (512B): 1155 * 1156 * 1) calculate the number of sm_shift sectors in the region: 1157 * 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384 1158 * 2) multiply by the power of 2 exponent and the sm_shift value: 1159 * 16384 * 21 * 9 = 3096576 1160 * This value will be added to the weighting of the metaslab.	1407 * This table defines 0% fragmented space using 16MB segments. Testing has 1408 * shown that segments that are greater than or equal to 16MB do not suffer 1409 * from drastic performance problems. Using this value, we derive the rest 1410 * of the table. Since the fragmentation value is never stored on disk, it 1411 * is possible to change these calculations in the future.
1161 */	1412 */
	1413int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1414 100, /* 512B / 1415* 100, /* 1K / 1416* 98, /* 2K / 1417* 95, /* 4K / 1418* 90, /* 8K / 1419* 80, /* 16K / 1420* 70, /* 32K / 1421* 60, /* 64K / 1422* 50, /* 128K / 1423* 40, /* 256K / 1424* 30, /* 512K / 1425* 20, /* 1M / 1426* 15, /* 2M / 1427* 10, /* 4M / 1428* 5, /* 8M / 1429* 0 /* 16M / 1430}; 1431* 1432/* 1433 * Calclate the metaslab's fragmentation metric. A return value 1434 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1435 * not support this metric. Otherwise, the return value should be in the 1436 * range [0, 100]. 1437 */
1162static uint64_t	1438static uint64_t
1163metaslab_weight_factor(metaslab_t *msp)	1439metaslab_fragmentation(metaslab_t *msp)
1164{	1440{
1165 uint64_t factor = 0; 1166 uint64_t sectors; 1167 int i;	1441 spa_t spa = msp->ms_group->mg_vd->vdev_spa; 1442* uint64_t fragmentation = 0; 1443 uint64_t total = 0; 1444 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1445 SPA_FEATURE_SPACEMAP_HISTOGRAM);
1168	1446
	1447 if (!feature_enabled) 1448 return (ZFS_FRAG_INVALID); 1449
1169 /*	1450 /*
1170 * A null space map means that the entire metaslab is free, 1171 * calculate a weight factor that spans the entire size of the 1172 * metaslab.	1451 * A null space map means that the entire metaslab is free 1452 * and thus is not fragmented.
1173 */	1453 */
1174 if (msp->ms_sm == NULL) {	1454 if (msp->ms_sm == NULL) 1455 return (0); 1456 1457 /* 1458 * If this metaslab's space_map has not been upgraded, flag it 1459 * so that we upgrade next time we encounter it. 1460 / 1461* if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1462 uint64_t txg = spa_syncing_txg(spa);
1175 vdev_t vd = msp->ms_group->mg_vd; 1176*	1463 vdev_t vd = msp->ms_group->mg_vd; 1464*
1177 i = highbit64(msp->ms_size) - 1; 1178 sectors = msp->ms_size >> vd->vdev_ashift; 1179 return (sectors * i * vd->vdev_ashift);	1465 msp->ms_condense_wanted = B_TRUE; 1466 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1467 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1468 "msp %p, vd %p", txg, msp, vd); 1469 return (ZFS_FRAG_INVALID);
1180 } 1181	1470 } 1471
1182 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) 1183 return (0);	1472 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1473 uint64_t space = 0; 1474 uint8_t shift = msp->ms_sm->sm_shift; 1475 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1476 FRAGMENTATION_TABLE_SIZE - 1);
1184	1477
1185 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1186 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1187 continue; 1188	1478 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1479 continue; 1480
1189 /* 1190 * Determine the number of sm_shift sectors in the region 1191 * indicated by the histogram. For example, given an 1192 * sm_shift value of 9 (512 bytes) and i = 4 then we know 1193 * that we're looking at an 8K region in the histogram 1194 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the 1195 * number of sm_shift sectors (512 bytes in this example), 1196 * we would take 8192 / 512 = 16. Since the histogram 1197 * is offset by sm_shift we can simply use the value of 1198 * of i to calculate this (i.e. 2^i = 16 where i = 4). 1199 / 1200* sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i; 1201 factor += (i + msp->ms_sm->sm_shift) * sectors;	1481 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1482 total += space; 1483 1484 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1485 fragmentation += space * zfs_frag_table[idx];
1202 }	1486 }
1203 return (factor * msp->ms_sm->sm_shift);	1487 1488 if (total > 0) 1489 fragmentation /= total; 1490 ASSERT3U(fragmentation, <=, 100); 1491 return (fragmentation);
1204} 1205	1492} 1493
	1494/* 1495 * Compute a weight -- a selection preference value -- for the given metaslab. 1496 * This is based on the amount of free space, the level of fragmentation, 1497 * the LBA range, and whether the metaslab is loaded. 1498 */
1206static uint64_t 1207metaslab_weight(metaslab_t msp) 1208{ 1209* metaslab_group_t mg = msp->ms_group; 1210* vdev_t vd = mg->mg_vd; 1211* uint64_t weight, space; 1212 1213 ASSERT(MUTEX_HELD(&msp->ms_lock)); --- 7 unchanged lines hidden (view full) --- 1221 ASSERT0(vd->vdev_ms_shift); 1222 return (0); 1223 } 1224 1225 /* 1226 * The baseline weight is the metaslab's free space. 1227 / 1228* space = msp->ms_size - space_map_allocated(msp->ms_sm);	1499static uint64_t 1500metaslab_weight(metaslab_t msp) 1501{ 1502* metaslab_group_t mg = msp->ms_group; 1503* vdev_t vd = mg->mg_vd; 1504* uint64_t weight, space; 1505 1506 ASSERT(MUTEX_HELD(&msp->ms_lock)); --- 7 unchanged lines hidden (view full) --- 1514 ASSERT0(vd->vdev_ms_shift); 1515 return (0); 1516 } 1517 1518 /* 1519 * The baseline weight is the metaslab's free space. 1520 / 1521* space = msp->ms_size - space_map_allocated(msp->ms_sm);
	1522 1523 msp->ms_fragmentation = metaslab_fragmentation(msp); 1524 if (metaslab_fragmentation_factor_enabled && 1525 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1526 /* 1527 * Use the fragmentation information to inversely scale 1528 * down the baseline weight. We need to ensure that we 1529 * don't exclude this metaslab completely when it's 100% 1530 * fragmented. To avoid this we reduce the fragmented value 1531 * by 1. 1532 / 1533* space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1534 1535 /* 1536 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1537 * this metaslab again. The fragmentation metric may have 1538 * decreased the space to something smaller than 1539 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1540 * so that we can consume any remaining space. 1541 / 1542* if (space > 0 && space < SPA_MINBLOCKSIZE) 1543 space = SPA_MINBLOCKSIZE; 1544 }
1229 weight = space; 1230 1231 /* 1232 * Modern disks have uniform bit density and constant angular velocity. 1233 * Therefore, the outer recording zones are faster (higher bandwidth) 1234 * than the inner zones by the ratio of outer to inner track diameter, 1235 * which is typically around 2:1. We account for this by assigning 1236 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1237 * In effect, this means that we'll select the metaslab with the most 1238 * free bandwidth rather than simply the one with the most free space. 1239 */	1545 weight = space; 1546 1547 /* 1548 * Modern disks have uniform bit density and constant angular velocity. 1549 * Therefore, the outer recording zones are faster (higher bandwidth) 1550 * than the inner zones by the ratio of outer to inner track diameter, 1551 * which is typically around 2:1. We account for this by assigning 1552 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1553 * In effect, this means that we'll select the metaslab with the most 1554 * free bandwidth rather than simply the one with the most free space. 1555 */
1240 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1241 ASSERT(weight >= space && weight <= 2 * space);	1556 if (metaslab_lba_weighting_enabled) { 1557 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1558 ASSERT(weight >= space && weight <= 2 * space); 1559 }
1242	1560
1243 msp->ms_factor = metaslab_weight_factor(msp); 1244 if (metaslab_weight_factor_enable) 1245 weight += msp->ms_factor; 1246 1247 if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) { 1248 /* 1249 * If this metaslab is one we're actively using, adjust its 1250 * weight to make it preferable to any inactive metaslab so 1251 * we'll polish it off. 1252 */	1561 /* 1562 * If this metaslab is one we're actively using, adjust its 1563 * weight to make it preferable to any inactive metaslab so 1564 * we'll polish it off. If the fragmentation on this metaslab 1565 * has exceed our threshold, then don't mark it active. 1566 / 1567* if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1568 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1253 weight \|= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1254 } 1255 1256 return (weight); 1257} 1258 1259static int 1260metaslab_activate(metaslab_t msp, uint64_t activation_weight) --- 68 unchanged lines hidden* (view full) --- 1329 mutex_enter(&mg->mg_lock); 1330 /* 1331 * Load the next potential metaslabs 1332 / 1333* msp = avl_first(t); 1334 while (msp != NULL) { 1335 metaslab_t msp_next = AVL_NEXT(t, msp); 1336*	1569 weight \|= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1570 } 1571 1572 return (weight); 1573} 1574 1575static int 1576metaslab_activate(metaslab_t msp, uint64_t activation_weight) --- 68 unchanged lines hidden* (view full) --- 1645 mutex_enter(&mg->mg_lock); 1646 /* 1647 * Load the next potential metaslabs 1648 / 1649* msp = avl_first(t); 1650 while (msp != NULL) { 1651 metaslab_t msp_next = AVL_NEXT(t, msp); 1652*
1337 /* If we have reached our preload limit then we're done / 1338* if (++m > metaslab_preload_limit) 1339 break;	1653 /* 1654 * We preload only the maximum number of metaslabs specified 1655 * by metaslab_preload_limit. If a metaslab is being forced 1656 * to condense then we preload it too. This will ensure 1657 * that force condensing happens in the next txg. 1658 / 1659* if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1660 msp = msp_next; 1661 continue; 1662 }
1340 1341 /* 1342 * We must drop the metaslab group lock here to preserve 1343 * lock ordering with the ms_lock (when grabbing both 1344 * the mg_lock and the ms_lock, the ms_lock must be taken 1345 * first). As a result, it is possible that the ordering 1346 * of the metaslabs within the avl tree may change before 1347 * we reacquire the lock. The metaslab cannot be removed from --- 51 unchanged lines hidden (view full) --- 1399 dmu_object_info_t doi; 1400 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1401 1402 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1403 ASSERT(msp->ms_loaded); 1404 1405 /* 1406 * Use the ms_size_tree range tree, which is ordered by size, to	1663 1664 /* 1665 * We must drop the metaslab group lock here to preserve 1666 * lock ordering with the ms_lock (when grabbing both 1667 * the mg_lock and the ms_lock, the ms_lock must be taken 1668 * first). As a result, it is possible that the ordering 1669 * of the metaslabs within the avl tree may change before 1670 * we reacquire the lock. The metaslab cannot be removed from --- 51 unchanged lines hidden (view full) --- 1722 dmu_object_info_t doi; 1723 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1724 1725 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1726 ASSERT(msp->ms_loaded); 1727 1728 /* 1729 * Use the ms_size_tree range tree, which is ordered by size, to
1407 * obtain the largest segment in the free tree. If the tree is empty 1408 * then we should condense the map.	1730 * obtain the largest segment in the free tree. We always condense 1731 * metaslabs that are empty and metaslabs for which a condense 1732 * request has been made.
1409 / 1410* rs = avl_last(&msp->ms_size_tree);	1733 / 1734* rs = avl_last(&msp->ms_size_tree);
1411 if (rs == NULL)	1735 if (rs == NULL \|\| msp->ms_condense_wanted)
1412 return (B_TRUE); 1413 1414 /* 1415 * Calculate the number of 64-bit entries this segment would 1416 * require when written to disk. If this single segment would be 1417 * larger on-disk than the entire current on-disk structure, then 1418 * clearly condensing will increase the on-disk structure size. 1419 / --- 24 unchanged lines hidden* (view full) --- 1444 range_tree_t freetree = msp->ms_freetree[txg & TXG_MASK]; 1445* range_tree_t condense_tree; 1446* space_map_t sm = msp->ms_sm; 1447* 1448 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1449 ASSERT3U(spa_sync_pass(spa), ==, 1); 1450 ASSERT(msp->ms_loaded); 1451	1736 return (B_TRUE); 1737 1738 /* 1739 * Calculate the number of 64-bit entries this segment would 1740 * require when written to disk. If this single segment would be 1741 * larger on-disk than the entire current on-disk structure, then 1742 * clearly condensing will increase the on-disk structure size. 1743 / --- 24 unchanged lines hidden* (view full) --- 1768 range_tree_t freetree = msp->ms_freetree[txg & TXG_MASK]; 1769* range_tree_t condense_tree; 1770* space_map_t sm = msp->ms_sm; 1771* 1772 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1773 ASSERT3U(spa_sync_pass(spa), ==, 1); 1774 ASSERT(msp->ms_loaded); 1775
	1776
1452 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "	1777 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1453 "smp size %llu, segments %lu", txg, msp->ms_id, msp, 1454 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));	1778 "smp size %llu, segments %lu, forcing condense=%s", txg, 1779 msp->ms_id, msp, space_map_length(msp->ms_sm), 1780 avl_numnodes(&msp->ms_tree->rt_root), 1781 msp->ms_condense_wanted ? "TRUE" : "FALSE");
1455	1782
	1783 msp->ms_condense_wanted = B_FALSE; 1784
1456 /* 1457 * Create an range tree that is 100% allocated. We remove segments 1458 * that have been freed in this txg, any deferred frees that exist, 1459 * and any allocation in the future. Removing segments should be 1460 * a relatively inexpensive operation since we expect these trees to 1461 * have a small number of nodes. 1462 / 1463* condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); --- 75 unchanged lines hidden (view full) --- 1539 ASSERT3P(alloctree, ==, NULL); 1540 return; 1541 } 1542 1543 ASSERT3P(alloctree, !=, NULL); 1544 ASSERT3P(freetree, !=, NULL); 1545* ASSERT3P(freed_tree, !=, NULL); 1546*	1785 /* 1786 * Create an range tree that is 100% allocated. We remove segments 1787 * that have been freed in this txg, any deferred frees that exist, 1788 * and any allocation in the future. Removing segments should be 1789 * a relatively inexpensive operation since we expect these trees to 1790 * have a small number of nodes. 1791 / 1792* condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); --- 75 unchanged lines hidden (view full) --- 1868 ASSERT3P(alloctree, ==, NULL); 1869 return; 1870 } 1871 1872 ASSERT3P(alloctree, !=, NULL); 1873 ASSERT3P(freetree, !=, NULL); 1874* ASSERT3P(freed_tree, !=, NULL); 1875*
	1876 /* 1877 * Normally, we don't want to process a metaslab if there 1878 * are no allocations or frees to perform. However, if the metaslab 1879 * is being forced to condense we need to let it through. 1880 */
1547 if (range_tree_space(alloctree) == 0 &&	1881 if (range_tree_space(alloctree) == 0 &&
1548 range_tree_space(*freetree) == 0)	1882 range_tree_space(freetree) == 0 && 1883* !msp->ms_condense_wanted)
1549 return; 1550 1551 /* 1552 * The only state that can actually be changing concurrently with 1553 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1554 * be modifying this txg's alloctree, freetree, freed_tree, or 1555 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1556 * space_map ASSERTs. We drop it whenever we call into the DMU, --- 20 unchanged lines hidden (view full) --- 1577 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1578 metaslab_should_condense(msp)) { 1579 metaslab_condense(msp, txg, tx); 1580 } else { 1581 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1582 space_map_write(msp->ms_sm, freetree, SM_FREE, tx); 1583* } 1584	1884 return; 1885 1886 /* 1887 * The only state that can actually be changing concurrently with 1888 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1889 * be modifying this txg's alloctree, freetree, freed_tree, or 1890 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1891 * space_map ASSERTs. We drop it whenever we call into the DMU, --- 20 unchanged lines hidden (view full) --- 1912 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1913 metaslab_should_condense(msp)) { 1914 metaslab_condense(msp, txg, tx); 1915 } else { 1916 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1917 space_map_write(msp->ms_sm, freetree, SM_FREE, tx); 1918* } 1919
1585 range_tree_vacate(alloctree, NULL, NULL); 1586	1920 metaslab_group_histogram_verify(mg); 1921 metaslab_class_histogram_verify(mg->mg_class); 1922 metaslab_group_histogram_remove(mg, msp);
1587 if (msp->ms_loaded) { 1588 /* 1589 * When the space map is loaded, we have an accruate 1590 * histogram in the range tree. This gives us an opportunity 1591 * to bring the space map's histogram up-to-date so we clear 1592 * it first before updating it. 1593 / 1594* space_map_histogram_clear(msp->ms_sm); 1595 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1596 } else { 1597 /* 1598 * Since the space map is not loaded we simply update the 1599 * exisiting histogram with what was freed in this txg. This 1600 * means that the on-disk histogram may not have an accurate 1601 * view of the free space but it's close enough to allow 1602 * us to make allocation decisions. 1603 / 1604* space_map_histogram_add(msp->ms_sm, freetree, tx); 1605* }	1923 if (msp->ms_loaded) { 1924 /* 1925 * When the space map is loaded, we have an accruate 1926 * histogram in the range tree. This gives us an opportunity 1927 * to bring the space map's histogram up-to-date so we clear 1928 * it first before updating it. 1929 / 1930* space_map_histogram_clear(msp->ms_sm); 1931 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1932 } else { 1933 /* 1934 * Since the space map is not loaded we simply update the 1935 * exisiting histogram with what was freed in this txg. This 1936 * means that the on-disk histogram may not have an accurate 1937 * view of the free space but it's close enough to allow 1938 * us to make allocation decisions. 1939 / 1940* space_map_histogram_add(msp->ms_sm, freetree, tx); 1941* }
	1942 metaslab_group_histogram_add(mg, msp); 1943 metaslab_group_histogram_verify(mg); 1944 metaslab_class_histogram_verify(mg->mg_class);
1606 1607 /* 1608 * For sync pass 1, we avoid traversing this txg's free range tree 1609 * and instead will just swap the pointers for freetree and 1610 * freed_tree. We can safely do this since the freed_tree is 1611 * guaranteed to be empty on the initial pass. 1612 / 1613* if (spa_sync_pass(spa) == 1) { 1614 range_tree_swap(freetree, freed_tree); 1615 } else { 1616 range_tree_vacate(freetree, range_tree_add, freed_tree); 1617 }	1945 1946 /* 1947 * For sync pass 1, we avoid traversing this txg's free range tree 1948 * and instead will just swap the pointers for freetree and 1949 * freed_tree. We can safely do this since the freed_tree is 1950 * guaranteed to be empty on the initial pass. 1951 / 1952* if (spa_sync_pass(spa) == 1) { 1953 range_tree_swap(freetree, freed_tree); 1954 } else { 1955 range_tree_vacate(freetree, range_tree_add, freed_tree); 1956 }
	1957 range_tree_vacate(alloctree, NULL, NULL);
1618 1619 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1620 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1621 1622 mutex_exit(&msp->ms_lock); 1623 1624 if (object != space_map_object(msp->ms_sm)) { 1625 object = space_map_object(msp->ms_sm); --- 94 unchanged lines hidden (view full) --- 1720 } 1721 1722 if (!metaslab_debug_unload) 1723 metaslab_unload(msp); 1724 } 1725 1726 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1727 mutex_exit(&msp->ms_lock);	1958 1959 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1960 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1961 1962 mutex_exit(&msp->ms_lock); 1963 1964 if (object != space_map_object(msp->ms_sm)) { 1965 object = space_map_object(msp->ms_sm); --- 94 unchanged lines hidden (view full) --- 2060 } 2061 2062 if (!metaslab_debug_unload) 2063 metaslab_unload(msp); 2064 } 2065 2066 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2067 mutex_exit(&msp->ms_lock);
1728
1729} 1730 1731void 1732metaslab_sync_reassess(metaslab_group_t mg) 1733{ 1734* metaslab_group_alloc_update(mg);	2068} 2069 2070void 2071metaslab_sync_reassess(metaslab_group_t mg) 2072{ 2073* metaslab_group_alloc_update(mg);
	2074 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
1735 1736 /* 1737 * Preload the next potential metaslabs 1738 / 1739* metaslab_group_preload(mg); 1740} 1741 1742static uint64_t --- 245 unchanged lines hidden (view full) --- 1988 goto next; 1989 1990 /* 1991 * Avoid writing single-copy data to a failing vdev 1992 * unless the user instructs us that it is okay. 1993 / 1994* if ((vd->vdev_stat.vs_write_errors > 0 \|\| 1995 vd->vdev_state < VDEV_STATE_HEALTHY) &&	2075 2076 /* 2077 * Preload the next potential metaslabs 2078 / 2079* metaslab_group_preload(mg); 2080} 2081 2082static uint64_t --- 245 unchanged lines hidden (view full) --- 2328 goto next; 2329 2330 /* 2331 * Avoid writing single-copy data to a failing vdev 2332 * unless the user instructs us that it is okay. 2333 / 2334* if ((vd->vdev_stat.vs_write_errors > 0 \|\| 2335 vd->vdev_state < VDEV_STATE_HEALTHY) &&
1996 d == 0 && dshift == 3 && 1997 !(zfs_write_to_degraded && vd->vdev_state == 1998 VDEV_STATE_DEGRADED)) {	2336 d == 0 && dshift == 3 && vd->vdev_children == 0) {
1999 all_zero = B_FALSE; 2000 goto next; 2001 } 2002 2003 ASSERT(mg->mg_class == mc); 2004 2005 distance = vd->vdev_asize >> dshift; 2006 if (distance <= (1ULL << vd->vdev_ms_shift)) --- 8 unchanged lines hidden (view full) --- 2015 dva, d); 2016 if (offset != -1ULL) { 2017 /* 2018 * If we've just selected this metaslab group, 2019 * figure out whether the corresponding vdev is 2020 * over- or under-used relative to the pool, 2021 * and set an allocation bias to even it out. 2022 */	2337 all_zero = B_FALSE; 2338 goto next; 2339 } 2340 2341 ASSERT(mg->mg_class == mc); 2342 2343 distance = vd->vdev_asize >> dshift; 2344 if (distance <= (1ULL << vd->vdev_ms_shift)) --- 8 unchanged lines hidden (view full) --- 2353 dva, d); 2354 if (offset != -1ULL) { 2355 /* 2356 * If we've just selected this metaslab group, 2357 * figure out whether the corresponding vdev is 2358 * over- or under-used relative to the pool, 2359 * and set an allocation bias to even it out. 2360 */
2023 if (mc->mc_aliquot == 0) {	2361 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
2024 vdev_stat_t vs = &vd->vdev_stat; 2025* int64_t vu, cu; 2026 2027 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2028 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2029 2030 /* 2031 * Calculate how much more or less we should --- 5 unchanged lines hidden (view full) --- 2037 * 2038 * mg_bias = (20 - 80) * 512K / 100 = -307K 2039 * 2040 * This reduces allocations by 307K for this 2041 * iteration. 2042 / 2043* mg->mg_bias = ((cu - vu) * 2044 (int64_t)mg->mg_aliquot) / 100;	2362 vdev_stat_t vs = &vd->vdev_stat; 2363* int64_t vu, cu; 2364 2365 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2366 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2367 2368 /* 2369 * Calculate how much more or less we should --- 5 unchanged lines hidden (view full) --- 2375 * 2376 * mg_bias = (20 - 80) * 512K / 100 = -307K 2377 * 2378 * This reduces allocations by 307K for this 2379 * iteration. 2380 / 2381* mg->mg_bias = ((cu - vu) * 2382 (int64_t)mg->mg_aliquot) / 100;
	2383 } else if (!metaslab_bias_enabled) { 2384 mg->mg_bias = 0;
2045 } 2046 2047 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2048 mg->mg_aliquot + mg->mg_bias) { 2049 mc->mc_rotor = mg->mg_next; 2050 mc->mc_aliquot = 0; 2051 } 2052 --- 257 unchanged lines hidden ---	2385 } 2386 2387 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2388 mg->mg_aliquot + mg->mg_bias) { 2389 mc->mc_rotor = mg->mg_next; 2390 mc->mc_aliquot = 0; 2391 } 2392 --- 257 unchanged lines hidden ---