metaslab.c (268855) | metaslab.c (269118) |
---|---|
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 18 unchanged lines hidden (view full) --- 27#include <sys/zfs_context.h> 28#include <sys/dmu.h> 29#include <sys/dmu_tx.h> 30#include <sys/space_map.h> 31#include <sys/metaslab_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio.h> 34#include <sys/spa_impl.h> | 1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 18 unchanged lines hidden (view full) --- 27#include <sys/zfs_context.h> 28#include <sys/dmu.h> 29#include <sys/dmu_tx.h> 30#include <sys/space_map.h> 31#include <sys/metaslab_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio.h> 34#include <sys/spa_impl.h> |
35#include <sys/zfeature.h> |
|
35 36SYSCTL_DECL(_vfs_zfs); 37SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 38 39/* 40 * Allow allocations to switch to gang blocks quickly. We do this to 41 * avoid having to load lots of space_maps in a given txg. There are, 42 * however, some cases where we want to avoid "fast" ganging and instead --- 41 unchanged lines hidden (view full) --- 84 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 85 * blocks. 86 */ 87int zfs_metaslab_condense_block_threshold = 4; 88 89/* 90 * The zfs_mg_noalloc_threshold defines which metaslab groups should 91 * be eligible for allocation. The value is defined as a percentage of | 36 37SYSCTL_DECL(_vfs_zfs); 38SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 39 40/* 41 * Allow allocations to switch to gang blocks quickly. We do this to 42 * avoid having to load lots of space_maps in a given txg. There are, 43 * however, some cases where we want to avoid "fast" ganging and instead --- 41 unchanged lines hidden (view full) --- 85 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 86 * blocks. 87 */ 88int zfs_metaslab_condense_block_threshold = 4; 89 90/* 91 * The zfs_mg_noalloc_threshold defines which metaslab groups should 92 * be eligible for allocation. The value is defined as a percentage of |
92 * a free space. Metaslab groups that have more free space than | 93 * free space. Metaslab groups that have more free space than |
93 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 94 * a metaslab group's free space is less than or equal to the 95 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 96 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 97 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 98 * groups are allowed to accept allocations. Gang blocks are always 99 * eligible to allocate on any metaslab group. The default value of 0 means 100 * no metaslab group will be excluded based on this criterion. 101 */ 102int zfs_mg_noalloc_threshold = 0; 103SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 104 &zfs_mg_noalloc_threshold, 0, 105 "Percentage of metaslab group size that should be free" 106 " to make it eligible for allocation"); 107 108/* | 94 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 95 * a metaslab group's free space is less than or equal to the 96 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 97 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 98 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 99 * groups are allowed to accept allocations. Gang blocks are always 100 * eligible to allocate on any metaslab group. The default value of 0 means 101 * no metaslab group will be excluded based on this criterion. 102 */ 103int zfs_mg_noalloc_threshold = 0; 104SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 105 &zfs_mg_noalloc_threshold, 0, 106 "Percentage of metaslab group size that should be free" 107 " to make it eligible for allocation"); 108 109/* |
110 * Metaslab groups are considered eligible for allocations if their 111 * fragmenation metric (measured as a percentage) is less than or equal to 112 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 113 * then it will be skipped unless all metaslab groups within the metaslab 114 * class have also crossed this threshold. 115 */ 116int zfs_mg_fragmentation_threshold = 85; 117 118/* 119 * Allow metaslabs to keep their active state as long as their fragmentation 120 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 121 * active metaslab that exceeds this threshold will no longer keep its active 122 * status allowing better metaslabs to be selected. 123 */ 124int zfs_metaslab_fragmentation_threshold = 70; 125 126/* |
|
109 * When set will load all metaslabs when pool is first opened. 110 */ 111int metaslab_debug_load = 0; 112SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 113 &metaslab_debug_load, 0, 114 "Load all metaslabs when pool is first opened"); 115 116/* --- 51 unchanged lines hidden (view full) --- 168 * keep it loaded. 169 */ 170int metaslab_unload_delay = TXG_SIZE * 2; 171SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 172 &metaslab_unload_delay, 0, 173 "Number of TXGs that an unused metaslab can be kept in memory"); 174 175/* | 127 * When set will load all metaslabs when pool is first opened. 128 */ 129int metaslab_debug_load = 0; 130SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 131 &metaslab_debug_load, 0, 132 "Load all metaslabs when pool is first opened"); 133 134/* --- 51 unchanged lines hidden (view full) --- 186 * keep it loaded. 187 */ 188int metaslab_unload_delay = TXG_SIZE * 2; 189SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 190 &metaslab_unload_delay, 0, 191 "Number of TXGs that an unused metaslab can be kept in memory"); 192 193/* |
176 * Should we be willing to write data to degraded vdevs? 177 */ 178boolean_t zfs_write_to_degraded = B_FALSE; 179SYSCTL_INT(_vfs_zfs, OID_AUTO, write_to_degraded, CTLFLAG_RWTUN, 180 &zfs_write_to_degraded, 0, "Allow writing data to degraded vdevs"); 181 182/* | |
183 * Max number of metaslabs per group to preload. 184 */ 185int metaslab_preload_limit = SPA_DVAS_PER_BP; 186SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 187 &metaslab_preload_limit, 0, 188 "Max number of metaslabs per group to preload"); 189 190/* 191 * Enable/disable preloading of metaslab. 192 */ 193boolean_t metaslab_preload_enabled = B_TRUE; 194SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 195 &metaslab_preload_enabled, 0, 196 "Max number of metaslabs per group to preload"); 197 198/* | 194 * Max number of metaslabs per group to preload. 195 */ 196int metaslab_preload_limit = SPA_DVAS_PER_BP; 197SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 198 &metaslab_preload_limit, 0, 199 "Max number of metaslabs per group to preload"); 200 201/* 202 * Enable/disable preloading of metaslab. 203 */ 204boolean_t metaslab_preload_enabled = B_TRUE; 205SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 206 &metaslab_preload_enabled, 0, 207 "Max number of metaslabs per group to preload"); 208 209/* |
199 * Enable/disable additional weight factor for each metaslab. | 210 * Enable/disable fragmentation weighting on metaslabs. |
200 */ | 211 */ |
201boolean_t metaslab_weight_factor_enable = B_FALSE; 202SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, weight_factor_enable, CTLFLAG_RWTUN, 203 &metaslab_weight_factor_enable, 0, 204 "Enable additional weight factor for each metaslab"); | 212boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 213SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 214 &metaslab_fragmentation_factor_enabled, 0, 215 "Enable fragmentation weighting on metaslabs"); |
205 | 216 |
217/* 218 * Enable/disable lba weighting (i.e. outer tracks are given preference). 219 */ 220boolean_t metaslab_lba_weighting_enabled = B_TRUE; 221SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 222 &metaslab_lba_weighting_enabled, 0, 223 "Enable LBA weighting (i.e. outer tracks are given preference)"); |
|
206 207/* | 224 225/* |
226 * Enable/disable metaslab group biasing. 227 */ 228boolean_t metaslab_bias_enabled = B_TRUE; 229SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 230 &metaslab_bias_enabled, 0, 231 "Enable metaslab group biasing"); 232 233static uint64_t metaslab_fragmentation(metaslab_t *); 234 235/* |
|
208 * ========================================================================== 209 * Metaslab classes 210 * ========================================================================== 211 */ 212metaslab_class_t * 213metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 214{ 215 metaslab_class_t *mc; --- 101 unchanged lines hidden (view full) --- 317} 318 319uint64_t 320metaslab_class_get_minblocksize(metaslab_class_t *mc) 321{ 322 return (mc->mc_minblocksize); 323} 324 | 236 * ========================================================================== 237 * Metaslab classes 238 * ========================================================================== 239 */ 240metaslab_class_t * 241metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 242{ 243 metaslab_class_t *mc; --- 101 unchanged lines hidden (view full) --- 345} 346 347uint64_t 348metaslab_class_get_minblocksize(metaslab_class_t *mc) 349{ 350 return (mc->mc_minblocksize); 351} 352 |
353void 354metaslab_class_histogram_verify(metaslab_class_t *mc) 355{ 356 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 357 uint64_t *mc_hist; 358 int i; 359 360 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 361 return; 362 363 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 364 KM_SLEEP); 365 366 for (int c = 0; c < rvd->vdev_children; c++) { 367 vdev_t *tvd = rvd->vdev_child[c]; 368 metaslab_group_t *mg = tvd->vdev_mg; 369 370 /* 371 * Skip any holes, uninitialized top-levels, or 372 * vdevs that are not in this metalab class. 373 */ 374 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 375 mg->mg_class != mc) { 376 continue; 377 } 378 379 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 380 mc_hist[i] += mg->mg_histogram[i]; 381 } 382 383 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 384 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 385 386 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 387} 388 |
|
325/* | 389/* |
390 * Calculate the metaslab class's fragmentation metric. The metric 391 * is weighted based on the space contribution of each metaslab group. 392 * The return value will be a number between 0 and 100 (inclusive), or 393 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 394 * zfs_frag_table for more information about the metric. 395 */ 396uint64_t 397metaslab_class_fragmentation(metaslab_class_t *mc) 398{ 399 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 400 uint64_t fragmentation = 0; 401 402 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 403 404 for (int c = 0; c < rvd->vdev_children; c++) { 405 vdev_t *tvd = rvd->vdev_child[c]; 406 metaslab_group_t *mg = tvd->vdev_mg; 407 408 /* 409 * Skip any holes, uninitialized top-levels, or 410 * vdevs that are not in this metalab class. 411 */ 412 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 413 mg->mg_class != mc) { 414 continue; 415 } 416 417 /* 418 * If a metaslab group does not contain a fragmentation 419 * metric then just bail out. 420 */ 421 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 422 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 423 return (ZFS_FRAG_INVALID); 424 } 425 426 /* 427 * Determine how much this metaslab_group is contributing 428 * to the overall pool fragmentation metric. 429 */ 430 fragmentation += mg->mg_fragmentation * 431 metaslab_group_get_space(mg); 432 } 433 fragmentation /= metaslab_class_get_space(mc); 434 435 ASSERT3U(fragmentation, <=, 100); 436 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 437 return (fragmentation); 438} 439 440/* 441 * Calculate the amount of expandable space that is available in 442 * this metaslab class. If a device is expanded then its expandable 443 * space will be the amount of allocatable space that is currently not 444 * part of this metaslab class. 445 */ 446uint64_t 447metaslab_class_expandable_space(metaslab_class_t *mc) 448{ 449 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 450 uint64_t space = 0; 451 452 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 453 for (int c = 0; c < rvd->vdev_children; c++) { 454 vdev_t *tvd = rvd->vdev_child[c]; 455 metaslab_group_t *mg = tvd->vdev_mg; 456 457 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 458 mg->mg_class != mc) { 459 continue; 460 } 461 462 space += tvd->vdev_max_asize - tvd->vdev_asize; 463 } 464 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 465 return (space); 466} 467 468/* |
|
326 * ========================================================================== 327 * Metaslab groups 328 * ========================================================================== 329 */ 330static int 331metaslab_compare(const void *x1, const void *x2) 332{ 333 const metaslab_t *m1 = x1; --- 35 unchanged lines hidden (view full) --- 369 ASSERT(vd == vd->vdev_top); 370 371 mutex_enter(&mg->mg_lock); 372 was_allocatable = mg->mg_allocatable; 373 374 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 375 (vs->vs_space + 1); 376 | 469 * ========================================================================== 470 * Metaslab groups 471 * ========================================================================== 472 */ 473static int 474metaslab_compare(const void *x1, const void *x2) 475{ 476 const metaslab_t *m1 = x1; --- 35 unchanged lines hidden (view full) --- 512 ASSERT(vd == vd->vdev_top); 513 514 mutex_enter(&mg->mg_lock); 515 was_allocatable = mg->mg_allocatable; 516 517 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 518 (vs->vs_space + 1); 519 |
377 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold); | 520 /* 521 * A metaslab group is considered allocatable if it has plenty 522 * of free space or is not heavily fragmented. We only take 523 * fragmentation into account if the metaslab group has a valid 524 * fragmentation metric (i.e. a value between 0 and 100). 525 */ 526 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 527 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 528 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); |
378 379 /* 380 * The mc_alloc_groups maintains a count of the number of 381 * groups in this metaslab class that are still above the 382 * zfs_mg_noalloc_threshold. This is used by the allocating 383 * threads to determine if they should avoid allocations to 384 * a given group. The allocator will avoid allocations to a group 385 * if that group has reached or is below the zfs_mg_noalloc_threshold --- 4 unchanged lines hidden (view full) --- 390 * groups have reached the zfs_mg_noalloc_threshold making all groups 391 * eligible for allocations. This effectively means that all devices 392 * are balanced again. 393 */ 394 if (was_allocatable && !mg->mg_allocatable) 395 mc->mc_alloc_groups--; 396 else if (!was_allocatable && mg->mg_allocatable) 397 mc->mc_alloc_groups++; | 529 530 /* 531 * The mc_alloc_groups maintains a count of the number of 532 * groups in this metaslab class that are still above the 533 * zfs_mg_noalloc_threshold. This is used by the allocating 534 * threads to determine if they should avoid allocations to 535 * a given group. The allocator will avoid allocations to a group 536 * if that group has reached or is below the zfs_mg_noalloc_threshold --- 4 unchanged lines hidden (view full) --- 541 * groups have reached the zfs_mg_noalloc_threshold making all groups 542 * eligible for allocations. This effectively means that all devices 543 * are balanced again. 544 */ 545 if (was_allocatable && !mg->mg_allocatable) 546 mc->mc_alloc_groups--; 547 else if (!was_allocatable && mg->mg_allocatable) 548 mc->mc_alloc_groups++; |
549 |
|
398 mutex_exit(&mg->mg_lock); 399} 400 401metaslab_group_t * 402metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 403{ 404 metaslab_group_t *mg; 405 --- 74 unchanged lines hidden (view full) --- 480 ASSERT(mc->mc_rotor != mg); 481 ASSERT(mg->mg_prev == NULL); 482 ASSERT(mg->mg_next == NULL); 483 ASSERT(mg->mg_activation_count < 0); 484 return; 485 } 486 487 taskq_wait(mg->mg_taskq); | 550 mutex_exit(&mg->mg_lock); 551} 552 553metaslab_group_t * 554metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 555{ 556 metaslab_group_t *mg; 557 --- 74 unchanged lines hidden (view full) --- 632 ASSERT(mc->mc_rotor != mg); 633 ASSERT(mg->mg_prev == NULL); 634 ASSERT(mg->mg_next == NULL); 635 ASSERT(mg->mg_activation_count < 0); 636 return; 637 } 638 639 taskq_wait(mg->mg_taskq); |
640 metaslab_group_alloc_update(mg); |
|
488 489 mgprev = mg->mg_prev; 490 mgnext = mg->mg_next; 491 492 if (mg == mgnext) { 493 mc->mc_rotor = NULL; 494 } else { 495 mc->mc_rotor = mgnext; 496 mgprev->mg_next = mgnext; 497 mgnext->mg_prev = mgprev; 498 } 499 500 mg->mg_prev = NULL; 501 mg->mg_next = NULL; 502 metaslab_class_minblocksize_update(mc); 503} 504 | 641 642 mgprev = mg->mg_prev; 643 mgnext = mg->mg_next; 644 645 if (mg == mgnext) { 646 mc->mc_rotor = NULL; 647 } else { 648 mc->mc_rotor = mgnext; 649 mgprev->mg_next = mgnext; 650 mgnext->mg_prev = mgprev; 651 } 652 653 mg->mg_prev = NULL; 654 mg->mg_next = NULL; 655 metaslab_class_minblocksize_update(mc); 656} 657 |
658uint64_t 659metaslab_group_get_space(metaslab_group_t *mg) 660{ 661 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 662} 663 664void 665metaslab_group_histogram_verify(metaslab_group_t *mg) 666{ 667 uint64_t *mg_hist; 668 vdev_t *vd = mg->mg_vd; 669 uint64_t ashift = vd->vdev_ashift; 670 int i; 671 672 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 673 return; 674 675 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 676 KM_SLEEP); 677 678 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 679 SPACE_MAP_HISTOGRAM_SIZE + ashift); 680 681 for (int m = 0; m < vd->vdev_ms_count; m++) { 682 metaslab_t *msp = vd->vdev_ms[m]; 683 684 if (msp->ms_sm == NULL) 685 continue; 686 687 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 688 mg_hist[i + ashift] += 689 msp->ms_sm->sm_phys->smp_histogram[i]; 690 } 691 692 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 693 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 694 695 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 696} 697 |
|
505static void | 698static void |
506metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) | 699metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) |
507{ | 700{ |
701 metaslab_class_t *mc = mg->mg_class; 702 uint64_t ashift = mg->mg_vd->vdev_ashift; 703 704 ASSERT(MUTEX_HELD(&msp->ms_lock)); 705 if (msp->ms_sm == NULL) 706 return; 707 |
|
508 mutex_enter(&mg->mg_lock); | 708 mutex_enter(&mg->mg_lock); |
709 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 710 mg->mg_histogram[i + ashift] += 711 msp->ms_sm->sm_phys->smp_histogram[i]; 712 mc->mc_histogram[i + ashift] += 713 msp->ms_sm->sm_phys->smp_histogram[i]; 714 } 715 mutex_exit(&mg->mg_lock); 716} 717 718void 719metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 720{ 721 metaslab_class_t *mc = mg->mg_class; 722 uint64_t ashift = mg->mg_vd->vdev_ashift; 723 724 ASSERT(MUTEX_HELD(&msp->ms_lock)); 725 if (msp->ms_sm == NULL) 726 return; 727 728 mutex_enter(&mg->mg_lock); 729 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 730 ASSERT3U(mg->mg_histogram[i + ashift], >=, 731 msp->ms_sm->sm_phys->smp_histogram[i]); 732 ASSERT3U(mc->mc_histogram[i + ashift], >=, 733 msp->ms_sm->sm_phys->smp_histogram[i]); 734 735 mg->mg_histogram[i + ashift] -= 736 msp->ms_sm->sm_phys->smp_histogram[i]; 737 mc->mc_histogram[i + ashift] -= 738 msp->ms_sm->sm_phys->smp_histogram[i]; 739 } 740 mutex_exit(&mg->mg_lock); 741} 742 743static void 744metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 745{ |
|
509 ASSERT(msp->ms_group == NULL); | 746 ASSERT(msp->ms_group == NULL); |
747 mutex_enter(&mg->mg_lock); |
|
510 msp->ms_group = mg; 511 msp->ms_weight = 0; 512 avl_add(&mg->mg_metaslab_tree, msp); 513 mutex_exit(&mg->mg_lock); | 748 msp->ms_group = mg; 749 msp->ms_weight = 0; 750 avl_add(&mg->mg_metaslab_tree, msp); 751 mutex_exit(&mg->mg_lock); |
752 753 mutex_enter(&msp->ms_lock); 754 metaslab_group_histogram_add(mg, msp); 755 mutex_exit(&msp->ms_lock); |
|
514} 515 516static void 517metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 518{ | 756} 757 758static void 759metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 760{ |
761 mutex_enter(&msp->ms_lock); 762 metaslab_group_histogram_remove(mg, msp); 763 mutex_exit(&msp->ms_lock); 764 |
|
519 mutex_enter(&mg->mg_lock); 520 ASSERT(msp->ms_group == mg); 521 avl_remove(&mg->mg_metaslab_tree, msp); 522 msp->ms_group = NULL; 523 mutex_exit(&mg->mg_lock); 524} 525 526static void 527metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 528{ 529 /* 530 * Although in principle the weight can be any value, in | 765 mutex_enter(&mg->mg_lock); 766 ASSERT(msp->ms_group == mg); 767 avl_remove(&mg->mg_metaslab_tree, msp); 768 msp->ms_group = NULL; 769 mutex_exit(&mg->mg_lock); 770} 771 772static void 773metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 774{ 775 /* 776 * Although in principle the weight can be any value, in |
531 * practice we do not use values in the range [1, 510]. | 777 * practice we do not use values in the range [1, 511]. |
532 */ | 778 */ |
533 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); | 779 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); |
534 ASSERT(MUTEX_HELD(&msp->ms_lock)); 535 536 mutex_enter(&mg->mg_lock); 537 ASSERT(msp->ms_group == mg); 538 avl_remove(&mg->mg_metaslab_tree, msp); 539 msp->ms_weight = weight; 540 avl_add(&mg->mg_metaslab_tree, msp); 541 mutex_exit(&mg->mg_lock); 542} 543 544/* | 780 ASSERT(MUTEX_HELD(&msp->ms_lock)); 781 782 mutex_enter(&mg->mg_lock); 783 ASSERT(msp->ms_group == mg); 784 avl_remove(&mg->mg_metaslab_tree, msp); 785 msp->ms_weight = weight; 786 avl_add(&mg->mg_metaslab_tree, msp); 787 mutex_exit(&mg->mg_lock); 788} 789 790/* |
791 * Calculate the fragmentation for a given metaslab group. We can use 792 * a simple average here since all metaslabs within the group must have 793 * the same size. The return value will be a value between 0 and 100 794 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 795 * group have a fragmentation metric. 796 */ 797uint64_t 798metaslab_group_fragmentation(metaslab_group_t *mg) 799{ 800 vdev_t *vd = mg->mg_vd; 801 uint64_t fragmentation = 0; 802 uint64_t valid_ms = 0; 803 804 for (int m = 0; m < vd->vdev_ms_count; m++) { 805 metaslab_t *msp = vd->vdev_ms[m]; 806 807 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 808 continue; 809 810 valid_ms++; 811 fragmentation += msp->ms_fragmentation; 812 } 813 814 if (valid_ms <= vd->vdev_ms_count / 2) 815 return (ZFS_FRAG_INVALID); 816 817 fragmentation /= valid_ms; 818 ASSERT3U(fragmentation, <=, 100); 819 return (fragmentation); 820} 821 822/* |
|
545 * Determine if a given metaslab group should skip allocations. A metaslab | 823 * Determine if a given metaslab group should skip allocations. A metaslab |
546 * group should avoid allocations if its used capacity has crossed the 547 * zfs_mg_noalloc_threshold and there is at least one metaslab group | 824 * group should avoid allocations if its free capacity is less than the 825 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 826 * zfs_mg_fragmentation_threshold and there is at least one metaslab group |
548 * that can still handle allocations. 549 */ 550static boolean_t 551metaslab_group_allocatable(metaslab_group_t *mg) 552{ 553 vdev_t *vd = mg->mg_vd; 554 spa_t *spa = vd->vdev_spa; 555 metaslab_class_t *mc = mg->mg_class; 556 557 /* | 827 * that can still handle allocations. 828 */ 829static boolean_t 830metaslab_group_allocatable(metaslab_group_t *mg) 831{ 832 vdev_t *vd = mg->mg_vd; 833 spa_t *spa = vd->vdev_spa; 834 metaslab_class_t *mc = mg->mg_class; 835 836 /* |
558 * A metaslab group is considered allocatable if its free capacity 559 * is greater than the set value of zfs_mg_noalloc_threshold, it's 560 * associated with a slog, or there are no other metaslab groups 561 * with free capacity greater than zfs_mg_noalloc_threshold. | 837 * We use two key metrics to determine if a metaslab group is 838 * considered allocatable -- free space and fragmentation. If 839 * the free space is greater than the free space threshold and 840 * the fragmentation is less than the fragmentation threshold then 841 * consider the group allocatable. There are two case when we will 842 * not consider these key metrics. The first is if the group is 843 * associated with a slog device and the second is if all groups 844 * in this metaslab class have already been consider ineligible 845 * for allocations. |
562 */ | 846 */ |
563 return (mg->mg_free_capacity > zfs_mg_noalloc_threshold || | 847 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 848 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 849 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || |
564 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 565} 566 567/* 568 * ========================================================================== 569 * Range tree callbacks 570 * ========================================================================== 571 */ --- 207 unchanged lines hidden (view full) --- 779 */ 780 uint64_t align = size & -size; 781 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 782 avl_tree_t *t = &msp->ms_tree->rt_root; 783 784 return (metaslab_block_picker(t, cursor, size, align)); 785} 786 | 850 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 851} 852 853/* 854 * ========================================================================== 855 * Range tree callbacks 856 * ========================================================================== 857 */ --- 207 unchanged lines hidden (view full) --- 1065 */ 1066 uint64_t align = size & -size; 1067 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1068 avl_tree_t *t = &msp->ms_tree->rt_root; 1069 1070 return (metaslab_block_picker(t, cursor, size, align)); 1071} 1072 |
787/* ARGSUSED */ 788static boolean_t 789metaslab_ff_fragmented(metaslab_t *msp) 790{ 791 return (B_TRUE); 792} 793 | |
794static metaslab_ops_t metaslab_ff_ops = { | 1073static metaslab_ops_t metaslab_ff_ops = { |
795 metaslab_ff_alloc, 796 metaslab_ff_fragmented | 1074 metaslab_ff_alloc |
797}; 798 799/* 800 * ========================================================================== 801 * Dynamic block allocator - 802 * Uses the first fit allocation scheme until space get low and then 803 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 804 * and metaslab_df_free_pct to determine when to switch the allocation scheme. --- 30 unchanged lines hidden (view full) --- 835 free_pct < metaslab_df_free_pct) { 836 t = &msp->ms_size_tree; 837 *cursor = 0; 838 } 839 840 return (metaslab_block_picker(t, cursor, size, 1ULL)); 841} 842 | 1075}; 1076 1077/* 1078 * ========================================================================== 1079 * Dynamic block allocator - 1080 * Uses the first fit allocation scheme until space get low and then 1081 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1082 * and metaslab_df_free_pct to determine when to switch the allocation scheme. --- 30 unchanged lines hidden (view full) --- 1113 free_pct < metaslab_df_free_pct) { 1114 t = &msp->ms_size_tree; 1115 *cursor = 0; 1116 } 1117 1118 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1119} 1120 |
843static boolean_t 844metaslab_df_fragmented(metaslab_t *msp) 845{ 846 range_tree_t *rt = msp->ms_tree; 847 uint64_t max_size = metaslab_block_maxsize(msp); 848 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 849 850 if (max_size >= metaslab_df_alloc_threshold && 851 free_pct >= metaslab_df_free_pct) 852 return (B_FALSE); 853 854 return (B_TRUE); 855} 856 | |
857static metaslab_ops_t metaslab_df_ops = { | 1121static metaslab_ops_t metaslab_df_ops = { |
858 metaslab_df_alloc, 859 metaslab_df_fragmented | 1122 metaslab_df_alloc |
860}; 861 862/* 863 * ========================================================================== 864 * Cursor fit block allocator - 865 * Select the largest region in the metaslab, set the cursor to the beginning 866 * of the range and the cursor_end to the end of the range. As allocations 867 * are made advance the cursor. Continue allocating from the cursor until --- 26 unchanged lines hidden (view full) --- 894 } 895 896 offset = *cursor; 897 *cursor += size; 898 899 return (offset); 900} 901 | 1123}; 1124 1125/* 1126 * ========================================================================== 1127 * Cursor fit block allocator - 1128 * Select the largest region in the metaslab, set the cursor to the beginning 1129 * of the range and the cursor_end to the end of the range. As allocations 1130 * are made advance the cursor. Continue allocating from the cursor until --- 26 unchanged lines hidden (view full) --- 1157 } 1158 1159 offset = *cursor; 1160 *cursor += size; 1161 1162 return (offset); 1163} 1164 |
902static boolean_t 903metaslab_cf_fragmented(metaslab_t *msp) 904{ 905 return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size); 906} 907 | |
908static metaslab_ops_t metaslab_cf_ops = { | 1165static metaslab_ops_t metaslab_cf_ops = { |
909 metaslab_cf_alloc, 910 metaslab_cf_fragmented | 1166 metaslab_cf_alloc |
911}; 912 913/* 914 * ========================================================================== 915 * New dynamic fit allocator - 916 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 917 * contiguous blocks. If no region is found then just use the largest segment 918 * that remains. --- 40 unchanged lines hidden (view full) --- 959 960 if ((rs->rs_end - rs->rs_start) >= size) { 961 *cursor = rs->rs_start + size; 962 return (rs->rs_start); 963 } 964 return (-1ULL); 965} 966 | 1167}; 1168 1169/* 1170 * ========================================================================== 1171 * New dynamic fit allocator - 1172 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1173 * contiguous blocks. If no region is found then just use the largest segment 1174 * that remains. --- 40 unchanged lines hidden (view full) --- 1215 1216 if ((rs->rs_end - rs->rs_start) >= size) { 1217 *cursor = rs->rs_start + size; 1218 return (rs->rs_start); 1219 } 1220 return (-1ULL); 1221} 1222 |
967static boolean_t 968metaslab_ndf_fragmented(metaslab_t *msp) 969{ 970 return (metaslab_block_maxsize(msp) <= 971 (metaslab_min_alloc_size << metaslab_ndf_clump_shift)); 972} 973 | |
974static metaslab_ops_t metaslab_ndf_ops = { | 1223static metaslab_ops_t metaslab_ndf_ops = { |
975 metaslab_ndf_alloc, 976 metaslab_ndf_fragmented | 1224 metaslab_ndf_alloc |
977}; 978 979metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 980 981/* 982 * ========================================================================== 983 * Metaslabs 984 * ========================================================================== --- 85 unchanged lines hidden (view full) --- 1070 * alloctree and freetree until metaslab_sync_done(). This serves 1071 * two purposes: it allows metaslab_sync_done() to detect the 1072 * addition of new space; and for debugging, it ensures that we'd 1073 * data fault on any attempt to use this metaslab before it's ready. 1074 */ 1075 msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock); 1076 metaslab_group_add(mg, msp); 1077 | 1225}; 1226 1227metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1228 1229/* 1230 * ========================================================================== 1231 * Metaslabs 1232 * ========================================================================== --- 85 unchanged lines hidden (view full) --- 1318 * alloctree and freetree until metaslab_sync_done(). This serves 1319 * two purposes: it allows metaslab_sync_done() to detect the 1320 * addition of new space; and for debugging, it ensures that we'd 1321 * data fault on any attempt to use this metaslab before it's ready. 1322 */ 1323 msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock); 1324 metaslab_group_add(mg, msp); 1325 |
1326 msp->ms_fragmentation = metaslab_fragmentation(msp); |
|
1078 msp->ms_ops = mg->mg_class->mc_ops; 1079 1080 /* 1081 * If we're opening an existing pool (txg == 0) or creating 1082 * a new one (txg == TXG_INITIAL), all space is available now. 1083 * If we're adding space to an existing pool, the new space 1084 * does not become available until after this txg has synced. 1085 */ --- 49 unchanged lines hidden (view full) --- 1135 1136 mutex_exit(&msp->ms_lock); 1137 cv_destroy(&msp->ms_load_cv); 1138 mutex_destroy(&msp->ms_lock); 1139 1140 kmem_free(msp, sizeof (metaslab_t)); 1141} 1142 | 1327 msp->ms_ops = mg->mg_class->mc_ops; 1328 1329 /* 1330 * If we're opening an existing pool (txg == 0) or creating 1331 * a new one (txg == TXG_INITIAL), all space is available now. 1332 * If we're adding space to an existing pool, the new space 1333 * does not become available until after this txg has synced. 1334 */ --- 49 unchanged lines hidden (view full) --- 1384 1385 mutex_exit(&msp->ms_lock); 1386 cv_destroy(&msp->ms_load_cv); 1387 mutex_destroy(&msp->ms_lock); 1388 1389 kmem_free(msp, sizeof (metaslab_t)); 1390} 1391 |
1392#define FRAGMENTATION_TABLE_SIZE 17 1393 |
|
1143/* | 1394/* |
1144 * Apply a weighting factor based on the histogram information for this 1145 * metaslab. The current weighting factor is somewhat arbitrary and requires 1146 * additional investigation. The implementation provides a measure of 1147 * "weighted" free space and gives a higher weighting for larger contiguous 1148 * regions. The weighting factor is determined by counting the number of 1149 * sm_shift sectors that exist in each region represented by the histogram. 1150 * That value is then multiplied by the power of 2 exponent and the sm_shift 1151 * value. | 1395 * This table defines a segment size based fragmentation metric that will 1396 * allow each metaslab to derive its own fragmentation value. This is done 1397 * by calculating the space in each bucket of the spacemap histogram and 1398 * multiplying that by the fragmetation metric in this table. Doing 1399 * this for all buckets and dividing it by the total amount of free 1400 * space in this metaslab (i.e. the total free space in all buckets) gives 1401 * us the fragmentation metric. This means that a high fragmentation metric 1402 * equates to most of the free space being comprised of small segments. 1403 * Conversely, if the metric is low, then most of the free space is in 1404 * large segments. A 10% change in fragmentation equates to approximately 1405 * double the number of segments. |
1152 * | 1406 * |
1153 * For example, assume the 2^21 histogram bucket has 4 2MB regions and the 1154 * metaslab has an sm_shift value of 9 (512B): 1155 * 1156 * 1) calculate the number of sm_shift sectors in the region: 1157 * 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384 1158 * 2) multiply by the power of 2 exponent and the sm_shift value: 1159 * 16384 * 21 * 9 = 3096576 1160 * This value will be added to the weighting of the metaslab. | 1407 * This table defines 0% fragmented space using 16MB segments. Testing has 1408 * shown that segments that are greater than or equal to 16MB do not suffer 1409 * from drastic performance problems. Using this value, we derive the rest 1410 * of the table. Since the fragmentation value is never stored on disk, it 1411 * is possible to change these calculations in the future. |
1161 */ | 1412 */ |
1413int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1414 100, /* 512B */ 1415 100, /* 1K */ 1416 98, /* 2K */ 1417 95, /* 4K */ 1418 90, /* 8K */ 1419 80, /* 16K */ 1420 70, /* 32K */ 1421 60, /* 64K */ 1422 50, /* 128K */ 1423 40, /* 256K */ 1424 30, /* 512K */ 1425 20, /* 1M */ 1426 15, /* 2M */ 1427 10, /* 4M */ 1428 5, /* 8M */ 1429 0 /* 16M */ 1430}; 1431 1432/* 1433 * Calclate the metaslab's fragmentation metric. A return value 1434 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1435 * not support this metric. Otherwise, the return value should be in the 1436 * range [0, 100]. 1437 */ |
|
1162static uint64_t | 1438static uint64_t |
1163metaslab_weight_factor(metaslab_t *msp) | 1439metaslab_fragmentation(metaslab_t *msp) |
1164{ | 1440{ |
1165 uint64_t factor = 0; 1166 uint64_t sectors; 1167 int i; | 1441 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1442 uint64_t fragmentation = 0; 1443 uint64_t total = 0; 1444 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1445 SPA_FEATURE_SPACEMAP_HISTOGRAM); |
1168 | 1446 |
1447 if (!feature_enabled) 1448 return (ZFS_FRAG_INVALID); 1449 |
|
1169 /* | 1450 /* |
1170 * A null space map means that the entire metaslab is free, 1171 * calculate a weight factor that spans the entire size of the 1172 * metaslab. | 1451 * A null space map means that the entire metaslab is free 1452 * and thus is not fragmented. |
1173 */ | 1453 */ |
1174 if (msp->ms_sm == NULL) { | 1454 if (msp->ms_sm == NULL) 1455 return (0); 1456 1457 /* 1458 * If this metaslab's space_map has not been upgraded, flag it 1459 * so that we upgrade next time we encounter it. 1460 */ 1461 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1462 uint64_t txg = spa_syncing_txg(spa); |
1175 vdev_t *vd = msp->ms_group->mg_vd; 1176 | 1463 vdev_t *vd = msp->ms_group->mg_vd; 1464 |
1177 i = highbit64(msp->ms_size) - 1; 1178 sectors = msp->ms_size >> vd->vdev_ashift; 1179 return (sectors * i * vd->vdev_ashift); | 1465 msp->ms_condense_wanted = B_TRUE; 1466 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1467 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1468 "msp %p, vd %p", txg, msp, vd); 1469 return (ZFS_FRAG_INVALID); |
1180 } 1181 | 1470 } 1471 |
1182 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) 1183 return (0); | 1472 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1473 uint64_t space = 0; 1474 uint8_t shift = msp->ms_sm->sm_shift; 1475 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1476 FRAGMENTATION_TABLE_SIZE - 1); |
1184 | 1477 |
1185 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) { | |
1186 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1187 continue; 1188 | 1478 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1479 continue; 1480 |
1189 /* 1190 * Determine the number of sm_shift sectors in the region 1191 * indicated by the histogram. For example, given an 1192 * sm_shift value of 9 (512 bytes) and i = 4 then we know 1193 * that we're looking at an 8K region in the histogram 1194 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the 1195 * number of sm_shift sectors (512 bytes in this example), 1196 * we would take 8192 / 512 = 16. Since the histogram 1197 * is offset by sm_shift we can simply use the value of 1198 * of i to calculate this (i.e. 2^i = 16 where i = 4). 1199 */ 1200 sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i; 1201 factor += (i + msp->ms_sm->sm_shift) * sectors; | 1481 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1482 total += space; 1483 1484 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1485 fragmentation += space * zfs_frag_table[idx]; |
1202 } | 1486 } |
1203 return (factor * msp->ms_sm->sm_shift); | 1487 1488 if (total > 0) 1489 fragmentation /= total; 1490 ASSERT3U(fragmentation, <=, 100); 1491 return (fragmentation); |
1204} 1205 | 1492} 1493 |
1494/* 1495 * Compute a weight -- a selection preference value -- for the given metaslab. 1496 * This is based on the amount of free space, the level of fragmentation, 1497 * the LBA range, and whether the metaslab is loaded. 1498 */ |
|
1206static uint64_t 1207metaslab_weight(metaslab_t *msp) 1208{ 1209 metaslab_group_t *mg = msp->ms_group; 1210 vdev_t *vd = mg->mg_vd; 1211 uint64_t weight, space; 1212 1213 ASSERT(MUTEX_HELD(&msp->ms_lock)); --- 7 unchanged lines hidden (view full) --- 1221 ASSERT0(vd->vdev_ms_shift); 1222 return (0); 1223 } 1224 1225 /* 1226 * The baseline weight is the metaslab's free space. 1227 */ 1228 space = msp->ms_size - space_map_allocated(msp->ms_sm); | 1499static uint64_t 1500metaslab_weight(metaslab_t *msp) 1501{ 1502 metaslab_group_t *mg = msp->ms_group; 1503 vdev_t *vd = mg->mg_vd; 1504 uint64_t weight, space; 1505 1506 ASSERT(MUTEX_HELD(&msp->ms_lock)); --- 7 unchanged lines hidden (view full) --- 1514 ASSERT0(vd->vdev_ms_shift); 1515 return (0); 1516 } 1517 1518 /* 1519 * The baseline weight is the metaslab's free space. 1520 */ 1521 space = msp->ms_size - space_map_allocated(msp->ms_sm); |
1522 1523 msp->ms_fragmentation = metaslab_fragmentation(msp); 1524 if (metaslab_fragmentation_factor_enabled && 1525 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1526 /* 1527 * Use the fragmentation information to inversely scale 1528 * down the baseline weight. We need to ensure that we 1529 * don't exclude this metaslab completely when it's 100% 1530 * fragmented. To avoid this we reduce the fragmented value 1531 * by 1. 1532 */ 1533 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1534 1535 /* 1536 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1537 * this metaslab again. The fragmentation metric may have 1538 * decreased the space to something smaller than 1539 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1540 * so that we can consume any remaining space. 1541 */ 1542 if (space > 0 && space < SPA_MINBLOCKSIZE) 1543 space = SPA_MINBLOCKSIZE; 1544 } |
|
1229 weight = space; 1230 1231 /* 1232 * Modern disks have uniform bit density and constant angular velocity. 1233 * Therefore, the outer recording zones are faster (higher bandwidth) 1234 * than the inner zones by the ratio of outer to inner track diameter, 1235 * which is typically around 2:1. We account for this by assigning 1236 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1237 * In effect, this means that we'll select the metaslab with the most 1238 * free bandwidth rather than simply the one with the most free space. 1239 */ | 1545 weight = space; 1546 1547 /* 1548 * Modern disks have uniform bit density and constant angular velocity. 1549 * Therefore, the outer recording zones are faster (higher bandwidth) 1550 * than the inner zones by the ratio of outer to inner track diameter, 1551 * which is typically around 2:1. We account for this by assigning 1552 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1553 * In effect, this means that we'll select the metaslab with the most 1554 * free bandwidth rather than simply the one with the most free space. 1555 */ |
1240 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1241 ASSERT(weight >= space && weight <= 2 * space); | 1556 if (metaslab_lba_weighting_enabled) { 1557 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1558 ASSERT(weight >= space && weight <= 2 * space); 1559 } |
1242 | 1560 |
1243 msp->ms_factor = metaslab_weight_factor(msp); 1244 if (metaslab_weight_factor_enable) 1245 weight += msp->ms_factor; 1246 1247 if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) { 1248 /* 1249 * If this metaslab is one we're actively using, adjust its 1250 * weight to make it preferable to any inactive metaslab so 1251 * we'll polish it off. 1252 */ | 1561 /* 1562 * If this metaslab is one we're actively using, adjust its 1563 * weight to make it preferable to any inactive metaslab so 1564 * we'll polish it off. If the fragmentation on this metaslab 1565 * has exceed our threshold, then don't mark it active. 1566 */ 1567 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1568 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { |
1253 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1254 } 1255 1256 return (weight); 1257} 1258 1259static int 1260metaslab_activate(metaslab_t *msp, uint64_t activation_weight) --- 68 unchanged lines hidden (view full) --- 1329 mutex_enter(&mg->mg_lock); 1330 /* 1331 * Load the next potential metaslabs 1332 */ 1333 msp = avl_first(t); 1334 while (msp != NULL) { 1335 metaslab_t *msp_next = AVL_NEXT(t, msp); 1336 | 1569 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1570 } 1571 1572 return (weight); 1573} 1574 1575static int 1576metaslab_activate(metaslab_t *msp, uint64_t activation_weight) --- 68 unchanged lines hidden (view full) --- 1645 mutex_enter(&mg->mg_lock); 1646 /* 1647 * Load the next potential metaslabs 1648 */ 1649 msp = avl_first(t); 1650 while (msp != NULL) { 1651 metaslab_t *msp_next = AVL_NEXT(t, msp); 1652 |
1337 /* If we have reached our preload limit then we're done */ 1338 if (++m > metaslab_preload_limit) 1339 break; | 1653 /* 1654 * We preload only the maximum number of metaslabs specified 1655 * by metaslab_preload_limit. If a metaslab is being forced 1656 * to condense then we preload it too. This will ensure 1657 * that force condensing happens in the next txg. 1658 */ 1659 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1660 msp = msp_next; 1661 continue; 1662 } |
1340 1341 /* 1342 * We must drop the metaslab group lock here to preserve 1343 * lock ordering with the ms_lock (when grabbing both 1344 * the mg_lock and the ms_lock, the ms_lock must be taken 1345 * first). As a result, it is possible that the ordering 1346 * of the metaslabs within the avl tree may change before 1347 * we reacquire the lock. The metaslab cannot be removed from --- 51 unchanged lines hidden (view full) --- 1399 dmu_object_info_t doi; 1400 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1401 1402 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1403 ASSERT(msp->ms_loaded); 1404 1405 /* 1406 * Use the ms_size_tree range tree, which is ordered by size, to | 1663 1664 /* 1665 * We must drop the metaslab group lock here to preserve 1666 * lock ordering with the ms_lock (when grabbing both 1667 * the mg_lock and the ms_lock, the ms_lock must be taken 1668 * first). As a result, it is possible that the ordering 1669 * of the metaslabs within the avl tree may change before 1670 * we reacquire the lock. The metaslab cannot be removed from --- 51 unchanged lines hidden (view full) --- 1722 dmu_object_info_t doi; 1723 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1724 1725 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1726 ASSERT(msp->ms_loaded); 1727 1728 /* 1729 * Use the ms_size_tree range tree, which is ordered by size, to |
1407 * obtain the largest segment in the free tree. If the tree is empty 1408 * then we should condense the map. | 1730 * obtain the largest segment in the free tree. We always condense 1731 * metaslabs that are empty and metaslabs for which a condense 1732 * request has been made. |
1409 */ 1410 rs = avl_last(&msp->ms_size_tree); | 1733 */ 1734 rs = avl_last(&msp->ms_size_tree); |
1411 if (rs == NULL) | 1735 if (rs == NULL || msp->ms_condense_wanted) |
1412 return (B_TRUE); 1413 1414 /* 1415 * Calculate the number of 64-bit entries this segment would 1416 * require when written to disk. If this single segment would be 1417 * larger on-disk than the entire current on-disk structure, then 1418 * clearly condensing will increase the on-disk structure size. 1419 */ --- 24 unchanged lines hidden (view full) --- 1444 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1445 range_tree_t *condense_tree; 1446 space_map_t *sm = msp->ms_sm; 1447 1448 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1449 ASSERT3U(spa_sync_pass(spa), ==, 1); 1450 ASSERT(msp->ms_loaded); 1451 | 1736 return (B_TRUE); 1737 1738 /* 1739 * Calculate the number of 64-bit entries this segment would 1740 * require when written to disk. If this single segment would be 1741 * larger on-disk than the entire current on-disk structure, then 1742 * clearly condensing will increase the on-disk structure size. 1743 */ --- 24 unchanged lines hidden (view full) --- 1768 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1769 range_tree_t *condense_tree; 1770 space_map_t *sm = msp->ms_sm; 1771 1772 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1773 ASSERT3U(spa_sync_pass(spa), ==, 1); 1774 ASSERT(msp->ms_loaded); 1775 |
1776 |
|
1452 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " | 1777 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " |
1453 "smp size %llu, segments %lu", txg, msp->ms_id, msp, 1454 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root)); | 1778 "smp size %llu, segments %lu, forcing condense=%s", txg, 1779 msp->ms_id, msp, space_map_length(msp->ms_sm), 1780 avl_numnodes(&msp->ms_tree->rt_root), 1781 msp->ms_condense_wanted ? "TRUE" : "FALSE"); |
1455 | 1782 |
1783 msp->ms_condense_wanted = B_FALSE; 1784 |
|
1456 /* 1457 * Create an range tree that is 100% allocated. We remove segments 1458 * that have been freed in this txg, any deferred frees that exist, 1459 * and any allocation in the future. Removing segments should be 1460 * a relatively inexpensive operation since we expect these trees to 1461 * have a small number of nodes. 1462 */ 1463 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); --- 75 unchanged lines hidden (view full) --- 1539 ASSERT3P(alloctree, ==, NULL); 1540 return; 1541 } 1542 1543 ASSERT3P(alloctree, !=, NULL); 1544 ASSERT3P(*freetree, !=, NULL); 1545 ASSERT3P(*freed_tree, !=, NULL); 1546 | 1785 /* 1786 * Create an range tree that is 100% allocated. We remove segments 1787 * that have been freed in this txg, any deferred frees that exist, 1788 * and any allocation in the future. Removing segments should be 1789 * a relatively inexpensive operation since we expect these trees to 1790 * have a small number of nodes. 1791 */ 1792 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); --- 75 unchanged lines hidden (view full) --- 1868 ASSERT3P(alloctree, ==, NULL); 1869 return; 1870 } 1871 1872 ASSERT3P(alloctree, !=, NULL); 1873 ASSERT3P(*freetree, !=, NULL); 1874 ASSERT3P(*freed_tree, !=, NULL); 1875 |
1876 /* 1877 * Normally, we don't want to process a metaslab if there 1878 * are no allocations or frees to perform. However, if the metaslab 1879 * is being forced to condense we need to let it through. 1880 */ |
|
1547 if (range_tree_space(alloctree) == 0 && | 1881 if (range_tree_space(alloctree) == 0 && |
1548 range_tree_space(*freetree) == 0) | 1882 range_tree_space(*freetree) == 0 && 1883 !msp->ms_condense_wanted) |
1549 return; 1550 1551 /* 1552 * The only state that can actually be changing concurrently with 1553 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1554 * be modifying this txg's alloctree, freetree, freed_tree, or 1555 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1556 * space_map ASSERTs. We drop it whenever we call into the DMU, --- 20 unchanged lines hidden (view full) --- 1577 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1578 metaslab_should_condense(msp)) { 1579 metaslab_condense(msp, txg, tx); 1580 } else { 1581 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1582 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1583 } 1584 | 1884 return; 1885 1886 /* 1887 * The only state that can actually be changing concurrently with 1888 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1889 * be modifying this txg's alloctree, freetree, freed_tree, or 1890 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1891 * space_map ASSERTs. We drop it whenever we call into the DMU, --- 20 unchanged lines hidden (view full) --- 1912 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1913 metaslab_should_condense(msp)) { 1914 metaslab_condense(msp, txg, tx); 1915 } else { 1916 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1917 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1918 } 1919 |
1585 range_tree_vacate(alloctree, NULL, NULL); 1586 | 1920 metaslab_group_histogram_verify(mg); 1921 metaslab_class_histogram_verify(mg->mg_class); 1922 metaslab_group_histogram_remove(mg, msp); |
1587 if (msp->ms_loaded) { 1588 /* 1589 * When the space map is loaded, we have an accruate 1590 * histogram in the range tree. This gives us an opportunity 1591 * to bring the space map's histogram up-to-date so we clear 1592 * it first before updating it. 1593 */ 1594 space_map_histogram_clear(msp->ms_sm); 1595 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1596 } else { 1597 /* 1598 * Since the space map is not loaded we simply update the 1599 * exisiting histogram with what was freed in this txg. This 1600 * means that the on-disk histogram may not have an accurate 1601 * view of the free space but it's close enough to allow 1602 * us to make allocation decisions. 1603 */ 1604 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1605 } | 1923 if (msp->ms_loaded) { 1924 /* 1925 * When the space map is loaded, we have an accruate 1926 * histogram in the range tree. This gives us an opportunity 1927 * to bring the space map's histogram up-to-date so we clear 1928 * it first before updating it. 1929 */ 1930 space_map_histogram_clear(msp->ms_sm); 1931 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1932 } else { 1933 /* 1934 * Since the space map is not loaded we simply update the 1935 * exisiting histogram with what was freed in this txg. This 1936 * means that the on-disk histogram may not have an accurate 1937 * view of the free space but it's close enough to allow 1938 * us to make allocation decisions. 1939 */ 1940 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1941 } |
1942 metaslab_group_histogram_add(mg, msp); 1943 metaslab_group_histogram_verify(mg); 1944 metaslab_class_histogram_verify(mg->mg_class); |
|
1606 1607 /* 1608 * For sync pass 1, we avoid traversing this txg's free range tree 1609 * and instead will just swap the pointers for freetree and 1610 * freed_tree. We can safely do this since the freed_tree is 1611 * guaranteed to be empty on the initial pass. 1612 */ 1613 if (spa_sync_pass(spa) == 1) { 1614 range_tree_swap(freetree, freed_tree); 1615 } else { 1616 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 1617 } | 1945 1946 /* 1947 * For sync pass 1, we avoid traversing this txg's free range tree 1948 * and instead will just swap the pointers for freetree and 1949 * freed_tree. We can safely do this since the freed_tree is 1950 * guaranteed to be empty on the initial pass. 1951 */ 1952 if (spa_sync_pass(spa) == 1) { 1953 range_tree_swap(freetree, freed_tree); 1954 } else { 1955 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 1956 } |
1957 range_tree_vacate(alloctree, NULL, NULL); |
|
1618 1619 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1620 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1621 1622 mutex_exit(&msp->ms_lock); 1623 1624 if (object != space_map_object(msp->ms_sm)) { 1625 object = space_map_object(msp->ms_sm); --- 94 unchanged lines hidden (view full) --- 1720 } 1721 1722 if (!metaslab_debug_unload) 1723 metaslab_unload(msp); 1724 } 1725 1726 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1727 mutex_exit(&msp->ms_lock); | 1958 1959 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1960 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1961 1962 mutex_exit(&msp->ms_lock); 1963 1964 if (object != space_map_object(msp->ms_sm)) { 1965 object = space_map_object(msp->ms_sm); --- 94 unchanged lines hidden (view full) --- 2060 } 2061 2062 if (!metaslab_debug_unload) 2063 metaslab_unload(msp); 2064 } 2065 2066 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2067 mutex_exit(&msp->ms_lock); |
1728 | |
1729} 1730 1731void 1732metaslab_sync_reassess(metaslab_group_t *mg) 1733{ 1734 metaslab_group_alloc_update(mg); | 2068} 2069 2070void 2071metaslab_sync_reassess(metaslab_group_t *mg) 2072{ 2073 metaslab_group_alloc_update(mg); |
2074 mg->mg_fragmentation = metaslab_group_fragmentation(mg); |
|
1735 1736 /* 1737 * Preload the next potential metaslabs 1738 */ 1739 metaslab_group_preload(mg); 1740} 1741 1742static uint64_t --- 245 unchanged lines hidden (view full) --- 1988 goto next; 1989 1990 /* 1991 * Avoid writing single-copy data to a failing vdev 1992 * unless the user instructs us that it is okay. 1993 */ 1994 if ((vd->vdev_stat.vs_write_errors > 0 || 1995 vd->vdev_state < VDEV_STATE_HEALTHY) && | 2075 2076 /* 2077 * Preload the next potential metaslabs 2078 */ 2079 metaslab_group_preload(mg); 2080} 2081 2082static uint64_t --- 245 unchanged lines hidden (view full) --- 2328 goto next; 2329 2330 /* 2331 * Avoid writing single-copy data to a failing vdev 2332 * unless the user instructs us that it is okay. 2333 */ 2334 if ((vd->vdev_stat.vs_write_errors > 0 || 2335 vd->vdev_state < VDEV_STATE_HEALTHY) && |
1996 d == 0 && dshift == 3 && 1997 !(zfs_write_to_degraded && vd->vdev_state == 1998 VDEV_STATE_DEGRADED)) { | 2336 d == 0 && dshift == 3 && vd->vdev_children == 0) { |
1999 all_zero = B_FALSE; 2000 goto next; 2001 } 2002 2003 ASSERT(mg->mg_class == mc); 2004 2005 distance = vd->vdev_asize >> dshift; 2006 if (distance <= (1ULL << vd->vdev_ms_shift)) --- 8 unchanged lines hidden (view full) --- 2015 dva, d); 2016 if (offset != -1ULL) { 2017 /* 2018 * If we've just selected this metaslab group, 2019 * figure out whether the corresponding vdev is 2020 * over- or under-used relative to the pool, 2021 * and set an allocation bias to even it out. 2022 */ | 2337 all_zero = B_FALSE; 2338 goto next; 2339 } 2340 2341 ASSERT(mg->mg_class == mc); 2342 2343 distance = vd->vdev_asize >> dshift; 2344 if (distance <= (1ULL << vd->vdev_ms_shift)) --- 8 unchanged lines hidden (view full) --- 2353 dva, d); 2354 if (offset != -1ULL) { 2355 /* 2356 * If we've just selected this metaslab group, 2357 * figure out whether the corresponding vdev is 2358 * over- or under-used relative to the pool, 2359 * and set an allocation bias to even it out. 2360 */ |
2023 if (mc->mc_aliquot == 0) { | 2361 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { |
2024 vdev_stat_t *vs = &vd->vdev_stat; 2025 int64_t vu, cu; 2026 2027 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2028 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2029 2030 /* 2031 * Calculate how much more or less we should --- 5 unchanged lines hidden (view full) --- 2037 * 2038 * mg_bias = (20 - 80) * 512K / 100 = -307K 2039 * 2040 * This reduces allocations by 307K for this 2041 * iteration. 2042 */ 2043 mg->mg_bias = ((cu - vu) * 2044 (int64_t)mg->mg_aliquot) / 100; | 2362 vdev_stat_t *vs = &vd->vdev_stat; 2363 int64_t vu, cu; 2364 2365 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2366 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2367 2368 /* 2369 * Calculate how much more or less we should --- 5 unchanged lines hidden (view full) --- 2375 * 2376 * mg_bias = (20 - 80) * 512K / 100 = -307K 2377 * 2378 * This reduces allocations by 307K for this 2379 * iteration. 2380 */ 2381 mg->mg_bias = ((cu - vu) * 2382 (int64_t)mg->mg_aliquot) / 100; |
2383 } else if (!metaslab_bias_enabled) { 2384 mg->mg_bias = 0; |
|
2045 } 2046 2047 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2048 mg->mg_aliquot + mg->mg_bias) { 2049 mc->mc_rotor = mg->mg_next; 2050 mc->mc_aliquot = 0; 2051 } 2052 --- 257 unchanged lines hidden --- | 2385 } 2386 2387 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2388 mg->mg_aliquot + mg->mg_bias) { 2389 mc->mc_rotor = mg->mg_next; 2390 mc->mc_aliquot = 0; 2391 } 2392 --- 257 unchanged lines hidden --- |