1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22211931Smm * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23290753Smav * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24246675Smm * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25297112Smav * Copyright (c) 2014 Integros [integros.com] 26168404Spjd */ 27168404Spjd 28168404Spjd#include <sys/zfs_context.h> 29168404Spjd#include <sys/dmu.h> 30168404Spjd#include <sys/dmu_tx.h> 31168404Spjd#include <sys/space_map.h> 32168404Spjd#include <sys/metaslab_impl.h> 33168404Spjd#include <sys/vdev_impl.h> 34168404Spjd#include <sys/zio.h> 35262093Savg#include <sys/spa_impl.h> 36269773Sdelphij#include <sys/zfeature.h> 37168404Spjd 38255226SpjdSYSCTL_DECL(_vfs_zfs); 39255226SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 40255226Spjd 41307279Smav#define GANG_ALLOCATION(flags) \ 42307279Smav ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 43224177Smm 44262093Savg#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 45262093Savg#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 46262093Savg#define METASLAB_ACTIVE_MASK \ 47262093Savg (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 48262093Savg 49168404Spjduint64_t metaslab_aliquot = 512ULL << 10; 50185029Spjduint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 51255226SpjdTUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang); 52255226SpjdSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, 53255226Spjd &metaslab_gang_bang, 0, 54255226Spjd "Force gang block allocation for blocks larger than or equal to this value"); 55168404Spjd 56168404Spjd/* 57247398Smm * The in-core space map representation is more compact than its on-disk form. 58247398Smm * The zfs_condense_pct determines how much more compact the in-core 59247398Smm * space_map representation must be before we compact it on-disk. 60247398Smm * Values should be greater than or equal to 100. 61247398Smm */ 62247398Smmint zfs_condense_pct = 200; 63262093SavgTUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct); 64262093SavgSYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, 65262093Savg &zfs_condense_pct, 0, 66262093Savg "Condense on-disk spacemap when it is more than this many percents" 67262093Savg " of in-memory counterpart"); 68247398Smm 69247398Smm/* 70269416Sdelphij * Condensing a metaslab is not guaranteed to actually reduce the amount of 71269416Sdelphij * space used on disk. In particular, a space map uses data in increments of 72273341Sdelphij * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 73269416Sdelphij * same number of blocks after condensing. Since the goal of condensing is to 74269416Sdelphij * reduce the number of IOPs required to read the space map, we only want to 75269416Sdelphij * condense when we can be sure we will reduce the number of blocks used by the 76269416Sdelphij * space map. Unfortunately, we cannot precisely compute whether or not this is 77269416Sdelphij * the case in metaslab_should_condense since we are holding ms_lock. Instead, 78269416Sdelphij * we apply the following heuristic: do not condense a spacemap unless the 79269416Sdelphij * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 80269416Sdelphij * blocks. 81269416Sdelphij */ 82269416Sdelphijint zfs_metaslab_condense_block_threshold = 4; 83269416Sdelphij 84269416Sdelphij/* 85260768Savg * The zfs_mg_noalloc_threshold defines which metaslab groups should 86260768Savg * be eligible for allocation. The value is defined as a percentage of 87269773Sdelphij * free space. Metaslab groups that have more free space than 88260768Savg * zfs_mg_noalloc_threshold are always eligible for allocations. Once 89260768Savg * a metaslab group's free space is less than or equal to the 90260768Savg * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 91260768Savg * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 92260768Savg * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 93260768Savg * groups are allowed to accept allocations. Gang blocks are always 94260768Savg * eligible to allocate on any metaslab group. The default value of 0 means 95260768Savg * no metaslab group will be excluded based on this criterion. 96260768Savg */ 97260768Savgint zfs_mg_noalloc_threshold = 0; 98262093SavgTUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold); 99262093SavgSYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 100262093Savg &zfs_mg_noalloc_threshold, 0, 101262093Savg "Percentage of metaslab group size that should be free" 102262093Savg " to make it eligible for allocation"); 103260768Savg 104260768Savg/* 105269773Sdelphij * Metaslab groups are considered eligible for allocations if their 106269773Sdelphij * fragmenation metric (measured as a percentage) is less than or equal to 107269773Sdelphij * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 108269773Sdelphij * then it will be skipped unless all metaslab groups within the metaslab 109269773Sdelphij * class have also crossed this threshold. 110269773Sdelphij */ 111269773Sdelphijint zfs_mg_fragmentation_threshold = 85; 112269774SdelphijTUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold); 113269774SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, 114269774Sdelphij &zfs_mg_fragmentation_threshold, 0, 115269774Sdelphij "Percentage of metaslab group size that should be considered " 116269774Sdelphij "eligible for allocations unless all metaslab groups within the metaslab class " 117269774Sdelphij "have also crossed this threshold"); 118269773Sdelphij 119269773Sdelphij/* 120269773Sdelphij * Allow metaslabs to keep their active state as long as their fragmentation 121269773Sdelphij * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 122269773Sdelphij * active metaslab that exceeds this threshold will no longer keep its active 123269773Sdelphij * status allowing better metaslabs to be selected. 124269773Sdelphij */ 125269773Sdelphijint zfs_metaslab_fragmentation_threshold = 70; 126269774SdelphijTUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold", 127269774Sdelphij &zfs_metaslab_fragmentation_threshold); 128269774SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, 129269774Sdelphij &zfs_metaslab_fragmentation_threshold, 0, 130269774Sdelphij "Maximum percentage of metaslab fragmentation level to keep their active state"); 131269773Sdelphij 132269773Sdelphij/* 133262093Savg * When set will load all metaslabs when pool is first opened. 134219089Spjd */ 135262093Savgint metaslab_debug_load = 0; 136262093SavgTUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load); 137262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 138262093Savg &metaslab_debug_load, 0, 139262093Savg "Load all metaslabs when pool is first opened"); 140219089Spjd 141219089Spjd/* 142262093Savg * When set will prevent metaslabs from being unloaded. 143262093Savg */ 144262093Savgint metaslab_debug_unload = 0; 145262093SavgTUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload); 146262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, 147262093Savg &metaslab_debug_unload, 0, 148262093Savg "Prevent metaslabs from being unloaded"); 149262093Savg 150262093Savg/* 151209962Smm * Minimum size which forces the dynamic allocator to change 152211931Smm * it's allocation strategy. Once the space map cannot satisfy 153209962Smm * an allocation of this size then it switches to using more 154209962Smm * aggressive strategy (i.e search by size rather than offset). 155209962Smm */ 156276081Sdelphijuint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 157255226SpjdTUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold", 158255226Spjd &metaslab_df_alloc_threshold); 159255226SpjdSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 160255226Spjd &metaslab_df_alloc_threshold, 0, 161255226Spjd "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 162209962Smm 163209962Smm/* 164209962Smm * The minimum free space, in percent, which must be available 165209962Smm * in a space map to continue allocations in a first-fit fashion. 166209962Smm * Once the space_map's free space drops below this level we dynamically 167209962Smm * switch to using best-fit allocations. 168209962Smm */ 169211931Smmint metaslab_df_free_pct = 4; 170255226SpjdTUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct); 171255226SpjdSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 172255226Spjd &metaslab_df_free_pct, 0, 173255226Spjd "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion"); 174209962Smm 175209962Smm/* 176211931Smm * A metaslab is considered "free" if it contains a contiguous 177211931Smm * segment which is greater than metaslab_min_alloc_size. 178211931Smm */ 179211931Smmuint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 180255226SpjdTUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size", 181255226Spjd &metaslab_min_alloc_size); 182255226SpjdSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 183255226Spjd &metaslab_min_alloc_size, 0, 184255226Spjd "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 185211931Smm 186211931Smm/* 187262093Savg * Percentage of all cpus that can be used by the metaslab taskq. 188211931Smm */ 189262093Savgint metaslab_load_pct = 50; 190262093SavgTUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct); 191262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, 192262093Savg &metaslab_load_pct, 0, 193262093Savg "Percentage of cpus that can be used by the metaslab taskq"); 194211931Smm 195211931Smm/* 196262093Savg * Determines how many txgs a metaslab may remain loaded without having any 197262093Savg * allocations from it. As long as a metaslab continues to be used we will 198262093Savg * keep it loaded. 199211931Smm */ 200262093Savgint metaslab_unload_delay = TXG_SIZE * 2; 201262093SavgTUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay); 202262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 203262093Savg &metaslab_unload_delay, 0, 204262093Savg "Number of TXGs that an unused metaslab can be kept in memory"); 205211931Smm 206211931Smm/* 207262093Savg * Max number of metaslabs per group to preload. 208262093Savg */ 209262093Savgint metaslab_preload_limit = SPA_DVAS_PER_BP; 210262093SavgTUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit); 211262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 212262093Savg &metaslab_preload_limit, 0, 213262093Savg "Max number of metaslabs per group to preload"); 214262093Savg 215262093Savg/* 216262093Savg * Enable/disable preloading of metaslab. 217262093Savg */ 218262093Savgboolean_t metaslab_preload_enabled = B_TRUE; 219262093SavgTUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled); 220262093SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 221262093Savg &metaslab_preload_enabled, 0, 222262093Savg "Max number of metaslabs per group to preload"); 223262093Savg 224262093Savg/* 225269773Sdelphij * Enable/disable fragmentation weighting on metaslabs. 226262093Savg */ 227269773Sdelphijboolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 228269773SdelphijTUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled", 229269773Sdelphij &metaslab_fragmentation_factor_enabled); 230269773SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 231269773Sdelphij &metaslab_fragmentation_factor_enabled, 0, 232269773Sdelphij "Enable fragmentation weighting on metaslabs"); 233262093Savg 234269773Sdelphij/* 235269773Sdelphij * Enable/disable lba weighting (i.e. outer tracks are given preference). 236269773Sdelphij */ 237269773Sdelphijboolean_t metaslab_lba_weighting_enabled = B_TRUE; 238269773SdelphijTUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled", 239269773Sdelphij &metaslab_lba_weighting_enabled); 240269773SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 241269773Sdelphij &metaslab_lba_weighting_enabled, 0, 242269773Sdelphij "Enable LBA weighting (i.e. outer tracks are given preference)"); 243262093Savg 244262093Savg/* 245269773Sdelphij * Enable/disable metaslab group biasing. 246269773Sdelphij */ 247269773Sdelphijboolean_t metaslab_bias_enabled = B_TRUE; 248269773SdelphijTUNABLE_INT("vfs.zfs.metaslab.bias_enabled", 249269773Sdelphij &metaslab_bias_enabled); 250269773SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 251269773Sdelphij &metaslab_bias_enabled, 0, 252269773Sdelphij "Enable metaslab group biasing"); 253269773Sdelphij 254269773Sdelphijstatic uint64_t metaslab_fragmentation(metaslab_t *); 255269773Sdelphij 256269773Sdelphij/* 257168404Spjd * ========================================================================== 258168404Spjd * Metaslab classes 259168404Spjd * ========================================================================== 260168404Spjd */ 261168404Spjdmetaslab_class_t * 262262093Savgmetaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 263168404Spjd{ 264168404Spjd metaslab_class_t *mc; 265168404Spjd 266168404Spjd mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 267168404Spjd 268219089Spjd mc->mc_spa = spa; 269168404Spjd mc->mc_rotor = NULL; 270209962Smm mc->mc_ops = ops; 271307279Smav mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 272307279Smav refcount_create_tracked(&mc->mc_alloc_slots); 273168404Spjd 274168404Spjd return (mc); 275168404Spjd} 276168404Spjd 277168404Spjdvoid 278168404Spjdmetaslab_class_destroy(metaslab_class_t *mc) 279168404Spjd{ 280219089Spjd ASSERT(mc->mc_rotor == NULL); 281219089Spjd ASSERT(mc->mc_alloc == 0); 282219089Spjd ASSERT(mc->mc_deferred == 0); 283219089Spjd ASSERT(mc->mc_space == 0); 284219089Spjd ASSERT(mc->mc_dspace == 0); 285168404Spjd 286307279Smav refcount_destroy(&mc->mc_alloc_slots); 287307279Smav mutex_destroy(&mc->mc_lock); 288168404Spjd kmem_free(mc, sizeof (metaslab_class_t)); 289168404Spjd} 290168404Spjd 291219089Spjdint 292219089Spjdmetaslab_class_validate(metaslab_class_t *mc) 293168404Spjd{ 294219089Spjd metaslab_group_t *mg; 295219089Spjd vdev_t *vd; 296168404Spjd 297219089Spjd /* 298219089Spjd * Must hold one of the spa_config locks. 299219089Spjd */ 300219089Spjd ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 301219089Spjd spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 302168404Spjd 303219089Spjd if ((mg = mc->mc_rotor) == NULL) 304219089Spjd return (0); 305219089Spjd 306219089Spjd do { 307219089Spjd vd = mg->mg_vd; 308219089Spjd ASSERT(vd->vdev_mg != NULL); 309219089Spjd ASSERT3P(vd->vdev_top, ==, vd); 310219089Spjd ASSERT3P(mg->mg_class, ==, mc); 311219089Spjd ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 312219089Spjd } while ((mg = mg->mg_next) != mc->mc_rotor); 313219089Spjd 314219089Spjd return (0); 315168404Spjd} 316168404Spjd 317168404Spjdvoid 318219089Spjdmetaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 319219089Spjd int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 320168404Spjd{ 321219089Spjd atomic_add_64(&mc->mc_alloc, alloc_delta); 322219089Spjd atomic_add_64(&mc->mc_deferred, defer_delta); 323219089Spjd atomic_add_64(&mc->mc_space, space_delta); 324219089Spjd atomic_add_64(&mc->mc_dspace, dspace_delta); 325219089Spjd} 326168404Spjd 327254591Sgibbsvoid 328254591Sgibbsmetaslab_class_minblocksize_update(metaslab_class_t *mc) 329254591Sgibbs{ 330254591Sgibbs metaslab_group_t *mg; 331254591Sgibbs vdev_t *vd; 332254591Sgibbs uint64_t minashift = UINT64_MAX; 333254591Sgibbs 334254591Sgibbs if ((mg = mc->mc_rotor) == NULL) { 335254591Sgibbs mc->mc_minblocksize = SPA_MINBLOCKSIZE; 336254591Sgibbs return; 337254591Sgibbs } 338254591Sgibbs 339254591Sgibbs do { 340254591Sgibbs vd = mg->mg_vd; 341254591Sgibbs if (vd->vdev_ashift < minashift) 342254591Sgibbs minashift = vd->vdev_ashift; 343254591Sgibbs } while ((mg = mg->mg_next) != mc->mc_rotor); 344254591Sgibbs 345254591Sgibbs mc->mc_minblocksize = 1ULL << minashift; 346254591Sgibbs} 347254591Sgibbs 348219089Spjduint64_t 349219089Spjdmetaslab_class_get_alloc(metaslab_class_t *mc) 350219089Spjd{ 351219089Spjd return (mc->mc_alloc); 352219089Spjd} 353168404Spjd 354219089Spjduint64_t 355219089Spjdmetaslab_class_get_deferred(metaslab_class_t *mc) 356219089Spjd{ 357219089Spjd return (mc->mc_deferred); 358219089Spjd} 359168404Spjd 360219089Spjduint64_t 361219089Spjdmetaslab_class_get_space(metaslab_class_t *mc) 362219089Spjd{ 363219089Spjd return (mc->mc_space); 364219089Spjd} 365168404Spjd 366219089Spjduint64_t 367219089Spjdmetaslab_class_get_dspace(metaslab_class_t *mc) 368219089Spjd{ 369219089Spjd return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 370168404Spjd} 371168404Spjd 372254591Sgibbsuint64_t 373254591Sgibbsmetaslab_class_get_minblocksize(metaslab_class_t *mc) 374254591Sgibbs{ 375254591Sgibbs return (mc->mc_minblocksize); 376254591Sgibbs} 377254591Sgibbs 378269773Sdelphijvoid 379269773Sdelphijmetaslab_class_histogram_verify(metaslab_class_t *mc) 380269773Sdelphij{ 381269773Sdelphij vdev_t *rvd = mc->mc_spa->spa_root_vdev; 382269773Sdelphij uint64_t *mc_hist; 383269773Sdelphij int i; 384269773Sdelphij 385269773Sdelphij if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 386269773Sdelphij return; 387269773Sdelphij 388269773Sdelphij mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 389269773Sdelphij KM_SLEEP); 390269773Sdelphij 391269773Sdelphij for (int c = 0; c < rvd->vdev_children; c++) { 392269773Sdelphij vdev_t *tvd = rvd->vdev_child[c]; 393269773Sdelphij metaslab_group_t *mg = tvd->vdev_mg; 394269773Sdelphij 395269773Sdelphij /* 396269773Sdelphij * Skip any holes, uninitialized top-levels, or 397269773Sdelphij * vdevs that are not in this metalab class. 398269773Sdelphij */ 399269773Sdelphij if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 400269773Sdelphij mg->mg_class != mc) { 401269773Sdelphij continue; 402269773Sdelphij } 403269773Sdelphij 404269773Sdelphij for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 405269773Sdelphij mc_hist[i] += mg->mg_histogram[i]; 406269773Sdelphij } 407269773Sdelphij 408269773Sdelphij for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 409269773Sdelphij VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 410269773Sdelphij 411269773Sdelphij kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 412269773Sdelphij} 413269773Sdelphij 414168404Spjd/* 415269773Sdelphij * Calculate the metaslab class's fragmentation metric. The metric 416269773Sdelphij * is weighted based on the space contribution of each metaslab group. 417269773Sdelphij * The return value will be a number between 0 and 100 (inclusive), or 418269773Sdelphij * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 419269773Sdelphij * zfs_frag_table for more information about the metric. 420269773Sdelphij */ 421269773Sdelphijuint64_t 422269773Sdelphijmetaslab_class_fragmentation(metaslab_class_t *mc) 423269773Sdelphij{ 424269773Sdelphij vdev_t *rvd = mc->mc_spa->spa_root_vdev; 425269773Sdelphij uint64_t fragmentation = 0; 426269773Sdelphij 427269773Sdelphij spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 428269773Sdelphij 429269773Sdelphij for (int c = 0; c < rvd->vdev_children; c++) { 430269773Sdelphij vdev_t *tvd = rvd->vdev_child[c]; 431269773Sdelphij metaslab_group_t *mg = tvd->vdev_mg; 432269773Sdelphij 433269773Sdelphij /* 434269773Sdelphij * Skip any holes, uninitialized top-levels, or 435269773Sdelphij * vdevs that are not in this metalab class. 436269773Sdelphij */ 437269773Sdelphij if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 438269773Sdelphij mg->mg_class != mc) { 439269773Sdelphij continue; 440269773Sdelphij } 441269773Sdelphij 442269773Sdelphij /* 443269773Sdelphij * If a metaslab group does not contain a fragmentation 444269773Sdelphij * metric then just bail out. 445269773Sdelphij */ 446269773Sdelphij if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 447269773Sdelphij spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 448269773Sdelphij return (ZFS_FRAG_INVALID); 449269773Sdelphij } 450269773Sdelphij 451269773Sdelphij /* 452269773Sdelphij * Determine how much this metaslab_group is contributing 453269773Sdelphij * to the overall pool fragmentation metric. 454269773Sdelphij */ 455269773Sdelphij fragmentation += mg->mg_fragmentation * 456269773Sdelphij metaslab_group_get_space(mg); 457269773Sdelphij } 458269773Sdelphij fragmentation /= metaslab_class_get_space(mc); 459269773Sdelphij 460269773Sdelphij ASSERT3U(fragmentation, <=, 100); 461269773Sdelphij spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 462269773Sdelphij return (fragmentation); 463269773Sdelphij} 464269773Sdelphij 465269773Sdelphij/* 466269773Sdelphij * Calculate the amount of expandable space that is available in 467269773Sdelphij * this metaslab class. If a device is expanded then its expandable 468269773Sdelphij * space will be the amount of allocatable space that is currently not 469269773Sdelphij * part of this metaslab class. 470269773Sdelphij */ 471269773Sdelphijuint64_t 472269773Sdelphijmetaslab_class_expandable_space(metaslab_class_t *mc) 473269773Sdelphij{ 474269773Sdelphij vdev_t *rvd = mc->mc_spa->spa_root_vdev; 475269773Sdelphij uint64_t space = 0; 476269773Sdelphij 477269773Sdelphij spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 478269773Sdelphij for (int c = 0; c < rvd->vdev_children; c++) { 479269773Sdelphij vdev_t *tvd = rvd->vdev_child[c]; 480269773Sdelphij metaslab_group_t *mg = tvd->vdev_mg; 481269773Sdelphij 482269773Sdelphij if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 483269773Sdelphij mg->mg_class != mc) { 484269773Sdelphij continue; 485269773Sdelphij } 486269773Sdelphij 487307268Smav /* 488307268Smav * Calculate if we have enough space to add additional 489307268Smav * metaslabs. We report the expandable space in terms 490307268Smav * of the metaslab size since that's the unit of expansion. 491307268Smav */ 492307268Smav space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, 493307268Smav 1ULL << tvd->vdev_ms_shift); 494269773Sdelphij } 495269773Sdelphij spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 496269773Sdelphij return (space); 497269773Sdelphij} 498269773Sdelphij 499269773Sdelphij/* 500168404Spjd * ========================================================================== 501168404Spjd * Metaslab groups 502168404Spjd * ========================================================================== 503168404Spjd */ 504168404Spjdstatic int 505168404Spjdmetaslab_compare(const void *x1, const void *x2) 506168404Spjd{ 507168404Spjd const metaslab_t *m1 = x1; 508168404Spjd const metaslab_t *m2 = x2; 509168404Spjd 510168404Spjd if (m1->ms_weight < m2->ms_weight) 511168404Spjd return (1); 512168404Spjd if (m1->ms_weight > m2->ms_weight) 513168404Spjd return (-1); 514168404Spjd 515168404Spjd /* 516168404Spjd * If the weights are identical, use the offset to force uniqueness. 517168404Spjd */ 518262093Savg if (m1->ms_start < m2->ms_start) 519168404Spjd return (-1); 520262093Savg if (m1->ms_start > m2->ms_start) 521168404Spjd return (1); 522168404Spjd 523168404Spjd ASSERT3P(m1, ==, m2); 524168404Spjd 525168404Spjd return (0); 526168404Spjd} 527168404Spjd 528260768Savg/* 529260768Savg * Update the allocatable flag and the metaslab group's capacity. 530260768Savg * The allocatable flag is set to true if the capacity is below 531307279Smav * the zfs_mg_noalloc_threshold or has a fragmentation value that is 532307279Smav * greater than zfs_mg_fragmentation_threshold. If a metaslab group 533307279Smav * transitions from allocatable to non-allocatable or vice versa then the 534307279Smav * metaslab group's class is updated to reflect the transition. 535260768Savg */ 536260768Savgstatic void 537260768Savgmetaslab_group_alloc_update(metaslab_group_t *mg) 538260768Savg{ 539260768Savg vdev_t *vd = mg->mg_vd; 540260768Savg metaslab_class_t *mc = mg->mg_class; 541260768Savg vdev_stat_t *vs = &vd->vdev_stat; 542260768Savg boolean_t was_allocatable; 543307279Smav boolean_t was_initialized; 544260768Savg 545260768Savg ASSERT(vd == vd->vdev_top); 546260768Savg 547260768Savg mutex_enter(&mg->mg_lock); 548260768Savg was_allocatable = mg->mg_allocatable; 549307279Smav was_initialized = mg->mg_initialized; 550260768Savg 551260768Savg mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 552260768Savg (vs->vs_space + 1); 553260768Savg 554307279Smav mutex_enter(&mc->mc_lock); 555307279Smav 556269773Sdelphij /* 557307279Smav * If the metaslab group was just added then it won't 558307279Smav * have any space until we finish syncing out this txg. 559307279Smav * At that point we will consider it initialized and available 560307279Smav * for allocations. We also don't consider non-activated 561307279Smav * metaslab groups (e.g. vdevs that are in the middle of being removed) 562307279Smav * to be initialized, because they can't be used for allocation. 563307279Smav */ 564307279Smav mg->mg_initialized = metaslab_group_initialized(mg); 565307279Smav if (!was_initialized && mg->mg_initialized) { 566307279Smav mc->mc_groups++; 567307279Smav } else if (was_initialized && !mg->mg_initialized) { 568307279Smav ASSERT3U(mc->mc_groups, >, 0); 569307279Smav mc->mc_groups--; 570307279Smav } 571307279Smav if (mg->mg_initialized) 572307279Smav mg->mg_no_free_space = B_FALSE; 573307279Smav 574307279Smav /* 575269773Sdelphij * A metaslab group is considered allocatable if it has plenty 576269773Sdelphij * of free space or is not heavily fragmented. We only take 577269773Sdelphij * fragmentation into account if the metaslab group has a valid 578269773Sdelphij * fragmentation metric (i.e. a value between 0 and 100). 579269773Sdelphij */ 580307279Smav mg->mg_allocatable = (mg->mg_activation_count > 0 && 581307279Smav mg->mg_free_capacity > zfs_mg_noalloc_threshold && 582269773Sdelphij (mg->mg_fragmentation == ZFS_FRAG_INVALID || 583269773Sdelphij mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 584260768Savg 585260768Savg /* 586260768Savg * The mc_alloc_groups maintains a count of the number of 587260768Savg * groups in this metaslab class that are still above the 588260768Savg * zfs_mg_noalloc_threshold. This is used by the allocating 589260768Savg * threads to determine if they should avoid allocations to 590260768Savg * a given group. The allocator will avoid allocations to a group 591260768Savg * if that group has reached or is below the zfs_mg_noalloc_threshold 592260768Savg * and there are still other groups that are above the threshold. 593260768Savg * When a group transitions from allocatable to non-allocatable or 594260768Savg * vice versa we update the metaslab class to reflect that change. 595260768Savg * When the mc_alloc_groups value drops to 0 that means that all 596260768Savg * groups have reached the zfs_mg_noalloc_threshold making all groups 597260768Savg * eligible for allocations. This effectively means that all devices 598260768Savg * are balanced again. 599260768Savg */ 600260768Savg if (was_allocatable && !mg->mg_allocatable) 601260768Savg mc->mc_alloc_groups--; 602260768Savg else if (!was_allocatable && mg->mg_allocatable) 603260768Savg mc->mc_alloc_groups++; 604307279Smav mutex_exit(&mc->mc_lock); 605269773Sdelphij 606260768Savg mutex_exit(&mg->mg_lock); 607260768Savg} 608260768Savg 609168404Spjdmetaslab_group_t * 610168404Spjdmetaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 611168404Spjd{ 612168404Spjd metaslab_group_t *mg; 613168404Spjd 614168404Spjd mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 615168404Spjd mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 616168404Spjd avl_create(&mg->mg_metaslab_tree, metaslab_compare, 617168404Spjd sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 618168404Spjd mg->mg_vd = vd; 619219089Spjd mg->mg_class = mc; 620219089Spjd mg->mg_activation_count = 0; 621307279Smav mg->mg_initialized = B_FALSE; 622307279Smav mg->mg_no_free_space = B_TRUE; 623307279Smav refcount_create_tracked(&mg->mg_alloc_queue_depth); 624168404Spjd 625265746Sdelphij mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 626262093Savg minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 627262093Savg 628168404Spjd return (mg); 629168404Spjd} 630168404Spjd 631168404Spjdvoid 632168404Spjdmetaslab_group_destroy(metaslab_group_t *mg) 633168404Spjd{ 634219089Spjd ASSERT(mg->mg_prev == NULL); 635219089Spjd ASSERT(mg->mg_next == NULL); 636219089Spjd /* 637219089Spjd * We may have gone below zero with the activation count 638219089Spjd * either because we never activated in the first place or 639219089Spjd * because we're done, and possibly removing the vdev. 640219089Spjd */ 641219089Spjd ASSERT(mg->mg_activation_count <= 0); 642219089Spjd 643265746Sdelphij taskq_destroy(mg->mg_taskq); 644168404Spjd avl_destroy(&mg->mg_metaslab_tree); 645168404Spjd mutex_destroy(&mg->mg_lock); 646307279Smav refcount_destroy(&mg->mg_alloc_queue_depth); 647168404Spjd kmem_free(mg, sizeof (metaslab_group_t)); 648168404Spjd} 649168404Spjd 650219089Spjdvoid 651219089Spjdmetaslab_group_activate(metaslab_group_t *mg) 652219089Spjd{ 653219089Spjd metaslab_class_t *mc = mg->mg_class; 654219089Spjd metaslab_group_t *mgprev, *mgnext; 655219089Spjd 656219089Spjd ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 657219089Spjd 658219089Spjd ASSERT(mc->mc_rotor != mg); 659219089Spjd ASSERT(mg->mg_prev == NULL); 660219089Spjd ASSERT(mg->mg_next == NULL); 661219089Spjd ASSERT(mg->mg_activation_count <= 0); 662219089Spjd 663219089Spjd if (++mg->mg_activation_count <= 0) 664219089Spjd return; 665219089Spjd 666219089Spjd mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 667260768Savg metaslab_group_alloc_update(mg); 668219089Spjd 669219089Spjd if ((mgprev = mc->mc_rotor) == NULL) { 670219089Spjd mg->mg_prev = mg; 671219089Spjd mg->mg_next = mg; 672219089Spjd } else { 673219089Spjd mgnext = mgprev->mg_next; 674219089Spjd mg->mg_prev = mgprev; 675219089Spjd mg->mg_next = mgnext; 676219089Spjd mgprev->mg_next = mg; 677219089Spjd mgnext->mg_prev = mg; 678219089Spjd } 679219089Spjd mc->mc_rotor = mg; 680254591Sgibbs metaslab_class_minblocksize_update(mc); 681219089Spjd} 682219089Spjd 683219089Spjdvoid 684219089Spjdmetaslab_group_passivate(metaslab_group_t *mg) 685219089Spjd{ 686219089Spjd metaslab_class_t *mc = mg->mg_class; 687219089Spjd metaslab_group_t *mgprev, *mgnext; 688219089Spjd 689219089Spjd ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 690219089Spjd 691219089Spjd if (--mg->mg_activation_count != 0) { 692219089Spjd ASSERT(mc->mc_rotor != mg); 693219089Spjd ASSERT(mg->mg_prev == NULL); 694219089Spjd ASSERT(mg->mg_next == NULL); 695219089Spjd ASSERT(mg->mg_activation_count < 0); 696219089Spjd return; 697219089Spjd } 698219089Spjd 699262093Savg taskq_wait(mg->mg_taskq); 700269773Sdelphij metaslab_group_alloc_update(mg); 701262093Savg 702219089Spjd mgprev = mg->mg_prev; 703219089Spjd mgnext = mg->mg_next; 704219089Spjd 705219089Spjd if (mg == mgnext) { 706219089Spjd mc->mc_rotor = NULL; 707219089Spjd } else { 708219089Spjd mc->mc_rotor = mgnext; 709219089Spjd mgprev->mg_next = mgnext; 710219089Spjd mgnext->mg_prev = mgprev; 711219089Spjd } 712219089Spjd 713219089Spjd mg->mg_prev = NULL; 714219089Spjd mg->mg_next = NULL; 715254591Sgibbs metaslab_class_minblocksize_update(mc); 716219089Spjd} 717219089Spjd 718307279Smavboolean_t 719307279Smavmetaslab_group_initialized(metaslab_group_t *mg) 720307279Smav{ 721307279Smav vdev_t *vd = mg->mg_vd; 722307279Smav vdev_stat_t *vs = &vd->vdev_stat; 723307279Smav 724307279Smav return (vs->vs_space != 0 && mg->mg_activation_count > 0); 725307279Smav} 726307279Smav 727269773Sdelphijuint64_t 728269773Sdelphijmetaslab_group_get_space(metaslab_group_t *mg) 729269773Sdelphij{ 730269773Sdelphij return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 731269773Sdelphij} 732269773Sdelphij 733269773Sdelphijvoid 734269773Sdelphijmetaslab_group_histogram_verify(metaslab_group_t *mg) 735269773Sdelphij{ 736269773Sdelphij uint64_t *mg_hist; 737269773Sdelphij vdev_t *vd = mg->mg_vd; 738269773Sdelphij uint64_t ashift = vd->vdev_ashift; 739269773Sdelphij int i; 740269773Sdelphij 741269773Sdelphij if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 742269773Sdelphij return; 743269773Sdelphij 744269773Sdelphij mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 745269773Sdelphij KM_SLEEP); 746269773Sdelphij 747269773Sdelphij ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 748269773Sdelphij SPACE_MAP_HISTOGRAM_SIZE + ashift); 749269773Sdelphij 750269773Sdelphij for (int m = 0; m < vd->vdev_ms_count; m++) { 751269773Sdelphij metaslab_t *msp = vd->vdev_ms[m]; 752269773Sdelphij 753269773Sdelphij if (msp->ms_sm == NULL) 754269773Sdelphij continue; 755269773Sdelphij 756269773Sdelphij for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 757269773Sdelphij mg_hist[i + ashift] += 758269773Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 759269773Sdelphij } 760269773Sdelphij 761269773Sdelphij for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 762269773Sdelphij VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 763269773Sdelphij 764269773Sdelphij kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 765269773Sdelphij} 766269773Sdelphij 767168404Spjdstatic void 768269773Sdelphijmetaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 769269773Sdelphij{ 770269773Sdelphij metaslab_class_t *mc = mg->mg_class; 771269773Sdelphij uint64_t ashift = mg->mg_vd->vdev_ashift; 772269773Sdelphij 773269773Sdelphij ASSERT(MUTEX_HELD(&msp->ms_lock)); 774269773Sdelphij if (msp->ms_sm == NULL) 775269773Sdelphij return; 776269773Sdelphij 777269773Sdelphij mutex_enter(&mg->mg_lock); 778269773Sdelphij for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 779269773Sdelphij mg->mg_histogram[i + ashift] += 780269773Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 781269773Sdelphij mc->mc_histogram[i + ashift] += 782269773Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 783269773Sdelphij } 784269773Sdelphij mutex_exit(&mg->mg_lock); 785269773Sdelphij} 786269773Sdelphij 787269773Sdelphijvoid 788269773Sdelphijmetaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 789269773Sdelphij{ 790269773Sdelphij metaslab_class_t *mc = mg->mg_class; 791269773Sdelphij uint64_t ashift = mg->mg_vd->vdev_ashift; 792269773Sdelphij 793269773Sdelphij ASSERT(MUTEX_HELD(&msp->ms_lock)); 794269773Sdelphij if (msp->ms_sm == NULL) 795269773Sdelphij return; 796269773Sdelphij 797269773Sdelphij mutex_enter(&mg->mg_lock); 798269773Sdelphij for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 799269773Sdelphij ASSERT3U(mg->mg_histogram[i + ashift], >=, 800269773Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]); 801269773Sdelphij ASSERT3U(mc->mc_histogram[i + ashift], >=, 802269773Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]); 803269773Sdelphij 804269773Sdelphij mg->mg_histogram[i + ashift] -= 805269773Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 806269773Sdelphij mc->mc_histogram[i + ashift] -= 807269773Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 808269773Sdelphij } 809269773Sdelphij mutex_exit(&mg->mg_lock); 810269773Sdelphij} 811269773Sdelphij 812269773Sdelphijstatic void 813168404Spjdmetaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 814168404Spjd{ 815269773Sdelphij ASSERT(msp->ms_group == NULL); 816168404Spjd mutex_enter(&mg->mg_lock); 817168404Spjd msp->ms_group = mg; 818168404Spjd msp->ms_weight = 0; 819168404Spjd avl_add(&mg->mg_metaslab_tree, msp); 820168404Spjd mutex_exit(&mg->mg_lock); 821269773Sdelphij 822269773Sdelphij mutex_enter(&msp->ms_lock); 823269773Sdelphij metaslab_group_histogram_add(mg, msp); 824269773Sdelphij mutex_exit(&msp->ms_lock); 825168404Spjd} 826168404Spjd 827168404Spjdstatic void 828168404Spjdmetaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 829168404Spjd{ 830269773Sdelphij mutex_enter(&msp->ms_lock); 831269773Sdelphij metaslab_group_histogram_remove(mg, msp); 832269773Sdelphij mutex_exit(&msp->ms_lock); 833269773Sdelphij 834168404Spjd mutex_enter(&mg->mg_lock); 835168404Spjd ASSERT(msp->ms_group == mg); 836168404Spjd avl_remove(&mg->mg_metaslab_tree, msp); 837168404Spjd msp->ms_group = NULL; 838168404Spjd mutex_exit(&mg->mg_lock); 839168404Spjd} 840168404Spjd 841168404Spjdstatic void 842168404Spjdmetaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 843168404Spjd{ 844168404Spjd /* 845168404Spjd * Although in principle the weight can be any value, in 846269773Sdelphij * practice we do not use values in the range [1, 511]. 847168404Spjd */ 848269773Sdelphij ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 849168404Spjd ASSERT(MUTEX_HELD(&msp->ms_lock)); 850168404Spjd 851168404Spjd mutex_enter(&mg->mg_lock); 852168404Spjd ASSERT(msp->ms_group == mg); 853168404Spjd avl_remove(&mg->mg_metaslab_tree, msp); 854168404Spjd msp->ms_weight = weight; 855168404Spjd avl_add(&mg->mg_metaslab_tree, msp); 856168404Spjd mutex_exit(&mg->mg_lock); 857168404Spjd} 858168404Spjd 859168404Spjd/* 860269773Sdelphij * Calculate the fragmentation for a given metaslab group. We can use 861269773Sdelphij * a simple average here since all metaslabs within the group must have 862269773Sdelphij * the same size. The return value will be a value between 0 and 100 863269773Sdelphij * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 864269773Sdelphij * group have a fragmentation metric. 865269773Sdelphij */ 866269773Sdelphijuint64_t 867269773Sdelphijmetaslab_group_fragmentation(metaslab_group_t *mg) 868269773Sdelphij{ 869269773Sdelphij vdev_t *vd = mg->mg_vd; 870269773Sdelphij uint64_t fragmentation = 0; 871269773Sdelphij uint64_t valid_ms = 0; 872269773Sdelphij 873269773Sdelphij for (int m = 0; m < vd->vdev_ms_count; m++) { 874269773Sdelphij metaslab_t *msp = vd->vdev_ms[m]; 875269773Sdelphij 876269773Sdelphij if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 877269773Sdelphij continue; 878269773Sdelphij 879269773Sdelphij valid_ms++; 880269773Sdelphij fragmentation += msp->ms_fragmentation; 881269773Sdelphij } 882269773Sdelphij 883269773Sdelphij if (valid_ms <= vd->vdev_ms_count / 2) 884269773Sdelphij return (ZFS_FRAG_INVALID); 885269773Sdelphij 886269773Sdelphij fragmentation /= valid_ms; 887269773Sdelphij ASSERT3U(fragmentation, <=, 100); 888269773Sdelphij return (fragmentation); 889269773Sdelphij} 890269773Sdelphij 891269773Sdelphij/* 892260768Savg * Determine if a given metaslab group should skip allocations. A metaslab 893269773Sdelphij * group should avoid allocations if its free capacity is less than the 894269773Sdelphij * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 895269773Sdelphij * zfs_mg_fragmentation_threshold and there is at least one metaslab group 896307279Smav * that can still handle allocations. If the allocation throttle is enabled 897307279Smav * then we skip allocations to devices that have reached their maximum 898307279Smav * allocation queue depth unless the selected metaslab group is the only 899307279Smav * eligible group remaining. 900260768Savg */ 901260768Savgstatic boolean_t 902307279Smavmetaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 903307279Smav uint64_t psize) 904260768Savg{ 905307279Smav spa_t *spa = mg->mg_vd->vdev_spa; 906260768Savg metaslab_class_t *mc = mg->mg_class; 907260768Savg 908260768Savg /* 909307279Smav * We can only consider skipping this metaslab group if it's 910307279Smav * in the normal metaslab class and there are other metaslab 911307279Smav * groups to select from. Otherwise, we always consider it eligible 912269773Sdelphij * for allocations. 913260768Savg */ 914307279Smav if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) 915307279Smav return (B_TRUE); 916307279Smav 917307279Smav /* 918307279Smav * If the metaslab group's mg_allocatable flag is set (see comments 919307279Smav * in metaslab_group_alloc_update() for more information) and 920307279Smav * the allocation throttle is disabled then allow allocations to this 921307279Smav * device. However, if the allocation throttle is enabled then 922307279Smav * check if we have reached our allocation limit (mg_alloc_queue_depth) 923307279Smav * to determine if we should allow allocations to this metaslab group. 924307279Smav * If all metaslab groups are no longer considered allocatable 925307279Smav * (mc_alloc_groups == 0) or we're trying to allocate the smallest 926307279Smav * gang block size then we allow allocations on this metaslab group 927307279Smav * regardless of the mg_allocatable or throttle settings. 928307279Smav */ 929307279Smav if (mg->mg_allocatable) { 930307279Smav metaslab_group_t *mgp; 931307279Smav int64_t qdepth; 932307279Smav uint64_t qmax = mg->mg_max_alloc_queue_depth; 933307279Smav 934307279Smav if (!mc->mc_alloc_throttle_enabled) 935307279Smav return (B_TRUE); 936307279Smav 937307279Smav /* 938307279Smav * If this metaslab group does not have any free space, then 939307279Smav * there is no point in looking further. 940307279Smav */ 941307279Smav if (mg->mg_no_free_space) 942307279Smav return (B_FALSE); 943307279Smav 944307279Smav qdepth = refcount_count(&mg->mg_alloc_queue_depth); 945307279Smav 946307279Smav /* 947307279Smav * If this metaslab group is below its qmax or it's 948307279Smav * the only allocatable metasable group, then attempt 949307279Smav * to allocate from it. 950307279Smav */ 951307279Smav if (qdepth < qmax || mc->mc_alloc_groups == 1) 952307279Smav return (B_TRUE); 953307279Smav ASSERT3U(mc->mc_alloc_groups, >, 1); 954307279Smav 955307279Smav /* 956307279Smav * Since this metaslab group is at or over its qmax, we 957307279Smav * need to determine if there are metaslab groups after this 958307279Smav * one that might be able to handle this allocation. This is 959307279Smav * racy since we can't hold the locks for all metaslab 960307279Smav * groups at the same time when we make this check. 961307279Smav */ 962307279Smav for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 963307279Smav qmax = mgp->mg_max_alloc_queue_depth; 964307279Smav 965307279Smav qdepth = refcount_count(&mgp->mg_alloc_queue_depth); 966307279Smav 967307279Smav /* 968307279Smav * If there is another metaslab group that 969307279Smav * might be able to handle the allocation, then 970307279Smav * we return false so that we skip this group. 971307279Smav */ 972307279Smav if (qdepth < qmax && !mgp->mg_no_free_space) 973307279Smav return (B_FALSE); 974307279Smav } 975307279Smav 976307279Smav /* 977307279Smav * We didn't find another group to handle the allocation 978307279Smav * so we can't skip this metaslab group even though 979307279Smav * we are at or over our qmax. 980307279Smav */ 981307279Smav return (B_TRUE); 982307279Smav 983307279Smav } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 984307279Smav return (B_TRUE); 985307279Smav } 986307279Smav return (B_FALSE); 987260768Savg} 988260768Savg 989260768Savg/* 990211931Smm * ========================================================================== 991262093Savg * Range tree callbacks 992211931Smm * ========================================================================== 993211931Smm */ 994262093Savg 995262093Savg/* 996262093Savg * Comparison function for the private size-ordered tree. Tree is sorted 997262093Savg * by size, larger sizes at the end of the tree. 998262093Savg */ 999211931Smmstatic int 1000262093Savgmetaslab_rangesize_compare(const void *x1, const void *x2) 1001211931Smm{ 1002262093Savg const range_seg_t *r1 = x1; 1003262093Savg const range_seg_t *r2 = x2; 1004262093Savg uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1005262093Savg uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1006211931Smm 1007262093Savg if (rs_size1 < rs_size2) 1008211931Smm return (-1); 1009262093Savg if (rs_size1 > rs_size2) 1010211931Smm return (1); 1011211931Smm 1012262093Savg if (r1->rs_start < r2->rs_start) 1013211931Smm return (-1); 1014262093Savg 1015262093Savg if (r1->rs_start > r2->rs_start) 1016211931Smm return (1); 1017211931Smm 1018211931Smm return (0); 1019211931Smm} 1020211931Smm 1021211931Smm/* 1022262093Savg * Create any block allocator specific components. The current allocators 1023262093Savg * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1024168404Spjd */ 1025262093Savgstatic void 1026262093Savgmetaslab_rt_create(range_tree_t *rt, void *arg) 1027168404Spjd{ 1028262093Savg metaslab_t *msp = arg; 1029168404Spjd 1030262093Savg ASSERT3P(rt->rt_arg, ==, msp); 1031262093Savg ASSERT(msp->ms_tree == NULL); 1032168404Spjd 1033262093Savg avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 1034262093Savg sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1035168404Spjd} 1036168404Spjd 1037262093Savg/* 1038262093Savg * Destroy the block allocator specific components. 1039262093Savg */ 1040209962Smmstatic void 1041262093Savgmetaslab_rt_destroy(range_tree_t *rt, void *arg) 1042209962Smm{ 1043262093Savg metaslab_t *msp = arg; 1044211931Smm 1045262093Savg ASSERT3P(rt->rt_arg, ==, msp); 1046262093Savg ASSERT3P(msp->ms_tree, ==, rt); 1047262093Savg ASSERT0(avl_numnodes(&msp->ms_size_tree)); 1048211931Smm 1049262093Savg avl_destroy(&msp->ms_size_tree); 1050209962Smm} 1051209962Smm 1052209962Smmstatic void 1053262093Savgmetaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1054209962Smm{ 1055262093Savg metaslab_t *msp = arg; 1056211931Smm 1057262093Savg ASSERT3P(rt->rt_arg, ==, msp); 1058262093Savg ASSERT3P(msp->ms_tree, ==, rt); 1059262093Savg VERIFY(!msp->ms_condensing); 1060262093Savg avl_add(&msp->ms_size_tree, rs); 1061209962Smm} 1062209962Smm 1063168404Spjdstatic void 1064262093Savgmetaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1065168404Spjd{ 1066262093Savg metaslab_t *msp = arg; 1067262093Savg 1068262093Savg ASSERT3P(rt->rt_arg, ==, msp); 1069262093Savg ASSERT3P(msp->ms_tree, ==, rt); 1070262093Savg VERIFY(!msp->ms_condensing); 1071262093Savg avl_remove(&msp->ms_size_tree, rs); 1072168404Spjd} 1073168404Spjd 1074168404Spjdstatic void 1075262093Savgmetaslab_rt_vacate(range_tree_t *rt, void *arg) 1076168404Spjd{ 1077262093Savg metaslab_t *msp = arg; 1078262093Savg 1079262093Savg ASSERT3P(rt->rt_arg, ==, msp); 1080262093Savg ASSERT3P(msp->ms_tree, ==, rt); 1081262093Savg 1082262093Savg /* 1083262093Savg * Normally one would walk the tree freeing nodes along the way. 1084262093Savg * Since the nodes are shared with the range trees we can avoid 1085262093Savg * walking all nodes and just reinitialize the avl tree. The nodes 1086262093Savg * will be freed by the range tree, so we don't want to free them here. 1087262093Savg */ 1088262093Savg avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 1089262093Savg sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1090168404Spjd} 1091168404Spjd 1092262093Savgstatic range_tree_ops_t metaslab_rt_ops = { 1093262093Savg metaslab_rt_create, 1094262093Savg metaslab_rt_destroy, 1095262093Savg metaslab_rt_add, 1096262093Savg metaslab_rt_remove, 1097262093Savg metaslab_rt_vacate 1098262093Savg}; 1099262093Savg 1100168404Spjd/* 1101262093Savg * ========================================================================== 1102262093Savg * Metaslab block operations 1103262093Savg * ========================================================================== 1104262093Savg */ 1105262093Savg 1106262093Savg/* 1107211931Smm * Return the maximum contiguous segment within the metaslab. 1108209962Smm */ 1109209962Smmuint64_t 1110262093Savgmetaslab_block_maxsize(metaslab_t *msp) 1111209962Smm{ 1112262093Savg avl_tree_t *t = &msp->ms_size_tree; 1113262093Savg range_seg_t *rs; 1114209962Smm 1115262093Savg if (t == NULL || (rs = avl_last(t)) == NULL) 1116209962Smm return (0ULL); 1117209962Smm 1118262093Savg return (rs->rs_end - rs->rs_start); 1119209962Smm} 1120209962Smm 1121262093Savguint64_t 1122262093Savgmetaslab_block_alloc(metaslab_t *msp, uint64_t size) 1123262093Savg{ 1124262093Savg uint64_t start; 1125262093Savg range_tree_t *rt = msp->ms_tree; 1126262093Savg 1127262093Savg VERIFY(!msp->ms_condensing); 1128262093Savg 1129262093Savg start = msp->ms_ops->msop_alloc(msp, size); 1130262093Savg if (start != -1ULL) { 1131262093Savg vdev_t *vd = msp->ms_group->mg_vd; 1132262093Savg 1133262093Savg VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 1134262093Savg VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1135262093Savg VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 1136262093Savg range_tree_remove(rt, start, size); 1137262093Savg } 1138262093Savg return (start); 1139262093Savg} 1140262093Savg 1141211931Smm/* 1142211931Smm * ========================================================================== 1143262093Savg * Common allocator routines 1144262093Savg * ========================================================================== 1145262093Savg */ 1146262093Savg 1147262093Savg/* 1148262093Savg * This is a helper function that can be used by the allocator to find 1149262093Savg * a suitable block to allocate. This will search the specified AVL 1150262093Savg * tree looking for a block that matches the specified criteria. 1151262093Savg */ 1152262093Savgstatic uint64_t 1153262093Savgmetaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1154262093Savg uint64_t align) 1155262093Savg{ 1156262093Savg range_seg_t *rs, rsearch; 1157262093Savg avl_index_t where; 1158262093Savg 1159262093Savg rsearch.rs_start = *cursor; 1160262093Savg rsearch.rs_end = *cursor + size; 1161262093Savg 1162262093Savg rs = avl_find(t, &rsearch, &where); 1163262093Savg if (rs == NULL) 1164262093Savg rs = avl_nearest(t, where, AVL_AFTER); 1165262093Savg 1166262093Savg while (rs != NULL) { 1167262093Savg uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1168262093Savg 1169262093Savg if (offset + size <= rs->rs_end) { 1170262093Savg *cursor = offset + size; 1171262093Savg return (offset); 1172262093Savg } 1173262093Savg rs = AVL_NEXT(t, rs); 1174262093Savg } 1175262093Savg 1176262093Savg /* 1177262093Savg * If we know we've searched the whole map (*cursor == 0), give up. 1178262093Savg * Otherwise, reset the cursor to the beginning and try again. 1179262093Savg */ 1180262093Savg if (*cursor == 0) 1181262093Savg return (-1ULL); 1182262093Savg 1183262093Savg *cursor = 0; 1184262093Savg return (metaslab_block_picker(t, cursor, size, align)); 1185262093Savg} 1186262093Savg 1187262093Savg/* 1188262093Savg * ========================================================================== 1189211931Smm * The first-fit block allocator 1190211931Smm * ========================================================================== 1191211931Smm */ 1192211931Smmstatic uint64_t 1193262093Savgmetaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1194209962Smm{ 1195262093Savg /* 1196262093Savg * Find the largest power of 2 block size that evenly divides the 1197262093Savg * requested size. This is used to try to allocate blocks with similar 1198262093Savg * alignment from the same area of the metaslab (i.e. same cursor 1199262093Savg * bucket) but it does not guarantee that other allocations sizes 1200262093Savg * may exist in the same region. 1201262093Savg */ 1202211931Smm uint64_t align = size & -size; 1203265740Sdelphij uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1204262093Savg avl_tree_t *t = &msp->ms_tree->rt_root; 1205209962Smm 1206211931Smm return (metaslab_block_picker(t, cursor, size, align)); 1207209962Smm} 1208209962Smm 1209262093Savgstatic metaslab_ops_t metaslab_ff_ops = { 1210269773Sdelphij metaslab_ff_alloc 1211211931Smm}; 1212209962Smm 1213211931Smm/* 1214211931Smm * ========================================================================== 1215211931Smm * Dynamic block allocator - 1216211931Smm * Uses the first fit allocation scheme until space get low and then 1217211931Smm * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1218211931Smm * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1219211931Smm * ========================================================================== 1220211931Smm */ 1221209962Smmstatic uint64_t 1222262093Savgmetaslab_df_alloc(metaslab_t *msp, uint64_t size) 1223209962Smm{ 1224262093Savg /* 1225262093Savg * Find the largest power of 2 block size that evenly divides the 1226262093Savg * requested size. This is used to try to allocate blocks with similar 1227262093Savg * alignment from the same area of the metaslab (i.e. same cursor 1228262093Savg * bucket) but it does not guarantee that other allocations sizes 1229262093Savg * may exist in the same region. 1230262093Savg */ 1231209962Smm uint64_t align = size & -size; 1232265740Sdelphij uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1233262093Savg range_tree_t *rt = msp->ms_tree; 1234262093Savg avl_tree_t *t = &rt->rt_root; 1235262093Savg uint64_t max_size = metaslab_block_maxsize(msp); 1236262093Savg int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1237209962Smm 1238262093Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1239262093Savg ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1240209962Smm 1241209962Smm if (max_size < size) 1242209962Smm return (-1ULL); 1243209962Smm 1244209962Smm /* 1245209962Smm * If we're running low on space switch to using the size 1246209962Smm * sorted AVL tree (best-fit). 1247209962Smm */ 1248209962Smm if (max_size < metaslab_df_alloc_threshold || 1249209962Smm free_pct < metaslab_df_free_pct) { 1250262093Savg t = &msp->ms_size_tree; 1251209962Smm *cursor = 0; 1252209962Smm } 1253209962Smm 1254209962Smm return (metaslab_block_picker(t, cursor, size, 1ULL)); 1255209962Smm} 1256209962Smm 1257262093Savgstatic metaslab_ops_t metaslab_df_ops = { 1258269773Sdelphij metaslab_df_alloc 1259209962Smm}; 1260209962Smm 1261211931Smm/* 1262211931Smm * ========================================================================== 1263262093Savg * Cursor fit block allocator - 1264262093Savg * Select the largest region in the metaslab, set the cursor to the beginning 1265262093Savg * of the range and the cursor_end to the end of the range. As allocations 1266262093Savg * are made advance the cursor. Continue allocating from the cursor until 1267262093Savg * the range is exhausted and then find a new range. 1268211931Smm * ========================================================================== 1269211931Smm */ 1270211931Smmstatic uint64_t 1271262093Savgmetaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1272211931Smm{ 1273262093Savg range_tree_t *rt = msp->ms_tree; 1274262093Savg avl_tree_t *t = &msp->ms_size_tree; 1275262093Savg uint64_t *cursor = &msp->ms_lbas[0]; 1276262093Savg uint64_t *cursor_end = &msp->ms_lbas[1]; 1277211931Smm uint64_t offset = 0; 1278209962Smm 1279262093Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1280262093Savg ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1281211931Smm 1282262093Savg ASSERT3U(*cursor_end, >=, *cursor); 1283211931Smm 1284262093Savg if ((*cursor + size) > *cursor_end) { 1285262093Savg range_seg_t *rs; 1286211931Smm 1287262093Savg rs = avl_last(&msp->ms_size_tree); 1288262093Savg if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1289262093Savg return (-1ULL); 1290211931Smm 1291262093Savg *cursor = rs->rs_start; 1292262093Savg *cursor_end = rs->rs_end; 1293262093Savg } 1294211931Smm 1295262093Savg offset = *cursor; 1296262093Savg *cursor += size; 1297262093Savg 1298211931Smm return (offset); 1299211931Smm} 1300211931Smm 1301262093Savgstatic metaslab_ops_t metaslab_cf_ops = { 1302269773Sdelphij metaslab_cf_alloc 1303211931Smm}; 1304211931Smm 1305262093Savg/* 1306262093Savg * ========================================================================== 1307262093Savg * New dynamic fit allocator - 1308262093Savg * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1309262093Savg * contiguous blocks. If no region is found then just use the largest segment 1310262093Savg * that remains. 1311262093Savg * ========================================================================== 1312262093Savg */ 1313262093Savg 1314262093Savg/* 1315262093Savg * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1316262093Savg * to request from the allocator. 1317262093Savg */ 1318211931Smmuint64_t metaslab_ndf_clump_shift = 4; 1319211931Smm 1320211931Smmstatic uint64_t 1321262093Savgmetaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1322211931Smm{ 1323262093Savg avl_tree_t *t = &msp->ms_tree->rt_root; 1324211931Smm avl_index_t where; 1325262093Savg range_seg_t *rs, rsearch; 1326265740Sdelphij uint64_t hbit = highbit64(size); 1327262093Savg uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1328262093Savg uint64_t max_size = metaslab_block_maxsize(msp); 1329211931Smm 1330262093Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1331262093Savg ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1332211931Smm 1333211931Smm if (max_size < size) 1334211931Smm return (-1ULL); 1335211931Smm 1336262093Savg rsearch.rs_start = *cursor; 1337262093Savg rsearch.rs_end = *cursor + size; 1338211931Smm 1339262093Savg rs = avl_find(t, &rsearch, &where); 1340262093Savg if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1341262093Savg t = &msp->ms_size_tree; 1342211931Smm 1343262093Savg rsearch.rs_start = 0; 1344262093Savg rsearch.rs_end = MIN(max_size, 1345211931Smm 1ULL << (hbit + metaslab_ndf_clump_shift)); 1346262093Savg rs = avl_find(t, &rsearch, &where); 1347262093Savg if (rs == NULL) 1348262093Savg rs = avl_nearest(t, where, AVL_AFTER); 1349262093Savg ASSERT(rs != NULL); 1350211931Smm } 1351211931Smm 1352262093Savg if ((rs->rs_end - rs->rs_start) >= size) { 1353262093Savg *cursor = rs->rs_start + size; 1354262093Savg return (rs->rs_start); 1355211931Smm } 1356211931Smm return (-1ULL); 1357211931Smm} 1358211931Smm 1359262093Savgstatic metaslab_ops_t metaslab_ndf_ops = { 1360269773Sdelphij metaslab_ndf_alloc 1361211931Smm}; 1362211931Smm 1363262093Savgmetaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1364211931Smm 1365209962Smm/* 1366168404Spjd * ========================================================================== 1367168404Spjd * Metaslabs 1368168404Spjd * ========================================================================== 1369168404Spjd */ 1370262093Savg 1371262093Savg/* 1372262093Savg * Wait for any in-progress metaslab loads to complete. 1373262093Savg */ 1374262093Savgvoid 1375262093Savgmetaslab_load_wait(metaslab_t *msp) 1376262093Savg{ 1377262093Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1378262093Savg 1379262093Savg while (msp->ms_loading) { 1380262093Savg ASSERT(!msp->ms_loaded); 1381262093Savg cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1382262093Savg } 1383262093Savg} 1384262093Savg 1385262093Savgint 1386262093Savgmetaslab_load(metaslab_t *msp) 1387262093Savg{ 1388262093Savg int error = 0; 1389262093Savg 1390262093Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1391262093Savg ASSERT(!msp->ms_loaded); 1392262093Savg ASSERT(!msp->ms_loading); 1393262093Savg 1394262093Savg msp->ms_loading = B_TRUE; 1395262093Savg 1396262093Savg /* 1397262093Savg * If the space map has not been allocated yet, then treat 1398262093Savg * all the space in the metaslab as free and add it to the 1399262093Savg * ms_tree. 1400262093Savg */ 1401262093Savg if (msp->ms_sm != NULL) 1402262093Savg error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1403262093Savg else 1404262093Savg range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1405262093Savg 1406262093Savg msp->ms_loaded = (error == 0); 1407262093Savg msp->ms_loading = B_FALSE; 1408262093Savg 1409262093Savg if (msp->ms_loaded) { 1410262093Savg for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1411262093Savg range_tree_walk(msp->ms_defertree[t], 1412262093Savg range_tree_remove, msp->ms_tree); 1413262093Savg } 1414262093Savg } 1415262093Savg cv_broadcast(&msp->ms_load_cv); 1416262093Savg return (error); 1417262093Savg} 1418262093Savg 1419262093Savgvoid 1420262093Savgmetaslab_unload(metaslab_t *msp) 1421262093Savg{ 1422262093Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1423262093Savg range_tree_vacate(msp->ms_tree, NULL, NULL); 1424262093Savg msp->ms_loaded = B_FALSE; 1425262093Savg msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1426262093Savg} 1427262093Savg 1428277553Sdelphijint 1429277553Sdelphijmetaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1430277553Sdelphij metaslab_t **msp) 1431168404Spjd{ 1432168404Spjd vdev_t *vd = mg->mg_vd; 1433262093Savg objset_t *mos = vd->vdev_spa->spa_meta_objset; 1434277553Sdelphij metaslab_t *ms; 1435277553Sdelphij int error; 1436168404Spjd 1437277553Sdelphij ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1438277553Sdelphij mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1439277553Sdelphij cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1440277553Sdelphij ms->ms_id = id; 1441277553Sdelphij ms->ms_start = id << vd->vdev_ms_shift; 1442277553Sdelphij ms->ms_size = 1ULL << vd->vdev_ms_shift; 1443168404Spjd 1444262093Savg /* 1445262093Savg * We only open space map objects that already exist. All others 1446262093Savg * will be opened when we finally allocate an object for it. 1447262093Savg */ 1448262093Savg if (object != 0) { 1449277553Sdelphij error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1450277553Sdelphij ms->ms_size, vd->vdev_ashift, &ms->ms_lock); 1451277553Sdelphij 1452277553Sdelphij if (error != 0) { 1453277553Sdelphij kmem_free(ms, sizeof (metaslab_t)); 1454277553Sdelphij return (error); 1455277553Sdelphij } 1456277553Sdelphij 1457277553Sdelphij ASSERT(ms->ms_sm != NULL); 1458262093Savg } 1459168404Spjd 1460168404Spjd /* 1461262093Savg * We create the main range tree here, but we don't create the 1462262093Savg * alloctree and freetree until metaslab_sync_done(). This serves 1463168404Spjd * two purposes: it allows metaslab_sync_done() to detect the 1464168404Spjd * addition of new space; and for debugging, it ensures that we'd 1465168404Spjd * data fault on any attempt to use this metaslab before it's ready. 1466168404Spjd */ 1467277553Sdelphij ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); 1468277553Sdelphij metaslab_group_add(mg, ms); 1469168404Spjd 1470277553Sdelphij ms->ms_fragmentation = metaslab_fragmentation(ms); 1471277553Sdelphij ms->ms_ops = mg->mg_class->mc_ops; 1472219089Spjd 1473168404Spjd /* 1474168404Spjd * If we're opening an existing pool (txg == 0) or creating 1475168404Spjd * a new one (txg == TXG_INITIAL), all space is available now. 1476168404Spjd * If we're adding space to an existing pool, the new space 1477168404Spjd * does not become available until after this txg has synced. 1478168404Spjd */ 1479168404Spjd if (txg <= TXG_INITIAL) 1480277553Sdelphij metaslab_sync_done(ms, 0); 1481168404Spjd 1482262093Savg /* 1483262093Savg * If metaslab_debug_load is set and we're initializing a metaslab 1484262093Savg * that has an allocated space_map object then load the its space 1485262093Savg * map so that can verify frees. 1486262093Savg */ 1487277553Sdelphij if (metaslab_debug_load && ms->ms_sm != NULL) { 1488277553Sdelphij mutex_enter(&ms->ms_lock); 1489277553Sdelphij VERIFY0(metaslab_load(ms)); 1490277553Sdelphij mutex_exit(&ms->ms_lock); 1491262093Savg } 1492262093Savg 1493168404Spjd if (txg != 0) { 1494168404Spjd vdev_dirty(vd, 0, NULL, txg); 1495277553Sdelphij vdev_dirty(vd, VDD_METASLAB, ms, txg); 1496168404Spjd } 1497168404Spjd 1498277553Sdelphij *msp = ms; 1499277553Sdelphij 1500277553Sdelphij return (0); 1501168404Spjd} 1502168404Spjd 1503168404Spjdvoid 1504168404Spjdmetaslab_fini(metaslab_t *msp) 1505168404Spjd{ 1506168404Spjd metaslab_group_t *mg = msp->ms_group; 1507168404Spjd 1508168404Spjd metaslab_group_remove(mg, msp); 1509168404Spjd 1510168404Spjd mutex_enter(&msp->ms_lock); 1511168404Spjd 1512262093Savg VERIFY(msp->ms_group == NULL); 1513262093Savg vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1514262093Savg 0, -msp->ms_size); 1515262093Savg space_map_close(msp->ms_sm); 1516168404Spjd 1517262093Savg metaslab_unload(msp); 1518262093Savg range_tree_destroy(msp->ms_tree); 1519262093Savg 1520219089Spjd for (int t = 0; t < TXG_SIZE; t++) { 1521262093Savg range_tree_destroy(msp->ms_alloctree[t]); 1522262093Savg range_tree_destroy(msp->ms_freetree[t]); 1523168404Spjd } 1524168404Spjd 1525247398Smm for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1526262093Savg range_tree_destroy(msp->ms_defertree[t]); 1527247398Smm } 1528219089Spjd 1529240415Smm ASSERT0(msp->ms_deferspace); 1530219089Spjd 1531168404Spjd mutex_exit(&msp->ms_lock); 1532262093Savg cv_destroy(&msp->ms_load_cv); 1533168404Spjd mutex_destroy(&msp->ms_lock); 1534168404Spjd 1535168404Spjd kmem_free(msp, sizeof (metaslab_t)); 1536168404Spjd} 1537168404Spjd 1538269773Sdelphij#define FRAGMENTATION_TABLE_SIZE 17 1539269773Sdelphij 1540262093Savg/* 1541269773Sdelphij * This table defines a segment size based fragmentation metric that will 1542269773Sdelphij * allow each metaslab to derive its own fragmentation value. This is done 1543269773Sdelphij * by calculating the space in each bucket of the spacemap histogram and 1544269773Sdelphij * multiplying that by the fragmetation metric in this table. Doing 1545269773Sdelphij * this for all buckets and dividing it by the total amount of free 1546269773Sdelphij * space in this metaslab (i.e. the total free space in all buckets) gives 1547269773Sdelphij * us the fragmentation metric. This means that a high fragmentation metric 1548269773Sdelphij * equates to most of the free space being comprised of small segments. 1549269773Sdelphij * Conversely, if the metric is low, then most of the free space is in 1550269773Sdelphij * large segments. A 10% change in fragmentation equates to approximately 1551269773Sdelphij * double the number of segments. 1552262093Savg * 1553269773Sdelphij * This table defines 0% fragmented space using 16MB segments. Testing has 1554269773Sdelphij * shown that segments that are greater than or equal to 16MB do not suffer 1555269773Sdelphij * from drastic performance problems. Using this value, we derive the rest 1556269773Sdelphij * of the table. Since the fragmentation value is never stored on disk, it 1557269773Sdelphij * is possible to change these calculations in the future. 1558262093Savg */ 1559269773Sdelphijint zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1560269773Sdelphij 100, /* 512B */ 1561269773Sdelphij 100, /* 1K */ 1562269773Sdelphij 98, /* 2K */ 1563269773Sdelphij 95, /* 4K */ 1564269773Sdelphij 90, /* 8K */ 1565269773Sdelphij 80, /* 16K */ 1566269773Sdelphij 70, /* 32K */ 1567269773Sdelphij 60, /* 64K */ 1568269773Sdelphij 50, /* 128K */ 1569269773Sdelphij 40, /* 256K */ 1570269773Sdelphij 30, /* 512K */ 1571269773Sdelphij 20, /* 1M */ 1572269773Sdelphij 15, /* 2M */ 1573269773Sdelphij 10, /* 4M */ 1574269773Sdelphij 5, /* 8M */ 1575269773Sdelphij 0 /* 16M */ 1576269773Sdelphij}; 1577269773Sdelphij 1578269773Sdelphij/* 1579269773Sdelphij * Calclate the metaslab's fragmentation metric. A return value 1580269773Sdelphij * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1581269773Sdelphij * not support this metric. Otherwise, the return value should be in the 1582269773Sdelphij * range [0, 100]. 1583269773Sdelphij */ 1584262093Savgstatic uint64_t 1585269773Sdelphijmetaslab_fragmentation(metaslab_t *msp) 1586262093Savg{ 1587269773Sdelphij spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1588269773Sdelphij uint64_t fragmentation = 0; 1589269773Sdelphij uint64_t total = 0; 1590269773Sdelphij boolean_t feature_enabled = spa_feature_is_enabled(spa, 1591269773Sdelphij SPA_FEATURE_SPACEMAP_HISTOGRAM); 1592168404Spjd 1593269773Sdelphij if (!feature_enabled) 1594269773Sdelphij return (ZFS_FRAG_INVALID); 1595269773Sdelphij 1596262093Savg /* 1597269773Sdelphij * A null space map means that the entire metaslab is free 1598269773Sdelphij * and thus is not fragmented. 1599262093Savg */ 1600269773Sdelphij if (msp->ms_sm == NULL) 1601269773Sdelphij return (0); 1602269773Sdelphij 1603269773Sdelphij /* 1604269773Sdelphij * If this metaslab's space_map has not been upgraded, flag it 1605269773Sdelphij * so that we upgrade next time we encounter it. 1606269773Sdelphij */ 1607269773Sdelphij if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1608269773Sdelphij uint64_t txg = spa_syncing_txg(spa); 1609262093Savg vdev_t *vd = msp->ms_group->mg_vd; 1610262093Savg 1611273341Sdelphij if (spa_writeable(spa)) { 1612273341Sdelphij msp->ms_condense_wanted = B_TRUE; 1613273341Sdelphij vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1614273341Sdelphij spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1615273341Sdelphij "msp %p, vd %p", txg, msp, vd); 1616273341Sdelphij } 1617269773Sdelphij return (ZFS_FRAG_INVALID); 1618262093Savg } 1619262093Savg 1620269773Sdelphij for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1621269773Sdelphij uint64_t space = 0; 1622269773Sdelphij uint8_t shift = msp->ms_sm->sm_shift; 1623269773Sdelphij int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1624269773Sdelphij FRAGMENTATION_TABLE_SIZE - 1); 1625262093Savg 1626262093Savg if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1627262093Savg continue; 1628262093Savg 1629269773Sdelphij space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1630269773Sdelphij total += space; 1631269773Sdelphij 1632269773Sdelphij ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1633269773Sdelphij fragmentation += space * zfs_frag_table[idx]; 1634262093Savg } 1635269773Sdelphij 1636269773Sdelphij if (total > 0) 1637269773Sdelphij fragmentation /= total; 1638269773Sdelphij ASSERT3U(fragmentation, <=, 100); 1639269773Sdelphij return (fragmentation); 1640262093Savg} 1641262093Savg 1642269773Sdelphij/* 1643269773Sdelphij * Compute a weight -- a selection preference value -- for the given metaslab. 1644269773Sdelphij * This is based on the amount of free space, the level of fragmentation, 1645269773Sdelphij * the LBA range, and whether the metaslab is loaded. 1646269773Sdelphij */ 1647168404Spjdstatic uint64_t 1648168404Spjdmetaslab_weight(metaslab_t *msp) 1649168404Spjd{ 1650168404Spjd metaslab_group_t *mg = msp->ms_group; 1651168404Spjd vdev_t *vd = mg->mg_vd; 1652168404Spjd uint64_t weight, space; 1653168404Spjd 1654168404Spjd ASSERT(MUTEX_HELD(&msp->ms_lock)); 1655168404Spjd 1656168404Spjd /* 1657247398Smm * This vdev is in the process of being removed so there is nothing 1658247398Smm * for us to do here. 1659247398Smm */ 1660247398Smm if (vd->vdev_removing) { 1661262093Savg ASSERT0(space_map_allocated(msp->ms_sm)); 1662247398Smm ASSERT0(vd->vdev_ms_shift); 1663247398Smm return (0); 1664247398Smm } 1665247398Smm 1666247398Smm /* 1667168404Spjd * The baseline weight is the metaslab's free space. 1668168404Spjd */ 1669262093Savg space = msp->ms_size - space_map_allocated(msp->ms_sm); 1670269773Sdelphij 1671269773Sdelphij msp->ms_fragmentation = metaslab_fragmentation(msp); 1672269773Sdelphij if (metaslab_fragmentation_factor_enabled && 1673269773Sdelphij msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1674269773Sdelphij /* 1675269773Sdelphij * Use the fragmentation information to inversely scale 1676269773Sdelphij * down the baseline weight. We need to ensure that we 1677269773Sdelphij * don't exclude this metaslab completely when it's 100% 1678269773Sdelphij * fragmented. To avoid this we reduce the fragmented value 1679269773Sdelphij * by 1. 1680269773Sdelphij */ 1681269773Sdelphij space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1682269773Sdelphij 1683269773Sdelphij /* 1684269773Sdelphij * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1685269773Sdelphij * this metaslab again. The fragmentation metric may have 1686269773Sdelphij * decreased the space to something smaller than 1687269773Sdelphij * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1688269773Sdelphij * so that we can consume any remaining space. 1689269773Sdelphij */ 1690269773Sdelphij if (space > 0 && space < SPA_MINBLOCKSIZE) 1691269773Sdelphij space = SPA_MINBLOCKSIZE; 1692269773Sdelphij } 1693168404Spjd weight = space; 1694168404Spjd 1695168404Spjd /* 1696168404Spjd * Modern disks have uniform bit density and constant angular velocity. 1697168404Spjd * Therefore, the outer recording zones are faster (higher bandwidth) 1698168404Spjd * than the inner zones by the ratio of outer to inner track diameter, 1699168404Spjd * which is typically around 2:1. We account for this by assigning 1700168404Spjd * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1701168404Spjd * In effect, this means that we'll select the metaslab with the most 1702168404Spjd * free bandwidth rather than simply the one with the most free space. 1703168404Spjd */ 1704269773Sdelphij if (metaslab_lba_weighting_enabled) { 1705269773Sdelphij weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1706269773Sdelphij ASSERT(weight >= space && weight <= 2 * space); 1707269773Sdelphij } 1708168404Spjd 1709269773Sdelphij /* 1710269773Sdelphij * If this metaslab is one we're actively using, adjust its 1711269773Sdelphij * weight to make it preferable to any inactive metaslab so 1712269773Sdelphij * we'll polish it off. If the fragmentation on this metaslab 1713269773Sdelphij * has exceed our threshold, then don't mark it active. 1714269773Sdelphij */ 1715269773Sdelphij if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1716269773Sdelphij msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1717211931Smm weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1718211931Smm } 1719262093Savg 1720211931Smm return (weight); 1721211931Smm} 1722211931Smm 1723168404Spjdstatic int 1724224177Smmmetaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1725168404Spjd{ 1726168404Spjd ASSERT(MUTEX_HELD(&msp->ms_lock)); 1727168404Spjd 1728168404Spjd if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1729262093Savg metaslab_load_wait(msp); 1730262093Savg if (!msp->ms_loaded) { 1731262093Savg int error = metaslab_load(msp); 1732262093Savg if (error) { 1733219089Spjd metaslab_group_sort(msp->ms_group, msp, 0); 1734219089Spjd return (error); 1735219089Spjd } 1736168404Spjd } 1737209962Smm 1738168404Spjd metaslab_group_sort(msp->ms_group, msp, 1739168404Spjd msp->ms_weight | activation_weight); 1740168404Spjd } 1741262093Savg ASSERT(msp->ms_loaded); 1742168404Spjd ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1743168404Spjd 1744168404Spjd return (0); 1745168404Spjd} 1746168404Spjd 1747168404Spjdstatic void 1748168404Spjdmetaslab_passivate(metaslab_t *msp, uint64_t size) 1749168404Spjd{ 1750168404Spjd /* 1751168404Spjd * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1752168404Spjd * this metaslab again. In that case, it had better be empty, 1753168404Spjd * or we would be leaving space on the table. 1754168404Spjd */ 1755262093Savg ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1756168404Spjd metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1757168404Spjd ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1758168404Spjd} 1759168404Spjd 1760262093Savgstatic void 1761262093Savgmetaslab_preload(void *arg) 1762262093Savg{ 1763262093Savg metaslab_t *msp = arg; 1764262093Savg spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1765262093Savg 1766268656Sdelphij ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1767268656Sdelphij 1768262093Savg mutex_enter(&msp->ms_lock); 1769262093Savg metaslab_load_wait(msp); 1770262093Savg if (!msp->ms_loaded) 1771262093Savg (void) metaslab_load(msp); 1772262093Savg 1773262093Savg /* 1774262093Savg * Set the ms_access_txg value so that we don't unload it right away. 1775262093Savg */ 1776262093Savg msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1777262093Savg mutex_exit(&msp->ms_lock); 1778262093Savg} 1779262093Savg 1780262093Savgstatic void 1781262093Savgmetaslab_group_preload(metaslab_group_t *mg) 1782262093Savg{ 1783262093Savg spa_t *spa = mg->mg_vd->vdev_spa; 1784262093Savg metaslab_t *msp; 1785262093Savg avl_tree_t *t = &mg->mg_metaslab_tree; 1786262093Savg int m = 0; 1787262093Savg 1788262093Savg if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1789262093Savg taskq_wait(mg->mg_taskq); 1790262093Savg return; 1791262093Savg } 1792268656Sdelphij 1793262093Savg mutex_enter(&mg->mg_lock); 1794262093Savg /* 1795268656Sdelphij * Load the next potential metaslabs 1796262093Savg */ 1797268656Sdelphij msp = avl_first(t); 1798268656Sdelphij while (msp != NULL) { 1799268656Sdelphij metaslab_t *msp_next = AVL_NEXT(t, msp); 1800262093Savg 1801269773Sdelphij /* 1802269773Sdelphij * We preload only the maximum number of metaslabs specified 1803269773Sdelphij * by metaslab_preload_limit. If a metaslab is being forced 1804269773Sdelphij * to condense then we preload it too. This will ensure 1805269773Sdelphij * that force condensing happens in the next txg. 1806269773Sdelphij */ 1807269773Sdelphij if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1808269773Sdelphij msp = msp_next; 1809269773Sdelphij continue; 1810269773Sdelphij } 1811262093Savg 1812268656Sdelphij /* 1813268656Sdelphij * We must drop the metaslab group lock here to preserve 1814268656Sdelphij * lock ordering with the ms_lock (when grabbing both 1815268656Sdelphij * the mg_lock and the ms_lock, the ms_lock must be taken 1816268656Sdelphij * first). As a result, it is possible that the ordering 1817268656Sdelphij * of the metaslabs within the avl tree may change before 1818268656Sdelphij * we reacquire the lock. The metaslab cannot be removed from 1819268656Sdelphij * the tree while we're in syncing context so it is safe to 1820268656Sdelphij * drop the mg_lock here. If the metaslabs are reordered 1821268656Sdelphij * nothing will break -- we just may end up loading a 1822268656Sdelphij * less than optimal one. 1823268656Sdelphij */ 1824268656Sdelphij mutex_exit(&mg->mg_lock); 1825262093Savg VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1826262093Savg msp, TQ_SLEEP) != 0); 1827268656Sdelphij mutex_enter(&mg->mg_lock); 1828268656Sdelphij msp = msp_next; 1829262093Savg } 1830262093Savg mutex_exit(&mg->mg_lock); 1831262093Savg} 1832262093Savg 1833168404Spjd/* 1834262093Savg * Determine if the space map's on-disk footprint is past our tolerance 1835262093Savg * for inefficiency. We would like to use the following criteria to make 1836262093Savg * our decision: 1837247398Smm * 1838247398Smm * 1. The size of the space map object should not dramatically increase as a 1839262093Savg * result of writing out the free space range tree. 1840247398Smm * 1841247398Smm * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1842262093Savg * times the size than the free space range tree representation 1843262093Savg * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1844247398Smm * 1845269416Sdelphij * 3. The on-disk size of the space map should actually decrease. 1846269416Sdelphij * 1847247398Smm * Checking the first condition is tricky since we don't want to walk 1848247398Smm * the entire AVL tree calculating the estimated on-disk size. Instead we 1849262093Savg * use the size-ordered range tree in the metaslab and calculate the 1850262093Savg * size required to write out the largest segment in our free tree. If the 1851247398Smm * size required to represent that segment on disk is larger than the space 1852247398Smm * map object then we avoid condensing this map. 1853247398Smm * 1854247398Smm * To determine the second criterion we use a best-case estimate and assume 1855247398Smm * each segment can be represented on-disk as a single 64-bit entry. We refer 1856247398Smm * to this best-case estimate as the space map's minimal form. 1857269416Sdelphij * 1858269416Sdelphij * Unfortunately, we cannot compute the on-disk size of the space map in this 1859269416Sdelphij * context because we cannot accurately compute the effects of compression, etc. 1860269416Sdelphij * Instead, we apply the heuristic described in the block comment for 1861269416Sdelphij * zfs_metaslab_condense_block_threshold - we only condense if the space used 1862269416Sdelphij * is greater than a threshold number of blocks. 1863247398Smm */ 1864247398Smmstatic boolean_t 1865247398Smmmetaslab_should_condense(metaslab_t *msp) 1866247398Smm{ 1867262093Savg space_map_t *sm = msp->ms_sm; 1868262093Savg range_seg_t *rs; 1869269416Sdelphij uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1870269416Sdelphij dmu_object_info_t doi; 1871269416Sdelphij uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1872247398Smm 1873247398Smm ASSERT(MUTEX_HELD(&msp->ms_lock)); 1874262093Savg ASSERT(msp->ms_loaded); 1875247398Smm 1876247398Smm /* 1877262093Savg * Use the ms_size_tree range tree, which is ordered by size, to 1878269773Sdelphij * obtain the largest segment in the free tree. We always condense 1879269773Sdelphij * metaslabs that are empty and metaslabs for which a condense 1880269773Sdelphij * request has been made. 1881247398Smm */ 1882262093Savg rs = avl_last(&msp->ms_size_tree); 1883269773Sdelphij if (rs == NULL || msp->ms_condense_wanted) 1884247398Smm return (B_TRUE); 1885247398Smm 1886247398Smm /* 1887247398Smm * Calculate the number of 64-bit entries this segment would 1888247398Smm * require when written to disk. If this single segment would be 1889247398Smm * larger on-disk than the entire current on-disk structure, then 1890247398Smm * clearly condensing will increase the on-disk structure size. 1891247398Smm */ 1892262093Savg size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1893247398Smm entries = size / (MIN(size, SM_RUN_MAX)); 1894247398Smm segsz = entries * sizeof (uint64_t); 1895247398Smm 1896269416Sdelphij optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1897269416Sdelphij object_size = space_map_length(msp->ms_sm); 1898269416Sdelphij 1899269416Sdelphij dmu_object_info_from_db(sm->sm_dbuf, &doi); 1900269416Sdelphij record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1901269416Sdelphij 1902269416Sdelphij return (segsz <= object_size && 1903269416Sdelphij object_size >= (optimal_size * zfs_condense_pct / 100) && 1904269416Sdelphij object_size > zfs_metaslab_condense_block_threshold * record_size); 1905247398Smm} 1906247398Smm 1907247398Smm/* 1908247398Smm * Condense the on-disk space map representation to its minimized form. 1909247398Smm * The minimized form consists of a small number of allocations followed by 1910262093Savg * the entries of the free range tree. 1911247398Smm */ 1912247398Smmstatic void 1913247398Smmmetaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1914247398Smm{ 1915247398Smm spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1916262093Savg range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1917262093Savg range_tree_t *condense_tree; 1918262093Savg space_map_t *sm = msp->ms_sm; 1919247398Smm 1920247398Smm ASSERT(MUTEX_HELD(&msp->ms_lock)); 1921247398Smm ASSERT3U(spa_sync_pass(spa), ==, 1); 1922262093Savg ASSERT(msp->ms_loaded); 1923247398Smm 1924269773Sdelphij 1925290753Smav spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 1926290753Smav "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 1927290753Smav msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 1928290753Smav msp->ms_group->mg_vd->vdev_spa->spa_name, 1929290753Smav space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), 1930269773Sdelphij msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1931247398Smm 1932269773Sdelphij msp->ms_condense_wanted = B_FALSE; 1933269773Sdelphij 1934247398Smm /* 1935262093Savg * Create an range tree that is 100% allocated. We remove segments 1936247398Smm * that have been freed in this txg, any deferred frees that exist, 1937247398Smm * and any allocation in the future. Removing segments should be 1938262093Savg * a relatively inexpensive operation since we expect these trees to 1939262093Savg * have a small number of nodes. 1940247398Smm */ 1941262093Savg condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1942262093Savg range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1943247398Smm 1944247398Smm /* 1945262093Savg * Remove what's been freed in this txg from the condense_tree. 1946247398Smm * Since we're in sync_pass 1, we know that all the frees from 1947262093Savg * this txg are in the freetree. 1948247398Smm */ 1949262093Savg range_tree_walk(freetree, range_tree_remove, condense_tree); 1950247398Smm 1951262093Savg for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1952262093Savg range_tree_walk(msp->ms_defertree[t], 1953262093Savg range_tree_remove, condense_tree); 1954262093Savg } 1955247398Smm 1956262093Savg for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1957262093Savg range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1958262093Savg range_tree_remove, condense_tree); 1959262093Savg } 1960247398Smm 1961247398Smm /* 1962247398Smm * We're about to drop the metaslab's lock thus allowing 1963247398Smm * other consumers to change it's content. Set the 1964262093Savg * metaslab's ms_condensing flag to ensure that 1965247398Smm * allocations on this metaslab do not occur while we're 1966247398Smm * in the middle of committing it to disk. This is only critical 1967262093Savg * for the ms_tree as all other range trees use per txg 1968247398Smm * views of their content. 1969247398Smm */ 1970262093Savg msp->ms_condensing = B_TRUE; 1971247398Smm 1972247398Smm mutex_exit(&msp->ms_lock); 1973262093Savg space_map_truncate(sm, tx); 1974247398Smm mutex_enter(&msp->ms_lock); 1975247398Smm 1976247398Smm /* 1977247398Smm * While we would ideally like to create a space_map representation 1978247398Smm * that consists only of allocation records, doing so can be 1979262093Savg * prohibitively expensive because the in-core free tree can be 1980247398Smm * large, and therefore computationally expensive to subtract 1981262093Savg * from the condense_tree. Instead we sync out two trees, a cheap 1982262093Savg * allocation only tree followed by the in-core free tree. While not 1983247398Smm * optimal, this is typically close to optimal, and much cheaper to 1984247398Smm * compute. 1985247398Smm */ 1986262093Savg space_map_write(sm, condense_tree, SM_ALLOC, tx); 1987262093Savg range_tree_vacate(condense_tree, NULL, NULL); 1988262093Savg range_tree_destroy(condense_tree); 1989247398Smm 1990262093Savg space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1991262093Savg msp->ms_condensing = B_FALSE; 1992247398Smm} 1993247398Smm 1994247398Smm/* 1995168404Spjd * Write a metaslab to disk in the context of the specified transaction group. 1996168404Spjd */ 1997168404Spjdvoid 1998168404Spjdmetaslab_sync(metaslab_t *msp, uint64_t txg) 1999168404Spjd{ 2000262093Savg metaslab_group_t *mg = msp->ms_group; 2001262093Savg vdev_t *vd = mg->mg_vd; 2002168404Spjd spa_t *spa = vd->vdev_spa; 2003219089Spjd objset_t *mos = spa_meta_objset(spa); 2004262093Savg range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 2005262093Savg range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 2006262093Savg range_tree_t **freed_tree = 2007262093Savg &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2008168404Spjd dmu_tx_t *tx; 2009262093Savg uint64_t object = space_map_object(msp->ms_sm); 2010168404Spjd 2011219089Spjd ASSERT(!vd->vdev_ishole); 2012168404Spjd 2013247398Smm /* 2014247398Smm * This metaslab has just been added so there's no work to do now. 2015247398Smm */ 2016262093Savg if (*freetree == NULL) { 2017262093Savg ASSERT3P(alloctree, ==, NULL); 2018219089Spjd return; 2019247398Smm } 2020219089Spjd 2021262093Savg ASSERT3P(alloctree, !=, NULL); 2022262093Savg ASSERT3P(*freetree, !=, NULL); 2023262093Savg ASSERT3P(*freed_tree, !=, NULL); 2024247398Smm 2025269773Sdelphij /* 2026269773Sdelphij * Normally, we don't want to process a metaslab if there 2027269773Sdelphij * are no allocations or frees to perform. However, if the metaslab 2028269773Sdelphij * is being forced to condense we need to let it through. 2029269773Sdelphij */ 2030262093Savg if (range_tree_space(alloctree) == 0 && 2031269773Sdelphij range_tree_space(*freetree) == 0 && 2032269773Sdelphij !msp->ms_condense_wanted) 2033247398Smm return; 2034247398Smm 2035168404Spjd /* 2036168404Spjd * The only state that can actually be changing concurrently with 2037262093Savg * metaslab_sync() is the metaslab's ms_tree. No other thread can 2038262093Savg * be modifying this txg's alloctree, freetree, freed_tree, or 2039262093Savg * space_map_phys_t. Therefore, we only hold ms_lock to satify 2040262093Savg * space_map ASSERTs. We drop it whenever we call into the DMU, 2041262093Savg * because the DMU can call down to us (e.g. via zio_free()) at 2042262093Savg * any time. 2043168404Spjd */ 2044168404Spjd 2045219089Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2046219089Spjd 2047262093Savg if (msp->ms_sm == NULL) { 2048262093Savg uint64_t new_object; 2049262093Savg 2050262093Savg new_object = space_map_alloc(mos, tx); 2051262093Savg VERIFY3U(new_object, !=, 0); 2052262093Savg 2053262093Savg VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2054262093Savg msp->ms_start, msp->ms_size, vd->vdev_ashift, 2055262093Savg &msp->ms_lock)); 2056262093Savg ASSERT(msp->ms_sm != NULL); 2057168404Spjd } 2058168404Spjd 2059219089Spjd mutex_enter(&msp->ms_lock); 2060219089Spjd 2061273341Sdelphij /* 2062273341Sdelphij * Note: metaslab_condense() clears the space_map's histogram. 2063273341Sdelphij * Therefore we must verify and remove this histogram before 2064273341Sdelphij * condensing. 2065273341Sdelphij */ 2066273341Sdelphij metaslab_group_histogram_verify(mg); 2067273341Sdelphij metaslab_class_histogram_verify(mg->mg_class); 2068273341Sdelphij metaslab_group_histogram_remove(mg, msp); 2069273341Sdelphij 2070262093Savg if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 2071247398Smm metaslab_should_condense(msp)) { 2072247398Smm metaslab_condense(msp, txg, tx); 2073247398Smm } else { 2074262093Savg space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 2075262093Savg space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 2076247398Smm } 2077168404Spjd 2078262093Savg if (msp->ms_loaded) { 2079262093Savg /* 2080262093Savg * When the space map is loaded, we have an accruate 2081262093Savg * histogram in the range tree. This gives us an opportunity 2082262093Savg * to bring the space map's histogram up-to-date so we clear 2083262093Savg * it first before updating it. 2084262093Savg */ 2085262093Savg space_map_histogram_clear(msp->ms_sm); 2086262093Savg space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 2087262093Savg } else { 2088262093Savg /* 2089262093Savg * Since the space map is not loaded we simply update the 2090262093Savg * exisiting histogram with what was freed in this txg. This 2091262093Savg * means that the on-disk histogram may not have an accurate 2092262093Savg * view of the free space but it's close enough to allow 2093262093Savg * us to make allocation decisions. 2094262093Savg */ 2095262093Savg space_map_histogram_add(msp->ms_sm, *freetree, tx); 2096262093Savg } 2097269773Sdelphij metaslab_group_histogram_add(mg, msp); 2098269773Sdelphij metaslab_group_histogram_verify(mg); 2099269773Sdelphij metaslab_class_histogram_verify(mg->mg_class); 2100262093Savg 2101247398Smm /* 2102262093Savg * For sync pass 1, we avoid traversing this txg's free range tree 2103262093Savg * and instead will just swap the pointers for freetree and 2104262093Savg * freed_tree. We can safely do this since the freed_tree is 2105247398Smm * guaranteed to be empty on the initial pass. 2106247398Smm */ 2107247398Smm if (spa_sync_pass(spa) == 1) { 2108262093Savg range_tree_swap(freetree, freed_tree); 2109247398Smm } else { 2110262093Savg range_tree_vacate(*freetree, range_tree_add, *freed_tree); 2111168404Spjd } 2112269773Sdelphij range_tree_vacate(alloctree, NULL, NULL); 2113168404Spjd 2114262093Savg ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2115262093Savg ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2116168404Spjd 2117168404Spjd mutex_exit(&msp->ms_lock); 2118168404Spjd 2119262093Savg if (object != space_map_object(msp->ms_sm)) { 2120262093Savg object = space_map_object(msp->ms_sm); 2121262093Savg dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2122262093Savg msp->ms_id, sizeof (uint64_t), &object, tx); 2123262093Savg } 2124168404Spjd dmu_tx_commit(tx); 2125168404Spjd} 2126168404Spjd 2127168404Spjd/* 2128168404Spjd * Called after a transaction group has completely synced to mark 2129168404Spjd * all of the metaslab's free space as usable. 2130168404Spjd */ 2131168404Spjdvoid 2132168404Spjdmetaslab_sync_done(metaslab_t *msp, uint64_t txg) 2133168404Spjd{ 2134168404Spjd metaslab_group_t *mg = msp->ms_group; 2135168404Spjd vdev_t *vd = mg->mg_vd; 2136262093Savg range_tree_t **freed_tree; 2137262093Savg range_tree_t **defer_tree; 2138219089Spjd int64_t alloc_delta, defer_delta; 2139168404Spjd 2140219089Spjd ASSERT(!vd->vdev_ishole); 2141219089Spjd 2142168404Spjd mutex_enter(&msp->ms_lock); 2143168404Spjd 2144168404Spjd /* 2145168404Spjd * If this metaslab is just becoming available, initialize its 2146262093Savg * alloctrees, freetrees, and defertree and add its capacity to 2147262093Savg * the vdev. 2148168404Spjd */ 2149262093Savg if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 2150219089Spjd for (int t = 0; t < TXG_SIZE; t++) { 2151262093Savg ASSERT(msp->ms_alloctree[t] == NULL); 2152262093Savg ASSERT(msp->ms_freetree[t] == NULL); 2153262093Savg 2154262093Savg msp->ms_alloctree[t] = range_tree_create(NULL, msp, 2155262093Savg &msp->ms_lock); 2156262093Savg msp->ms_freetree[t] = range_tree_create(NULL, msp, 2157262093Savg &msp->ms_lock); 2158168404Spjd } 2159219089Spjd 2160247398Smm for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2161262093Savg ASSERT(msp->ms_defertree[t] == NULL); 2162262093Savg 2163262093Savg msp->ms_defertree[t] = range_tree_create(NULL, msp, 2164262093Savg &msp->ms_lock); 2165247398Smm } 2166219089Spjd 2167262093Savg vdev_space_update(vd, 0, 0, msp->ms_size); 2168168404Spjd } 2169168404Spjd 2170262093Savg freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2171262093Savg defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 2172168404Spjd 2173262093Savg alloc_delta = space_map_alloc_delta(msp->ms_sm); 2174262093Savg defer_delta = range_tree_space(*freed_tree) - 2175262093Savg range_tree_space(*defer_tree); 2176262093Savg 2177219089Spjd vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2178219089Spjd 2179262093Savg ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2180262093Savg ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2181168404Spjd 2182168404Spjd /* 2183262093Savg * If there's a metaslab_load() in progress, wait for it to complete 2184168404Spjd * so that we have a consistent view of the in-core space map. 2185168404Spjd */ 2186262093Savg metaslab_load_wait(msp); 2187168404Spjd 2188247398Smm /* 2189262093Savg * Move the frees from the defer_tree back to the free 2190262093Savg * range tree (if it's loaded). Swap the freed_tree and the 2191262093Savg * defer_tree -- this is safe to do because we've just emptied out 2192262093Savg * the defer_tree. 2193247398Smm */ 2194262093Savg range_tree_vacate(*defer_tree, 2195262093Savg msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 2196262093Savg range_tree_swap(freed_tree, defer_tree); 2197247398Smm 2198262093Savg space_map_update(msp->ms_sm); 2199168404Spjd 2200219089Spjd msp->ms_deferspace += defer_delta; 2201219089Spjd ASSERT3S(msp->ms_deferspace, >=, 0); 2202262093Savg ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2203219089Spjd if (msp->ms_deferspace != 0) { 2204219089Spjd /* 2205219089Spjd * Keep syncing this metaslab until all deferred frees 2206219089Spjd * are back in circulation. 2207219089Spjd */ 2208219089Spjd vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2209219089Spjd } 2210219089Spjd 2211262093Savg if (msp->ms_loaded && msp->ms_access_txg < txg) { 2212262093Savg for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2213262093Savg VERIFY0(range_tree_space( 2214262093Savg msp->ms_alloctree[(txg + t) & TXG_MASK])); 2215262093Savg } 2216168404Spjd 2217262093Savg if (!metaslab_debug_unload) 2218262093Savg metaslab_unload(msp); 2219168404Spjd } 2220168404Spjd 2221168404Spjd metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2222262093Savg mutex_exit(&msp->ms_lock); 2223168404Spjd} 2224168404Spjd 2225211931Smmvoid 2226211931Smmmetaslab_sync_reassess(metaslab_group_t *mg) 2227211931Smm{ 2228260768Savg metaslab_group_alloc_update(mg); 2229269773Sdelphij mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2230224177Smm 2231211931Smm /* 2232262093Savg * Preload the next potential metaslabs 2233211931Smm */ 2234262093Savg metaslab_group_preload(mg); 2235211931Smm} 2236211931Smm 2237168404Spjdstatic uint64_t 2238168404Spjdmetaslab_distance(metaslab_t *msp, dva_t *dva) 2239168404Spjd{ 2240168404Spjd uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2241168404Spjd uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2242262093Savg uint64_t start = msp->ms_id; 2243168404Spjd 2244168404Spjd if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2245168404Spjd return (1ULL << 63); 2246168404Spjd 2247168404Spjd if (offset < start) 2248168404Spjd return ((start - offset) << ms_shift); 2249168404Spjd if (offset > start) 2250168404Spjd return ((offset - start) << ms_shift); 2251168404Spjd return (0); 2252168404Spjd} 2253168404Spjd 2254307279Smav/* 2255307279Smav * ========================================================================== 2256307279Smav * Metaslab block operations 2257307279Smav * ========================================================================== 2258307279Smav */ 2259307279Smav 2260307279Smavstatic void 2261307279Smavmetaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) 2262307279Smav{ 2263307279Smav if (!(flags & METASLAB_ASYNC_ALLOC) || 2264307279Smav flags & METASLAB_DONT_THROTTLE) 2265307279Smav return; 2266307279Smav 2267307279Smav metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2268307279Smav if (!mg->mg_class->mc_alloc_throttle_enabled) 2269307279Smav return; 2270307279Smav 2271307279Smav (void) refcount_add(&mg->mg_alloc_queue_depth, tag); 2272307279Smav} 2273307279Smav 2274307279Smavvoid 2275307279Smavmetaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) 2276307279Smav{ 2277307279Smav if (!(flags & METASLAB_ASYNC_ALLOC) || 2278307279Smav flags & METASLAB_DONT_THROTTLE) 2279307279Smav return; 2280307279Smav 2281307279Smav metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2282307279Smav if (!mg->mg_class->mc_alloc_throttle_enabled) 2283307279Smav return; 2284307279Smav 2285307279Smav (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); 2286307279Smav} 2287307279Smav 2288307279Smavvoid 2289307279Smavmetaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) 2290307279Smav{ 2291307279Smav#ifdef ZFS_DEBUG 2292307279Smav const dva_t *dva = bp->blk_dva; 2293307279Smav int ndvas = BP_GET_NDVAS(bp); 2294307279Smav 2295307279Smav for (int d = 0; d < ndvas; d++) { 2296307279Smav uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2297307279Smav metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2298307279Smav VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); 2299307279Smav } 2300307279Smav#endif 2301307279Smav} 2302307279Smav 2303168404Spjdstatic uint64_t 2304307279Smavmetaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, 2305265741Sdelphij uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2306168404Spjd{ 2307224177Smm spa_t *spa = mg->mg_vd->vdev_spa; 2308168404Spjd metaslab_t *msp = NULL; 2309168404Spjd uint64_t offset = -1ULL; 2310168404Spjd avl_tree_t *t = &mg->mg_metaslab_tree; 2311168404Spjd uint64_t activation_weight; 2312168404Spjd uint64_t target_distance; 2313168404Spjd int i; 2314168404Spjd 2315168404Spjd activation_weight = METASLAB_WEIGHT_PRIMARY; 2316209962Smm for (i = 0; i < d; i++) { 2317209962Smm if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2318168404Spjd activation_weight = METASLAB_WEIGHT_SECONDARY; 2319209962Smm break; 2320209962Smm } 2321209962Smm } 2322168404Spjd 2323168404Spjd for (;;) { 2324209962Smm boolean_t was_active; 2325209962Smm 2326168404Spjd mutex_enter(&mg->mg_lock); 2327168404Spjd for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2328224177Smm if (msp->ms_weight < asize) { 2329224177Smm spa_dbgmsg(spa, "%s: failed to meet weight " 2330224177Smm "requirement: vdev %llu, txg %llu, mg %p, " 2331307279Smav "msp %p, asize %llu, " 2332265741Sdelphij "weight %llu", spa_name(spa), 2333265741Sdelphij mg->mg_vd->vdev_id, txg, 2334307279Smav mg, msp, asize, msp->ms_weight); 2335168404Spjd mutex_exit(&mg->mg_lock); 2336168404Spjd return (-1ULL); 2337168404Spjd } 2338247398Smm 2339247398Smm /* 2340247398Smm * If the selected metaslab is condensing, skip it. 2341247398Smm */ 2342262093Savg if (msp->ms_condensing) 2343247398Smm continue; 2344247398Smm 2345209962Smm was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2346168404Spjd if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2347168404Spjd break; 2348168404Spjd 2349168404Spjd target_distance = min_distance + 2350262093Savg (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2351262093Savg min_distance >> 1); 2352168404Spjd 2353168404Spjd for (i = 0; i < d; i++) 2354168404Spjd if (metaslab_distance(msp, &dva[i]) < 2355168404Spjd target_distance) 2356168404Spjd break; 2357168404Spjd if (i == d) 2358168404Spjd break; 2359168404Spjd } 2360168404Spjd mutex_exit(&mg->mg_lock); 2361168404Spjd if (msp == NULL) 2362168404Spjd return (-1ULL); 2363168404Spjd 2364260768Savg mutex_enter(&msp->ms_lock); 2365260768Savg 2366224177Smm /* 2367168404Spjd * Ensure that the metaslab we have selected is still 2368168404Spjd * capable of handling our request. It's possible that 2369168404Spjd * another thread may have changed the weight while we 2370168404Spjd * were blocked on the metaslab lock. 2371168404Spjd */ 2372224177Smm if (msp->ms_weight < asize || (was_active && 2373209962Smm !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2374209962Smm activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2375168404Spjd mutex_exit(&msp->ms_lock); 2376168404Spjd continue; 2377168404Spjd } 2378168404Spjd 2379168404Spjd if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2380168404Spjd activation_weight == METASLAB_WEIGHT_PRIMARY) { 2381168404Spjd metaslab_passivate(msp, 2382168404Spjd msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2383168404Spjd mutex_exit(&msp->ms_lock); 2384168404Spjd continue; 2385168404Spjd } 2386168404Spjd 2387224177Smm if (metaslab_activate(msp, activation_weight) != 0) { 2388168404Spjd mutex_exit(&msp->ms_lock); 2389168404Spjd continue; 2390168404Spjd } 2391168404Spjd 2392247398Smm /* 2393247398Smm * If this metaslab is currently condensing then pick again as 2394247398Smm * we can't manipulate this metaslab until it's committed 2395247398Smm * to disk. 2396247398Smm */ 2397262093Savg if (msp->ms_condensing) { 2398247398Smm mutex_exit(&msp->ms_lock); 2399247398Smm continue; 2400247398Smm } 2401247398Smm 2402262093Savg if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2403168404Spjd break; 2404168404Spjd 2405262093Savg metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2406168404Spjd mutex_exit(&msp->ms_lock); 2407168404Spjd } 2408168404Spjd 2409262093Savg if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2410168404Spjd vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2411168404Spjd 2412262093Savg range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2413262093Savg msp->ms_access_txg = txg + metaslab_unload_delay; 2414168404Spjd 2415168404Spjd mutex_exit(&msp->ms_lock); 2416168404Spjd return (offset); 2417168404Spjd} 2418168404Spjd 2419168404Spjd/* 2420168404Spjd * Allocate a block for the specified i/o. 2421168404Spjd */ 2422168404Spjdstatic int 2423185029Spjdmetaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2424185029Spjd dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2425168404Spjd{ 2426168404Spjd metaslab_group_t *mg, *rotor; 2427168404Spjd vdev_t *vd; 2428168404Spjd int dshift = 3; 2429168404Spjd int all_zero; 2430209962Smm int zio_lock = B_FALSE; 2431209962Smm boolean_t allocatable; 2432168404Spjd uint64_t asize; 2433168404Spjd uint64_t distance; 2434168404Spjd 2435168404Spjd ASSERT(!DVA_IS_VALID(&dva[d])); 2436168404Spjd 2437185029Spjd /* 2438185029Spjd * For testing, make some blocks above a certain size be gang blocks. 2439185029Spjd */ 2440219089Spjd if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2441249195Smm return (SET_ERROR(ENOSPC)); 2442168404Spjd 2443168404Spjd /* 2444168404Spjd * Start at the rotor and loop through all mgs until we find something. 2445219089Spjd * Note that there's no locking on mc_rotor or mc_aliquot because 2446168404Spjd * nothing actually breaks if we miss a few updates -- we just won't 2447168404Spjd * allocate quite as evenly. It all balances out over time. 2448168404Spjd * 2449168404Spjd * If we are doing ditto or log blocks, try to spread them across 2450168404Spjd * consecutive vdevs. If we're forced to reuse a vdev before we've 2451168404Spjd * allocated all of our ditto blocks, then try and spread them out on 2452168404Spjd * that vdev as much as possible. If it turns out to not be possible, 2453168404Spjd * gradually lower our standards until anything becomes acceptable. 2454168404Spjd * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2455168404Spjd * gives us hope of containing our fault domains to something we're 2456168404Spjd * able to reason about. Otherwise, any two top-level vdev failures 2457168404Spjd * will guarantee the loss of data. With consecutive allocation, 2458168404Spjd * only two adjacent top-level vdev failures will result in data loss. 2459168404Spjd * 2460168404Spjd * If we are doing gang blocks (hintdva is non-NULL), try to keep 2461168404Spjd * ourselves on the same vdev as our gang block header. That 2462168404Spjd * way, we can hope for locality in vdev_cache, plus it makes our 2463168404Spjd * fault domains something tractable. 2464168404Spjd */ 2465168404Spjd if (hintdva) { 2466168404Spjd vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2467219089Spjd 2468219089Spjd /* 2469219089Spjd * It's possible the vdev we're using as the hint no 2470219089Spjd * longer exists (i.e. removed). Consult the rotor when 2471219089Spjd * all else fails. 2472219089Spjd */ 2473219089Spjd if (vd != NULL) { 2474168404Spjd mg = vd->vdev_mg; 2475219089Spjd 2476219089Spjd if (flags & METASLAB_HINTBP_AVOID && 2477219089Spjd mg->mg_next != NULL) 2478219089Spjd mg = mg->mg_next; 2479219089Spjd } else { 2480219089Spjd mg = mc->mc_rotor; 2481219089Spjd } 2482168404Spjd } else if (d != 0) { 2483168404Spjd vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2484168404Spjd mg = vd->vdev_mg->mg_next; 2485168404Spjd } else { 2486168404Spjd mg = mc->mc_rotor; 2487168404Spjd } 2488185029Spjd 2489185029Spjd /* 2490219089Spjd * If the hint put us into the wrong metaslab class, or into a 2491219089Spjd * metaslab group that has been passivated, just follow the rotor. 2492185029Spjd */ 2493219089Spjd if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2494185029Spjd mg = mc->mc_rotor; 2495185029Spjd 2496168404Spjd rotor = mg; 2497168404Spjdtop: 2498168404Spjd all_zero = B_TRUE; 2499168404Spjd do { 2500219089Spjd ASSERT(mg->mg_activation_count == 1); 2501168404Spjd vd = mg->mg_vd; 2502209962Smm 2503185029Spjd /* 2504185029Spjd * Don't allocate from faulted devices. 2505185029Spjd */ 2506209962Smm if (zio_lock) { 2507209962Smm spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2508209962Smm allocatable = vdev_allocatable(vd); 2509209962Smm spa_config_exit(spa, SCL_ZIO, FTAG); 2510209962Smm } else { 2511209962Smm allocatable = vdev_allocatable(vd); 2512209962Smm } 2513260768Savg 2514260768Savg /* 2515260768Savg * Determine if the selected metaslab group is eligible 2516307279Smav * for allocations. If we're ganging then don't allow 2517307279Smav * this metaslab group to skip allocations since that would 2518307279Smav * inadvertently return ENOSPC and suspend the pool 2519260768Savg * even though space is still available. 2520260768Savg */ 2521307279Smav if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) { 2522307279Smav allocatable = metaslab_group_allocatable(mg, rotor, 2523307279Smav psize); 2524307279Smav } 2525260768Savg 2526209962Smm if (!allocatable) 2527185029Spjd goto next; 2528209962Smm 2529307279Smav ASSERT(mg->mg_initialized); 2530307279Smav 2531185029Spjd /* 2532307279Smav * Avoid writing single-copy data to a failing vdev. 2533185029Spjd */ 2534185029Spjd if ((vd->vdev_stat.vs_write_errors > 0 || 2535185029Spjd vd->vdev_state < VDEV_STATE_HEALTHY) && 2536269773Sdelphij d == 0 && dshift == 3 && vd->vdev_children == 0) { 2537185029Spjd all_zero = B_FALSE; 2538185029Spjd goto next; 2539185029Spjd } 2540168404Spjd 2541185029Spjd ASSERT(mg->mg_class == mc); 2542185029Spjd 2543168404Spjd distance = vd->vdev_asize >> dshift; 2544168404Spjd if (distance <= (1ULL << vd->vdev_ms_shift)) 2545168404Spjd distance = 0; 2546168404Spjd else 2547168404Spjd all_zero = B_FALSE; 2548168404Spjd 2549168404Spjd asize = vdev_psize_to_asize(vd, psize); 2550168404Spjd ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2551168404Spjd 2552307279Smav uint64_t offset = metaslab_group_alloc(mg, asize, txg, 2553307279Smav distance, dva, d); 2554307279Smav 2555307279Smav mutex_enter(&mg->mg_lock); 2556307279Smav if (offset == -1ULL) { 2557307279Smav mg->mg_failed_allocations++; 2558307279Smav if (asize == SPA_GANGBLOCKSIZE) { 2559307279Smav /* 2560307279Smav * This metaslab group was unable to allocate 2561307279Smav * the minimum gang block size so it must be 2562307279Smav * out of space. We must notify the allocation 2563307279Smav * throttle to start skipping allocation 2564307279Smav * attempts to this metaslab group until more 2565307279Smav * space becomes available. 2566307279Smav * 2567307279Smav * Note: this failure cannot be caused by the 2568307279Smav * allocation throttle since the allocation 2569307279Smav * throttle is only responsible for skipping 2570307279Smav * devices and not failing block allocations. 2571307279Smav */ 2572307279Smav mg->mg_no_free_space = B_TRUE; 2573307279Smav } 2574307279Smav } 2575307279Smav mg->mg_allocations++; 2576307279Smav mutex_exit(&mg->mg_lock); 2577307279Smav 2578168404Spjd if (offset != -1ULL) { 2579168404Spjd /* 2580168404Spjd * If we've just selected this metaslab group, 2581168404Spjd * figure out whether the corresponding vdev is 2582168404Spjd * over- or under-used relative to the pool, 2583168404Spjd * and set an allocation bias to even it out. 2584168404Spjd */ 2585269773Sdelphij if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2586168404Spjd vdev_stat_t *vs = &vd->vdev_stat; 2587219089Spjd int64_t vu, cu; 2588168404Spjd 2589224177Smm vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2590224177Smm cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2591168404Spjd 2592168404Spjd /* 2593224177Smm * Calculate how much more or less we should 2594224177Smm * try to allocate from this device during 2595224177Smm * this iteration around the rotor. 2596224177Smm * For example, if a device is 80% full 2597224177Smm * and the pool is 20% full then we should 2598224177Smm * reduce allocations by 60% on this device. 2599224177Smm * 2600224177Smm * mg_bias = (20 - 80) * 512K / 100 = -307K 2601224177Smm * 2602224177Smm * This reduces allocations by 307K for this 2603224177Smm * iteration. 2604168404Spjd */ 2605219089Spjd mg->mg_bias = ((cu - vu) * 2606224177Smm (int64_t)mg->mg_aliquot) / 100; 2607269773Sdelphij } else if (!metaslab_bias_enabled) { 2608269773Sdelphij mg->mg_bias = 0; 2609168404Spjd } 2610168404Spjd 2611219089Spjd if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2612168404Spjd mg->mg_aliquot + mg->mg_bias) { 2613168404Spjd mc->mc_rotor = mg->mg_next; 2614219089Spjd mc->mc_aliquot = 0; 2615168404Spjd } 2616168404Spjd 2617168404Spjd DVA_SET_VDEV(&dva[d], vd->vdev_id); 2618168404Spjd DVA_SET_OFFSET(&dva[d], offset); 2619185029Spjd DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2620168404Spjd DVA_SET_ASIZE(&dva[d], asize); 2621168404Spjd 2622168404Spjd return (0); 2623168404Spjd } 2624185029Spjdnext: 2625168404Spjd mc->mc_rotor = mg->mg_next; 2626219089Spjd mc->mc_aliquot = 0; 2627168404Spjd } while ((mg = mg->mg_next) != rotor); 2628168404Spjd 2629168404Spjd if (!all_zero) { 2630168404Spjd dshift++; 2631168404Spjd ASSERT(dshift < 64); 2632168404Spjd goto top; 2633168404Spjd } 2634168404Spjd 2635209962Smm if (!allocatable && !zio_lock) { 2636209962Smm dshift = 3; 2637209962Smm zio_lock = B_TRUE; 2638209962Smm goto top; 2639209962Smm } 2640209962Smm 2641168404Spjd bzero(&dva[d], sizeof (dva_t)); 2642168404Spjd 2643249195Smm return (SET_ERROR(ENOSPC)); 2644168404Spjd} 2645168404Spjd 2646168404Spjd/* 2647168404Spjd * Free the block represented by DVA in the context of the specified 2648168404Spjd * transaction group. 2649168404Spjd */ 2650168404Spjdstatic void 2651168404Spjdmetaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2652168404Spjd{ 2653168404Spjd uint64_t vdev = DVA_GET_VDEV(dva); 2654168404Spjd uint64_t offset = DVA_GET_OFFSET(dva); 2655168404Spjd uint64_t size = DVA_GET_ASIZE(dva); 2656168404Spjd vdev_t *vd; 2657168404Spjd metaslab_t *msp; 2658168404Spjd 2659168404Spjd ASSERT(DVA_IS_VALID(dva)); 2660168404Spjd 2661168404Spjd if (txg > spa_freeze_txg(spa)) 2662168404Spjd return; 2663168404Spjd 2664168404Spjd if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2665168404Spjd (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2666168404Spjd cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2667168404Spjd (u_longlong_t)vdev, (u_longlong_t)offset); 2668168404Spjd ASSERT(0); 2669168404Spjd return; 2670168404Spjd } 2671168404Spjd 2672168404Spjd msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2673168404Spjd 2674168404Spjd if (DVA_GET_GANG(dva)) 2675168404Spjd size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2676168404Spjd 2677168404Spjd mutex_enter(&msp->ms_lock); 2678168404Spjd 2679168404Spjd if (now) { 2680262093Savg range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2681168404Spjd offset, size); 2682262093Savg 2683262093Savg VERIFY(!msp->ms_condensing); 2684262093Savg VERIFY3U(offset, >=, msp->ms_start); 2685262093Savg VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2686262093Savg VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2687262093Savg msp->ms_size); 2688262093Savg VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2689262093Savg VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2690262093Savg range_tree_add(msp->ms_tree, offset, size); 2691168404Spjd } else { 2692262093Savg if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2693168404Spjd vdev_dirty(vd, VDD_METASLAB, msp, txg); 2694262093Savg range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2695262093Savg offset, size); 2696168404Spjd } 2697168404Spjd 2698168404Spjd mutex_exit(&msp->ms_lock); 2699168404Spjd} 2700168404Spjd 2701168404Spjd/* 2702168404Spjd * Intent log support: upon opening the pool after a crash, notify the SPA 2703168404Spjd * of blocks that the intent log has allocated for immediate write, but 2704168404Spjd * which are still considered free by the SPA because the last transaction 2705168404Spjd * group didn't commit yet. 2706168404Spjd */ 2707168404Spjdstatic int 2708168404Spjdmetaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2709168404Spjd{ 2710168404Spjd uint64_t vdev = DVA_GET_VDEV(dva); 2711168404Spjd uint64_t offset = DVA_GET_OFFSET(dva); 2712168404Spjd uint64_t size = DVA_GET_ASIZE(dva); 2713168404Spjd vdev_t *vd; 2714168404Spjd metaslab_t *msp; 2715219089Spjd int error = 0; 2716168404Spjd 2717168404Spjd ASSERT(DVA_IS_VALID(dva)); 2718168404Spjd 2719168404Spjd if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2720168404Spjd (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2721249195Smm return (SET_ERROR(ENXIO)); 2722168404Spjd 2723168404Spjd msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2724168404Spjd 2725168404Spjd if (DVA_GET_GANG(dva)) 2726168404Spjd size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2727168404Spjd 2728168404Spjd mutex_enter(&msp->ms_lock); 2729168404Spjd 2730262093Savg if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2731224177Smm error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2732219089Spjd 2733262093Savg if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2734249195Smm error = SET_ERROR(ENOENT); 2735219089Spjd 2736185029Spjd if (error || txg == 0) { /* txg == 0 indicates dry run */ 2737168404Spjd mutex_exit(&msp->ms_lock); 2738168404Spjd return (error); 2739168404Spjd } 2740168404Spjd 2741262093Savg VERIFY(!msp->ms_condensing); 2742262093Savg VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2743262093Savg VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2744262093Savg VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2745262093Savg range_tree_remove(msp->ms_tree, offset, size); 2746168404Spjd 2747209962Smm if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2748262093Savg if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2749185029Spjd vdev_dirty(vd, VDD_METASLAB, msp, txg); 2750262093Savg range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2751185029Spjd } 2752185029Spjd 2753168404Spjd mutex_exit(&msp->ms_lock); 2754168404Spjd 2755168404Spjd return (0); 2756168404Spjd} 2757168404Spjd 2758307279Smav/* 2759307279Smav * Reserve some allocation slots. The reservation system must be called 2760307279Smav * before we call into the allocator. If there aren't any available slots 2761307279Smav * then the I/O will be throttled until an I/O completes and its slots are 2762307279Smav * freed up. The function returns true if it was successful in placing 2763307279Smav * the reservation. 2764307279Smav */ 2765307279Smavboolean_t 2766307279Smavmetaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, 2767307279Smav int flags) 2768307279Smav{ 2769307279Smav uint64_t available_slots = 0; 2770307279Smav boolean_t slot_reserved = B_FALSE; 2771307279Smav 2772307279Smav ASSERT(mc->mc_alloc_throttle_enabled); 2773307279Smav mutex_enter(&mc->mc_lock); 2774307279Smav 2775307279Smav uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); 2776307279Smav if (reserved_slots < mc->mc_alloc_max_slots) 2777307279Smav available_slots = mc->mc_alloc_max_slots - reserved_slots; 2778307279Smav 2779307279Smav if (slots <= available_slots || GANG_ALLOCATION(flags)) { 2780307279Smav /* 2781307279Smav * We reserve the slots individually so that we can unreserve 2782307279Smav * them individually when an I/O completes. 2783307279Smav */ 2784307279Smav for (int d = 0; d < slots; d++) { 2785307279Smav reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); 2786307279Smav } 2787307279Smav zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 2788307279Smav slot_reserved = B_TRUE; 2789307279Smav } 2790307279Smav 2791307279Smav mutex_exit(&mc->mc_lock); 2792307279Smav return (slot_reserved); 2793307279Smav} 2794307279Smav 2795307279Smavvoid 2796307279Smavmetaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) 2797307279Smav{ 2798307279Smav ASSERT(mc->mc_alloc_throttle_enabled); 2799307279Smav mutex_enter(&mc->mc_lock); 2800307279Smav for (int d = 0; d < slots; d++) { 2801307279Smav (void) refcount_remove(&mc->mc_alloc_slots, zio); 2802307279Smav } 2803307279Smav mutex_exit(&mc->mc_lock); 2804307279Smav} 2805307279Smav 2806168404Spjdint 2807185029Spjdmetaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2808307279Smav int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio) 2809168404Spjd{ 2810168404Spjd dva_t *dva = bp->blk_dva; 2811168404Spjd dva_t *hintdva = hintbp->blk_dva; 2812168404Spjd int error = 0; 2813168404Spjd 2814185029Spjd ASSERT(bp->blk_birth == 0); 2815219089Spjd ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2816185029Spjd 2817185029Spjd spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2818185029Spjd 2819185029Spjd if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2820185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 2821249195Smm return (SET_ERROR(ENOSPC)); 2822185029Spjd } 2823185029Spjd 2824168404Spjd ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2825168404Spjd ASSERT(BP_GET_NDVAS(bp) == 0); 2826168404Spjd ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2827168404Spjd 2828185029Spjd for (int d = 0; d < ndvas; d++) { 2829185029Spjd error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2830185029Spjd txg, flags); 2831262093Savg if (error != 0) { 2832168404Spjd for (d--; d >= 0; d--) { 2833168404Spjd metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2834307279Smav metaslab_group_alloc_decrement(spa, 2835307279Smav DVA_GET_VDEV(&dva[d]), zio, flags); 2836168404Spjd bzero(&dva[d], sizeof (dva_t)); 2837168404Spjd } 2838185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 2839168404Spjd return (error); 2840307279Smav } else { 2841307279Smav /* 2842307279Smav * Update the metaslab group's queue depth 2843307279Smav * based on the newly allocated dva. 2844307279Smav */ 2845307279Smav metaslab_group_alloc_increment(spa, 2846307279Smav DVA_GET_VDEV(&dva[d]), zio, flags); 2847168404Spjd } 2848307279Smav 2849168404Spjd } 2850168404Spjd ASSERT(error == 0); 2851168404Spjd ASSERT(BP_GET_NDVAS(bp) == ndvas); 2852168404Spjd 2853185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 2854185029Spjd 2855219089Spjd BP_SET_BIRTH(bp, txg, txg); 2856185029Spjd 2857168404Spjd return (0); 2858168404Spjd} 2859168404Spjd 2860168404Spjdvoid 2861168404Spjdmetaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2862168404Spjd{ 2863168404Spjd const dva_t *dva = bp->blk_dva; 2864168404Spjd int ndvas = BP_GET_NDVAS(bp); 2865168404Spjd 2866168404Spjd ASSERT(!BP_IS_HOLE(bp)); 2867219089Spjd ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2868168404Spjd 2869185029Spjd spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2870185029Spjd 2871185029Spjd for (int d = 0; d < ndvas; d++) 2872168404Spjd metaslab_free_dva(spa, &dva[d], txg, now); 2873185029Spjd 2874185029Spjd spa_config_exit(spa, SCL_FREE, FTAG); 2875168404Spjd} 2876168404Spjd 2877168404Spjdint 2878168404Spjdmetaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2879168404Spjd{ 2880168404Spjd const dva_t *dva = bp->blk_dva; 2881168404Spjd int ndvas = BP_GET_NDVAS(bp); 2882185029Spjd int error = 0; 2883168404Spjd 2884168404Spjd ASSERT(!BP_IS_HOLE(bp)); 2885168404Spjd 2886185029Spjd if (txg != 0) { 2887185029Spjd /* 2888185029Spjd * First do a dry run to make sure all DVAs are claimable, 2889185029Spjd * so we don't have to unwind from partial failures below. 2890185029Spjd */ 2891185029Spjd if ((error = metaslab_claim(spa, bp, 0)) != 0) 2892185029Spjd return (error); 2893185029Spjd } 2894185029Spjd 2895185029Spjd spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2896185029Spjd 2897185029Spjd for (int d = 0; d < ndvas; d++) 2898168404Spjd if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2899185029Spjd break; 2900168404Spjd 2901185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 2902185029Spjd 2903185029Spjd ASSERT(error == 0 || txg == 0); 2904185029Spjd 2905185029Spjd return (error); 2906168404Spjd} 2907248571Smm 2908248571Smmvoid 2909248571Smmmetaslab_check_free(spa_t *spa, const blkptr_t *bp) 2910248571Smm{ 2911248571Smm if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2912248571Smm return; 2913248571Smm 2914248571Smm spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2915248571Smm for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2916262093Savg uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2917262093Savg vdev_t *vd = vdev_lookup_top(spa, vdev); 2918262093Savg uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2919248571Smm uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2920262093Savg metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2921248571Smm 2922262093Savg if (msp->ms_loaded) 2923262093Savg range_tree_verify(msp->ms_tree, offset, size); 2924248571Smm 2925248571Smm for (int j = 0; j < TXG_SIZE; j++) 2926262093Savg range_tree_verify(msp->ms_freetree[j], offset, size); 2927248571Smm for (int j = 0; j < TXG_DEFER_SIZE; j++) 2928262093Savg range_tree_verify(msp->ms_defertree[j], offset, size); 2929248571Smm } 2930248571Smm spa_config_exit(spa, SCL_VDEV, FTAG); 2931248571Smm} 2932