metaslab.c revision 307267
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22211931Smm * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23289307Smav * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24246675Smm * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25296519Smav * Copyright (c) 2014 Integros [integros.com] 26168404Spjd */ 27168404Spjd 28168404Spjd#include <sys/zfs_context.h> 29168404Spjd#include <sys/dmu.h> 30168404Spjd#include <sys/dmu_tx.h> 31168404Spjd#include <sys/space_map.h> 32168404Spjd#include <sys/metaslab_impl.h> 33168404Spjd#include <sys/vdev_impl.h> 34168404Spjd#include <sys/zio.h> 35258717Savg#include <sys/spa_impl.h> 36269118Sdelphij#include <sys/zfeature.h> 37168404Spjd 38255226SpjdSYSCTL_DECL(_vfs_zfs); 39255226SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 40255226Spjd 41224177Smm/* 42224177Smm * Allow allocations to switch to gang blocks quickly. We do this to 43224177Smm * avoid having to load lots of space_maps in a given txg. There are, 44224177Smm * however, some cases where we want to avoid "fast" ganging and instead 45224177Smm * we want to do an exhaustive search of all metaslabs on this device. 46264671Sdelphij * Currently we don't allow any gang, slog, or dump device related allocations 47224177Smm * to "fast" gang. 48224177Smm */ 49224177Smm#define CAN_FASTGANG(flags) \ 50224177Smm (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 51224177Smm METASLAB_GANG_AVOID))) 52224177Smm 53258717Savg#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 54258717Savg#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 55258717Savg#define METASLAB_ACTIVE_MASK \ 56258717Savg (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 57258717Savg 58168404Spjduint64_t metaslab_aliquot = 512ULL << 10; 59185029Spjduint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 60255226SpjdSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, 61255226Spjd &metaslab_gang_bang, 0, 62255226Spjd "Force gang block allocation for blocks larger than or equal to this value"); 63168404Spjd 64168404Spjd/* 65247398Smm * The in-core space map representation is more compact than its on-disk form. 66247398Smm * The zfs_condense_pct determines how much more compact the in-core 67247398Smm * space_map representation must be before we compact it on-disk. 68247398Smm * Values should be greater than or equal to 100. 69247398Smm */ 70247398Smmint zfs_condense_pct = 200; 71258717SavgSYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, 72258717Savg &zfs_condense_pct, 0, 73258717Savg "Condense on-disk spacemap when it is more than this many percents" 74258717Savg " of in-memory counterpart"); 75247398Smm 76247398Smm/* 77268855Sdelphij * Condensing a metaslab is not guaranteed to actually reduce the amount of 78268855Sdelphij * space used on disk. In particular, a space map uses data in increments of 79272504Sdelphij * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 80268855Sdelphij * same number of blocks after condensing. Since the goal of condensing is to 81268855Sdelphij * reduce the number of IOPs required to read the space map, we only want to 82268855Sdelphij * condense when we can be sure we will reduce the number of blocks used by the 83268855Sdelphij * space map. Unfortunately, we cannot precisely compute whether or not this is 84268855Sdelphij * the case in metaslab_should_condense since we are holding ms_lock. Instead, 85268855Sdelphij * we apply the following heuristic: do not condense a spacemap unless the 86268855Sdelphij * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 87268855Sdelphij * blocks. 88268855Sdelphij */ 89268855Sdelphijint zfs_metaslab_condense_block_threshold = 4; 90268855Sdelphij 91268855Sdelphij/* 92258633Savg * The zfs_mg_noalloc_threshold defines which metaslab groups should 93258633Savg * be eligible for allocation. The value is defined as a percentage of 94269118Sdelphij * free space. Metaslab groups that have more free space than 95258633Savg * zfs_mg_noalloc_threshold are always eligible for allocations. Once 96258633Savg * a metaslab group's free space is less than or equal to the 97258633Savg * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 98258633Savg * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 99258633Savg * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 100258633Savg * groups are allowed to accept allocations. Gang blocks are always 101258633Savg * eligible to allocate on any metaslab group. The default value of 0 means 102258633Savg * no metaslab group will be excluded based on this criterion. 103258633Savg */ 104258633Savgint zfs_mg_noalloc_threshold = 0; 105258717SavgSYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 106258717Savg &zfs_mg_noalloc_threshold, 0, 107258717Savg "Percentage of metaslab group size that should be free" 108258717Savg " to make it eligible for allocation"); 109258633Savg 110258633Savg/* 111269118Sdelphij * Metaslab groups are considered eligible for allocations if their 112269118Sdelphij * fragmenation metric (measured as a percentage) is less than or equal to 113269118Sdelphij * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 114269118Sdelphij * then it will be skipped unless all metaslab groups within the metaslab 115269118Sdelphij * class have also crossed this threshold. 116269118Sdelphij */ 117269118Sdelphijint zfs_mg_fragmentation_threshold = 85; 118269138SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, 119269138Sdelphij &zfs_mg_fragmentation_threshold, 0, 120269138Sdelphij "Percentage of metaslab group size that should be considered " 121269138Sdelphij "eligible for allocations unless all metaslab groups within the metaslab class " 122269138Sdelphij "have also crossed this threshold"); 123269118Sdelphij 124269118Sdelphij/* 125269118Sdelphij * Allow metaslabs to keep their active state as long as their fragmentation 126269118Sdelphij * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 127269118Sdelphij * active metaslab that exceeds this threshold will no longer keep its active 128269118Sdelphij * status allowing better metaslabs to be selected. 129269118Sdelphij */ 130269118Sdelphijint zfs_metaslab_fragmentation_threshold = 70; 131269138SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, 132269138Sdelphij &zfs_metaslab_fragmentation_threshold, 0, 133269138Sdelphij "Maximum percentage of metaslab fragmentation level to keep their active state"); 134269118Sdelphij 135269118Sdelphij/* 136258717Savg * When set will load all metaslabs when pool is first opened. 137219089Spjd */ 138258717Savgint metaslab_debug_load = 0; 139258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 140258717Savg &metaslab_debug_load, 0, 141258717Savg "Load all metaslabs when pool is first opened"); 142219089Spjd 143219089Spjd/* 144258717Savg * When set will prevent metaslabs from being unloaded. 145258717Savg */ 146258717Savgint metaslab_debug_unload = 0; 147258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, 148258717Savg &metaslab_debug_unload, 0, 149258717Savg "Prevent metaslabs from being unloaded"); 150258717Savg 151258717Savg/* 152209962Smm * Minimum size which forces the dynamic allocator to change 153211931Smm * it's allocation strategy. Once the space map cannot satisfy 154209962Smm * an allocation of this size then it switches to using more 155209962Smm * aggressive strategy (i.e search by size rather than offset). 156209962Smm */ 157274337Sdelphijuint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 158255226SpjdSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 159255226Spjd &metaslab_df_alloc_threshold, 0, 160255226Spjd "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 161209962Smm 162209962Smm/* 163209962Smm * The minimum free space, in percent, which must be available 164209962Smm * in a space map to continue allocations in a first-fit fashion. 165209962Smm * Once the space_map's free space drops below this level we dynamically 166209962Smm * switch to using best-fit allocations. 167209962Smm */ 168211931Smmint metaslab_df_free_pct = 4; 169255226SpjdSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 170255226Spjd &metaslab_df_free_pct, 0, 171267992Shselasky "The minimum free space, in percent, which must be available in a " 172267992Shselasky "space map to continue allocations in a first-fit fashion"); 173209962Smm 174209962Smm/* 175211931Smm * A metaslab is considered "free" if it contains a contiguous 176211931Smm * segment which is greater than metaslab_min_alloc_size. 177211931Smm */ 178211931Smmuint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 179255226SpjdSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 180255226Spjd &metaslab_min_alloc_size, 0, 181267992Shselasky "A metaslab is considered \"free\" if it contains a contiguous " 182267992Shselasky "segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 183211931Smm 184211931Smm/* 185258717Savg * Percentage of all cpus that can be used by the metaslab taskq. 186211931Smm */ 187258717Savgint metaslab_load_pct = 50; 188258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, 189258717Savg &metaslab_load_pct, 0, 190258717Savg "Percentage of cpus that can be used by the metaslab taskq"); 191211931Smm 192211931Smm/* 193258717Savg * Determines how many txgs a metaslab may remain loaded without having any 194258717Savg * allocations from it. As long as a metaslab continues to be used we will 195258717Savg * keep it loaded. 196211931Smm */ 197258717Savgint metaslab_unload_delay = TXG_SIZE * 2; 198258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 199258717Savg &metaslab_unload_delay, 0, 200258717Savg "Number of TXGs that an unused metaslab can be kept in memory"); 201211931Smm 202211931Smm/* 203258717Savg * Max number of metaslabs per group to preload. 204258717Savg */ 205258717Savgint metaslab_preload_limit = SPA_DVAS_PER_BP; 206258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 207258717Savg &metaslab_preload_limit, 0, 208258717Savg "Max number of metaslabs per group to preload"); 209258717Savg 210258717Savg/* 211258717Savg * Enable/disable preloading of metaslab. 212258717Savg */ 213258717Savgboolean_t metaslab_preload_enabled = B_TRUE; 214258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 215258717Savg &metaslab_preload_enabled, 0, 216258717Savg "Max number of metaslabs per group to preload"); 217258717Savg 218258717Savg/* 219269118Sdelphij * Enable/disable fragmentation weighting on metaslabs. 220258717Savg */ 221269118Sdelphijboolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 222269118SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 223269118Sdelphij &metaslab_fragmentation_factor_enabled, 0, 224269118Sdelphij "Enable fragmentation weighting on metaslabs"); 225258717Savg 226269118Sdelphij/* 227269118Sdelphij * Enable/disable lba weighting (i.e. outer tracks are given preference). 228269118Sdelphij */ 229269118Sdelphijboolean_t metaslab_lba_weighting_enabled = B_TRUE; 230269118SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 231269118Sdelphij &metaslab_lba_weighting_enabled, 0, 232269118Sdelphij "Enable LBA weighting (i.e. outer tracks are given preference)"); 233258717Savg 234258717Savg/* 235269118Sdelphij * Enable/disable metaslab group biasing. 236269118Sdelphij */ 237269118Sdelphijboolean_t metaslab_bias_enabled = B_TRUE; 238269118SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 239269118Sdelphij &metaslab_bias_enabled, 0, 240269118Sdelphij "Enable metaslab group biasing"); 241269118Sdelphij 242269118Sdelphijstatic uint64_t metaslab_fragmentation(metaslab_t *); 243269118Sdelphij 244269118Sdelphij/* 245168404Spjd * ========================================================================== 246168404Spjd * Metaslab classes 247168404Spjd * ========================================================================== 248168404Spjd */ 249168404Spjdmetaslab_class_t * 250258717Savgmetaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 251168404Spjd{ 252168404Spjd metaslab_class_t *mc; 253168404Spjd 254168404Spjd mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 255168404Spjd 256219089Spjd mc->mc_spa = spa; 257168404Spjd mc->mc_rotor = NULL; 258209962Smm mc->mc_ops = ops; 259168404Spjd 260168404Spjd return (mc); 261168404Spjd} 262168404Spjd 263168404Spjdvoid 264168404Spjdmetaslab_class_destroy(metaslab_class_t *mc) 265168404Spjd{ 266219089Spjd ASSERT(mc->mc_rotor == NULL); 267219089Spjd ASSERT(mc->mc_alloc == 0); 268219089Spjd ASSERT(mc->mc_deferred == 0); 269219089Spjd ASSERT(mc->mc_space == 0); 270219089Spjd ASSERT(mc->mc_dspace == 0); 271168404Spjd 272168404Spjd kmem_free(mc, sizeof (metaslab_class_t)); 273168404Spjd} 274168404Spjd 275219089Spjdint 276219089Spjdmetaslab_class_validate(metaslab_class_t *mc) 277168404Spjd{ 278219089Spjd metaslab_group_t *mg; 279219089Spjd vdev_t *vd; 280168404Spjd 281219089Spjd /* 282219089Spjd * Must hold one of the spa_config locks. 283219089Spjd */ 284219089Spjd ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 285219089Spjd spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 286168404Spjd 287219089Spjd if ((mg = mc->mc_rotor) == NULL) 288219089Spjd return (0); 289219089Spjd 290219089Spjd do { 291219089Spjd vd = mg->mg_vd; 292219089Spjd ASSERT(vd->vdev_mg != NULL); 293219089Spjd ASSERT3P(vd->vdev_top, ==, vd); 294219089Spjd ASSERT3P(mg->mg_class, ==, mc); 295219089Spjd ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 296219089Spjd } while ((mg = mg->mg_next) != mc->mc_rotor); 297219089Spjd 298219089Spjd return (0); 299168404Spjd} 300168404Spjd 301168404Spjdvoid 302219089Spjdmetaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 303219089Spjd int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 304168404Spjd{ 305219089Spjd atomic_add_64(&mc->mc_alloc, alloc_delta); 306219089Spjd atomic_add_64(&mc->mc_deferred, defer_delta); 307219089Spjd atomic_add_64(&mc->mc_space, space_delta); 308219089Spjd atomic_add_64(&mc->mc_dspace, dspace_delta); 309219089Spjd} 310168404Spjd 311254591Sgibbsvoid 312254591Sgibbsmetaslab_class_minblocksize_update(metaslab_class_t *mc) 313254591Sgibbs{ 314254591Sgibbs metaslab_group_t *mg; 315254591Sgibbs vdev_t *vd; 316254591Sgibbs uint64_t minashift = UINT64_MAX; 317254591Sgibbs 318254591Sgibbs if ((mg = mc->mc_rotor) == NULL) { 319254591Sgibbs mc->mc_minblocksize = SPA_MINBLOCKSIZE; 320254591Sgibbs return; 321254591Sgibbs } 322254591Sgibbs 323254591Sgibbs do { 324254591Sgibbs vd = mg->mg_vd; 325254591Sgibbs if (vd->vdev_ashift < minashift) 326254591Sgibbs minashift = vd->vdev_ashift; 327254591Sgibbs } while ((mg = mg->mg_next) != mc->mc_rotor); 328254591Sgibbs 329254591Sgibbs mc->mc_minblocksize = 1ULL << minashift; 330254591Sgibbs} 331254591Sgibbs 332219089Spjduint64_t 333219089Spjdmetaslab_class_get_alloc(metaslab_class_t *mc) 334219089Spjd{ 335219089Spjd return (mc->mc_alloc); 336219089Spjd} 337168404Spjd 338219089Spjduint64_t 339219089Spjdmetaslab_class_get_deferred(metaslab_class_t *mc) 340219089Spjd{ 341219089Spjd return (mc->mc_deferred); 342219089Spjd} 343168404Spjd 344219089Spjduint64_t 345219089Spjdmetaslab_class_get_space(metaslab_class_t *mc) 346219089Spjd{ 347219089Spjd return (mc->mc_space); 348219089Spjd} 349168404Spjd 350219089Spjduint64_t 351219089Spjdmetaslab_class_get_dspace(metaslab_class_t *mc) 352219089Spjd{ 353219089Spjd return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 354168404Spjd} 355168404Spjd 356254591Sgibbsuint64_t 357254591Sgibbsmetaslab_class_get_minblocksize(metaslab_class_t *mc) 358254591Sgibbs{ 359254591Sgibbs return (mc->mc_minblocksize); 360254591Sgibbs} 361254591Sgibbs 362269118Sdelphijvoid 363269118Sdelphijmetaslab_class_histogram_verify(metaslab_class_t *mc) 364269118Sdelphij{ 365269118Sdelphij vdev_t *rvd = mc->mc_spa->spa_root_vdev; 366269118Sdelphij uint64_t *mc_hist; 367269118Sdelphij int i; 368269118Sdelphij 369269118Sdelphij if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 370269118Sdelphij return; 371269118Sdelphij 372269118Sdelphij mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 373269118Sdelphij KM_SLEEP); 374269118Sdelphij 375269118Sdelphij for (int c = 0; c < rvd->vdev_children; c++) { 376269118Sdelphij vdev_t *tvd = rvd->vdev_child[c]; 377269118Sdelphij metaslab_group_t *mg = tvd->vdev_mg; 378269118Sdelphij 379269118Sdelphij /* 380269118Sdelphij * Skip any holes, uninitialized top-levels, or 381269118Sdelphij * vdevs that are not in this metalab class. 382269118Sdelphij */ 383269118Sdelphij if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 384269118Sdelphij mg->mg_class != mc) { 385269118Sdelphij continue; 386269118Sdelphij } 387269118Sdelphij 388269118Sdelphij for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 389269118Sdelphij mc_hist[i] += mg->mg_histogram[i]; 390269118Sdelphij } 391269118Sdelphij 392269118Sdelphij for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 393269118Sdelphij VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 394269118Sdelphij 395269118Sdelphij kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 396269118Sdelphij} 397269118Sdelphij 398168404Spjd/* 399269118Sdelphij * Calculate the metaslab class's fragmentation metric. The metric 400269118Sdelphij * is weighted based on the space contribution of each metaslab group. 401269118Sdelphij * The return value will be a number between 0 and 100 (inclusive), or 402269118Sdelphij * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 403269118Sdelphij * zfs_frag_table for more information about the metric. 404269118Sdelphij */ 405269118Sdelphijuint64_t 406269118Sdelphijmetaslab_class_fragmentation(metaslab_class_t *mc) 407269118Sdelphij{ 408269118Sdelphij vdev_t *rvd = mc->mc_spa->spa_root_vdev; 409269118Sdelphij uint64_t fragmentation = 0; 410269118Sdelphij 411269118Sdelphij spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 412269118Sdelphij 413269118Sdelphij for (int c = 0; c < rvd->vdev_children; c++) { 414269118Sdelphij vdev_t *tvd = rvd->vdev_child[c]; 415269118Sdelphij metaslab_group_t *mg = tvd->vdev_mg; 416269118Sdelphij 417269118Sdelphij /* 418269118Sdelphij * Skip any holes, uninitialized top-levels, or 419269118Sdelphij * vdevs that are not in this metalab class. 420269118Sdelphij */ 421269118Sdelphij if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 422269118Sdelphij mg->mg_class != mc) { 423269118Sdelphij continue; 424269118Sdelphij } 425269118Sdelphij 426269118Sdelphij /* 427269118Sdelphij * If a metaslab group does not contain a fragmentation 428269118Sdelphij * metric then just bail out. 429269118Sdelphij */ 430269118Sdelphij if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 431269118Sdelphij spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 432269118Sdelphij return (ZFS_FRAG_INVALID); 433269118Sdelphij } 434269118Sdelphij 435269118Sdelphij /* 436269118Sdelphij * Determine how much this metaslab_group is contributing 437269118Sdelphij * to the overall pool fragmentation metric. 438269118Sdelphij */ 439269118Sdelphij fragmentation += mg->mg_fragmentation * 440269118Sdelphij metaslab_group_get_space(mg); 441269118Sdelphij } 442269118Sdelphij fragmentation /= metaslab_class_get_space(mc); 443269118Sdelphij 444269118Sdelphij ASSERT3U(fragmentation, <=, 100); 445269118Sdelphij spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 446269118Sdelphij return (fragmentation); 447269118Sdelphij} 448269118Sdelphij 449269118Sdelphij/* 450269118Sdelphij * Calculate the amount of expandable space that is available in 451269118Sdelphij * this metaslab class. If a device is expanded then its expandable 452269118Sdelphij * space will be the amount of allocatable space that is currently not 453269118Sdelphij * part of this metaslab class. 454269118Sdelphij */ 455269118Sdelphijuint64_t 456269118Sdelphijmetaslab_class_expandable_space(metaslab_class_t *mc) 457269118Sdelphij{ 458269118Sdelphij vdev_t *rvd = mc->mc_spa->spa_root_vdev; 459269118Sdelphij uint64_t space = 0; 460269118Sdelphij 461269118Sdelphij spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 462269118Sdelphij for (int c = 0; c < rvd->vdev_children; c++) { 463269118Sdelphij vdev_t *tvd = rvd->vdev_child[c]; 464269118Sdelphij metaslab_group_t *mg = tvd->vdev_mg; 465269118Sdelphij 466269118Sdelphij if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 467269118Sdelphij mg->mg_class != mc) { 468269118Sdelphij continue; 469269118Sdelphij } 470269118Sdelphij 471307267Smav /* 472307267Smav * Calculate if we have enough space to add additional 473307267Smav * metaslabs. We report the expandable space in terms 474307267Smav * of the metaslab size since that's the unit of expansion. 475307267Smav */ 476307267Smav space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, 477307267Smav 1ULL << tvd->vdev_ms_shift); 478269118Sdelphij } 479269118Sdelphij spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 480269118Sdelphij return (space); 481269118Sdelphij} 482269118Sdelphij 483269118Sdelphij/* 484168404Spjd * ========================================================================== 485168404Spjd * Metaslab groups 486168404Spjd * ========================================================================== 487168404Spjd */ 488168404Spjdstatic int 489168404Spjdmetaslab_compare(const void *x1, const void *x2) 490168404Spjd{ 491168404Spjd const metaslab_t *m1 = x1; 492168404Spjd const metaslab_t *m2 = x2; 493168404Spjd 494168404Spjd if (m1->ms_weight < m2->ms_weight) 495168404Spjd return (1); 496168404Spjd if (m1->ms_weight > m2->ms_weight) 497168404Spjd return (-1); 498168404Spjd 499168404Spjd /* 500168404Spjd * If the weights are identical, use the offset to force uniqueness. 501168404Spjd */ 502258717Savg if (m1->ms_start < m2->ms_start) 503168404Spjd return (-1); 504258717Savg if (m1->ms_start > m2->ms_start) 505168404Spjd return (1); 506168404Spjd 507168404Spjd ASSERT3P(m1, ==, m2); 508168404Spjd 509168404Spjd return (0); 510168404Spjd} 511168404Spjd 512258633Savg/* 513258633Savg * Update the allocatable flag and the metaslab group's capacity. 514258633Savg * The allocatable flag is set to true if the capacity is below 515258633Savg * the zfs_mg_noalloc_threshold. If a metaslab group transitions 516258633Savg * from allocatable to non-allocatable or vice versa then the metaslab 517258633Savg * group's class is updated to reflect the transition. 518258633Savg */ 519258633Savgstatic void 520258633Savgmetaslab_group_alloc_update(metaslab_group_t *mg) 521258633Savg{ 522258633Savg vdev_t *vd = mg->mg_vd; 523258633Savg metaslab_class_t *mc = mg->mg_class; 524258633Savg vdev_stat_t *vs = &vd->vdev_stat; 525258633Savg boolean_t was_allocatable; 526258633Savg 527258633Savg ASSERT(vd == vd->vdev_top); 528258633Savg 529258633Savg mutex_enter(&mg->mg_lock); 530258633Savg was_allocatable = mg->mg_allocatable; 531258633Savg 532258633Savg mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 533258633Savg (vs->vs_space + 1); 534258633Savg 535269118Sdelphij /* 536269118Sdelphij * A metaslab group is considered allocatable if it has plenty 537269118Sdelphij * of free space or is not heavily fragmented. We only take 538269118Sdelphij * fragmentation into account if the metaslab group has a valid 539269118Sdelphij * fragmentation metric (i.e. a value between 0 and 100). 540269118Sdelphij */ 541269118Sdelphij mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 542269118Sdelphij (mg->mg_fragmentation == ZFS_FRAG_INVALID || 543269118Sdelphij mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 544258633Savg 545258633Savg /* 546258633Savg * The mc_alloc_groups maintains a count of the number of 547258633Savg * groups in this metaslab class that are still above the 548258633Savg * zfs_mg_noalloc_threshold. This is used by the allocating 549258633Savg * threads to determine if they should avoid allocations to 550258633Savg * a given group. The allocator will avoid allocations to a group 551258633Savg * if that group has reached or is below the zfs_mg_noalloc_threshold 552258633Savg * and there are still other groups that are above the threshold. 553258633Savg * When a group transitions from allocatable to non-allocatable or 554258633Savg * vice versa we update the metaslab class to reflect that change. 555258633Savg * When the mc_alloc_groups value drops to 0 that means that all 556258633Savg * groups have reached the zfs_mg_noalloc_threshold making all groups 557258633Savg * eligible for allocations. This effectively means that all devices 558258633Savg * are balanced again. 559258633Savg */ 560258633Savg if (was_allocatable && !mg->mg_allocatable) 561258633Savg mc->mc_alloc_groups--; 562258633Savg else if (!was_allocatable && mg->mg_allocatable) 563258633Savg mc->mc_alloc_groups++; 564269118Sdelphij 565258633Savg mutex_exit(&mg->mg_lock); 566258633Savg} 567258633Savg 568168404Spjdmetaslab_group_t * 569168404Spjdmetaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 570168404Spjd{ 571168404Spjd metaslab_group_t *mg; 572168404Spjd 573168404Spjd mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 574168404Spjd mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 575168404Spjd avl_create(&mg->mg_metaslab_tree, metaslab_compare, 576168404Spjd sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 577168404Spjd mg->mg_vd = vd; 578219089Spjd mg->mg_class = mc; 579219089Spjd mg->mg_activation_count = 0; 580168404Spjd 581265458Sdelphij mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 582258717Savg minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 583258717Savg 584168404Spjd return (mg); 585168404Spjd} 586168404Spjd 587168404Spjdvoid 588168404Spjdmetaslab_group_destroy(metaslab_group_t *mg) 589168404Spjd{ 590219089Spjd ASSERT(mg->mg_prev == NULL); 591219089Spjd ASSERT(mg->mg_next == NULL); 592219089Spjd /* 593219089Spjd * We may have gone below zero with the activation count 594219089Spjd * either because we never activated in the first place or 595219089Spjd * because we're done, and possibly removing the vdev. 596219089Spjd */ 597219089Spjd ASSERT(mg->mg_activation_count <= 0); 598219089Spjd 599265458Sdelphij taskq_destroy(mg->mg_taskq); 600168404Spjd avl_destroy(&mg->mg_metaslab_tree); 601168404Spjd mutex_destroy(&mg->mg_lock); 602168404Spjd kmem_free(mg, sizeof (metaslab_group_t)); 603168404Spjd} 604168404Spjd 605219089Spjdvoid 606219089Spjdmetaslab_group_activate(metaslab_group_t *mg) 607219089Spjd{ 608219089Spjd metaslab_class_t *mc = mg->mg_class; 609219089Spjd metaslab_group_t *mgprev, *mgnext; 610219089Spjd 611219089Spjd ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 612219089Spjd 613219089Spjd ASSERT(mc->mc_rotor != mg); 614219089Spjd ASSERT(mg->mg_prev == NULL); 615219089Spjd ASSERT(mg->mg_next == NULL); 616219089Spjd ASSERT(mg->mg_activation_count <= 0); 617219089Spjd 618219089Spjd if (++mg->mg_activation_count <= 0) 619219089Spjd return; 620219089Spjd 621219089Spjd mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 622258633Savg metaslab_group_alloc_update(mg); 623219089Spjd 624219089Spjd if ((mgprev = mc->mc_rotor) == NULL) { 625219089Spjd mg->mg_prev = mg; 626219089Spjd mg->mg_next = mg; 627219089Spjd } else { 628219089Spjd mgnext = mgprev->mg_next; 629219089Spjd mg->mg_prev = mgprev; 630219089Spjd mg->mg_next = mgnext; 631219089Spjd mgprev->mg_next = mg; 632219089Spjd mgnext->mg_prev = mg; 633219089Spjd } 634219089Spjd mc->mc_rotor = mg; 635254591Sgibbs metaslab_class_minblocksize_update(mc); 636219089Spjd} 637219089Spjd 638219089Spjdvoid 639219089Spjdmetaslab_group_passivate(metaslab_group_t *mg) 640219089Spjd{ 641219089Spjd metaslab_class_t *mc = mg->mg_class; 642219089Spjd metaslab_group_t *mgprev, *mgnext; 643219089Spjd 644219089Spjd ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 645219089Spjd 646219089Spjd if (--mg->mg_activation_count != 0) { 647219089Spjd ASSERT(mc->mc_rotor != mg); 648219089Spjd ASSERT(mg->mg_prev == NULL); 649219089Spjd ASSERT(mg->mg_next == NULL); 650219089Spjd ASSERT(mg->mg_activation_count < 0); 651219089Spjd return; 652219089Spjd } 653219089Spjd 654258717Savg taskq_wait(mg->mg_taskq); 655269118Sdelphij metaslab_group_alloc_update(mg); 656258717Savg 657219089Spjd mgprev = mg->mg_prev; 658219089Spjd mgnext = mg->mg_next; 659219089Spjd 660219089Spjd if (mg == mgnext) { 661219089Spjd mc->mc_rotor = NULL; 662219089Spjd } else { 663219089Spjd mc->mc_rotor = mgnext; 664219089Spjd mgprev->mg_next = mgnext; 665219089Spjd mgnext->mg_prev = mgprev; 666219089Spjd } 667219089Spjd 668219089Spjd mg->mg_prev = NULL; 669219089Spjd mg->mg_next = NULL; 670254591Sgibbs metaslab_class_minblocksize_update(mc); 671219089Spjd} 672219089Spjd 673269118Sdelphijuint64_t 674269118Sdelphijmetaslab_group_get_space(metaslab_group_t *mg) 675269118Sdelphij{ 676269118Sdelphij return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 677269118Sdelphij} 678269118Sdelphij 679269118Sdelphijvoid 680269118Sdelphijmetaslab_group_histogram_verify(metaslab_group_t *mg) 681269118Sdelphij{ 682269118Sdelphij uint64_t *mg_hist; 683269118Sdelphij vdev_t *vd = mg->mg_vd; 684269118Sdelphij uint64_t ashift = vd->vdev_ashift; 685269118Sdelphij int i; 686269118Sdelphij 687269118Sdelphij if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 688269118Sdelphij return; 689269118Sdelphij 690269118Sdelphij mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 691269118Sdelphij KM_SLEEP); 692269118Sdelphij 693269118Sdelphij ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 694269118Sdelphij SPACE_MAP_HISTOGRAM_SIZE + ashift); 695269118Sdelphij 696269118Sdelphij for (int m = 0; m < vd->vdev_ms_count; m++) { 697269118Sdelphij metaslab_t *msp = vd->vdev_ms[m]; 698269118Sdelphij 699269118Sdelphij if (msp->ms_sm == NULL) 700269118Sdelphij continue; 701269118Sdelphij 702269118Sdelphij for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 703269118Sdelphij mg_hist[i + ashift] += 704269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 705269118Sdelphij } 706269118Sdelphij 707269118Sdelphij for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 708269118Sdelphij VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 709269118Sdelphij 710269118Sdelphij kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 711269118Sdelphij} 712269118Sdelphij 713168404Spjdstatic void 714269118Sdelphijmetaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 715269118Sdelphij{ 716269118Sdelphij metaslab_class_t *mc = mg->mg_class; 717269118Sdelphij uint64_t ashift = mg->mg_vd->vdev_ashift; 718269118Sdelphij 719269118Sdelphij ASSERT(MUTEX_HELD(&msp->ms_lock)); 720269118Sdelphij if (msp->ms_sm == NULL) 721269118Sdelphij return; 722269118Sdelphij 723269118Sdelphij mutex_enter(&mg->mg_lock); 724269118Sdelphij for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 725269118Sdelphij mg->mg_histogram[i + ashift] += 726269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 727269118Sdelphij mc->mc_histogram[i + ashift] += 728269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 729269118Sdelphij } 730269118Sdelphij mutex_exit(&mg->mg_lock); 731269118Sdelphij} 732269118Sdelphij 733269118Sdelphijvoid 734269118Sdelphijmetaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 735269118Sdelphij{ 736269118Sdelphij metaslab_class_t *mc = mg->mg_class; 737269118Sdelphij uint64_t ashift = mg->mg_vd->vdev_ashift; 738269118Sdelphij 739269118Sdelphij ASSERT(MUTEX_HELD(&msp->ms_lock)); 740269118Sdelphij if (msp->ms_sm == NULL) 741269118Sdelphij return; 742269118Sdelphij 743269118Sdelphij mutex_enter(&mg->mg_lock); 744269118Sdelphij for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 745269118Sdelphij ASSERT3U(mg->mg_histogram[i + ashift], >=, 746269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]); 747269118Sdelphij ASSERT3U(mc->mc_histogram[i + ashift], >=, 748269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]); 749269118Sdelphij 750269118Sdelphij mg->mg_histogram[i + ashift] -= 751269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 752269118Sdelphij mc->mc_histogram[i + ashift] -= 753269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 754269118Sdelphij } 755269118Sdelphij mutex_exit(&mg->mg_lock); 756269118Sdelphij} 757269118Sdelphij 758269118Sdelphijstatic void 759168404Spjdmetaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 760168404Spjd{ 761269118Sdelphij ASSERT(msp->ms_group == NULL); 762168404Spjd mutex_enter(&mg->mg_lock); 763168404Spjd msp->ms_group = mg; 764168404Spjd msp->ms_weight = 0; 765168404Spjd avl_add(&mg->mg_metaslab_tree, msp); 766168404Spjd mutex_exit(&mg->mg_lock); 767269118Sdelphij 768269118Sdelphij mutex_enter(&msp->ms_lock); 769269118Sdelphij metaslab_group_histogram_add(mg, msp); 770269118Sdelphij mutex_exit(&msp->ms_lock); 771168404Spjd} 772168404Spjd 773168404Spjdstatic void 774168404Spjdmetaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 775168404Spjd{ 776269118Sdelphij mutex_enter(&msp->ms_lock); 777269118Sdelphij metaslab_group_histogram_remove(mg, msp); 778269118Sdelphij mutex_exit(&msp->ms_lock); 779269118Sdelphij 780168404Spjd mutex_enter(&mg->mg_lock); 781168404Spjd ASSERT(msp->ms_group == mg); 782168404Spjd avl_remove(&mg->mg_metaslab_tree, msp); 783168404Spjd msp->ms_group = NULL; 784168404Spjd mutex_exit(&mg->mg_lock); 785168404Spjd} 786168404Spjd 787168404Spjdstatic void 788168404Spjdmetaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 789168404Spjd{ 790168404Spjd /* 791168404Spjd * Although in principle the weight can be any value, in 792269118Sdelphij * practice we do not use values in the range [1, 511]. 793168404Spjd */ 794269118Sdelphij ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 795168404Spjd ASSERT(MUTEX_HELD(&msp->ms_lock)); 796168404Spjd 797168404Spjd mutex_enter(&mg->mg_lock); 798168404Spjd ASSERT(msp->ms_group == mg); 799168404Spjd avl_remove(&mg->mg_metaslab_tree, msp); 800168404Spjd msp->ms_weight = weight; 801168404Spjd avl_add(&mg->mg_metaslab_tree, msp); 802168404Spjd mutex_exit(&mg->mg_lock); 803168404Spjd} 804168404Spjd 805168404Spjd/* 806269118Sdelphij * Calculate the fragmentation for a given metaslab group. We can use 807269118Sdelphij * a simple average here since all metaslabs within the group must have 808269118Sdelphij * the same size. The return value will be a value between 0 and 100 809269118Sdelphij * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 810269118Sdelphij * group have a fragmentation metric. 811269118Sdelphij */ 812269118Sdelphijuint64_t 813269118Sdelphijmetaslab_group_fragmentation(metaslab_group_t *mg) 814269118Sdelphij{ 815269118Sdelphij vdev_t *vd = mg->mg_vd; 816269118Sdelphij uint64_t fragmentation = 0; 817269118Sdelphij uint64_t valid_ms = 0; 818269118Sdelphij 819269118Sdelphij for (int m = 0; m < vd->vdev_ms_count; m++) { 820269118Sdelphij metaslab_t *msp = vd->vdev_ms[m]; 821269118Sdelphij 822269118Sdelphij if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 823269118Sdelphij continue; 824269118Sdelphij 825269118Sdelphij valid_ms++; 826269118Sdelphij fragmentation += msp->ms_fragmentation; 827269118Sdelphij } 828269118Sdelphij 829269118Sdelphij if (valid_ms <= vd->vdev_ms_count / 2) 830269118Sdelphij return (ZFS_FRAG_INVALID); 831269118Sdelphij 832269118Sdelphij fragmentation /= valid_ms; 833269118Sdelphij ASSERT3U(fragmentation, <=, 100); 834269118Sdelphij return (fragmentation); 835269118Sdelphij} 836269118Sdelphij 837269118Sdelphij/* 838258633Savg * Determine if a given metaslab group should skip allocations. A metaslab 839269118Sdelphij * group should avoid allocations if its free capacity is less than the 840269118Sdelphij * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 841269118Sdelphij * zfs_mg_fragmentation_threshold and there is at least one metaslab group 842258633Savg * that can still handle allocations. 843258633Savg */ 844258633Savgstatic boolean_t 845258633Savgmetaslab_group_allocatable(metaslab_group_t *mg) 846258633Savg{ 847258633Savg vdev_t *vd = mg->mg_vd; 848258633Savg spa_t *spa = vd->vdev_spa; 849258633Savg metaslab_class_t *mc = mg->mg_class; 850258633Savg 851258633Savg /* 852269118Sdelphij * We use two key metrics to determine if a metaslab group is 853269118Sdelphij * considered allocatable -- free space and fragmentation. If 854269118Sdelphij * the free space is greater than the free space threshold and 855269118Sdelphij * the fragmentation is less than the fragmentation threshold then 856269118Sdelphij * consider the group allocatable. There are two case when we will 857269118Sdelphij * not consider these key metrics. The first is if the group is 858269118Sdelphij * associated with a slog device and the second is if all groups 859269118Sdelphij * in this metaslab class have already been consider ineligible 860269118Sdelphij * for allocations. 861258633Savg */ 862269118Sdelphij return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 863269118Sdelphij (mg->mg_fragmentation == ZFS_FRAG_INVALID || 864269118Sdelphij mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || 865258633Savg mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 866258633Savg} 867258633Savg 868258633Savg/* 869211931Smm * ========================================================================== 870258717Savg * Range tree callbacks 871211931Smm * ========================================================================== 872211931Smm */ 873258717Savg 874258717Savg/* 875258717Savg * Comparison function for the private size-ordered tree. Tree is sorted 876258717Savg * by size, larger sizes at the end of the tree. 877258717Savg */ 878211931Smmstatic int 879258717Savgmetaslab_rangesize_compare(const void *x1, const void *x2) 880211931Smm{ 881258717Savg const range_seg_t *r1 = x1; 882258717Savg const range_seg_t *r2 = x2; 883258717Savg uint64_t rs_size1 = r1->rs_end - r1->rs_start; 884258717Savg uint64_t rs_size2 = r2->rs_end - r2->rs_start; 885211931Smm 886258717Savg if (rs_size1 < rs_size2) 887211931Smm return (-1); 888258717Savg if (rs_size1 > rs_size2) 889211931Smm return (1); 890211931Smm 891258717Savg if (r1->rs_start < r2->rs_start) 892211931Smm return (-1); 893258717Savg 894258717Savg if (r1->rs_start > r2->rs_start) 895211931Smm return (1); 896211931Smm 897211931Smm return (0); 898211931Smm} 899211931Smm 900211931Smm/* 901258717Savg * Create any block allocator specific components. The current allocators 902258717Savg * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 903168404Spjd */ 904258717Savgstatic void 905258717Savgmetaslab_rt_create(range_tree_t *rt, void *arg) 906168404Spjd{ 907258717Savg metaslab_t *msp = arg; 908168404Spjd 909258717Savg ASSERT3P(rt->rt_arg, ==, msp); 910258717Savg ASSERT(msp->ms_tree == NULL); 911168404Spjd 912258717Savg avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 913258717Savg sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 914168404Spjd} 915168404Spjd 916258717Savg/* 917258717Savg * Destroy the block allocator specific components. 918258717Savg */ 919209962Smmstatic void 920258717Savgmetaslab_rt_destroy(range_tree_t *rt, void *arg) 921209962Smm{ 922258717Savg metaslab_t *msp = arg; 923211931Smm 924258717Savg ASSERT3P(rt->rt_arg, ==, msp); 925258717Savg ASSERT3P(msp->ms_tree, ==, rt); 926258717Savg ASSERT0(avl_numnodes(&msp->ms_size_tree)); 927211931Smm 928258717Savg avl_destroy(&msp->ms_size_tree); 929209962Smm} 930209962Smm 931209962Smmstatic void 932258717Savgmetaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 933209962Smm{ 934258717Savg metaslab_t *msp = arg; 935211931Smm 936258717Savg ASSERT3P(rt->rt_arg, ==, msp); 937258717Savg ASSERT3P(msp->ms_tree, ==, rt); 938258717Savg VERIFY(!msp->ms_condensing); 939258717Savg avl_add(&msp->ms_size_tree, rs); 940209962Smm} 941209962Smm 942168404Spjdstatic void 943258717Savgmetaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 944168404Spjd{ 945258717Savg metaslab_t *msp = arg; 946258717Savg 947258717Savg ASSERT3P(rt->rt_arg, ==, msp); 948258717Savg ASSERT3P(msp->ms_tree, ==, rt); 949258717Savg VERIFY(!msp->ms_condensing); 950258717Savg avl_remove(&msp->ms_size_tree, rs); 951168404Spjd} 952168404Spjd 953168404Spjdstatic void 954258717Savgmetaslab_rt_vacate(range_tree_t *rt, void *arg) 955168404Spjd{ 956258717Savg metaslab_t *msp = arg; 957258717Savg 958258717Savg ASSERT3P(rt->rt_arg, ==, msp); 959258717Savg ASSERT3P(msp->ms_tree, ==, rt); 960258717Savg 961258717Savg /* 962258717Savg * Normally one would walk the tree freeing nodes along the way. 963258717Savg * Since the nodes are shared with the range trees we can avoid 964258717Savg * walking all nodes and just reinitialize the avl tree. The nodes 965258717Savg * will be freed by the range tree, so we don't want to free them here. 966258717Savg */ 967258717Savg avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 968258717Savg sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 969168404Spjd} 970168404Spjd 971258717Savgstatic range_tree_ops_t metaslab_rt_ops = { 972258717Savg metaslab_rt_create, 973258717Savg metaslab_rt_destroy, 974258717Savg metaslab_rt_add, 975258717Savg metaslab_rt_remove, 976258717Savg metaslab_rt_vacate 977258717Savg}; 978258717Savg 979168404Spjd/* 980258717Savg * ========================================================================== 981258717Savg * Metaslab block operations 982258717Savg * ========================================================================== 983258717Savg */ 984258717Savg 985258717Savg/* 986211931Smm * Return the maximum contiguous segment within the metaslab. 987209962Smm */ 988209962Smmuint64_t 989258717Savgmetaslab_block_maxsize(metaslab_t *msp) 990209962Smm{ 991258717Savg avl_tree_t *t = &msp->ms_size_tree; 992258717Savg range_seg_t *rs; 993209962Smm 994258717Savg if (t == NULL || (rs = avl_last(t)) == NULL) 995209962Smm return (0ULL); 996209962Smm 997258717Savg return (rs->rs_end - rs->rs_start); 998209962Smm} 999209962Smm 1000258717Savguint64_t 1001258717Savgmetaslab_block_alloc(metaslab_t *msp, uint64_t size) 1002258717Savg{ 1003258717Savg uint64_t start; 1004258717Savg range_tree_t *rt = msp->ms_tree; 1005258717Savg 1006258717Savg VERIFY(!msp->ms_condensing); 1007258717Savg 1008258717Savg start = msp->ms_ops->msop_alloc(msp, size); 1009258717Savg if (start != -1ULL) { 1010258717Savg vdev_t *vd = msp->ms_group->mg_vd; 1011258717Savg 1012258717Savg VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 1013258717Savg VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1014258717Savg VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 1015258717Savg range_tree_remove(rt, start, size); 1016258717Savg } 1017258717Savg return (start); 1018258717Savg} 1019258717Savg 1020211931Smm/* 1021211931Smm * ========================================================================== 1022258717Savg * Common allocator routines 1023258717Savg * ========================================================================== 1024258717Savg */ 1025258717Savg 1026258717Savg/* 1027258717Savg * This is a helper function that can be used by the allocator to find 1028258717Savg * a suitable block to allocate. This will search the specified AVL 1029258717Savg * tree looking for a block that matches the specified criteria. 1030258717Savg */ 1031258717Savgstatic uint64_t 1032258717Savgmetaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1033258717Savg uint64_t align) 1034258717Savg{ 1035258717Savg range_seg_t *rs, rsearch; 1036258717Savg avl_index_t where; 1037258717Savg 1038258717Savg rsearch.rs_start = *cursor; 1039258717Savg rsearch.rs_end = *cursor + size; 1040258717Savg 1041258717Savg rs = avl_find(t, &rsearch, &where); 1042258717Savg if (rs == NULL) 1043258717Savg rs = avl_nearest(t, where, AVL_AFTER); 1044258717Savg 1045258717Savg while (rs != NULL) { 1046258717Savg uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1047258717Savg 1048258717Savg if (offset + size <= rs->rs_end) { 1049258717Savg *cursor = offset + size; 1050258717Savg return (offset); 1051258717Savg } 1052258717Savg rs = AVL_NEXT(t, rs); 1053258717Savg } 1054258717Savg 1055258717Savg /* 1056258717Savg * If we know we've searched the whole map (*cursor == 0), give up. 1057258717Savg * Otherwise, reset the cursor to the beginning and try again. 1058258717Savg */ 1059258717Savg if (*cursor == 0) 1060258717Savg return (-1ULL); 1061258717Savg 1062258717Savg *cursor = 0; 1063258717Savg return (metaslab_block_picker(t, cursor, size, align)); 1064258717Savg} 1065258717Savg 1066258717Savg/* 1067258717Savg * ========================================================================== 1068211931Smm * The first-fit block allocator 1069211931Smm * ========================================================================== 1070211931Smm */ 1071211931Smmstatic uint64_t 1072258717Savgmetaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1073209962Smm{ 1074258717Savg /* 1075258717Savg * Find the largest power of 2 block size that evenly divides the 1076258717Savg * requested size. This is used to try to allocate blocks with similar 1077258717Savg * alignment from the same area of the metaslab (i.e. same cursor 1078258717Savg * bucket) but it does not guarantee that other allocations sizes 1079258717Savg * may exist in the same region. 1080258717Savg */ 1081211931Smm uint64_t align = size & -size; 1082264669Sdelphij uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1083258717Savg avl_tree_t *t = &msp->ms_tree->rt_root; 1084209962Smm 1085211931Smm return (metaslab_block_picker(t, cursor, size, align)); 1086209962Smm} 1087209962Smm 1088258717Savgstatic metaslab_ops_t metaslab_ff_ops = { 1089269118Sdelphij metaslab_ff_alloc 1090211931Smm}; 1091209962Smm 1092211931Smm/* 1093211931Smm * ========================================================================== 1094211931Smm * Dynamic block allocator - 1095211931Smm * Uses the first fit allocation scheme until space get low and then 1096211931Smm * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1097211931Smm * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1098211931Smm * ========================================================================== 1099211931Smm */ 1100209962Smmstatic uint64_t 1101258717Savgmetaslab_df_alloc(metaslab_t *msp, uint64_t size) 1102209962Smm{ 1103258717Savg /* 1104258717Savg * Find the largest power of 2 block size that evenly divides the 1105258717Savg * requested size. This is used to try to allocate blocks with similar 1106258717Savg * alignment from the same area of the metaslab (i.e. same cursor 1107258717Savg * bucket) but it does not guarantee that other allocations sizes 1108258717Savg * may exist in the same region. 1109258717Savg */ 1110209962Smm uint64_t align = size & -size; 1111264669Sdelphij uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1112258717Savg range_tree_t *rt = msp->ms_tree; 1113258717Savg avl_tree_t *t = &rt->rt_root; 1114258717Savg uint64_t max_size = metaslab_block_maxsize(msp); 1115258717Savg int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1116209962Smm 1117258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1118258717Savg ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1119209962Smm 1120209962Smm if (max_size < size) 1121209962Smm return (-1ULL); 1122209962Smm 1123209962Smm /* 1124209962Smm * If we're running low on space switch to using the size 1125209962Smm * sorted AVL tree (best-fit). 1126209962Smm */ 1127209962Smm if (max_size < metaslab_df_alloc_threshold || 1128209962Smm free_pct < metaslab_df_free_pct) { 1129258717Savg t = &msp->ms_size_tree; 1130209962Smm *cursor = 0; 1131209962Smm } 1132209962Smm 1133209962Smm return (metaslab_block_picker(t, cursor, size, 1ULL)); 1134209962Smm} 1135209962Smm 1136258717Savgstatic metaslab_ops_t metaslab_df_ops = { 1137269118Sdelphij metaslab_df_alloc 1138209962Smm}; 1139209962Smm 1140211931Smm/* 1141211931Smm * ========================================================================== 1142258717Savg * Cursor fit block allocator - 1143258717Savg * Select the largest region in the metaslab, set the cursor to the beginning 1144258717Savg * of the range and the cursor_end to the end of the range. As allocations 1145258717Savg * are made advance the cursor. Continue allocating from the cursor until 1146258717Savg * the range is exhausted and then find a new range. 1147211931Smm * ========================================================================== 1148211931Smm */ 1149211931Smmstatic uint64_t 1150258717Savgmetaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1151211931Smm{ 1152258717Savg range_tree_t *rt = msp->ms_tree; 1153258717Savg avl_tree_t *t = &msp->ms_size_tree; 1154258717Savg uint64_t *cursor = &msp->ms_lbas[0]; 1155258717Savg uint64_t *cursor_end = &msp->ms_lbas[1]; 1156211931Smm uint64_t offset = 0; 1157209962Smm 1158258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1159258717Savg ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1160211931Smm 1161258717Savg ASSERT3U(*cursor_end, >=, *cursor); 1162211931Smm 1163258717Savg if ((*cursor + size) > *cursor_end) { 1164258717Savg range_seg_t *rs; 1165211931Smm 1166258717Savg rs = avl_last(&msp->ms_size_tree); 1167258717Savg if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1168258717Savg return (-1ULL); 1169211931Smm 1170258717Savg *cursor = rs->rs_start; 1171258717Savg *cursor_end = rs->rs_end; 1172258717Savg } 1173211931Smm 1174258717Savg offset = *cursor; 1175258717Savg *cursor += size; 1176258717Savg 1177211931Smm return (offset); 1178211931Smm} 1179211931Smm 1180258717Savgstatic metaslab_ops_t metaslab_cf_ops = { 1181269118Sdelphij metaslab_cf_alloc 1182211931Smm}; 1183211931Smm 1184258717Savg/* 1185258717Savg * ========================================================================== 1186258717Savg * New dynamic fit allocator - 1187258717Savg * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1188258717Savg * contiguous blocks. If no region is found then just use the largest segment 1189258717Savg * that remains. 1190258717Savg * ========================================================================== 1191258717Savg */ 1192258717Savg 1193258717Savg/* 1194258717Savg * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1195258717Savg * to request from the allocator. 1196258717Savg */ 1197211931Smmuint64_t metaslab_ndf_clump_shift = 4; 1198211931Smm 1199211931Smmstatic uint64_t 1200258717Savgmetaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1201211931Smm{ 1202258717Savg avl_tree_t *t = &msp->ms_tree->rt_root; 1203211931Smm avl_index_t where; 1204258717Savg range_seg_t *rs, rsearch; 1205264669Sdelphij uint64_t hbit = highbit64(size); 1206258717Savg uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1207258717Savg uint64_t max_size = metaslab_block_maxsize(msp); 1208211931Smm 1209258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1210258717Savg ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1211211931Smm 1212211931Smm if (max_size < size) 1213211931Smm return (-1ULL); 1214211931Smm 1215258717Savg rsearch.rs_start = *cursor; 1216258717Savg rsearch.rs_end = *cursor + size; 1217211931Smm 1218258717Savg rs = avl_find(t, &rsearch, &where); 1219258717Savg if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1220258717Savg t = &msp->ms_size_tree; 1221211931Smm 1222258717Savg rsearch.rs_start = 0; 1223258717Savg rsearch.rs_end = MIN(max_size, 1224211931Smm 1ULL << (hbit + metaslab_ndf_clump_shift)); 1225258717Savg rs = avl_find(t, &rsearch, &where); 1226258717Savg if (rs == NULL) 1227258717Savg rs = avl_nearest(t, where, AVL_AFTER); 1228258717Savg ASSERT(rs != NULL); 1229211931Smm } 1230211931Smm 1231258717Savg if ((rs->rs_end - rs->rs_start) >= size) { 1232258717Savg *cursor = rs->rs_start + size; 1233258717Savg return (rs->rs_start); 1234211931Smm } 1235211931Smm return (-1ULL); 1236211931Smm} 1237211931Smm 1238258717Savgstatic metaslab_ops_t metaslab_ndf_ops = { 1239269118Sdelphij metaslab_ndf_alloc 1240211931Smm}; 1241211931Smm 1242258717Savgmetaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1243211931Smm 1244209962Smm/* 1245168404Spjd * ========================================================================== 1246168404Spjd * Metaslabs 1247168404Spjd * ========================================================================== 1248168404Spjd */ 1249258717Savg 1250258717Savg/* 1251258717Savg * Wait for any in-progress metaslab loads to complete. 1252258717Savg */ 1253258717Savgvoid 1254258717Savgmetaslab_load_wait(metaslab_t *msp) 1255258717Savg{ 1256258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1257258717Savg 1258258717Savg while (msp->ms_loading) { 1259258717Savg ASSERT(!msp->ms_loaded); 1260258717Savg cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1261258717Savg } 1262258717Savg} 1263258717Savg 1264258717Savgint 1265258717Savgmetaslab_load(metaslab_t *msp) 1266258717Savg{ 1267258717Savg int error = 0; 1268258717Savg 1269258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1270258717Savg ASSERT(!msp->ms_loaded); 1271258717Savg ASSERT(!msp->ms_loading); 1272258717Savg 1273258717Savg msp->ms_loading = B_TRUE; 1274258717Savg 1275258717Savg /* 1276258717Savg * If the space map has not been allocated yet, then treat 1277258717Savg * all the space in the metaslab as free and add it to the 1278258717Savg * ms_tree. 1279258717Savg */ 1280258717Savg if (msp->ms_sm != NULL) 1281258717Savg error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1282258717Savg else 1283258717Savg range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1284258717Savg 1285258717Savg msp->ms_loaded = (error == 0); 1286258717Savg msp->ms_loading = B_FALSE; 1287258717Savg 1288258717Savg if (msp->ms_loaded) { 1289258717Savg for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1290258717Savg range_tree_walk(msp->ms_defertree[t], 1291258717Savg range_tree_remove, msp->ms_tree); 1292258717Savg } 1293258717Savg } 1294258717Savg cv_broadcast(&msp->ms_load_cv); 1295258717Savg return (error); 1296258717Savg} 1297258717Savg 1298258717Savgvoid 1299258717Savgmetaslab_unload(metaslab_t *msp) 1300258717Savg{ 1301258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1302258717Savg range_tree_vacate(msp->ms_tree, NULL, NULL); 1303258717Savg msp->ms_loaded = B_FALSE; 1304258717Savg msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1305258717Savg} 1306258717Savg 1307275594Sdelphijint 1308275594Sdelphijmetaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1309275594Sdelphij metaslab_t **msp) 1310168404Spjd{ 1311168404Spjd vdev_t *vd = mg->mg_vd; 1312258717Savg objset_t *mos = vd->vdev_spa->spa_meta_objset; 1313275594Sdelphij metaslab_t *ms; 1314275594Sdelphij int error; 1315168404Spjd 1316275594Sdelphij ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1317275594Sdelphij mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1318275594Sdelphij cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1319275594Sdelphij ms->ms_id = id; 1320275594Sdelphij ms->ms_start = id << vd->vdev_ms_shift; 1321275594Sdelphij ms->ms_size = 1ULL << vd->vdev_ms_shift; 1322168404Spjd 1323258717Savg /* 1324258717Savg * We only open space map objects that already exist. All others 1325258717Savg * will be opened when we finally allocate an object for it. 1326258717Savg */ 1327258717Savg if (object != 0) { 1328275594Sdelphij error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1329275594Sdelphij ms->ms_size, vd->vdev_ashift, &ms->ms_lock); 1330275594Sdelphij 1331275594Sdelphij if (error != 0) { 1332275594Sdelphij kmem_free(ms, sizeof (metaslab_t)); 1333275594Sdelphij return (error); 1334275594Sdelphij } 1335275594Sdelphij 1336275594Sdelphij ASSERT(ms->ms_sm != NULL); 1337258717Savg } 1338168404Spjd 1339168404Spjd /* 1340258717Savg * We create the main range tree here, but we don't create the 1341258717Savg * alloctree and freetree until metaslab_sync_done(). This serves 1342168404Spjd * two purposes: it allows metaslab_sync_done() to detect the 1343168404Spjd * addition of new space; and for debugging, it ensures that we'd 1344168404Spjd * data fault on any attempt to use this metaslab before it's ready. 1345168404Spjd */ 1346275594Sdelphij ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); 1347275594Sdelphij metaslab_group_add(mg, ms); 1348168404Spjd 1349275594Sdelphij ms->ms_fragmentation = metaslab_fragmentation(ms); 1350275594Sdelphij ms->ms_ops = mg->mg_class->mc_ops; 1351219089Spjd 1352168404Spjd /* 1353168404Spjd * If we're opening an existing pool (txg == 0) or creating 1354168404Spjd * a new one (txg == TXG_INITIAL), all space is available now. 1355168404Spjd * If we're adding space to an existing pool, the new space 1356168404Spjd * does not become available until after this txg has synced. 1357168404Spjd */ 1358168404Spjd if (txg <= TXG_INITIAL) 1359275594Sdelphij metaslab_sync_done(ms, 0); 1360168404Spjd 1361258717Savg /* 1362258717Savg * If metaslab_debug_load is set and we're initializing a metaslab 1363258717Savg * that has an allocated space_map object then load the its space 1364258717Savg * map so that can verify frees. 1365258717Savg */ 1366275594Sdelphij if (metaslab_debug_load && ms->ms_sm != NULL) { 1367275594Sdelphij mutex_enter(&ms->ms_lock); 1368275594Sdelphij VERIFY0(metaslab_load(ms)); 1369275594Sdelphij mutex_exit(&ms->ms_lock); 1370258717Savg } 1371258717Savg 1372168404Spjd if (txg != 0) { 1373168404Spjd vdev_dirty(vd, 0, NULL, txg); 1374275594Sdelphij vdev_dirty(vd, VDD_METASLAB, ms, txg); 1375168404Spjd } 1376168404Spjd 1377275594Sdelphij *msp = ms; 1378275594Sdelphij 1379275594Sdelphij return (0); 1380168404Spjd} 1381168404Spjd 1382168404Spjdvoid 1383168404Spjdmetaslab_fini(metaslab_t *msp) 1384168404Spjd{ 1385168404Spjd metaslab_group_t *mg = msp->ms_group; 1386168404Spjd 1387168404Spjd metaslab_group_remove(mg, msp); 1388168404Spjd 1389168404Spjd mutex_enter(&msp->ms_lock); 1390168404Spjd 1391258717Savg VERIFY(msp->ms_group == NULL); 1392258717Savg vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1393258717Savg 0, -msp->ms_size); 1394258717Savg space_map_close(msp->ms_sm); 1395168404Spjd 1396258717Savg metaslab_unload(msp); 1397258717Savg range_tree_destroy(msp->ms_tree); 1398258717Savg 1399219089Spjd for (int t = 0; t < TXG_SIZE; t++) { 1400258717Savg range_tree_destroy(msp->ms_alloctree[t]); 1401258717Savg range_tree_destroy(msp->ms_freetree[t]); 1402168404Spjd } 1403168404Spjd 1404247398Smm for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1405258717Savg range_tree_destroy(msp->ms_defertree[t]); 1406247398Smm } 1407219089Spjd 1408240415Smm ASSERT0(msp->ms_deferspace); 1409219089Spjd 1410168404Spjd mutex_exit(&msp->ms_lock); 1411258717Savg cv_destroy(&msp->ms_load_cv); 1412168404Spjd mutex_destroy(&msp->ms_lock); 1413168404Spjd 1414168404Spjd kmem_free(msp, sizeof (metaslab_t)); 1415168404Spjd} 1416168404Spjd 1417269118Sdelphij#define FRAGMENTATION_TABLE_SIZE 17 1418269118Sdelphij 1419258717Savg/* 1420269118Sdelphij * This table defines a segment size based fragmentation metric that will 1421269118Sdelphij * allow each metaslab to derive its own fragmentation value. This is done 1422269118Sdelphij * by calculating the space in each bucket of the spacemap histogram and 1423269118Sdelphij * multiplying that by the fragmetation metric in this table. Doing 1424269118Sdelphij * this for all buckets and dividing it by the total amount of free 1425269118Sdelphij * space in this metaslab (i.e. the total free space in all buckets) gives 1426269118Sdelphij * us the fragmentation metric. This means that a high fragmentation metric 1427269118Sdelphij * equates to most of the free space being comprised of small segments. 1428269118Sdelphij * Conversely, if the metric is low, then most of the free space is in 1429269118Sdelphij * large segments. A 10% change in fragmentation equates to approximately 1430269118Sdelphij * double the number of segments. 1431258717Savg * 1432269118Sdelphij * This table defines 0% fragmented space using 16MB segments. Testing has 1433269118Sdelphij * shown that segments that are greater than or equal to 16MB do not suffer 1434269118Sdelphij * from drastic performance problems. Using this value, we derive the rest 1435269118Sdelphij * of the table. Since the fragmentation value is never stored on disk, it 1436269118Sdelphij * is possible to change these calculations in the future. 1437258717Savg */ 1438269118Sdelphijint zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1439269118Sdelphij 100, /* 512B */ 1440269118Sdelphij 100, /* 1K */ 1441269118Sdelphij 98, /* 2K */ 1442269118Sdelphij 95, /* 4K */ 1443269118Sdelphij 90, /* 8K */ 1444269118Sdelphij 80, /* 16K */ 1445269118Sdelphij 70, /* 32K */ 1446269118Sdelphij 60, /* 64K */ 1447269118Sdelphij 50, /* 128K */ 1448269118Sdelphij 40, /* 256K */ 1449269118Sdelphij 30, /* 512K */ 1450269118Sdelphij 20, /* 1M */ 1451269118Sdelphij 15, /* 2M */ 1452269118Sdelphij 10, /* 4M */ 1453269118Sdelphij 5, /* 8M */ 1454269118Sdelphij 0 /* 16M */ 1455269118Sdelphij}; 1456269118Sdelphij 1457269118Sdelphij/* 1458269118Sdelphij * Calclate the metaslab's fragmentation metric. A return value 1459269118Sdelphij * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1460269118Sdelphij * not support this metric. Otherwise, the return value should be in the 1461269118Sdelphij * range [0, 100]. 1462269118Sdelphij */ 1463258717Savgstatic uint64_t 1464269118Sdelphijmetaslab_fragmentation(metaslab_t *msp) 1465258717Savg{ 1466269118Sdelphij spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1467269118Sdelphij uint64_t fragmentation = 0; 1468269118Sdelphij uint64_t total = 0; 1469269118Sdelphij boolean_t feature_enabled = spa_feature_is_enabled(spa, 1470269118Sdelphij SPA_FEATURE_SPACEMAP_HISTOGRAM); 1471168404Spjd 1472269118Sdelphij if (!feature_enabled) 1473269118Sdelphij return (ZFS_FRAG_INVALID); 1474269118Sdelphij 1475258717Savg /* 1476269118Sdelphij * A null space map means that the entire metaslab is free 1477269118Sdelphij * and thus is not fragmented. 1478258717Savg */ 1479269118Sdelphij if (msp->ms_sm == NULL) 1480269118Sdelphij return (0); 1481269118Sdelphij 1482269118Sdelphij /* 1483269118Sdelphij * If this metaslab's space_map has not been upgraded, flag it 1484269118Sdelphij * so that we upgrade next time we encounter it. 1485269118Sdelphij */ 1486269118Sdelphij if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1487269118Sdelphij uint64_t txg = spa_syncing_txg(spa); 1488258717Savg vdev_t *vd = msp->ms_group->mg_vd; 1489258717Savg 1490272504Sdelphij if (spa_writeable(spa)) { 1491272504Sdelphij msp->ms_condense_wanted = B_TRUE; 1492272504Sdelphij vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1493272504Sdelphij spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1494272504Sdelphij "msp %p, vd %p", txg, msp, vd); 1495272504Sdelphij } 1496269118Sdelphij return (ZFS_FRAG_INVALID); 1497258717Savg } 1498258717Savg 1499269118Sdelphij for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1500269118Sdelphij uint64_t space = 0; 1501269118Sdelphij uint8_t shift = msp->ms_sm->sm_shift; 1502269118Sdelphij int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1503269118Sdelphij FRAGMENTATION_TABLE_SIZE - 1); 1504258717Savg 1505258717Savg if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1506258717Savg continue; 1507258717Savg 1508269118Sdelphij space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1509269118Sdelphij total += space; 1510269118Sdelphij 1511269118Sdelphij ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1512269118Sdelphij fragmentation += space * zfs_frag_table[idx]; 1513258717Savg } 1514269118Sdelphij 1515269118Sdelphij if (total > 0) 1516269118Sdelphij fragmentation /= total; 1517269118Sdelphij ASSERT3U(fragmentation, <=, 100); 1518269118Sdelphij return (fragmentation); 1519258717Savg} 1520258717Savg 1521269118Sdelphij/* 1522269118Sdelphij * Compute a weight -- a selection preference value -- for the given metaslab. 1523269118Sdelphij * This is based on the amount of free space, the level of fragmentation, 1524269118Sdelphij * the LBA range, and whether the metaslab is loaded. 1525269118Sdelphij */ 1526168404Spjdstatic uint64_t 1527168404Spjdmetaslab_weight(metaslab_t *msp) 1528168404Spjd{ 1529168404Spjd metaslab_group_t *mg = msp->ms_group; 1530168404Spjd vdev_t *vd = mg->mg_vd; 1531168404Spjd uint64_t weight, space; 1532168404Spjd 1533168404Spjd ASSERT(MUTEX_HELD(&msp->ms_lock)); 1534168404Spjd 1535168404Spjd /* 1536247398Smm * This vdev is in the process of being removed so there is nothing 1537247398Smm * for us to do here. 1538247398Smm */ 1539247398Smm if (vd->vdev_removing) { 1540258717Savg ASSERT0(space_map_allocated(msp->ms_sm)); 1541247398Smm ASSERT0(vd->vdev_ms_shift); 1542247398Smm return (0); 1543247398Smm } 1544247398Smm 1545247398Smm /* 1546168404Spjd * The baseline weight is the metaslab's free space. 1547168404Spjd */ 1548258717Savg space = msp->ms_size - space_map_allocated(msp->ms_sm); 1549269118Sdelphij 1550269118Sdelphij msp->ms_fragmentation = metaslab_fragmentation(msp); 1551269118Sdelphij if (metaslab_fragmentation_factor_enabled && 1552269118Sdelphij msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1553269118Sdelphij /* 1554269118Sdelphij * Use the fragmentation information to inversely scale 1555269118Sdelphij * down the baseline weight. We need to ensure that we 1556269118Sdelphij * don't exclude this metaslab completely when it's 100% 1557269118Sdelphij * fragmented. To avoid this we reduce the fragmented value 1558269118Sdelphij * by 1. 1559269118Sdelphij */ 1560269118Sdelphij space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1561269118Sdelphij 1562269118Sdelphij /* 1563269118Sdelphij * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1564269118Sdelphij * this metaslab again. The fragmentation metric may have 1565269118Sdelphij * decreased the space to something smaller than 1566269118Sdelphij * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1567269118Sdelphij * so that we can consume any remaining space. 1568269118Sdelphij */ 1569269118Sdelphij if (space > 0 && space < SPA_MINBLOCKSIZE) 1570269118Sdelphij space = SPA_MINBLOCKSIZE; 1571269118Sdelphij } 1572168404Spjd weight = space; 1573168404Spjd 1574168404Spjd /* 1575168404Spjd * Modern disks have uniform bit density and constant angular velocity. 1576168404Spjd * Therefore, the outer recording zones are faster (higher bandwidth) 1577168404Spjd * than the inner zones by the ratio of outer to inner track diameter, 1578168404Spjd * which is typically around 2:1. We account for this by assigning 1579168404Spjd * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1580168404Spjd * In effect, this means that we'll select the metaslab with the most 1581168404Spjd * free bandwidth rather than simply the one with the most free space. 1582168404Spjd */ 1583269118Sdelphij if (metaslab_lba_weighting_enabled) { 1584269118Sdelphij weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1585269118Sdelphij ASSERT(weight >= space && weight <= 2 * space); 1586269118Sdelphij } 1587168404Spjd 1588269118Sdelphij /* 1589269118Sdelphij * If this metaslab is one we're actively using, adjust its 1590269118Sdelphij * weight to make it preferable to any inactive metaslab so 1591269118Sdelphij * we'll polish it off. If the fragmentation on this metaslab 1592269118Sdelphij * has exceed our threshold, then don't mark it active. 1593269118Sdelphij */ 1594269118Sdelphij if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1595269118Sdelphij msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1596211931Smm weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1597211931Smm } 1598258717Savg 1599211931Smm return (weight); 1600211931Smm} 1601211931Smm 1602168404Spjdstatic int 1603224177Smmmetaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1604168404Spjd{ 1605168404Spjd ASSERT(MUTEX_HELD(&msp->ms_lock)); 1606168404Spjd 1607168404Spjd if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1608258717Savg metaslab_load_wait(msp); 1609258717Savg if (!msp->ms_loaded) { 1610258717Savg int error = metaslab_load(msp); 1611258717Savg if (error) { 1612219089Spjd metaslab_group_sort(msp->ms_group, msp, 0); 1613219089Spjd return (error); 1614219089Spjd } 1615168404Spjd } 1616209962Smm 1617168404Spjd metaslab_group_sort(msp->ms_group, msp, 1618168404Spjd msp->ms_weight | activation_weight); 1619168404Spjd } 1620258717Savg ASSERT(msp->ms_loaded); 1621168404Spjd ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1622168404Spjd 1623168404Spjd return (0); 1624168404Spjd} 1625168404Spjd 1626168404Spjdstatic void 1627168404Spjdmetaslab_passivate(metaslab_t *msp, uint64_t size) 1628168404Spjd{ 1629168404Spjd /* 1630168404Spjd * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1631168404Spjd * this metaslab again. In that case, it had better be empty, 1632168404Spjd * or we would be leaving space on the table. 1633168404Spjd */ 1634258717Savg ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1635168404Spjd metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1636168404Spjd ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1637168404Spjd} 1638168404Spjd 1639258717Savgstatic void 1640258717Savgmetaslab_preload(void *arg) 1641258717Savg{ 1642258717Savg metaslab_t *msp = arg; 1643258717Savg spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1644258717Savg 1645268086Sdelphij ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1646268086Sdelphij 1647258717Savg mutex_enter(&msp->ms_lock); 1648258717Savg metaslab_load_wait(msp); 1649258717Savg if (!msp->ms_loaded) 1650258717Savg (void) metaslab_load(msp); 1651258717Savg 1652258717Savg /* 1653258717Savg * Set the ms_access_txg value so that we don't unload it right away. 1654258717Savg */ 1655258717Savg msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1656258717Savg mutex_exit(&msp->ms_lock); 1657258717Savg} 1658258717Savg 1659258717Savgstatic void 1660258717Savgmetaslab_group_preload(metaslab_group_t *mg) 1661258717Savg{ 1662258717Savg spa_t *spa = mg->mg_vd->vdev_spa; 1663258717Savg metaslab_t *msp; 1664258717Savg avl_tree_t *t = &mg->mg_metaslab_tree; 1665258717Savg int m = 0; 1666258717Savg 1667258717Savg if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1668258717Savg taskq_wait(mg->mg_taskq); 1669258717Savg return; 1670258717Savg } 1671268086Sdelphij 1672258717Savg mutex_enter(&mg->mg_lock); 1673258717Savg /* 1674268086Sdelphij * Load the next potential metaslabs 1675258717Savg */ 1676268086Sdelphij msp = avl_first(t); 1677268086Sdelphij while (msp != NULL) { 1678268086Sdelphij metaslab_t *msp_next = AVL_NEXT(t, msp); 1679258717Savg 1680269118Sdelphij /* 1681269118Sdelphij * We preload only the maximum number of metaslabs specified 1682269118Sdelphij * by metaslab_preload_limit. If a metaslab is being forced 1683269118Sdelphij * to condense then we preload it too. This will ensure 1684269118Sdelphij * that force condensing happens in the next txg. 1685269118Sdelphij */ 1686269118Sdelphij if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1687269118Sdelphij msp = msp_next; 1688269118Sdelphij continue; 1689269118Sdelphij } 1690258717Savg 1691268086Sdelphij /* 1692268086Sdelphij * We must drop the metaslab group lock here to preserve 1693268086Sdelphij * lock ordering with the ms_lock (when grabbing both 1694268086Sdelphij * the mg_lock and the ms_lock, the ms_lock must be taken 1695268086Sdelphij * first). As a result, it is possible that the ordering 1696268086Sdelphij * of the metaslabs within the avl tree may change before 1697268086Sdelphij * we reacquire the lock. The metaslab cannot be removed from 1698268086Sdelphij * the tree while we're in syncing context so it is safe to 1699268086Sdelphij * drop the mg_lock here. If the metaslabs are reordered 1700268086Sdelphij * nothing will break -- we just may end up loading a 1701268086Sdelphij * less than optimal one. 1702268086Sdelphij */ 1703268086Sdelphij mutex_exit(&mg->mg_lock); 1704258717Savg VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1705258717Savg msp, TQ_SLEEP) != 0); 1706268086Sdelphij mutex_enter(&mg->mg_lock); 1707268086Sdelphij msp = msp_next; 1708258717Savg } 1709258717Savg mutex_exit(&mg->mg_lock); 1710258717Savg} 1711258717Savg 1712168404Spjd/* 1713258717Savg * Determine if the space map's on-disk footprint is past our tolerance 1714258717Savg * for inefficiency. We would like to use the following criteria to make 1715258717Savg * our decision: 1716247398Smm * 1717247398Smm * 1. The size of the space map object should not dramatically increase as a 1718258717Savg * result of writing out the free space range tree. 1719247398Smm * 1720247398Smm * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1721258717Savg * times the size than the free space range tree representation 1722258717Savg * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1723247398Smm * 1724268855Sdelphij * 3. The on-disk size of the space map should actually decrease. 1725268855Sdelphij * 1726247398Smm * Checking the first condition is tricky since we don't want to walk 1727247398Smm * the entire AVL tree calculating the estimated on-disk size. Instead we 1728258717Savg * use the size-ordered range tree in the metaslab and calculate the 1729258717Savg * size required to write out the largest segment in our free tree. If the 1730247398Smm * size required to represent that segment on disk is larger than the space 1731247398Smm * map object then we avoid condensing this map. 1732247398Smm * 1733247398Smm * To determine the second criterion we use a best-case estimate and assume 1734247398Smm * each segment can be represented on-disk as a single 64-bit entry. We refer 1735247398Smm * to this best-case estimate as the space map's minimal form. 1736268855Sdelphij * 1737268855Sdelphij * Unfortunately, we cannot compute the on-disk size of the space map in this 1738268855Sdelphij * context because we cannot accurately compute the effects of compression, etc. 1739268855Sdelphij * Instead, we apply the heuristic described in the block comment for 1740268855Sdelphij * zfs_metaslab_condense_block_threshold - we only condense if the space used 1741268855Sdelphij * is greater than a threshold number of blocks. 1742247398Smm */ 1743247398Smmstatic boolean_t 1744247398Smmmetaslab_should_condense(metaslab_t *msp) 1745247398Smm{ 1746258717Savg space_map_t *sm = msp->ms_sm; 1747258717Savg range_seg_t *rs; 1748268855Sdelphij uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1749268855Sdelphij dmu_object_info_t doi; 1750268855Sdelphij uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1751247398Smm 1752247398Smm ASSERT(MUTEX_HELD(&msp->ms_lock)); 1753258717Savg ASSERT(msp->ms_loaded); 1754247398Smm 1755247398Smm /* 1756258717Savg * Use the ms_size_tree range tree, which is ordered by size, to 1757269118Sdelphij * obtain the largest segment in the free tree. We always condense 1758269118Sdelphij * metaslabs that are empty and metaslabs for which a condense 1759269118Sdelphij * request has been made. 1760247398Smm */ 1761258717Savg rs = avl_last(&msp->ms_size_tree); 1762269118Sdelphij if (rs == NULL || msp->ms_condense_wanted) 1763247398Smm return (B_TRUE); 1764247398Smm 1765247398Smm /* 1766247398Smm * Calculate the number of 64-bit entries this segment would 1767247398Smm * require when written to disk. If this single segment would be 1768247398Smm * larger on-disk than the entire current on-disk structure, then 1769247398Smm * clearly condensing will increase the on-disk structure size. 1770247398Smm */ 1771258717Savg size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1772247398Smm entries = size / (MIN(size, SM_RUN_MAX)); 1773247398Smm segsz = entries * sizeof (uint64_t); 1774247398Smm 1775268855Sdelphij optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1776268855Sdelphij object_size = space_map_length(msp->ms_sm); 1777268855Sdelphij 1778268855Sdelphij dmu_object_info_from_db(sm->sm_dbuf, &doi); 1779268855Sdelphij record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1780268855Sdelphij 1781268855Sdelphij return (segsz <= object_size && 1782268855Sdelphij object_size >= (optimal_size * zfs_condense_pct / 100) && 1783268855Sdelphij object_size > zfs_metaslab_condense_block_threshold * record_size); 1784247398Smm} 1785247398Smm 1786247398Smm/* 1787247398Smm * Condense the on-disk space map representation to its minimized form. 1788247398Smm * The minimized form consists of a small number of allocations followed by 1789258717Savg * the entries of the free range tree. 1790247398Smm */ 1791247398Smmstatic void 1792247398Smmmetaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1793247398Smm{ 1794247398Smm spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1795258717Savg range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1796258717Savg range_tree_t *condense_tree; 1797258717Savg space_map_t *sm = msp->ms_sm; 1798247398Smm 1799247398Smm ASSERT(MUTEX_HELD(&msp->ms_lock)); 1800247398Smm ASSERT3U(spa_sync_pass(spa), ==, 1); 1801258717Savg ASSERT(msp->ms_loaded); 1802247398Smm 1803269118Sdelphij 1804289307Smav spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 1805289307Smav "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 1806289307Smav msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 1807289307Smav msp->ms_group->mg_vd->vdev_spa->spa_name, 1808289307Smav space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), 1809269118Sdelphij msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1810247398Smm 1811269118Sdelphij msp->ms_condense_wanted = B_FALSE; 1812269118Sdelphij 1813247398Smm /* 1814258717Savg * Create an range tree that is 100% allocated. We remove segments 1815247398Smm * that have been freed in this txg, any deferred frees that exist, 1816247398Smm * and any allocation in the future. Removing segments should be 1817258717Savg * a relatively inexpensive operation since we expect these trees to 1818258717Savg * have a small number of nodes. 1819247398Smm */ 1820258717Savg condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1821258717Savg range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1822247398Smm 1823247398Smm /* 1824258717Savg * Remove what's been freed in this txg from the condense_tree. 1825247398Smm * Since we're in sync_pass 1, we know that all the frees from 1826258717Savg * this txg are in the freetree. 1827247398Smm */ 1828258717Savg range_tree_walk(freetree, range_tree_remove, condense_tree); 1829247398Smm 1830258717Savg for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1831258717Savg range_tree_walk(msp->ms_defertree[t], 1832258717Savg range_tree_remove, condense_tree); 1833258717Savg } 1834247398Smm 1835258717Savg for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1836258717Savg range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1837258717Savg range_tree_remove, condense_tree); 1838258717Savg } 1839247398Smm 1840247398Smm /* 1841247398Smm * We're about to drop the metaslab's lock thus allowing 1842247398Smm * other consumers to change it's content. Set the 1843258717Savg * metaslab's ms_condensing flag to ensure that 1844247398Smm * allocations on this metaslab do not occur while we're 1845247398Smm * in the middle of committing it to disk. This is only critical 1846258717Savg * for the ms_tree as all other range trees use per txg 1847247398Smm * views of their content. 1848247398Smm */ 1849258717Savg msp->ms_condensing = B_TRUE; 1850247398Smm 1851247398Smm mutex_exit(&msp->ms_lock); 1852258717Savg space_map_truncate(sm, tx); 1853247398Smm mutex_enter(&msp->ms_lock); 1854247398Smm 1855247398Smm /* 1856247398Smm * While we would ideally like to create a space_map representation 1857247398Smm * that consists only of allocation records, doing so can be 1858258717Savg * prohibitively expensive because the in-core free tree can be 1859247398Smm * large, and therefore computationally expensive to subtract 1860258717Savg * from the condense_tree. Instead we sync out two trees, a cheap 1861258717Savg * allocation only tree followed by the in-core free tree. While not 1862247398Smm * optimal, this is typically close to optimal, and much cheaper to 1863247398Smm * compute. 1864247398Smm */ 1865258717Savg space_map_write(sm, condense_tree, SM_ALLOC, tx); 1866258717Savg range_tree_vacate(condense_tree, NULL, NULL); 1867258717Savg range_tree_destroy(condense_tree); 1868247398Smm 1869258717Savg space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1870258717Savg msp->ms_condensing = B_FALSE; 1871247398Smm} 1872247398Smm 1873247398Smm/* 1874168404Spjd * Write a metaslab to disk in the context of the specified transaction group. 1875168404Spjd */ 1876168404Spjdvoid 1877168404Spjdmetaslab_sync(metaslab_t *msp, uint64_t txg) 1878168404Spjd{ 1879258717Savg metaslab_group_t *mg = msp->ms_group; 1880258717Savg vdev_t *vd = mg->mg_vd; 1881168404Spjd spa_t *spa = vd->vdev_spa; 1882219089Spjd objset_t *mos = spa_meta_objset(spa); 1883258717Savg range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1884258717Savg range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1885258717Savg range_tree_t **freed_tree = 1886258717Savg &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1887168404Spjd dmu_tx_t *tx; 1888258717Savg uint64_t object = space_map_object(msp->ms_sm); 1889168404Spjd 1890219089Spjd ASSERT(!vd->vdev_ishole); 1891168404Spjd 1892247398Smm /* 1893247398Smm * This metaslab has just been added so there's no work to do now. 1894247398Smm */ 1895258717Savg if (*freetree == NULL) { 1896258717Savg ASSERT3P(alloctree, ==, NULL); 1897219089Spjd return; 1898247398Smm } 1899219089Spjd 1900258717Savg ASSERT3P(alloctree, !=, NULL); 1901258717Savg ASSERT3P(*freetree, !=, NULL); 1902258717Savg ASSERT3P(*freed_tree, !=, NULL); 1903247398Smm 1904269118Sdelphij /* 1905269118Sdelphij * Normally, we don't want to process a metaslab if there 1906269118Sdelphij * are no allocations or frees to perform. However, if the metaslab 1907269118Sdelphij * is being forced to condense we need to let it through. 1908269118Sdelphij */ 1909258717Savg if (range_tree_space(alloctree) == 0 && 1910269118Sdelphij range_tree_space(*freetree) == 0 && 1911269118Sdelphij !msp->ms_condense_wanted) 1912247398Smm return; 1913247398Smm 1914168404Spjd /* 1915168404Spjd * The only state that can actually be changing concurrently with 1916258717Savg * metaslab_sync() is the metaslab's ms_tree. No other thread can 1917258717Savg * be modifying this txg's alloctree, freetree, freed_tree, or 1918258717Savg * space_map_phys_t. Therefore, we only hold ms_lock to satify 1919258717Savg * space_map ASSERTs. We drop it whenever we call into the DMU, 1920258717Savg * because the DMU can call down to us (e.g. via zio_free()) at 1921258717Savg * any time. 1922168404Spjd */ 1923168404Spjd 1924219089Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1925219089Spjd 1926258717Savg if (msp->ms_sm == NULL) { 1927258717Savg uint64_t new_object; 1928258717Savg 1929258717Savg new_object = space_map_alloc(mos, tx); 1930258717Savg VERIFY3U(new_object, !=, 0); 1931258717Savg 1932258717Savg VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1933258717Savg msp->ms_start, msp->ms_size, vd->vdev_ashift, 1934258717Savg &msp->ms_lock)); 1935258717Savg ASSERT(msp->ms_sm != NULL); 1936168404Spjd } 1937168404Spjd 1938219089Spjd mutex_enter(&msp->ms_lock); 1939219089Spjd 1940272504Sdelphij /* 1941272504Sdelphij * Note: metaslab_condense() clears the space_map's histogram. 1942272504Sdelphij * Therefore we must verify and remove this histogram before 1943272504Sdelphij * condensing. 1944272504Sdelphij */ 1945272504Sdelphij metaslab_group_histogram_verify(mg); 1946272504Sdelphij metaslab_class_histogram_verify(mg->mg_class); 1947272504Sdelphij metaslab_group_histogram_remove(mg, msp); 1948272504Sdelphij 1949258717Savg if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1950247398Smm metaslab_should_condense(msp)) { 1951247398Smm metaslab_condense(msp, txg, tx); 1952247398Smm } else { 1953258717Savg space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1954258717Savg space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1955247398Smm } 1956168404Spjd 1957258717Savg if (msp->ms_loaded) { 1958258717Savg /* 1959258717Savg * When the space map is loaded, we have an accruate 1960258717Savg * histogram in the range tree. This gives us an opportunity 1961258717Savg * to bring the space map's histogram up-to-date so we clear 1962258717Savg * it first before updating it. 1963258717Savg */ 1964258717Savg space_map_histogram_clear(msp->ms_sm); 1965258717Savg space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1966258717Savg } else { 1967258717Savg /* 1968258717Savg * Since the space map is not loaded we simply update the 1969258717Savg * exisiting histogram with what was freed in this txg. This 1970258717Savg * means that the on-disk histogram may not have an accurate 1971258717Savg * view of the free space but it's close enough to allow 1972258717Savg * us to make allocation decisions. 1973258717Savg */ 1974258717Savg space_map_histogram_add(msp->ms_sm, *freetree, tx); 1975258717Savg } 1976269118Sdelphij metaslab_group_histogram_add(mg, msp); 1977269118Sdelphij metaslab_group_histogram_verify(mg); 1978269118Sdelphij metaslab_class_histogram_verify(mg->mg_class); 1979258717Savg 1980247398Smm /* 1981258717Savg * For sync pass 1, we avoid traversing this txg's free range tree 1982258717Savg * and instead will just swap the pointers for freetree and 1983258717Savg * freed_tree. We can safely do this since the freed_tree is 1984247398Smm * guaranteed to be empty on the initial pass. 1985247398Smm */ 1986247398Smm if (spa_sync_pass(spa) == 1) { 1987258717Savg range_tree_swap(freetree, freed_tree); 1988247398Smm } else { 1989258717Savg range_tree_vacate(*freetree, range_tree_add, *freed_tree); 1990168404Spjd } 1991269118Sdelphij range_tree_vacate(alloctree, NULL, NULL); 1992168404Spjd 1993258717Savg ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1994258717Savg ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1995168404Spjd 1996168404Spjd mutex_exit(&msp->ms_lock); 1997168404Spjd 1998258717Savg if (object != space_map_object(msp->ms_sm)) { 1999258717Savg object = space_map_object(msp->ms_sm); 2000258717Savg dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2001258717Savg msp->ms_id, sizeof (uint64_t), &object, tx); 2002258717Savg } 2003168404Spjd dmu_tx_commit(tx); 2004168404Spjd} 2005168404Spjd 2006168404Spjd/* 2007168404Spjd * Called after a transaction group has completely synced to mark 2008168404Spjd * all of the metaslab's free space as usable. 2009168404Spjd */ 2010168404Spjdvoid 2011168404Spjdmetaslab_sync_done(metaslab_t *msp, uint64_t txg) 2012168404Spjd{ 2013168404Spjd metaslab_group_t *mg = msp->ms_group; 2014168404Spjd vdev_t *vd = mg->mg_vd; 2015258717Savg range_tree_t **freed_tree; 2016258717Savg range_tree_t **defer_tree; 2017219089Spjd int64_t alloc_delta, defer_delta; 2018168404Spjd 2019219089Spjd ASSERT(!vd->vdev_ishole); 2020219089Spjd 2021168404Spjd mutex_enter(&msp->ms_lock); 2022168404Spjd 2023168404Spjd /* 2024168404Spjd * If this metaslab is just becoming available, initialize its 2025258717Savg * alloctrees, freetrees, and defertree and add its capacity to 2026258717Savg * the vdev. 2027168404Spjd */ 2028258717Savg if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 2029219089Spjd for (int t = 0; t < TXG_SIZE; t++) { 2030258717Savg ASSERT(msp->ms_alloctree[t] == NULL); 2031258717Savg ASSERT(msp->ms_freetree[t] == NULL); 2032258717Savg 2033258717Savg msp->ms_alloctree[t] = range_tree_create(NULL, msp, 2034258717Savg &msp->ms_lock); 2035258717Savg msp->ms_freetree[t] = range_tree_create(NULL, msp, 2036258717Savg &msp->ms_lock); 2037168404Spjd } 2038219089Spjd 2039247398Smm for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2040258717Savg ASSERT(msp->ms_defertree[t] == NULL); 2041258717Savg 2042258717Savg msp->ms_defertree[t] = range_tree_create(NULL, msp, 2043258717Savg &msp->ms_lock); 2044247398Smm } 2045219089Spjd 2046258717Savg vdev_space_update(vd, 0, 0, msp->ms_size); 2047168404Spjd } 2048168404Spjd 2049258717Savg freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2050258717Savg defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 2051168404Spjd 2052258717Savg alloc_delta = space_map_alloc_delta(msp->ms_sm); 2053258717Savg defer_delta = range_tree_space(*freed_tree) - 2054258717Savg range_tree_space(*defer_tree); 2055258717Savg 2056219089Spjd vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2057219089Spjd 2058258717Savg ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2059258717Savg ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2060168404Spjd 2061168404Spjd /* 2062258717Savg * If there's a metaslab_load() in progress, wait for it to complete 2063168404Spjd * so that we have a consistent view of the in-core space map. 2064168404Spjd */ 2065258717Savg metaslab_load_wait(msp); 2066168404Spjd 2067247398Smm /* 2068258717Savg * Move the frees from the defer_tree back to the free 2069258717Savg * range tree (if it's loaded). Swap the freed_tree and the 2070258717Savg * defer_tree -- this is safe to do because we've just emptied out 2071258717Savg * the defer_tree. 2072247398Smm */ 2073258717Savg range_tree_vacate(*defer_tree, 2074258717Savg msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 2075258717Savg range_tree_swap(freed_tree, defer_tree); 2076247398Smm 2077258717Savg space_map_update(msp->ms_sm); 2078168404Spjd 2079219089Spjd msp->ms_deferspace += defer_delta; 2080219089Spjd ASSERT3S(msp->ms_deferspace, >=, 0); 2081258717Savg ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2082219089Spjd if (msp->ms_deferspace != 0) { 2083219089Spjd /* 2084219089Spjd * Keep syncing this metaslab until all deferred frees 2085219089Spjd * are back in circulation. 2086219089Spjd */ 2087219089Spjd vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2088219089Spjd } 2089219089Spjd 2090258717Savg if (msp->ms_loaded && msp->ms_access_txg < txg) { 2091258717Savg for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2092258717Savg VERIFY0(range_tree_space( 2093258717Savg msp->ms_alloctree[(txg + t) & TXG_MASK])); 2094258717Savg } 2095168404Spjd 2096258717Savg if (!metaslab_debug_unload) 2097258717Savg metaslab_unload(msp); 2098168404Spjd } 2099168404Spjd 2100168404Spjd metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2101258717Savg mutex_exit(&msp->ms_lock); 2102168404Spjd} 2103168404Spjd 2104211931Smmvoid 2105211931Smmmetaslab_sync_reassess(metaslab_group_t *mg) 2106211931Smm{ 2107258633Savg metaslab_group_alloc_update(mg); 2108269118Sdelphij mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2109224177Smm 2110211931Smm /* 2111258717Savg * Preload the next potential metaslabs 2112211931Smm */ 2113258717Savg metaslab_group_preload(mg); 2114211931Smm} 2115211931Smm 2116168404Spjdstatic uint64_t 2117168404Spjdmetaslab_distance(metaslab_t *msp, dva_t *dva) 2118168404Spjd{ 2119168404Spjd uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2120168404Spjd uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2121258717Savg uint64_t start = msp->ms_id; 2122168404Spjd 2123168404Spjd if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2124168404Spjd return (1ULL << 63); 2125168404Spjd 2126168404Spjd if (offset < start) 2127168404Spjd return ((start - offset) << ms_shift); 2128168404Spjd if (offset > start) 2129168404Spjd return ((offset - start) << ms_shift); 2130168404Spjd return (0); 2131168404Spjd} 2132168404Spjd 2133168404Spjdstatic uint64_t 2134224177Smmmetaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 2135264671Sdelphij uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2136168404Spjd{ 2137224177Smm spa_t *spa = mg->mg_vd->vdev_spa; 2138168404Spjd metaslab_t *msp = NULL; 2139168404Spjd uint64_t offset = -1ULL; 2140168404Spjd avl_tree_t *t = &mg->mg_metaslab_tree; 2141168404Spjd uint64_t activation_weight; 2142168404Spjd uint64_t target_distance; 2143168404Spjd int i; 2144168404Spjd 2145168404Spjd activation_weight = METASLAB_WEIGHT_PRIMARY; 2146209962Smm for (i = 0; i < d; i++) { 2147209962Smm if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2148168404Spjd activation_weight = METASLAB_WEIGHT_SECONDARY; 2149209962Smm break; 2150209962Smm } 2151209962Smm } 2152168404Spjd 2153168404Spjd for (;;) { 2154209962Smm boolean_t was_active; 2155209962Smm 2156168404Spjd mutex_enter(&mg->mg_lock); 2157168404Spjd for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2158224177Smm if (msp->ms_weight < asize) { 2159224177Smm spa_dbgmsg(spa, "%s: failed to meet weight " 2160224177Smm "requirement: vdev %llu, txg %llu, mg %p, " 2161224177Smm "msp %p, psize %llu, asize %llu, " 2162264671Sdelphij "weight %llu", spa_name(spa), 2163264671Sdelphij mg->mg_vd->vdev_id, txg, 2164264671Sdelphij mg, msp, psize, asize, msp->ms_weight); 2165168404Spjd mutex_exit(&mg->mg_lock); 2166168404Spjd return (-1ULL); 2167168404Spjd } 2168247398Smm 2169247398Smm /* 2170247398Smm * If the selected metaslab is condensing, skip it. 2171247398Smm */ 2172258717Savg if (msp->ms_condensing) 2173247398Smm continue; 2174247398Smm 2175209962Smm was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2176168404Spjd if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2177168404Spjd break; 2178168404Spjd 2179168404Spjd target_distance = min_distance + 2180258717Savg (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2181258717Savg min_distance >> 1); 2182168404Spjd 2183168404Spjd for (i = 0; i < d; i++) 2184168404Spjd if (metaslab_distance(msp, &dva[i]) < 2185168404Spjd target_distance) 2186168404Spjd break; 2187168404Spjd if (i == d) 2188168404Spjd break; 2189168404Spjd } 2190168404Spjd mutex_exit(&mg->mg_lock); 2191168404Spjd if (msp == NULL) 2192168404Spjd return (-1ULL); 2193168404Spjd 2194258633Savg mutex_enter(&msp->ms_lock); 2195258633Savg 2196224177Smm /* 2197168404Spjd * Ensure that the metaslab we have selected is still 2198168404Spjd * capable of handling our request. It's possible that 2199168404Spjd * another thread may have changed the weight while we 2200168404Spjd * were blocked on the metaslab lock. 2201168404Spjd */ 2202224177Smm if (msp->ms_weight < asize || (was_active && 2203209962Smm !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2204209962Smm activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2205168404Spjd mutex_exit(&msp->ms_lock); 2206168404Spjd continue; 2207168404Spjd } 2208168404Spjd 2209168404Spjd if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2210168404Spjd activation_weight == METASLAB_WEIGHT_PRIMARY) { 2211168404Spjd metaslab_passivate(msp, 2212168404Spjd msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2213168404Spjd mutex_exit(&msp->ms_lock); 2214168404Spjd continue; 2215168404Spjd } 2216168404Spjd 2217224177Smm if (metaslab_activate(msp, activation_weight) != 0) { 2218168404Spjd mutex_exit(&msp->ms_lock); 2219168404Spjd continue; 2220168404Spjd } 2221168404Spjd 2222247398Smm /* 2223247398Smm * If this metaslab is currently condensing then pick again as 2224247398Smm * we can't manipulate this metaslab until it's committed 2225247398Smm * to disk. 2226247398Smm */ 2227258717Savg if (msp->ms_condensing) { 2228247398Smm mutex_exit(&msp->ms_lock); 2229247398Smm continue; 2230247398Smm } 2231247398Smm 2232258717Savg if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2233168404Spjd break; 2234168404Spjd 2235258717Savg metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2236168404Spjd mutex_exit(&msp->ms_lock); 2237168404Spjd } 2238168404Spjd 2239258717Savg if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2240168404Spjd vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2241168404Spjd 2242258717Savg range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2243258717Savg msp->ms_access_txg = txg + metaslab_unload_delay; 2244168404Spjd 2245168404Spjd mutex_exit(&msp->ms_lock); 2246168404Spjd 2247168404Spjd return (offset); 2248168404Spjd} 2249168404Spjd 2250168404Spjd/* 2251168404Spjd * Allocate a block for the specified i/o. 2252168404Spjd */ 2253168404Spjdstatic int 2254185029Spjdmetaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2255185029Spjd dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2256168404Spjd{ 2257168404Spjd metaslab_group_t *mg, *rotor; 2258168404Spjd vdev_t *vd; 2259168404Spjd int dshift = 3; 2260168404Spjd int all_zero; 2261209962Smm int zio_lock = B_FALSE; 2262209962Smm boolean_t allocatable; 2263168404Spjd uint64_t offset = -1ULL; 2264168404Spjd uint64_t asize; 2265168404Spjd uint64_t distance; 2266168404Spjd 2267168404Spjd ASSERT(!DVA_IS_VALID(&dva[d])); 2268168404Spjd 2269185029Spjd /* 2270185029Spjd * For testing, make some blocks above a certain size be gang blocks. 2271185029Spjd */ 2272219089Spjd if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2273249195Smm return (SET_ERROR(ENOSPC)); 2274168404Spjd 2275168404Spjd /* 2276168404Spjd * Start at the rotor and loop through all mgs until we find something. 2277219089Spjd * Note that there's no locking on mc_rotor or mc_aliquot because 2278168404Spjd * nothing actually breaks if we miss a few updates -- we just won't 2279168404Spjd * allocate quite as evenly. It all balances out over time. 2280168404Spjd * 2281168404Spjd * If we are doing ditto or log blocks, try to spread them across 2282168404Spjd * consecutive vdevs. If we're forced to reuse a vdev before we've 2283168404Spjd * allocated all of our ditto blocks, then try and spread them out on 2284168404Spjd * that vdev as much as possible. If it turns out to not be possible, 2285168404Spjd * gradually lower our standards until anything becomes acceptable. 2286168404Spjd * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2287168404Spjd * gives us hope of containing our fault domains to something we're 2288168404Spjd * able to reason about. Otherwise, any two top-level vdev failures 2289168404Spjd * will guarantee the loss of data. With consecutive allocation, 2290168404Spjd * only two adjacent top-level vdev failures will result in data loss. 2291168404Spjd * 2292168404Spjd * If we are doing gang blocks (hintdva is non-NULL), try to keep 2293168404Spjd * ourselves on the same vdev as our gang block header. That 2294168404Spjd * way, we can hope for locality in vdev_cache, plus it makes our 2295168404Spjd * fault domains something tractable. 2296168404Spjd */ 2297168404Spjd if (hintdva) { 2298168404Spjd vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2299219089Spjd 2300219089Spjd /* 2301219089Spjd * It's possible the vdev we're using as the hint no 2302219089Spjd * longer exists (i.e. removed). Consult the rotor when 2303219089Spjd * all else fails. 2304219089Spjd */ 2305219089Spjd if (vd != NULL) { 2306168404Spjd mg = vd->vdev_mg; 2307219089Spjd 2308219089Spjd if (flags & METASLAB_HINTBP_AVOID && 2309219089Spjd mg->mg_next != NULL) 2310219089Spjd mg = mg->mg_next; 2311219089Spjd } else { 2312219089Spjd mg = mc->mc_rotor; 2313219089Spjd } 2314168404Spjd } else if (d != 0) { 2315168404Spjd vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2316168404Spjd mg = vd->vdev_mg->mg_next; 2317168404Spjd } else { 2318168404Spjd mg = mc->mc_rotor; 2319168404Spjd } 2320185029Spjd 2321185029Spjd /* 2322219089Spjd * If the hint put us into the wrong metaslab class, or into a 2323219089Spjd * metaslab group that has been passivated, just follow the rotor. 2324185029Spjd */ 2325219089Spjd if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2326185029Spjd mg = mc->mc_rotor; 2327185029Spjd 2328168404Spjd rotor = mg; 2329168404Spjdtop: 2330168404Spjd all_zero = B_TRUE; 2331168404Spjd do { 2332219089Spjd ASSERT(mg->mg_activation_count == 1); 2333219089Spjd 2334168404Spjd vd = mg->mg_vd; 2335209962Smm 2336185029Spjd /* 2337185029Spjd * Don't allocate from faulted devices. 2338185029Spjd */ 2339209962Smm if (zio_lock) { 2340209962Smm spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2341209962Smm allocatable = vdev_allocatable(vd); 2342209962Smm spa_config_exit(spa, SCL_ZIO, FTAG); 2343209962Smm } else { 2344209962Smm allocatable = vdev_allocatable(vd); 2345209962Smm } 2346258633Savg 2347258633Savg /* 2348258633Savg * Determine if the selected metaslab group is eligible 2349258633Savg * for allocations. If we're ganging or have requested 2350258633Savg * an allocation for the smallest gang block size 2351258633Savg * then we don't want to avoid allocating to the this 2352258633Savg * metaslab group. If we're in this condition we should 2353258633Savg * try to allocate from any device possible so that we 2354258633Savg * don't inadvertently return ENOSPC and suspend the pool 2355258633Savg * even though space is still available. 2356258633Savg */ 2357258633Savg if (allocatable && CAN_FASTGANG(flags) && 2358258633Savg psize > SPA_GANGBLOCKSIZE) 2359258633Savg allocatable = metaslab_group_allocatable(mg); 2360258633Savg 2361209962Smm if (!allocatable) 2362185029Spjd goto next; 2363209962Smm 2364185029Spjd /* 2365185029Spjd * Avoid writing single-copy data to a failing vdev 2366246675Smm * unless the user instructs us that it is okay. 2367185029Spjd */ 2368185029Spjd if ((vd->vdev_stat.vs_write_errors > 0 || 2369185029Spjd vd->vdev_state < VDEV_STATE_HEALTHY) && 2370269118Sdelphij d == 0 && dshift == 3 && vd->vdev_children == 0) { 2371185029Spjd all_zero = B_FALSE; 2372185029Spjd goto next; 2373185029Spjd } 2374168404Spjd 2375185029Spjd ASSERT(mg->mg_class == mc); 2376185029Spjd 2377168404Spjd distance = vd->vdev_asize >> dshift; 2378168404Spjd if (distance <= (1ULL << vd->vdev_ms_shift)) 2379168404Spjd distance = 0; 2380168404Spjd else 2381168404Spjd all_zero = B_FALSE; 2382168404Spjd 2383168404Spjd asize = vdev_psize_to_asize(vd, psize); 2384168404Spjd ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2385168404Spjd 2386224177Smm offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 2387264671Sdelphij dva, d); 2388168404Spjd if (offset != -1ULL) { 2389168404Spjd /* 2390168404Spjd * If we've just selected this metaslab group, 2391168404Spjd * figure out whether the corresponding vdev is 2392168404Spjd * over- or under-used relative to the pool, 2393168404Spjd * and set an allocation bias to even it out. 2394168404Spjd */ 2395269118Sdelphij if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2396168404Spjd vdev_stat_t *vs = &vd->vdev_stat; 2397219089Spjd int64_t vu, cu; 2398168404Spjd 2399224177Smm vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2400224177Smm cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2401168404Spjd 2402168404Spjd /* 2403224177Smm * Calculate how much more or less we should 2404224177Smm * try to allocate from this device during 2405224177Smm * this iteration around the rotor. 2406224177Smm * For example, if a device is 80% full 2407224177Smm * and the pool is 20% full then we should 2408224177Smm * reduce allocations by 60% on this device. 2409224177Smm * 2410224177Smm * mg_bias = (20 - 80) * 512K / 100 = -307K 2411224177Smm * 2412224177Smm * This reduces allocations by 307K for this 2413224177Smm * iteration. 2414168404Spjd */ 2415219089Spjd mg->mg_bias = ((cu - vu) * 2416224177Smm (int64_t)mg->mg_aliquot) / 100; 2417269118Sdelphij } else if (!metaslab_bias_enabled) { 2418269118Sdelphij mg->mg_bias = 0; 2419168404Spjd } 2420168404Spjd 2421219089Spjd if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2422168404Spjd mg->mg_aliquot + mg->mg_bias) { 2423168404Spjd mc->mc_rotor = mg->mg_next; 2424219089Spjd mc->mc_aliquot = 0; 2425168404Spjd } 2426168404Spjd 2427168404Spjd DVA_SET_VDEV(&dva[d], vd->vdev_id); 2428168404Spjd DVA_SET_OFFSET(&dva[d], offset); 2429185029Spjd DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2430168404Spjd DVA_SET_ASIZE(&dva[d], asize); 2431168404Spjd 2432168404Spjd return (0); 2433168404Spjd } 2434185029Spjdnext: 2435168404Spjd mc->mc_rotor = mg->mg_next; 2436219089Spjd mc->mc_aliquot = 0; 2437168404Spjd } while ((mg = mg->mg_next) != rotor); 2438168404Spjd 2439168404Spjd if (!all_zero) { 2440168404Spjd dshift++; 2441168404Spjd ASSERT(dshift < 64); 2442168404Spjd goto top; 2443168404Spjd } 2444168404Spjd 2445209962Smm if (!allocatable && !zio_lock) { 2446209962Smm dshift = 3; 2447209962Smm zio_lock = B_TRUE; 2448209962Smm goto top; 2449209962Smm } 2450209962Smm 2451168404Spjd bzero(&dva[d], sizeof (dva_t)); 2452168404Spjd 2453249195Smm return (SET_ERROR(ENOSPC)); 2454168404Spjd} 2455168404Spjd 2456168404Spjd/* 2457168404Spjd * Free the block represented by DVA in the context of the specified 2458168404Spjd * transaction group. 2459168404Spjd */ 2460168404Spjdstatic void 2461168404Spjdmetaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2462168404Spjd{ 2463168404Spjd uint64_t vdev = DVA_GET_VDEV(dva); 2464168404Spjd uint64_t offset = DVA_GET_OFFSET(dva); 2465168404Spjd uint64_t size = DVA_GET_ASIZE(dva); 2466168404Spjd vdev_t *vd; 2467168404Spjd metaslab_t *msp; 2468168404Spjd 2469168404Spjd ASSERT(DVA_IS_VALID(dva)); 2470168404Spjd 2471168404Spjd if (txg > spa_freeze_txg(spa)) 2472168404Spjd return; 2473168404Spjd 2474168404Spjd if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2475168404Spjd (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2476168404Spjd cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2477168404Spjd (u_longlong_t)vdev, (u_longlong_t)offset); 2478168404Spjd ASSERT(0); 2479168404Spjd return; 2480168404Spjd } 2481168404Spjd 2482168404Spjd msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2483168404Spjd 2484168404Spjd if (DVA_GET_GANG(dva)) 2485168404Spjd size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2486168404Spjd 2487168404Spjd mutex_enter(&msp->ms_lock); 2488168404Spjd 2489168404Spjd if (now) { 2490258717Savg range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2491168404Spjd offset, size); 2492258717Savg 2493258717Savg VERIFY(!msp->ms_condensing); 2494258717Savg VERIFY3U(offset, >=, msp->ms_start); 2495258717Savg VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2496258717Savg VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2497258717Savg msp->ms_size); 2498258717Savg VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2499258717Savg VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2500258717Savg range_tree_add(msp->ms_tree, offset, size); 2501168404Spjd } else { 2502258717Savg if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2503168404Spjd vdev_dirty(vd, VDD_METASLAB, msp, txg); 2504258717Savg range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2505258717Savg offset, size); 2506168404Spjd } 2507168404Spjd 2508168404Spjd mutex_exit(&msp->ms_lock); 2509168404Spjd} 2510168404Spjd 2511168404Spjd/* 2512168404Spjd * Intent log support: upon opening the pool after a crash, notify the SPA 2513168404Spjd * of blocks that the intent log has allocated for immediate write, but 2514168404Spjd * which are still considered free by the SPA because the last transaction 2515168404Spjd * group didn't commit yet. 2516168404Spjd */ 2517168404Spjdstatic int 2518168404Spjdmetaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2519168404Spjd{ 2520168404Spjd uint64_t vdev = DVA_GET_VDEV(dva); 2521168404Spjd uint64_t offset = DVA_GET_OFFSET(dva); 2522168404Spjd uint64_t size = DVA_GET_ASIZE(dva); 2523168404Spjd vdev_t *vd; 2524168404Spjd metaslab_t *msp; 2525219089Spjd int error = 0; 2526168404Spjd 2527168404Spjd ASSERT(DVA_IS_VALID(dva)); 2528168404Spjd 2529168404Spjd if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2530168404Spjd (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2531249195Smm return (SET_ERROR(ENXIO)); 2532168404Spjd 2533168404Spjd msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2534168404Spjd 2535168404Spjd if (DVA_GET_GANG(dva)) 2536168404Spjd size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2537168404Spjd 2538168404Spjd mutex_enter(&msp->ms_lock); 2539168404Spjd 2540258717Savg if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2541224177Smm error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2542219089Spjd 2543258717Savg if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2544249195Smm error = SET_ERROR(ENOENT); 2545219089Spjd 2546185029Spjd if (error || txg == 0) { /* txg == 0 indicates dry run */ 2547168404Spjd mutex_exit(&msp->ms_lock); 2548168404Spjd return (error); 2549168404Spjd } 2550168404Spjd 2551258717Savg VERIFY(!msp->ms_condensing); 2552258717Savg VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2553258717Savg VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2554258717Savg VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2555258717Savg range_tree_remove(msp->ms_tree, offset, size); 2556168404Spjd 2557209962Smm if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2558258717Savg if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2559185029Spjd vdev_dirty(vd, VDD_METASLAB, msp, txg); 2560258717Savg range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2561185029Spjd } 2562185029Spjd 2563168404Spjd mutex_exit(&msp->ms_lock); 2564168404Spjd 2565168404Spjd return (0); 2566168404Spjd} 2567168404Spjd 2568168404Spjdint 2569185029Spjdmetaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2570185029Spjd int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 2571168404Spjd{ 2572168404Spjd dva_t *dva = bp->blk_dva; 2573168404Spjd dva_t *hintdva = hintbp->blk_dva; 2574168404Spjd int error = 0; 2575168404Spjd 2576185029Spjd ASSERT(bp->blk_birth == 0); 2577219089Spjd ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2578185029Spjd 2579185029Spjd spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2580185029Spjd 2581185029Spjd if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2582185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 2583249195Smm return (SET_ERROR(ENOSPC)); 2584185029Spjd } 2585185029Spjd 2586168404Spjd ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2587168404Spjd ASSERT(BP_GET_NDVAS(bp) == 0); 2588168404Spjd ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2589168404Spjd 2590185029Spjd for (int d = 0; d < ndvas; d++) { 2591185029Spjd error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2592185029Spjd txg, flags); 2593258717Savg if (error != 0) { 2594168404Spjd for (d--; d >= 0; d--) { 2595168404Spjd metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2596168404Spjd bzero(&dva[d], sizeof (dva_t)); 2597168404Spjd } 2598185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 2599168404Spjd return (error); 2600168404Spjd } 2601168404Spjd } 2602168404Spjd ASSERT(error == 0); 2603168404Spjd ASSERT(BP_GET_NDVAS(bp) == ndvas); 2604168404Spjd 2605185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 2606185029Spjd 2607219089Spjd BP_SET_BIRTH(bp, txg, txg); 2608185029Spjd 2609168404Spjd return (0); 2610168404Spjd} 2611168404Spjd 2612168404Spjdvoid 2613168404Spjdmetaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2614168404Spjd{ 2615168404Spjd const dva_t *dva = bp->blk_dva; 2616168404Spjd int ndvas = BP_GET_NDVAS(bp); 2617168404Spjd 2618168404Spjd ASSERT(!BP_IS_HOLE(bp)); 2619219089Spjd ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2620168404Spjd 2621185029Spjd spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2622185029Spjd 2623185029Spjd for (int d = 0; d < ndvas; d++) 2624168404Spjd metaslab_free_dva(spa, &dva[d], txg, now); 2625185029Spjd 2626185029Spjd spa_config_exit(spa, SCL_FREE, FTAG); 2627168404Spjd} 2628168404Spjd 2629168404Spjdint 2630168404Spjdmetaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2631168404Spjd{ 2632168404Spjd const dva_t *dva = bp->blk_dva; 2633168404Spjd int ndvas = BP_GET_NDVAS(bp); 2634185029Spjd int error = 0; 2635168404Spjd 2636168404Spjd ASSERT(!BP_IS_HOLE(bp)); 2637168404Spjd 2638185029Spjd if (txg != 0) { 2639185029Spjd /* 2640185029Spjd * First do a dry run to make sure all DVAs are claimable, 2641185029Spjd * so we don't have to unwind from partial failures below. 2642185029Spjd */ 2643185029Spjd if ((error = metaslab_claim(spa, bp, 0)) != 0) 2644185029Spjd return (error); 2645185029Spjd } 2646185029Spjd 2647185029Spjd spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2648185029Spjd 2649185029Spjd for (int d = 0; d < ndvas; d++) 2650168404Spjd if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2651185029Spjd break; 2652168404Spjd 2653185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 2654185029Spjd 2655185029Spjd ASSERT(error == 0 || txg == 0); 2656185029Spjd 2657185029Spjd return (error); 2658168404Spjd} 2659248571Smm 2660248571Smmvoid 2661248571Smmmetaslab_check_free(spa_t *spa, const blkptr_t *bp) 2662248571Smm{ 2663248571Smm if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2664248571Smm return; 2665248571Smm 2666248571Smm spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2667248571Smm for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2668258717Savg uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2669258717Savg vdev_t *vd = vdev_lookup_top(spa, vdev); 2670258717Savg uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2671248571Smm uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2672258717Savg metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2673248571Smm 2674258717Savg if (msp->ms_loaded) 2675258717Savg range_tree_verify(msp->ms_tree, offset, size); 2676248571Smm 2677248571Smm for (int j = 0; j < TXG_SIZE; j++) 2678258717Savg range_tree_verify(msp->ms_freetree[j], offset, size); 2679248571Smm for (int j = 0; j < TXG_DEFER_SIZE; j++) 2680258717Savg range_tree_verify(msp->ms_defertree[j], offset, size); 2681248571Smm } 2682248571Smm spa_config_exit(spa, SCL_VDEV, FTAG); 2683248571Smm} 2684