1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22211931Smm * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23339105Smav * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24246675Smm * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25296519Smav * Copyright (c) 2014 Integros [integros.com] 26168404Spjd */ 27168404Spjd 28168404Spjd#include <sys/zfs_context.h> 29168404Spjd#include <sys/dmu.h> 30168404Spjd#include <sys/dmu_tx.h> 31168404Spjd#include <sys/space_map.h> 32168404Spjd#include <sys/metaslab_impl.h> 33168404Spjd#include <sys/vdev_impl.h> 34168404Spjd#include <sys/zio.h> 35258717Savg#include <sys/spa_impl.h> 36269118Sdelphij#include <sys/zfeature.h> 37332525Smav#include <sys/vdev_indirect_mapping.h> 38332547Smav#include <sys/zap.h> 39168404Spjd 40255226SpjdSYSCTL_DECL(_vfs_zfs); 41255226SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 42255226Spjd 43307277Smav#define GANG_ALLOCATION(flags) \ 44307277Smav ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 45224177Smm 46168404Spjduint64_t metaslab_aliquot = 512ULL << 10; 47332553Smavuint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 48332553SmavSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, force_ganging, CTLFLAG_RWTUN, 49332553Smav &metaslab_force_ganging, 0, 50255226Spjd "Force gang block allocation for blocks larger than or equal to this value"); 51168404Spjd 52168404Spjd/* 53332547Smav * Since we can touch multiple metaslabs (and their respective space maps) 54332547Smav * with each transaction group, we benefit from having a smaller space map 55332547Smav * block size since it allows us to issue more I/O operations scattered 56332547Smav * around the disk. 57332547Smav */ 58332547Smavint zfs_metaslab_sm_blksz = (1 << 12); 59332547SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, metaslab_sm_blksz, CTLFLAG_RDTUN, 60332547Smav &zfs_metaslab_sm_blksz, 0, 61332547Smav "Block size for metaslab DTL space map. Power of 2 and greater than 4096."); 62332547Smav 63332547Smav/* 64247398Smm * The in-core space map representation is more compact than its on-disk form. 65247398Smm * The zfs_condense_pct determines how much more compact the in-core 66321529Smav * space map representation must be before we compact it on-disk. 67247398Smm * Values should be greater than or equal to 100. 68247398Smm */ 69247398Smmint zfs_condense_pct = 200; 70258717SavgSYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, 71258717Savg &zfs_condense_pct, 0, 72258717Savg "Condense on-disk spacemap when it is more than this many percents" 73258717Savg " of in-memory counterpart"); 74247398Smm 75247398Smm/* 76268855Sdelphij * Condensing a metaslab is not guaranteed to actually reduce the amount of 77268855Sdelphij * space used on disk. In particular, a space map uses data in increments of 78272504Sdelphij * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 79268855Sdelphij * same number of blocks after condensing. Since the goal of condensing is to 80268855Sdelphij * reduce the number of IOPs required to read the space map, we only want to 81268855Sdelphij * condense when we can be sure we will reduce the number of blocks used by the 82268855Sdelphij * space map. Unfortunately, we cannot precisely compute whether or not this is 83268855Sdelphij * the case in metaslab_should_condense since we are holding ms_lock. Instead, 84268855Sdelphij * we apply the following heuristic: do not condense a spacemap unless the 85268855Sdelphij * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 86268855Sdelphij * blocks. 87268855Sdelphij */ 88268855Sdelphijint zfs_metaslab_condense_block_threshold = 4; 89268855Sdelphij 90268855Sdelphij/* 91258633Savg * The zfs_mg_noalloc_threshold defines which metaslab groups should 92258633Savg * be eligible for allocation. The value is defined as a percentage of 93269118Sdelphij * free space. Metaslab groups that have more free space than 94258633Savg * zfs_mg_noalloc_threshold are always eligible for allocations. Once 95258633Savg * a metaslab group's free space is less than or equal to the 96258633Savg * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 97258633Savg * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 98258633Savg * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 99258633Savg * groups are allowed to accept allocations. Gang blocks are always 100258633Savg * eligible to allocate on any metaslab group. The default value of 0 means 101258633Savg * no metaslab group will be excluded based on this criterion. 102258633Savg */ 103258633Savgint zfs_mg_noalloc_threshold = 0; 104258717SavgSYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 105258717Savg &zfs_mg_noalloc_threshold, 0, 106258717Savg "Percentage of metaslab group size that should be free" 107258717Savg " to make it eligible for allocation"); 108258633Savg 109258633Savg/* 110269118Sdelphij * Metaslab groups are considered eligible for allocations if their 111269118Sdelphij * fragmenation metric (measured as a percentage) is less than or equal to 112269118Sdelphij * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 113269118Sdelphij * then it will be skipped unless all metaslab groups within the metaslab 114269118Sdelphij * class have also crossed this threshold. 115269118Sdelphij */ 116269118Sdelphijint zfs_mg_fragmentation_threshold = 85; 117269138SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, 118269138Sdelphij &zfs_mg_fragmentation_threshold, 0, 119269138Sdelphij "Percentage of metaslab group size that should be considered " 120269138Sdelphij "eligible for allocations unless all metaslab groups within the metaslab class " 121269138Sdelphij "have also crossed this threshold"); 122269118Sdelphij 123269118Sdelphij/* 124269118Sdelphij * Allow metaslabs to keep their active state as long as their fragmentation 125269118Sdelphij * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 126269118Sdelphij * active metaslab that exceeds this threshold will no longer keep its active 127269118Sdelphij * status allowing better metaslabs to be selected. 128269118Sdelphij */ 129269118Sdelphijint zfs_metaslab_fragmentation_threshold = 70; 130269138SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, 131269138Sdelphij &zfs_metaslab_fragmentation_threshold, 0, 132269138Sdelphij "Maximum percentage of metaslab fragmentation level to keep their active state"); 133269118Sdelphij 134269118Sdelphij/* 135258717Savg * When set will load all metaslabs when pool is first opened. 136219089Spjd */ 137258717Savgint metaslab_debug_load = 0; 138258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 139258717Savg &metaslab_debug_load, 0, 140258717Savg "Load all metaslabs when pool is first opened"); 141219089Spjd 142219089Spjd/* 143258717Savg * When set will prevent metaslabs from being unloaded. 144258717Savg */ 145258717Savgint metaslab_debug_unload = 0; 146258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, 147258717Savg &metaslab_debug_unload, 0, 148258717Savg "Prevent metaslabs from being unloaded"); 149258717Savg 150258717Savg/* 151209962Smm * Minimum size which forces the dynamic allocator to change 152211931Smm * it's allocation strategy. Once the space map cannot satisfy 153209962Smm * an allocation of this size then it switches to using more 154209962Smm * aggressive strategy (i.e search by size rather than offset). 155209962Smm */ 156274337Sdelphijuint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 157255226SpjdSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 158255226Spjd &metaslab_df_alloc_threshold, 0, 159255226Spjd "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 160209962Smm 161209962Smm/* 162209962Smm * The minimum free space, in percent, which must be available 163209962Smm * in a space map to continue allocations in a first-fit fashion. 164321529Smav * Once the space map's free space drops below this level we dynamically 165209962Smm * switch to using best-fit allocations. 166209962Smm */ 167211931Smmint metaslab_df_free_pct = 4; 168255226SpjdSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 169255226Spjd &metaslab_df_free_pct, 0, 170267992Shselasky "The minimum free space, in percent, which must be available in a " 171267992Shselasky "space map to continue allocations in a first-fit fashion"); 172209962Smm 173209962Smm/* 174211931Smm * A metaslab is considered "free" if it contains a contiguous 175211931Smm * segment which is greater than metaslab_min_alloc_size. 176211931Smm */ 177211931Smmuint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 178255226SpjdSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 179255226Spjd &metaslab_min_alloc_size, 0, 180267992Shselasky "A metaslab is considered \"free\" if it contains a contiguous " 181267992Shselasky "segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 182211931Smm 183211931Smm/* 184258717Savg * Percentage of all cpus that can be used by the metaslab taskq. 185211931Smm */ 186258717Savgint metaslab_load_pct = 50; 187258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, 188258717Savg &metaslab_load_pct, 0, 189258717Savg "Percentage of cpus that can be used by the metaslab taskq"); 190211931Smm 191211931Smm/* 192258717Savg * Determines how many txgs a metaslab may remain loaded without having any 193258717Savg * allocations from it. As long as a metaslab continues to be used we will 194258717Savg * keep it loaded. 195211931Smm */ 196258717Savgint metaslab_unload_delay = TXG_SIZE * 2; 197258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 198258717Savg &metaslab_unload_delay, 0, 199258717Savg "Number of TXGs that an unused metaslab can be kept in memory"); 200211931Smm 201211931Smm/* 202258717Savg * Max number of metaslabs per group to preload. 203258717Savg */ 204258717Savgint metaslab_preload_limit = SPA_DVAS_PER_BP; 205258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 206258717Savg &metaslab_preload_limit, 0, 207258717Savg "Max number of metaslabs per group to preload"); 208258717Savg 209258717Savg/* 210258717Savg * Enable/disable preloading of metaslab. 211258717Savg */ 212258717Savgboolean_t metaslab_preload_enabled = B_TRUE; 213258717SavgSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 214258717Savg &metaslab_preload_enabled, 0, 215258717Savg "Max number of metaslabs per group to preload"); 216258717Savg 217258717Savg/* 218269118Sdelphij * Enable/disable fragmentation weighting on metaslabs. 219258717Savg */ 220269118Sdelphijboolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 221269118SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 222269118Sdelphij &metaslab_fragmentation_factor_enabled, 0, 223269118Sdelphij "Enable fragmentation weighting on metaslabs"); 224258717Savg 225269118Sdelphij/* 226269118Sdelphij * Enable/disable lba weighting (i.e. outer tracks are given preference). 227269118Sdelphij */ 228269118Sdelphijboolean_t metaslab_lba_weighting_enabled = B_TRUE; 229269118SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 230269118Sdelphij &metaslab_lba_weighting_enabled, 0, 231269118Sdelphij "Enable LBA weighting (i.e. outer tracks are given preference)"); 232258717Savg 233258717Savg/* 234269118Sdelphij * Enable/disable metaslab group biasing. 235269118Sdelphij */ 236269118Sdelphijboolean_t metaslab_bias_enabled = B_TRUE; 237269118SdelphijSYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 238269118Sdelphij &metaslab_bias_enabled, 0, 239269118Sdelphij "Enable metaslab group biasing"); 240269118Sdelphij 241321529Smav/* 242332525Smav * Enable/disable remapping of indirect DVAs to their concrete vdevs. 243332525Smav */ 244332525Smavboolean_t zfs_remap_blkptr_enable = B_TRUE; 245332525Smav 246332525Smav/* 247321529Smav * Enable/disable segment-based metaslab selection. 248321529Smav */ 249321529Smavboolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 250269118Sdelphij 251269118Sdelphij/* 252321529Smav * When using segment-based metaslab selection, we will continue 253321529Smav * allocating from the active metaslab until we have exhausted 254321529Smav * zfs_metaslab_switch_threshold of its buckets. 255321529Smav */ 256321529Smavint zfs_metaslab_switch_threshold = 2; 257321529Smav 258321529Smav/* 259321529Smav * Internal switch to enable/disable the metaslab allocation tracing 260321529Smav * facility. 261321529Smav */ 262321529Smavboolean_t metaslab_trace_enabled = B_TRUE; 263321529Smav 264321529Smav/* 265321529Smav * Maximum entries that the metaslab allocation tracing facility will keep 266321529Smav * in a given list when running in non-debug mode. We limit the number 267321529Smav * of entries in non-debug mode to prevent us from using up too much memory. 268321529Smav * The limit should be sufficiently large that we don't expect any allocation 269321529Smav * to every exceed this value. In debug mode, the system will panic if this 270321529Smav * limit is ever reached allowing for further investigation. 271321529Smav */ 272321529Smavuint64_t metaslab_trace_max_entries = 5000; 273321529Smav 274321529Smavstatic uint64_t metaslab_weight(metaslab_t *); 275321529Smavstatic void metaslab_set_fragmentation(metaslab_t *); 276332547Smavstatic void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 277332525Smavstatic void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 278339105Smavstatic void metaslab_passivate(metaslab_t *msp, uint64_t weight); 279339105Smavstatic uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); 280321529Smav 281321529Smavkmem_cache_t *metaslab_alloc_trace_cache; 282321529Smav 283321529Smav/* 284168404Spjd * ========================================================================== 285168404Spjd * Metaslab classes 286168404Spjd * ========================================================================== 287168404Spjd */ 288168404Spjdmetaslab_class_t * 289258717Savgmetaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 290168404Spjd{ 291168404Spjd metaslab_class_t *mc; 292168404Spjd 293168404Spjd mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 294168404Spjd 295219089Spjd mc->mc_spa = spa; 296168404Spjd mc->mc_rotor = NULL; 297209962Smm mc->mc_ops = ops; 298307277Smav mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 299339105Smav mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * 300339105Smav sizeof (refcount_t), KM_SLEEP); 301339105Smav mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * 302339105Smav sizeof (uint64_t), KM_SLEEP); 303339105Smav for (int i = 0; i < spa->spa_alloc_count; i++) 304339105Smav refcount_create_tracked(&mc->mc_alloc_slots[i]); 305168404Spjd 306168404Spjd return (mc); 307168404Spjd} 308168404Spjd 309168404Spjdvoid 310168404Spjdmetaslab_class_destroy(metaslab_class_t *mc) 311168404Spjd{ 312219089Spjd ASSERT(mc->mc_rotor == NULL); 313219089Spjd ASSERT(mc->mc_alloc == 0); 314219089Spjd ASSERT(mc->mc_deferred == 0); 315219089Spjd ASSERT(mc->mc_space == 0); 316219089Spjd ASSERT(mc->mc_dspace == 0); 317168404Spjd 318339105Smav for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) 319339105Smav refcount_destroy(&mc->mc_alloc_slots[i]); 320339105Smav kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * 321339105Smav sizeof (refcount_t)); 322339105Smav kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * 323339105Smav sizeof (uint64_t)); 324307277Smav mutex_destroy(&mc->mc_lock); 325168404Spjd kmem_free(mc, sizeof (metaslab_class_t)); 326168404Spjd} 327168404Spjd 328219089Spjdint 329219089Spjdmetaslab_class_validate(metaslab_class_t *mc) 330168404Spjd{ 331219089Spjd metaslab_group_t *mg; 332219089Spjd vdev_t *vd; 333168404Spjd 334219089Spjd /* 335219089Spjd * Must hold one of the spa_config locks. 336219089Spjd */ 337219089Spjd ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 338219089Spjd spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 339168404Spjd 340219089Spjd if ((mg = mc->mc_rotor) == NULL) 341219089Spjd return (0); 342219089Spjd 343219089Spjd do { 344219089Spjd vd = mg->mg_vd; 345219089Spjd ASSERT(vd->vdev_mg != NULL); 346219089Spjd ASSERT3P(vd->vdev_top, ==, vd); 347219089Spjd ASSERT3P(mg->mg_class, ==, mc); 348219089Spjd ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 349219089Spjd } while ((mg = mg->mg_next) != mc->mc_rotor); 350219089Spjd 351219089Spjd return (0); 352168404Spjd} 353168404Spjd 354168404Spjdvoid 355219089Spjdmetaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 356219089Spjd int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 357168404Spjd{ 358219089Spjd atomic_add_64(&mc->mc_alloc, alloc_delta); 359219089Spjd atomic_add_64(&mc->mc_deferred, defer_delta); 360219089Spjd atomic_add_64(&mc->mc_space, space_delta); 361219089Spjd atomic_add_64(&mc->mc_dspace, dspace_delta); 362219089Spjd} 363168404Spjd 364254591Sgibbsvoid 365254591Sgibbsmetaslab_class_minblocksize_update(metaslab_class_t *mc) 366254591Sgibbs{ 367254591Sgibbs metaslab_group_t *mg; 368254591Sgibbs vdev_t *vd; 369254591Sgibbs uint64_t minashift = UINT64_MAX; 370254591Sgibbs 371254591Sgibbs if ((mg = mc->mc_rotor) == NULL) { 372254591Sgibbs mc->mc_minblocksize = SPA_MINBLOCKSIZE; 373254591Sgibbs return; 374254591Sgibbs } 375254591Sgibbs 376254591Sgibbs do { 377254591Sgibbs vd = mg->mg_vd; 378254591Sgibbs if (vd->vdev_ashift < minashift) 379254591Sgibbs minashift = vd->vdev_ashift; 380254591Sgibbs } while ((mg = mg->mg_next) != mc->mc_rotor); 381254591Sgibbs 382254591Sgibbs mc->mc_minblocksize = 1ULL << minashift; 383254591Sgibbs} 384254591Sgibbs 385219089Spjduint64_t 386219089Spjdmetaslab_class_get_alloc(metaslab_class_t *mc) 387219089Spjd{ 388219089Spjd return (mc->mc_alloc); 389219089Spjd} 390168404Spjd 391219089Spjduint64_t 392219089Spjdmetaslab_class_get_deferred(metaslab_class_t *mc) 393219089Spjd{ 394219089Spjd return (mc->mc_deferred); 395219089Spjd} 396168404Spjd 397219089Spjduint64_t 398219089Spjdmetaslab_class_get_space(metaslab_class_t *mc) 399219089Spjd{ 400219089Spjd return (mc->mc_space); 401219089Spjd} 402168404Spjd 403219089Spjduint64_t 404219089Spjdmetaslab_class_get_dspace(metaslab_class_t *mc) 405219089Spjd{ 406219089Spjd return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 407168404Spjd} 408168404Spjd 409254591Sgibbsuint64_t 410254591Sgibbsmetaslab_class_get_minblocksize(metaslab_class_t *mc) 411254591Sgibbs{ 412254591Sgibbs return (mc->mc_minblocksize); 413254591Sgibbs} 414254591Sgibbs 415269118Sdelphijvoid 416269118Sdelphijmetaslab_class_histogram_verify(metaslab_class_t *mc) 417269118Sdelphij{ 418269118Sdelphij vdev_t *rvd = mc->mc_spa->spa_root_vdev; 419269118Sdelphij uint64_t *mc_hist; 420269118Sdelphij int i; 421269118Sdelphij 422269118Sdelphij if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 423269118Sdelphij return; 424269118Sdelphij 425269118Sdelphij mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 426269118Sdelphij KM_SLEEP); 427269118Sdelphij 428269118Sdelphij for (int c = 0; c < rvd->vdev_children; c++) { 429269118Sdelphij vdev_t *tvd = rvd->vdev_child[c]; 430269118Sdelphij metaslab_group_t *mg = tvd->vdev_mg; 431269118Sdelphij 432269118Sdelphij /* 433269118Sdelphij * Skip any holes, uninitialized top-levels, or 434269118Sdelphij * vdevs that are not in this metalab class. 435269118Sdelphij */ 436332525Smav if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 437269118Sdelphij mg->mg_class != mc) { 438269118Sdelphij continue; 439269118Sdelphij } 440269118Sdelphij 441269118Sdelphij for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 442269118Sdelphij mc_hist[i] += mg->mg_histogram[i]; 443269118Sdelphij } 444269118Sdelphij 445269118Sdelphij for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 446269118Sdelphij VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 447269118Sdelphij 448269118Sdelphij kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 449269118Sdelphij} 450269118Sdelphij 451168404Spjd/* 452269118Sdelphij * Calculate the metaslab class's fragmentation metric. The metric 453269118Sdelphij * is weighted based on the space contribution of each metaslab group. 454269118Sdelphij * The return value will be a number between 0 and 100 (inclusive), or 455269118Sdelphij * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 456269118Sdelphij * zfs_frag_table for more information about the metric. 457269118Sdelphij */ 458269118Sdelphijuint64_t 459269118Sdelphijmetaslab_class_fragmentation(metaslab_class_t *mc) 460269118Sdelphij{ 461269118Sdelphij vdev_t *rvd = mc->mc_spa->spa_root_vdev; 462269118Sdelphij uint64_t fragmentation = 0; 463269118Sdelphij 464269118Sdelphij spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 465269118Sdelphij 466269118Sdelphij for (int c = 0; c < rvd->vdev_children; c++) { 467269118Sdelphij vdev_t *tvd = rvd->vdev_child[c]; 468269118Sdelphij metaslab_group_t *mg = tvd->vdev_mg; 469269118Sdelphij 470269118Sdelphij /* 471332525Smav * Skip any holes, uninitialized top-levels, 472332525Smav * or vdevs that are not in this metalab class. 473269118Sdelphij */ 474332525Smav if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 475269118Sdelphij mg->mg_class != mc) { 476269118Sdelphij continue; 477269118Sdelphij } 478269118Sdelphij 479269118Sdelphij /* 480269118Sdelphij * If a metaslab group does not contain a fragmentation 481269118Sdelphij * metric then just bail out. 482269118Sdelphij */ 483269118Sdelphij if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 484269118Sdelphij spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 485269118Sdelphij return (ZFS_FRAG_INVALID); 486269118Sdelphij } 487269118Sdelphij 488269118Sdelphij /* 489269118Sdelphij * Determine how much this metaslab_group is contributing 490269118Sdelphij * to the overall pool fragmentation metric. 491269118Sdelphij */ 492269118Sdelphij fragmentation += mg->mg_fragmentation * 493269118Sdelphij metaslab_group_get_space(mg); 494269118Sdelphij } 495269118Sdelphij fragmentation /= metaslab_class_get_space(mc); 496269118Sdelphij 497269118Sdelphij ASSERT3U(fragmentation, <=, 100); 498269118Sdelphij spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 499269118Sdelphij return (fragmentation); 500269118Sdelphij} 501269118Sdelphij 502269118Sdelphij/* 503269118Sdelphij * Calculate the amount of expandable space that is available in 504269118Sdelphij * this metaslab class. If a device is expanded then its expandable 505269118Sdelphij * space will be the amount of allocatable space that is currently not 506269118Sdelphij * part of this metaslab class. 507269118Sdelphij */ 508269118Sdelphijuint64_t 509269118Sdelphijmetaslab_class_expandable_space(metaslab_class_t *mc) 510269118Sdelphij{ 511269118Sdelphij vdev_t *rvd = mc->mc_spa->spa_root_vdev; 512269118Sdelphij uint64_t space = 0; 513269118Sdelphij 514269118Sdelphij spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 515269118Sdelphij for (int c = 0; c < rvd->vdev_children; c++) { 516331395Smav uint64_t tspace; 517269118Sdelphij vdev_t *tvd = rvd->vdev_child[c]; 518269118Sdelphij metaslab_group_t *mg = tvd->vdev_mg; 519269118Sdelphij 520332525Smav if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 521269118Sdelphij mg->mg_class != mc) { 522269118Sdelphij continue; 523269118Sdelphij } 524269118Sdelphij 525307267Smav /* 526307267Smav * Calculate if we have enough space to add additional 527307267Smav * metaslabs. We report the expandable space in terms 528307267Smav * of the metaslab size since that's the unit of expansion. 529331395Smav * Adjust by efi system partition size. 530307267Smav */ 531331395Smav tspace = tvd->vdev_max_asize - tvd->vdev_asize; 532331395Smav if (tspace > mc->mc_spa->spa_bootsize) { 533331395Smav tspace -= mc->mc_spa->spa_bootsize; 534331395Smav } 535331395Smav space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 536269118Sdelphij } 537269118Sdelphij spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 538269118Sdelphij return (space); 539269118Sdelphij} 540269118Sdelphij 541168404Spjdstatic int 542168404Spjdmetaslab_compare(const void *x1, const void *x2) 543168404Spjd{ 544339158Smav const metaslab_t *m1 = (const metaslab_t *)x1; 545339158Smav const metaslab_t *m2 = (const metaslab_t *)x2; 546168404Spjd 547339105Smav int sort1 = 0; 548339105Smav int sort2 = 0; 549339105Smav if (m1->ms_allocator != -1 && m1->ms_primary) 550339105Smav sort1 = 1; 551339105Smav else if (m1->ms_allocator != -1 && !m1->ms_primary) 552339105Smav sort1 = 2; 553339105Smav if (m2->ms_allocator != -1 && m2->ms_primary) 554339105Smav sort2 = 1; 555339105Smav else if (m2->ms_allocator != -1 && !m2->ms_primary) 556339105Smav sort2 = 2; 557339105Smav 558339105Smav /* 559339105Smav * Sort inactive metaslabs first, then primaries, then secondaries. When 560339105Smav * selecting a metaslab to allocate from, an allocator first tries its 561339105Smav * primary, then secondary active metaslab. If it doesn't have active 562339105Smav * metaslabs, or can't allocate from them, it searches for an inactive 563339105Smav * metaslab to activate. If it can't find a suitable one, it will steal 564339105Smav * a primary or secondary metaslab from another allocator. 565339105Smav */ 566339105Smav if (sort1 < sort2) 567339105Smav return (-1); 568339105Smav if (sort1 > sort2) 569339105Smav return (1); 570339105Smav 571339158Smav int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); 572339158Smav if (likely(cmp)) 573339158Smav return (cmp); 574168404Spjd 575339158Smav IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); 576168404Spjd 577339158Smav return (AVL_CMP(m1->ms_start, m2->ms_start)); 578168404Spjd} 579168404Spjd 580258633Savg/* 581321529Smav * Verify that the space accounting on disk matches the in-core range_trees. 582321529Smav */ 583321529Smavvoid 584321529Smavmetaslab_verify_space(metaslab_t *msp, uint64_t txg) 585321529Smav{ 586321529Smav spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 587321529Smav uint64_t allocated = 0; 588321529Smav uint64_t sm_free_space, msp_free_space; 589321529Smav 590321529Smav ASSERT(MUTEX_HELD(&msp->ms_lock)); 591321529Smav 592321529Smav if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 593321529Smav return; 594321529Smav 595321529Smav /* 596321529Smav * We can only verify the metaslab space when we're called 597321529Smav * from syncing context with a loaded metaslab that has an allocated 598321529Smav * space map. Calling this in non-syncing context does not 599321529Smav * provide a consistent view of the metaslab since we're performing 600321529Smav * allocations in the future. 601321529Smav */ 602321529Smav if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 603321529Smav !msp->ms_loaded) 604321529Smav return; 605321529Smav 606321529Smav sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - 607321529Smav space_map_alloc_delta(msp->ms_sm); 608321529Smav 609321529Smav /* 610321529Smav * Account for future allocations since we would have already 611321529Smav * deducted that space from the ms_freetree. 612321529Smav */ 613321529Smav for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 614321529Smav allocated += 615332547Smav range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 616321529Smav } 617321529Smav 618332547Smav msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + 619332547Smav msp->ms_deferspace + range_tree_space(msp->ms_freed); 620321529Smav 621321529Smav VERIFY3U(sm_free_space, ==, msp_free_space); 622321529Smav} 623321529Smav 624321529Smav/* 625321529Smav * ========================================================================== 626321529Smav * Metaslab groups 627321529Smav * ========================================================================== 628321529Smav */ 629321529Smav/* 630258633Savg * Update the allocatable flag and the metaslab group's capacity. 631258633Savg * The allocatable flag is set to true if the capacity is below 632307277Smav * the zfs_mg_noalloc_threshold or has a fragmentation value that is 633307277Smav * greater than zfs_mg_fragmentation_threshold. If a metaslab group 634307277Smav * transitions from allocatable to non-allocatable or vice versa then the 635307277Smav * metaslab group's class is updated to reflect the transition. 636258633Savg */ 637258633Savgstatic void 638258633Savgmetaslab_group_alloc_update(metaslab_group_t *mg) 639258633Savg{ 640258633Savg vdev_t *vd = mg->mg_vd; 641258633Savg metaslab_class_t *mc = mg->mg_class; 642258633Savg vdev_stat_t *vs = &vd->vdev_stat; 643258633Savg boolean_t was_allocatable; 644307277Smav boolean_t was_initialized; 645258633Savg 646258633Savg ASSERT(vd == vd->vdev_top); 647332525Smav ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 648332525Smav SCL_ALLOC); 649258633Savg 650258633Savg mutex_enter(&mg->mg_lock); 651258633Savg was_allocatable = mg->mg_allocatable; 652307277Smav was_initialized = mg->mg_initialized; 653258633Savg 654258633Savg mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 655258633Savg (vs->vs_space + 1); 656258633Savg 657307277Smav mutex_enter(&mc->mc_lock); 658307277Smav 659269118Sdelphij /* 660307277Smav * If the metaslab group was just added then it won't 661307277Smav * have any space until we finish syncing out this txg. 662307277Smav * At that point we will consider it initialized and available 663307277Smav * for allocations. We also don't consider non-activated 664307277Smav * metaslab groups (e.g. vdevs that are in the middle of being removed) 665307277Smav * to be initialized, because they can't be used for allocation. 666307277Smav */ 667307277Smav mg->mg_initialized = metaslab_group_initialized(mg); 668307277Smav if (!was_initialized && mg->mg_initialized) { 669307277Smav mc->mc_groups++; 670307277Smav } else if (was_initialized && !mg->mg_initialized) { 671307277Smav ASSERT3U(mc->mc_groups, >, 0); 672307277Smav mc->mc_groups--; 673307277Smav } 674307277Smav if (mg->mg_initialized) 675307277Smav mg->mg_no_free_space = B_FALSE; 676307277Smav 677307277Smav /* 678269118Sdelphij * A metaslab group is considered allocatable if it has plenty 679269118Sdelphij * of free space or is not heavily fragmented. We only take 680269118Sdelphij * fragmentation into account if the metaslab group has a valid 681269118Sdelphij * fragmentation metric (i.e. a value between 0 and 100). 682269118Sdelphij */ 683307277Smav mg->mg_allocatable = (mg->mg_activation_count > 0 && 684307277Smav mg->mg_free_capacity > zfs_mg_noalloc_threshold && 685269118Sdelphij (mg->mg_fragmentation == ZFS_FRAG_INVALID || 686269118Sdelphij mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 687258633Savg 688258633Savg /* 689258633Savg * The mc_alloc_groups maintains a count of the number of 690258633Savg * groups in this metaslab class that are still above the 691258633Savg * zfs_mg_noalloc_threshold. This is used by the allocating 692258633Savg * threads to determine if they should avoid allocations to 693258633Savg * a given group. The allocator will avoid allocations to a group 694258633Savg * if that group has reached or is below the zfs_mg_noalloc_threshold 695258633Savg * and there are still other groups that are above the threshold. 696258633Savg * When a group transitions from allocatable to non-allocatable or 697258633Savg * vice versa we update the metaslab class to reflect that change. 698258633Savg * When the mc_alloc_groups value drops to 0 that means that all 699258633Savg * groups have reached the zfs_mg_noalloc_threshold making all groups 700258633Savg * eligible for allocations. This effectively means that all devices 701258633Savg * are balanced again. 702258633Savg */ 703258633Savg if (was_allocatable && !mg->mg_allocatable) 704258633Savg mc->mc_alloc_groups--; 705258633Savg else if (!was_allocatable && mg->mg_allocatable) 706258633Savg mc->mc_alloc_groups++; 707307277Smav mutex_exit(&mc->mc_lock); 708269118Sdelphij 709258633Savg mutex_exit(&mg->mg_lock); 710258633Savg} 711258633Savg 712168404Spjdmetaslab_group_t * 713339105Smavmetaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) 714168404Spjd{ 715168404Spjd metaslab_group_t *mg; 716168404Spjd 717168404Spjd mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 718168404Spjd mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 719339111Smav mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); 720339111Smav cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); 721339105Smav mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 722339105Smav KM_SLEEP); 723339105Smav mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), 724339105Smav KM_SLEEP); 725168404Spjd avl_create(&mg->mg_metaslab_tree, metaslab_compare, 726168404Spjd sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 727168404Spjd mg->mg_vd = vd; 728219089Spjd mg->mg_class = mc; 729219089Spjd mg->mg_activation_count = 0; 730307277Smav mg->mg_initialized = B_FALSE; 731307277Smav mg->mg_no_free_space = B_TRUE; 732339105Smav mg->mg_allocators = allocators; 733168404Spjd 734339105Smav mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t), 735339105Smav KM_SLEEP); 736339105Smav mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * 737339105Smav sizeof (uint64_t), KM_SLEEP); 738339105Smav for (int i = 0; i < allocators; i++) { 739339105Smav refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); 740339105Smav mg->mg_cur_max_alloc_queue_depth[i] = 0; 741339105Smav } 742339105Smav 743265458Sdelphij mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 744258717Savg minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 745258717Savg 746168404Spjd return (mg); 747168404Spjd} 748168404Spjd 749168404Spjdvoid 750168404Spjdmetaslab_group_destroy(metaslab_group_t *mg) 751168404Spjd{ 752219089Spjd ASSERT(mg->mg_prev == NULL); 753219089Spjd ASSERT(mg->mg_next == NULL); 754219089Spjd /* 755219089Spjd * We may have gone below zero with the activation count 756219089Spjd * either because we never activated in the first place or 757219089Spjd * because we're done, and possibly removing the vdev. 758219089Spjd */ 759219089Spjd ASSERT(mg->mg_activation_count <= 0); 760219089Spjd 761265458Sdelphij taskq_destroy(mg->mg_taskq); 762168404Spjd avl_destroy(&mg->mg_metaslab_tree); 763339105Smav kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); 764339105Smav kmem_free(mg->mg_secondaries, mg->mg_allocators * 765339105Smav sizeof (metaslab_t *)); 766168404Spjd mutex_destroy(&mg->mg_lock); 767339111Smav mutex_destroy(&mg->mg_ms_initialize_lock); 768339111Smav cv_destroy(&mg->mg_ms_initialize_cv); 769339105Smav 770339105Smav for (int i = 0; i < mg->mg_allocators; i++) { 771339105Smav refcount_destroy(&mg->mg_alloc_queue_depth[i]); 772339105Smav mg->mg_cur_max_alloc_queue_depth[i] = 0; 773339105Smav } 774339105Smav kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * 775339105Smav sizeof (refcount_t)); 776339105Smav kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * 777339105Smav sizeof (uint64_t)); 778339105Smav 779168404Spjd kmem_free(mg, sizeof (metaslab_group_t)); 780168404Spjd} 781168404Spjd 782219089Spjdvoid 783219089Spjdmetaslab_group_activate(metaslab_group_t *mg) 784219089Spjd{ 785219089Spjd metaslab_class_t *mc = mg->mg_class; 786219089Spjd metaslab_group_t *mgprev, *mgnext; 787219089Spjd 788332525Smav ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 789219089Spjd 790219089Spjd ASSERT(mc->mc_rotor != mg); 791219089Spjd ASSERT(mg->mg_prev == NULL); 792219089Spjd ASSERT(mg->mg_next == NULL); 793219089Spjd ASSERT(mg->mg_activation_count <= 0); 794219089Spjd 795219089Spjd if (++mg->mg_activation_count <= 0) 796219089Spjd return; 797219089Spjd 798219089Spjd mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 799258633Savg metaslab_group_alloc_update(mg); 800219089Spjd 801219089Spjd if ((mgprev = mc->mc_rotor) == NULL) { 802219089Spjd mg->mg_prev = mg; 803219089Spjd mg->mg_next = mg; 804219089Spjd } else { 805219089Spjd mgnext = mgprev->mg_next; 806219089Spjd mg->mg_prev = mgprev; 807219089Spjd mg->mg_next = mgnext; 808219089Spjd mgprev->mg_next = mg; 809219089Spjd mgnext->mg_prev = mg; 810219089Spjd } 811219089Spjd mc->mc_rotor = mg; 812254591Sgibbs metaslab_class_minblocksize_update(mc); 813219089Spjd} 814219089Spjd 815332525Smav/* 816332525Smav * Passivate a metaslab group and remove it from the allocation rotor. 817332525Smav * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 818332525Smav * a metaslab group. This function will momentarily drop spa_config_locks 819332525Smav * that are lower than the SCL_ALLOC lock (see comment below). 820332525Smav */ 821219089Spjdvoid 822219089Spjdmetaslab_group_passivate(metaslab_group_t *mg) 823219089Spjd{ 824219089Spjd metaslab_class_t *mc = mg->mg_class; 825332525Smav spa_t *spa = mc->mc_spa; 826219089Spjd metaslab_group_t *mgprev, *mgnext; 827332525Smav int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 828219089Spjd 829332525Smav ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 830332525Smav (SCL_ALLOC | SCL_ZIO)); 831219089Spjd 832219089Spjd if (--mg->mg_activation_count != 0) { 833219089Spjd ASSERT(mc->mc_rotor != mg); 834219089Spjd ASSERT(mg->mg_prev == NULL); 835219089Spjd ASSERT(mg->mg_next == NULL); 836219089Spjd ASSERT(mg->mg_activation_count < 0); 837219089Spjd return; 838219089Spjd } 839219089Spjd 840332525Smav /* 841332525Smav * The spa_config_lock is an array of rwlocks, ordered as 842332525Smav * follows (from highest to lowest): 843332525Smav * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 844332525Smav * SCL_ZIO > SCL_FREE > SCL_VDEV 845332525Smav * (For more information about the spa_config_lock see spa_misc.c) 846332525Smav * The higher the lock, the broader its coverage. When we passivate 847332525Smav * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 848332525Smav * config locks. However, the metaslab group's taskq might be trying 849332525Smav * to preload metaslabs so we must drop the SCL_ZIO lock and any 850332525Smav * lower locks to allow the I/O to complete. At a minimum, 851332525Smav * we continue to hold the SCL_ALLOC lock, which prevents any future 852332525Smav * allocations from taking place and any changes to the vdev tree. 853332525Smav */ 854332525Smav spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 855258717Savg taskq_wait(mg->mg_taskq); 856332525Smav spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 857269118Sdelphij metaslab_group_alloc_update(mg); 858339105Smav for (int i = 0; i < mg->mg_allocators; i++) { 859339105Smav metaslab_t *msp = mg->mg_primaries[i]; 860339105Smav if (msp != NULL) { 861339105Smav mutex_enter(&msp->ms_lock); 862339105Smav metaslab_passivate(msp, 863339105Smav metaslab_weight_from_range_tree(msp)); 864339105Smav mutex_exit(&msp->ms_lock); 865339105Smav } 866339105Smav msp = mg->mg_secondaries[i]; 867339105Smav if (msp != NULL) { 868339105Smav mutex_enter(&msp->ms_lock); 869339105Smav metaslab_passivate(msp, 870339105Smav metaslab_weight_from_range_tree(msp)); 871339105Smav mutex_exit(&msp->ms_lock); 872339105Smav } 873339105Smav } 874258717Savg 875219089Spjd mgprev = mg->mg_prev; 876219089Spjd mgnext = mg->mg_next; 877219089Spjd 878219089Spjd if (mg == mgnext) { 879219089Spjd mc->mc_rotor = NULL; 880219089Spjd } else { 881219089Spjd mc->mc_rotor = mgnext; 882219089Spjd mgprev->mg_next = mgnext; 883219089Spjd mgnext->mg_prev = mgprev; 884219089Spjd } 885219089Spjd 886219089Spjd mg->mg_prev = NULL; 887219089Spjd mg->mg_next = NULL; 888254591Sgibbs metaslab_class_minblocksize_update(mc); 889219089Spjd} 890219089Spjd 891307277Smavboolean_t 892307277Smavmetaslab_group_initialized(metaslab_group_t *mg) 893307277Smav{ 894307277Smav vdev_t *vd = mg->mg_vd; 895307277Smav vdev_stat_t *vs = &vd->vdev_stat; 896307277Smav 897307277Smav return (vs->vs_space != 0 && mg->mg_activation_count > 0); 898307277Smav} 899307277Smav 900269118Sdelphijuint64_t 901269118Sdelphijmetaslab_group_get_space(metaslab_group_t *mg) 902269118Sdelphij{ 903269118Sdelphij return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 904269118Sdelphij} 905269118Sdelphij 906269118Sdelphijvoid 907269118Sdelphijmetaslab_group_histogram_verify(metaslab_group_t *mg) 908269118Sdelphij{ 909269118Sdelphij uint64_t *mg_hist; 910269118Sdelphij vdev_t *vd = mg->mg_vd; 911269118Sdelphij uint64_t ashift = vd->vdev_ashift; 912269118Sdelphij int i; 913269118Sdelphij 914269118Sdelphij if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 915269118Sdelphij return; 916269118Sdelphij 917269118Sdelphij mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 918269118Sdelphij KM_SLEEP); 919269118Sdelphij 920269118Sdelphij ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 921269118Sdelphij SPACE_MAP_HISTOGRAM_SIZE + ashift); 922269118Sdelphij 923269118Sdelphij for (int m = 0; m < vd->vdev_ms_count; m++) { 924269118Sdelphij metaslab_t *msp = vd->vdev_ms[m]; 925269118Sdelphij 926269118Sdelphij if (msp->ms_sm == NULL) 927269118Sdelphij continue; 928269118Sdelphij 929269118Sdelphij for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 930269118Sdelphij mg_hist[i + ashift] += 931269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 932269118Sdelphij } 933269118Sdelphij 934269118Sdelphij for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 935269118Sdelphij VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 936269118Sdelphij 937269118Sdelphij kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 938269118Sdelphij} 939269118Sdelphij 940168404Spjdstatic void 941269118Sdelphijmetaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 942269118Sdelphij{ 943269118Sdelphij metaslab_class_t *mc = mg->mg_class; 944269118Sdelphij uint64_t ashift = mg->mg_vd->vdev_ashift; 945269118Sdelphij 946269118Sdelphij ASSERT(MUTEX_HELD(&msp->ms_lock)); 947269118Sdelphij if (msp->ms_sm == NULL) 948269118Sdelphij return; 949269118Sdelphij 950269118Sdelphij mutex_enter(&mg->mg_lock); 951269118Sdelphij for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 952269118Sdelphij mg->mg_histogram[i + ashift] += 953269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 954269118Sdelphij mc->mc_histogram[i + ashift] += 955269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 956269118Sdelphij } 957269118Sdelphij mutex_exit(&mg->mg_lock); 958269118Sdelphij} 959269118Sdelphij 960269118Sdelphijvoid 961269118Sdelphijmetaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 962269118Sdelphij{ 963269118Sdelphij metaslab_class_t *mc = mg->mg_class; 964269118Sdelphij uint64_t ashift = mg->mg_vd->vdev_ashift; 965269118Sdelphij 966269118Sdelphij ASSERT(MUTEX_HELD(&msp->ms_lock)); 967269118Sdelphij if (msp->ms_sm == NULL) 968269118Sdelphij return; 969269118Sdelphij 970269118Sdelphij mutex_enter(&mg->mg_lock); 971269118Sdelphij for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 972269118Sdelphij ASSERT3U(mg->mg_histogram[i + ashift], >=, 973269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]); 974269118Sdelphij ASSERT3U(mc->mc_histogram[i + ashift], >=, 975269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]); 976269118Sdelphij 977269118Sdelphij mg->mg_histogram[i + ashift] -= 978269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 979269118Sdelphij mc->mc_histogram[i + ashift] -= 980269118Sdelphij msp->ms_sm->sm_phys->smp_histogram[i]; 981269118Sdelphij } 982269118Sdelphij mutex_exit(&mg->mg_lock); 983269118Sdelphij} 984269118Sdelphij 985269118Sdelphijstatic void 986168404Spjdmetaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 987168404Spjd{ 988269118Sdelphij ASSERT(msp->ms_group == NULL); 989168404Spjd mutex_enter(&mg->mg_lock); 990168404Spjd msp->ms_group = mg; 991168404Spjd msp->ms_weight = 0; 992168404Spjd avl_add(&mg->mg_metaslab_tree, msp); 993168404Spjd mutex_exit(&mg->mg_lock); 994269118Sdelphij 995269118Sdelphij mutex_enter(&msp->ms_lock); 996269118Sdelphij metaslab_group_histogram_add(mg, msp); 997269118Sdelphij mutex_exit(&msp->ms_lock); 998168404Spjd} 999168404Spjd 1000168404Spjdstatic void 1001168404Spjdmetaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 1002168404Spjd{ 1003269118Sdelphij mutex_enter(&msp->ms_lock); 1004269118Sdelphij metaslab_group_histogram_remove(mg, msp); 1005269118Sdelphij mutex_exit(&msp->ms_lock); 1006269118Sdelphij 1007168404Spjd mutex_enter(&mg->mg_lock); 1008168404Spjd ASSERT(msp->ms_group == mg); 1009168404Spjd avl_remove(&mg->mg_metaslab_tree, msp); 1010168404Spjd msp->ms_group = NULL; 1011168404Spjd mutex_exit(&mg->mg_lock); 1012168404Spjd} 1013168404Spjd 1014168404Spjdstatic void 1015339105Smavmetaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1016339105Smav{ 1017339105Smav ASSERT(MUTEX_HELD(&mg->mg_lock)); 1018339105Smav ASSERT(msp->ms_group == mg); 1019339105Smav avl_remove(&mg->mg_metaslab_tree, msp); 1020339105Smav msp->ms_weight = weight; 1021339105Smav avl_add(&mg->mg_metaslab_tree, msp); 1022339105Smav 1023339105Smav} 1024339105Smav 1025339105Smavstatic void 1026168404Spjdmetaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 1027168404Spjd{ 1028168404Spjd /* 1029168404Spjd * Although in principle the weight can be any value, in 1030269118Sdelphij * practice we do not use values in the range [1, 511]. 1031168404Spjd */ 1032269118Sdelphij ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 1033168404Spjd ASSERT(MUTEX_HELD(&msp->ms_lock)); 1034168404Spjd 1035168404Spjd mutex_enter(&mg->mg_lock); 1036339105Smav metaslab_group_sort_impl(mg, msp, weight); 1037168404Spjd mutex_exit(&mg->mg_lock); 1038168404Spjd} 1039168404Spjd 1040168404Spjd/* 1041269118Sdelphij * Calculate the fragmentation for a given metaslab group. We can use 1042269118Sdelphij * a simple average here since all metaslabs within the group must have 1043269118Sdelphij * the same size. The return value will be a value between 0 and 100 1044269118Sdelphij * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 1045269118Sdelphij * group have a fragmentation metric. 1046269118Sdelphij */ 1047269118Sdelphijuint64_t 1048269118Sdelphijmetaslab_group_fragmentation(metaslab_group_t *mg) 1049269118Sdelphij{ 1050269118Sdelphij vdev_t *vd = mg->mg_vd; 1051269118Sdelphij uint64_t fragmentation = 0; 1052269118Sdelphij uint64_t valid_ms = 0; 1053269118Sdelphij 1054269118Sdelphij for (int m = 0; m < vd->vdev_ms_count; m++) { 1055269118Sdelphij metaslab_t *msp = vd->vdev_ms[m]; 1056269118Sdelphij 1057269118Sdelphij if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 1058269118Sdelphij continue; 1059269118Sdelphij 1060269118Sdelphij valid_ms++; 1061269118Sdelphij fragmentation += msp->ms_fragmentation; 1062269118Sdelphij } 1063269118Sdelphij 1064269118Sdelphij if (valid_ms <= vd->vdev_ms_count / 2) 1065269118Sdelphij return (ZFS_FRAG_INVALID); 1066269118Sdelphij 1067269118Sdelphij fragmentation /= valid_ms; 1068269118Sdelphij ASSERT3U(fragmentation, <=, 100); 1069269118Sdelphij return (fragmentation); 1070269118Sdelphij} 1071269118Sdelphij 1072269118Sdelphij/* 1073258633Savg * Determine if a given metaslab group should skip allocations. A metaslab 1074269118Sdelphij * group should avoid allocations if its free capacity is less than the 1075269118Sdelphij * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 1076269118Sdelphij * zfs_mg_fragmentation_threshold and there is at least one metaslab group 1077307277Smav * that can still handle allocations. If the allocation throttle is enabled 1078307277Smav * then we skip allocations to devices that have reached their maximum 1079307277Smav * allocation queue depth unless the selected metaslab group is the only 1080307277Smav * eligible group remaining. 1081258633Savg */ 1082258633Savgstatic boolean_t 1083307277Smavmetaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 1084339152Smav uint64_t psize, int allocator, int d) 1085258633Savg{ 1086307277Smav spa_t *spa = mg->mg_vd->vdev_spa; 1087258633Savg metaslab_class_t *mc = mg->mg_class; 1088258633Savg 1089258633Savg /* 1090307277Smav * We can only consider skipping this metaslab group if it's 1091307277Smav * in the normal metaslab class and there are other metaslab 1092307277Smav * groups to select from. Otherwise, we always consider it eligible 1093269118Sdelphij * for allocations. 1094258633Savg */ 1095307277Smav if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) 1096307277Smav return (B_TRUE); 1097307277Smav 1098307277Smav /* 1099307277Smav * If the metaslab group's mg_allocatable flag is set (see comments 1100307277Smav * in metaslab_group_alloc_update() for more information) and 1101307277Smav * the allocation throttle is disabled then allow allocations to this 1102307277Smav * device. However, if the allocation throttle is enabled then 1103307277Smav * check if we have reached our allocation limit (mg_alloc_queue_depth) 1104307277Smav * to determine if we should allow allocations to this metaslab group. 1105307277Smav * If all metaslab groups are no longer considered allocatable 1106307277Smav * (mc_alloc_groups == 0) or we're trying to allocate the smallest 1107307277Smav * gang block size then we allow allocations on this metaslab group 1108307277Smav * regardless of the mg_allocatable or throttle settings. 1109307277Smav */ 1110307277Smav if (mg->mg_allocatable) { 1111307277Smav metaslab_group_t *mgp; 1112307277Smav int64_t qdepth; 1113339105Smav uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; 1114307277Smav 1115307277Smav if (!mc->mc_alloc_throttle_enabled) 1116307277Smav return (B_TRUE); 1117307277Smav 1118307277Smav /* 1119307277Smav * If this metaslab group does not have any free space, then 1120307277Smav * there is no point in looking further. 1121307277Smav */ 1122307277Smav if (mg->mg_no_free_space) 1123307277Smav return (B_FALSE); 1124307277Smav 1125339152Smav /* 1126339152Smav * Relax allocation throttling for ditto blocks. Due to 1127339152Smav * random imbalances in allocation it tends to push copies 1128339152Smav * to one vdev, that looks a bit better at the moment. 1129339152Smav */ 1130339152Smav qmax = qmax * (4 + d) / 4; 1131339152Smav 1132339105Smav qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]); 1133307277Smav 1134307277Smav /* 1135307277Smav * If this metaslab group is below its qmax or it's 1136307277Smav * the only allocatable metasable group, then attempt 1137307277Smav * to allocate from it. 1138307277Smav */ 1139307277Smav if (qdepth < qmax || mc->mc_alloc_groups == 1) 1140307277Smav return (B_TRUE); 1141307277Smav ASSERT3U(mc->mc_alloc_groups, >, 1); 1142307277Smav 1143307277Smav /* 1144307277Smav * Since this metaslab group is at or over its qmax, we 1145307277Smav * need to determine if there are metaslab groups after this 1146307277Smav * one that might be able to handle this allocation. This is 1147307277Smav * racy since we can't hold the locks for all metaslab 1148307277Smav * groups at the same time when we make this check. 1149307277Smav */ 1150307277Smav for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 1151339105Smav qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; 1152339152Smav qmax = qmax * (4 + d) / 4; 1153339105Smav qdepth = refcount_count( 1154339105Smav &mgp->mg_alloc_queue_depth[allocator]); 1155307277Smav 1156307277Smav /* 1157307277Smav * If there is another metaslab group that 1158307277Smav * might be able to handle the allocation, then 1159307277Smav * we return false so that we skip this group. 1160307277Smav */ 1161307277Smav if (qdepth < qmax && !mgp->mg_no_free_space) 1162307277Smav return (B_FALSE); 1163307277Smav } 1164307277Smav 1165307277Smav /* 1166307277Smav * We didn't find another group to handle the allocation 1167307277Smav * so we can't skip this metaslab group even though 1168307277Smav * we are at or over our qmax. 1169307277Smav */ 1170307277Smav return (B_TRUE); 1171307277Smav 1172307277Smav } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 1173307277Smav return (B_TRUE); 1174307277Smav } 1175307277Smav return (B_FALSE); 1176258633Savg} 1177258633Savg 1178258633Savg/* 1179211931Smm * ========================================================================== 1180258717Savg * Range tree callbacks 1181211931Smm * ========================================================================== 1182211931Smm */ 1183258717Savg 1184258717Savg/* 1185258717Savg * Comparison function for the private size-ordered tree. Tree is sorted 1186258717Savg * by size, larger sizes at the end of the tree. 1187258717Savg */ 1188211931Smmstatic int 1189258717Savgmetaslab_rangesize_compare(const void *x1, const void *x2) 1190211931Smm{ 1191258717Savg const range_seg_t *r1 = x1; 1192258717Savg const range_seg_t *r2 = x2; 1193258717Savg uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1194258717Savg uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1195211931Smm 1196339158Smav int cmp = AVL_CMP(rs_size1, rs_size2); 1197339158Smav if (likely(cmp)) 1198339158Smav return (cmp); 1199211931Smm 1200258717Savg if (r1->rs_start < r2->rs_start) 1201211931Smm return (-1); 1202258717Savg 1203339158Smav return (AVL_CMP(r1->rs_start, r2->rs_start)); 1204211931Smm} 1205211931Smm 1206211931Smm/* 1207258717Savg * ========================================================================== 1208321529Smav * Common allocator routines 1209258717Savg * ========================================================================== 1210258717Savg */ 1211258717Savg 1212258717Savg/* 1213211931Smm * Return the maximum contiguous segment within the metaslab. 1214209962Smm */ 1215209962Smmuint64_t 1216258717Savgmetaslab_block_maxsize(metaslab_t *msp) 1217209962Smm{ 1218332547Smav avl_tree_t *t = &msp->ms_allocatable_by_size; 1219258717Savg range_seg_t *rs; 1220209962Smm 1221258717Savg if (t == NULL || (rs = avl_last(t)) == NULL) 1222209962Smm return (0ULL); 1223209962Smm 1224258717Savg return (rs->rs_end - rs->rs_start); 1225209962Smm} 1226209962Smm 1227321529Smavstatic range_seg_t * 1228321529Smavmetaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1229258717Savg{ 1230321529Smav range_seg_t *rs, rsearch; 1231321529Smav avl_index_t where; 1232258717Savg 1233321529Smav rsearch.rs_start = start; 1234321529Smav rsearch.rs_end = start + size; 1235258717Savg 1236321529Smav rs = avl_find(t, &rsearch, &where); 1237321529Smav if (rs == NULL) { 1238321529Smav rs = avl_nearest(t, where, AVL_AFTER); 1239321529Smav } 1240258717Savg 1241321529Smav return (rs); 1242258717Savg} 1243258717Savg 1244211931Smm/* 1245258717Savg * This is a helper function that can be used by the allocator to find 1246258717Savg * a suitable block to allocate. This will search the specified AVL 1247258717Savg * tree looking for a block that matches the specified criteria. 1248258717Savg */ 1249258717Savgstatic uint64_t 1250258717Savgmetaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1251258717Savg uint64_t align) 1252258717Savg{ 1253321529Smav range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1254258717Savg 1255258717Savg while (rs != NULL) { 1256258717Savg uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1257258717Savg 1258258717Savg if (offset + size <= rs->rs_end) { 1259258717Savg *cursor = offset + size; 1260258717Savg return (offset); 1261258717Savg } 1262258717Savg rs = AVL_NEXT(t, rs); 1263258717Savg } 1264258717Savg 1265258717Savg /* 1266258717Savg * If we know we've searched the whole map (*cursor == 0), give up. 1267258717Savg * Otherwise, reset the cursor to the beginning and try again. 1268258717Savg */ 1269258717Savg if (*cursor == 0) 1270258717Savg return (-1ULL); 1271258717Savg 1272258717Savg *cursor = 0; 1273258717Savg return (metaslab_block_picker(t, cursor, size, align)); 1274258717Savg} 1275258717Savg 1276258717Savg/* 1277258717Savg * ========================================================================== 1278211931Smm * The first-fit block allocator 1279211931Smm * ========================================================================== 1280211931Smm */ 1281211931Smmstatic uint64_t 1282258717Savgmetaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1283209962Smm{ 1284258717Savg /* 1285258717Savg * Find the largest power of 2 block size that evenly divides the 1286258717Savg * requested size. This is used to try to allocate blocks with similar 1287258717Savg * alignment from the same area of the metaslab (i.e. same cursor 1288258717Savg * bucket) but it does not guarantee that other allocations sizes 1289258717Savg * may exist in the same region. 1290258717Savg */ 1291211931Smm uint64_t align = size & -size; 1292264669Sdelphij uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1293332547Smav avl_tree_t *t = &msp->ms_allocatable->rt_root; 1294209962Smm 1295211931Smm return (metaslab_block_picker(t, cursor, size, align)); 1296209962Smm} 1297209962Smm 1298258717Savgstatic metaslab_ops_t metaslab_ff_ops = { 1299269118Sdelphij metaslab_ff_alloc 1300211931Smm}; 1301209962Smm 1302211931Smm/* 1303211931Smm * ========================================================================== 1304211931Smm * Dynamic block allocator - 1305211931Smm * Uses the first fit allocation scheme until space get low and then 1306211931Smm * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1307211931Smm * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1308211931Smm * ========================================================================== 1309211931Smm */ 1310209962Smmstatic uint64_t 1311258717Savgmetaslab_df_alloc(metaslab_t *msp, uint64_t size) 1312209962Smm{ 1313258717Savg /* 1314258717Savg * Find the largest power of 2 block size that evenly divides the 1315258717Savg * requested size. This is used to try to allocate blocks with similar 1316258717Savg * alignment from the same area of the metaslab (i.e. same cursor 1317258717Savg * bucket) but it does not guarantee that other allocations sizes 1318258717Savg * may exist in the same region. 1319258717Savg */ 1320209962Smm uint64_t align = size & -size; 1321264669Sdelphij uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1322332547Smav range_tree_t *rt = msp->ms_allocatable; 1323258717Savg avl_tree_t *t = &rt->rt_root; 1324258717Savg uint64_t max_size = metaslab_block_maxsize(msp); 1325258717Savg int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1326209962Smm 1327258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1328332547Smav ASSERT3U(avl_numnodes(t), ==, 1329332547Smav avl_numnodes(&msp->ms_allocatable_by_size)); 1330209962Smm 1331209962Smm if (max_size < size) 1332209962Smm return (-1ULL); 1333209962Smm 1334209962Smm /* 1335209962Smm * If we're running low on space switch to using the size 1336209962Smm * sorted AVL tree (best-fit). 1337209962Smm */ 1338209962Smm if (max_size < metaslab_df_alloc_threshold || 1339209962Smm free_pct < metaslab_df_free_pct) { 1340332547Smav t = &msp->ms_allocatable_by_size; 1341209962Smm *cursor = 0; 1342209962Smm } 1343209962Smm 1344209962Smm return (metaslab_block_picker(t, cursor, size, 1ULL)); 1345209962Smm} 1346209962Smm 1347258717Savgstatic metaslab_ops_t metaslab_df_ops = { 1348269118Sdelphij metaslab_df_alloc 1349209962Smm}; 1350209962Smm 1351211931Smm/* 1352211931Smm * ========================================================================== 1353258717Savg * Cursor fit block allocator - 1354258717Savg * Select the largest region in the metaslab, set the cursor to the beginning 1355258717Savg * of the range and the cursor_end to the end of the range. As allocations 1356258717Savg * are made advance the cursor. Continue allocating from the cursor until 1357258717Savg * the range is exhausted and then find a new range. 1358211931Smm * ========================================================================== 1359211931Smm */ 1360211931Smmstatic uint64_t 1361258717Savgmetaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1362211931Smm{ 1363332547Smav range_tree_t *rt = msp->ms_allocatable; 1364332547Smav avl_tree_t *t = &msp->ms_allocatable_by_size; 1365258717Savg uint64_t *cursor = &msp->ms_lbas[0]; 1366258717Savg uint64_t *cursor_end = &msp->ms_lbas[1]; 1367211931Smm uint64_t offset = 0; 1368209962Smm 1369258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1370258717Savg ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1371211931Smm 1372258717Savg ASSERT3U(*cursor_end, >=, *cursor); 1373211931Smm 1374258717Savg if ((*cursor + size) > *cursor_end) { 1375258717Savg range_seg_t *rs; 1376211931Smm 1377332547Smav rs = avl_last(&msp->ms_allocatable_by_size); 1378258717Savg if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1379258717Savg return (-1ULL); 1380211931Smm 1381258717Savg *cursor = rs->rs_start; 1382258717Savg *cursor_end = rs->rs_end; 1383258717Savg } 1384211931Smm 1385258717Savg offset = *cursor; 1386258717Savg *cursor += size; 1387258717Savg 1388211931Smm return (offset); 1389211931Smm} 1390211931Smm 1391258717Savgstatic metaslab_ops_t metaslab_cf_ops = { 1392269118Sdelphij metaslab_cf_alloc 1393211931Smm}; 1394211931Smm 1395258717Savg/* 1396258717Savg * ========================================================================== 1397258717Savg * New dynamic fit allocator - 1398258717Savg * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1399258717Savg * contiguous blocks. If no region is found then just use the largest segment 1400258717Savg * that remains. 1401258717Savg * ========================================================================== 1402258717Savg */ 1403258717Savg 1404258717Savg/* 1405258717Savg * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1406258717Savg * to request from the allocator. 1407258717Savg */ 1408211931Smmuint64_t metaslab_ndf_clump_shift = 4; 1409211931Smm 1410211931Smmstatic uint64_t 1411258717Savgmetaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1412211931Smm{ 1413332547Smav avl_tree_t *t = &msp->ms_allocatable->rt_root; 1414211931Smm avl_index_t where; 1415258717Savg range_seg_t *rs, rsearch; 1416264669Sdelphij uint64_t hbit = highbit64(size); 1417258717Savg uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1418258717Savg uint64_t max_size = metaslab_block_maxsize(msp); 1419211931Smm 1420258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1421332547Smav ASSERT3U(avl_numnodes(t), ==, 1422332547Smav avl_numnodes(&msp->ms_allocatable_by_size)); 1423211931Smm 1424211931Smm if (max_size < size) 1425211931Smm return (-1ULL); 1426211931Smm 1427258717Savg rsearch.rs_start = *cursor; 1428258717Savg rsearch.rs_end = *cursor + size; 1429211931Smm 1430258717Savg rs = avl_find(t, &rsearch, &where); 1431258717Savg if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1432332547Smav t = &msp->ms_allocatable_by_size; 1433211931Smm 1434258717Savg rsearch.rs_start = 0; 1435258717Savg rsearch.rs_end = MIN(max_size, 1436211931Smm 1ULL << (hbit + metaslab_ndf_clump_shift)); 1437258717Savg rs = avl_find(t, &rsearch, &where); 1438258717Savg if (rs == NULL) 1439258717Savg rs = avl_nearest(t, where, AVL_AFTER); 1440258717Savg ASSERT(rs != NULL); 1441211931Smm } 1442211931Smm 1443258717Savg if ((rs->rs_end - rs->rs_start) >= size) { 1444258717Savg *cursor = rs->rs_start + size; 1445258717Savg return (rs->rs_start); 1446211931Smm } 1447211931Smm return (-1ULL); 1448211931Smm} 1449211931Smm 1450258717Savgstatic metaslab_ops_t metaslab_ndf_ops = { 1451269118Sdelphij metaslab_ndf_alloc 1452211931Smm}; 1453211931Smm 1454258717Savgmetaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1455211931Smm 1456209962Smm/* 1457168404Spjd * ========================================================================== 1458168404Spjd * Metaslabs 1459168404Spjd * ========================================================================== 1460168404Spjd */ 1461258717Savg 1462258717Savg/* 1463258717Savg * Wait for any in-progress metaslab loads to complete. 1464258717Savg */ 1465258717Savgvoid 1466258717Savgmetaslab_load_wait(metaslab_t *msp) 1467258717Savg{ 1468258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1469258717Savg 1470258717Savg while (msp->ms_loading) { 1471258717Savg ASSERT(!msp->ms_loaded); 1472258717Savg cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1473258717Savg } 1474258717Savg} 1475258717Savg 1476258717Savgint 1477258717Savgmetaslab_load(metaslab_t *msp) 1478258717Savg{ 1479258717Savg int error = 0; 1480321529Smav boolean_t success = B_FALSE; 1481258717Savg 1482258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1483258717Savg ASSERT(!msp->ms_loaded); 1484258717Savg ASSERT(!msp->ms_loading); 1485258717Savg 1486258717Savg msp->ms_loading = B_TRUE; 1487332525Smav /* 1488332525Smav * Nobody else can manipulate a loading metaslab, so it's now safe 1489332525Smav * to drop the lock. This way we don't have to hold the lock while 1490332525Smav * reading the spacemap from disk. 1491332525Smav */ 1492332525Smav mutex_exit(&msp->ms_lock); 1493258717Savg 1494258717Savg /* 1495258717Savg * If the space map has not been allocated yet, then treat 1496332547Smav * all the space in the metaslab as free and add it to ms_allocatable. 1497258717Savg */ 1498332547Smav if (msp->ms_sm != NULL) { 1499332547Smav error = space_map_load(msp->ms_sm, msp->ms_allocatable, 1500332547Smav SM_FREE); 1501332547Smav } else { 1502332547Smav range_tree_add(msp->ms_allocatable, 1503332547Smav msp->ms_start, msp->ms_size); 1504332547Smav } 1505258717Savg 1506321529Smav success = (error == 0); 1507332525Smav 1508332525Smav mutex_enter(&msp->ms_lock); 1509258717Savg msp->ms_loading = B_FALSE; 1510258717Savg 1511321529Smav if (success) { 1512321529Smav ASSERT3P(msp->ms_group, !=, NULL); 1513321529Smav msp->ms_loaded = B_TRUE; 1514321529Smav 1515332547Smav /* 1516332547Smav * If the metaslab already has a spacemap, then we need to 1517332547Smav * remove all segments from the defer tree; otherwise, the 1518332547Smav * metaslab is completely empty and we can skip this. 1519332547Smav */ 1520332547Smav if (msp->ms_sm != NULL) { 1521332547Smav for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1522332547Smav range_tree_walk(msp->ms_defer[t], 1523332547Smav range_tree_remove, msp->ms_allocatable); 1524332547Smav } 1525258717Savg } 1526321529Smav msp->ms_max_size = metaslab_block_maxsize(msp); 1527258717Savg } 1528258717Savg cv_broadcast(&msp->ms_load_cv); 1529258717Savg return (error); 1530258717Savg} 1531258717Savg 1532258717Savgvoid 1533258717Savgmetaslab_unload(metaslab_t *msp) 1534258717Savg{ 1535258717Savg ASSERT(MUTEX_HELD(&msp->ms_lock)); 1536332547Smav range_tree_vacate(msp->ms_allocatable, NULL, NULL); 1537258717Savg msp->ms_loaded = B_FALSE; 1538258717Savg msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1539321529Smav msp->ms_max_size = 0; 1540258717Savg} 1541258717Savg 1542275594Sdelphijint 1543275594Sdelphijmetaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1544275594Sdelphij metaslab_t **msp) 1545168404Spjd{ 1546168404Spjd vdev_t *vd = mg->mg_vd; 1547258717Savg objset_t *mos = vd->vdev_spa->spa_meta_objset; 1548275594Sdelphij metaslab_t *ms; 1549275594Sdelphij int error; 1550168404Spjd 1551275594Sdelphij ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1552275594Sdelphij mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1553332525Smav mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1554275594Sdelphij cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1555339111Smav 1556275594Sdelphij ms->ms_id = id; 1557275594Sdelphij ms->ms_start = id << vd->vdev_ms_shift; 1558275594Sdelphij ms->ms_size = 1ULL << vd->vdev_ms_shift; 1559339105Smav ms->ms_allocator = -1; 1560339105Smav ms->ms_new = B_TRUE; 1561168404Spjd 1562258717Savg /* 1563258717Savg * We only open space map objects that already exist. All others 1564258717Savg * will be opened when we finally allocate an object for it. 1565258717Savg */ 1566258717Savg if (object != 0) { 1567275594Sdelphij error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1568332525Smav ms->ms_size, vd->vdev_ashift); 1569275594Sdelphij 1570275594Sdelphij if (error != 0) { 1571275594Sdelphij kmem_free(ms, sizeof (metaslab_t)); 1572275594Sdelphij return (error); 1573275594Sdelphij } 1574275594Sdelphij 1575275594Sdelphij ASSERT(ms->ms_sm != NULL); 1576258717Savg } 1577168404Spjd 1578168404Spjd /* 1579258717Savg * We create the main range tree here, but we don't create the 1580321539Smav * other range trees until metaslab_sync_done(). This serves 1581168404Spjd * two purposes: it allows metaslab_sync_done() to detect the 1582168404Spjd * addition of new space; and for debugging, it ensures that we'd 1583168404Spjd * data fault on any attempt to use this metaslab before it's ready. 1584168404Spjd */ 1585339034Ssef ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size, 1586339034Ssef metaslab_rangesize_compare, 0); 1587275594Sdelphij metaslab_group_add(mg, ms); 1588168404Spjd 1589321529Smav metaslab_set_fragmentation(ms); 1590219089Spjd 1591168404Spjd /* 1592168404Spjd * If we're opening an existing pool (txg == 0) or creating 1593168404Spjd * a new one (txg == TXG_INITIAL), all space is available now. 1594168404Spjd * If we're adding space to an existing pool, the new space 1595168404Spjd * does not become available until after this txg has synced. 1596321529Smav * The metaslab's weight will also be initialized when we sync 1597321529Smav * out this txg. This ensures that we don't attempt to allocate 1598321529Smav * from it before we have initialized it completely. 1599168404Spjd */ 1600168404Spjd if (txg <= TXG_INITIAL) 1601275594Sdelphij metaslab_sync_done(ms, 0); 1602168404Spjd 1603258717Savg /* 1604258717Savg * If metaslab_debug_load is set and we're initializing a metaslab 1605321529Smav * that has an allocated space map object then load the its space 1606258717Savg * map so that can verify frees. 1607258717Savg */ 1608275594Sdelphij if (metaslab_debug_load && ms->ms_sm != NULL) { 1609275594Sdelphij mutex_enter(&ms->ms_lock); 1610275594Sdelphij VERIFY0(metaslab_load(ms)); 1611275594Sdelphij mutex_exit(&ms->ms_lock); 1612258717Savg } 1613258717Savg 1614168404Spjd if (txg != 0) { 1615168404Spjd vdev_dirty(vd, 0, NULL, txg); 1616275594Sdelphij vdev_dirty(vd, VDD_METASLAB, ms, txg); 1617168404Spjd } 1618168404Spjd 1619275594Sdelphij *msp = ms; 1620275594Sdelphij 1621275594Sdelphij return (0); 1622168404Spjd} 1623168404Spjd 1624168404Spjdvoid 1625168404Spjdmetaslab_fini(metaslab_t *msp) 1626168404Spjd{ 1627168404Spjd metaslab_group_t *mg = msp->ms_group; 1628168404Spjd 1629168404Spjd metaslab_group_remove(mg, msp); 1630168404Spjd 1631168404Spjd mutex_enter(&msp->ms_lock); 1632258717Savg VERIFY(msp->ms_group == NULL); 1633258717Savg vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1634258717Savg 0, -msp->ms_size); 1635258717Savg space_map_close(msp->ms_sm); 1636168404Spjd 1637258717Savg metaslab_unload(msp); 1638332547Smav range_tree_destroy(msp->ms_allocatable); 1639332547Smav range_tree_destroy(msp->ms_freeing); 1640332547Smav range_tree_destroy(msp->ms_freed); 1641258717Savg 1642219089Spjd for (int t = 0; t < TXG_SIZE; t++) { 1643332547Smav range_tree_destroy(msp->ms_allocating[t]); 1644168404Spjd } 1645168404Spjd 1646247398Smm for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1647332547Smav range_tree_destroy(msp->ms_defer[t]); 1648247398Smm } 1649240415Smm ASSERT0(msp->ms_deferspace); 1650219089Spjd 1651332547Smav range_tree_destroy(msp->ms_checkpointing); 1652332547Smav 1653168404Spjd mutex_exit(&msp->ms_lock); 1654258717Savg cv_destroy(&msp->ms_load_cv); 1655168404Spjd mutex_destroy(&msp->ms_lock); 1656332525Smav mutex_destroy(&msp->ms_sync_lock); 1657339105Smav ASSERT3U(msp->ms_allocator, ==, -1); 1658168404Spjd 1659168404Spjd kmem_free(msp, sizeof (metaslab_t)); 1660168404Spjd} 1661168404Spjd 1662269118Sdelphij#define FRAGMENTATION_TABLE_SIZE 17 1663269118Sdelphij 1664258717Savg/* 1665269118Sdelphij * This table defines a segment size based fragmentation metric that will 1666269118Sdelphij * allow each metaslab to derive its own fragmentation value. This is done 1667269118Sdelphij * by calculating the space in each bucket of the spacemap histogram and 1668269118Sdelphij * multiplying that by the fragmetation metric in this table. Doing 1669269118Sdelphij * this for all buckets and dividing it by the total amount of free 1670269118Sdelphij * space in this metaslab (i.e. the total free space in all buckets) gives 1671269118Sdelphij * us the fragmentation metric. This means that a high fragmentation metric 1672269118Sdelphij * equates to most of the free space being comprised of small segments. 1673269118Sdelphij * Conversely, if the metric is low, then most of the free space is in 1674269118Sdelphij * large segments. A 10% change in fragmentation equates to approximately 1675269118Sdelphij * double the number of segments. 1676258717Savg * 1677269118Sdelphij * This table defines 0% fragmented space using 16MB segments. Testing has 1678269118Sdelphij * shown that segments that are greater than or equal to 16MB do not suffer 1679269118Sdelphij * from drastic performance problems. Using this value, we derive the rest 1680269118Sdelphij * of the table. Since the fragmentation value is never stored on disk, it 1681269118Sdelphij * is possible to change these calculations in the future. 1682258717Savg */ 1683269118Sdelphijint zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1684269118Sdelphij 100, /* 512B */ 1685269118Sdelphij 100, /* 1K */ 1686269118Sdelphij 98, /* 2K */ 1687269118Sdelphij 95, /* 4K */ 1688269118Sdelphij 90, /* 8K */ 1689269118Sdelphij 80, /* 16K */ 1690269118Sdelphij 70, /* 32K */ 1691269118Sdelphij 60, /* 64K */ 1692269118Sdelphij 50, /* 128K */ 1693269118Sdelphij 40, /* 256K */ 1694269118Sdelphij 30, /* 512K */ 1695269118Sdelphij 20, /* 1M */ 1696269118Sdelphij 15, /* 2M */ 1697269118Sdelphij 10, /* 4M */ 1698269118Sdelphij 5, /* 8M */ 1699269118Sdelphij 0 /* 16M */ 1700269118Sdelphij}; 1701269118Sdelphij 1702269118Sdelphij/* 1703269118Sdelphij * Calclate the metaslab's fragmentation metric. A return value 1704269118Sdelphij * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1705269118Sdelphij * not support this metric. Otherwise, the return value should be in the 1706269118Sdelphij * range [0, 100]. 1707269118Sdelphij */ 1708321529Smavstatic void 1709321529Smavmetaslab_set_fragmentation(metaslab_t *msp) 1710258717Savg{ 1711269118Sdelphij spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1712269118Sdelphij uint64_t fragmentation = 0; 1713269118Sdelphij uint64_t total = 0; 1714269118Sdelphij boolean_t feature_enabled = spa_feature_is_enabled(spa, 1715269118Sdelphij SPA_FEATURE_SPACEMAP_HISTOGRAM); 1716168404Spjd 1717321529Smav if (!feature_enabled) { 1718321529Smav msp->ms_fragmentation = ZFS_FRAG_INVALID; 1719321529Smav return; 1720321529Smav } 1721269118Sdelphij 1722258717Savg /* 1723269118Sdelphij * A null space map means that the entire metaslab is free 1724269118Sdelphij * and thus is not fragmented. 1725258717Savg */ 1726321529Smav if (msp->ms_sm == NULL) { 1727321529Smav msp->ms_fragmentation = 0; 1728321529Smav return; 1729321529Smav } 1730269118Sdelphij 1731269118Sdelphij /* 1732321529Smav * If this metaslab's space map has not been upgraded, flag it 1733269118Sdelphij * so that we upgrade next time we encounter it. 1734269118Sdelphij */ 1735269118Sdelphij if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1736269118Sdelphij uint64_t txg = spa_syncing_txg(spa); 1737258717Savg vdev_t *vd = msp->ms_group->mg_vd; 1738258717Savg 1739321554Smav /* 1740321554Smav * If we've reached the final dirty txg, then we must 1741321554Smav * be shutting down the pool. We don't want to dirty 1742321554Smav * any data past this point so skip setting the condense 1743321554Smav * flag. We can retry this action the next time the pool 1744321554Smav * is imported. 1745321554Smav */ 1746321554Smav if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 1747272504Sdelphij msp->ms_condense_wanted = B_TRUE; 1748272504Sdelphij vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1749339108Smav zfs_dbgmsg("txg %llu, requesting force condense: " 1750321554Smav "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 1751321554Smav vd->vdev_id); 1752272504Sdelphij } 1753321529Smav msp->ms_fragmentation = ZFS_FRAG_INVALID; 1754321529Smav return; 1755258717Savg } 1756258717Savg 1757269118Sdelphij for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1758269118Sdelphij uint64_t space = 0; 1759269118Sdelphij uint8_t shift = msp->ms_sm->sm_shift; 1760321529Smav 1761269118Sdelphij int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1762269118Sdelphij FRAGMENTATION_TABLE_SIZE - 1); 1763258717Savg 1764258717Savg if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1765258717Savg continue; 1766258717Savg 1767269118Sdelphij space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1768269118Sdelphij total += space; 1769269118Sdelphij 1770269118Sdelphij ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1771269118Sdelphij fragmentation += space * zfs_frag_table[idx]; 1772258717Savg } 1773269118Sdelphij 1774269118Sdelphij if (total > 0) 1775269118Sdelphij fragmentation /= total; 1776269118Sdelphij ASSERT3U(fragmentation, <=, 100); 1777321529Smav 1778321529Smav msp->ms_fragmentation = fragmentation; 1779258717Savg} 1780258717Savg 1781269118Sdelphij/* 1782269118Sdelphij * Compute a weight -- a selection preference value -- for the given metaslab. 1783269118Sdelphij * This is based on the amount of free space, the level of fragmentation, 1784269118Sdelphij * the LBA range, and whether the metaslab is loaded. 1785269118Sdelphij */ 1786168404Spjdstatic uint64_t 1787321529Smavmetaslab_space_weight(metaslab_t *msp) 1788168404Spjd{ 1789168404Spjd metaslab_group_t *mg = msp->ms_group; 1790168404Spjd vdev_t *vd = mg->mg_vd; 1791168404Spjd uint64_t weight, space; 1792168404Spjd 1793168404Spjd ASSERT(MUTEX_HELD(&msp->ms_lock)); 1794321529Smav ASSERT(!vd->vdev_removing); 1795168404Spjd 1796168404Spjd /* 1797168404Spjd * The baseline weight is the metaslab's free space. 1798168404Spjd */ 1799258717Savg space = msp->ms_size - space_map_allocated(msp->ms_sm); 1800269118Sdelphij 1801269118Sdelphij if (metaslab_fragmentation_factor_enabled && 1802269118Sdelphij msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1803269118Sdelphij /* 1804269118Sdelphij * Use the fragmentation information to inversely scale 1805269118Sdelphij * down the baseline weight. We need to ensure that we 1806269118Sdelphij * don't exclude this metaslab completely when it's 100% 1807269118Sdelphij * fragmented. To avoid this we reduce the fragmented value 1808269118Sdelphij * by 1. 1809269118Sdelphij */ 1810269118Sdelphij space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1811269118Sdelphij 1812269118Sdelphij /* 1813269118Sdelphij * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1814269118Sdelphij * this metaslab again. The fragmentation metric may have 1815269118Sdelphij * decreased the space to something smaller than 1816269118Sdelphij * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1817269118Sdelphij * so that we can consume any remaining space. 1818269118Sdelphij */ 1819269118Sdelphij if (space > 0 && space < SPA_MINBLOCKSIZE) 1820269118Sdelphij space = SPA_MINBLOCKSIZE; 1821269118Sdelphij } 1822168404Spjd weight = space; 1823168404Spjd 1824168404Spjd /* 1825168404Spjd * Modern disks have uniform bit density and constant angular velocity. 1826168404Spjd * Therefore, the outer recording zones are faster (higher bandwidth) 1827168404Spjd * than the inner zones by the ratio of outer to inner track diameter, 1828168404Spjd * which is typically around 2:1. We account for this by assigning 1829168404Spjd * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1830168404Spjd * In effect, this means that we'll select the metaslab with the most 1831168404Spjd * free bandwidth rather than simply the one with the most free space. 1832168404Spjd */ 1833346131Smav if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { 1834269118Sdelphij weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1835269118Sdelphij ASSERT(weight >= space && weight <= 2 * space); 1836269118Sdelphij } 1837168404Spjd 1838269118Sdelphij /* 1839269118Sdelphij * If this metaslab is one we're actively using, adjust its 1840269118Sdelphij * weight to make it preferable to any inactive metaslab so 1841269118Sdelphij * we'll polish it off. If the fragmentation on this metaslab 1842269118Sdelphij * has exceed our threshold, then don't mark it active. 1843269118Sdelphij */ 1844269118Sdelphij if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1845269118Sdelphij msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1846211931Smm weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1847211931Smm } 1848258717Savg 1849321529Smav WEIGHT_SET_SPACEBASED(weight); 1850211931Smm return (weight); 1851211931Smm} 1852211931Smm 1853321529Smav/* 1854321529Smav * Return the weight of the specified metaslab, according to the segment-based 1855321529Smav * weighting algorithm. The metaslab must be loaded. This function can 1856321529Smav * be called within a sync pass since it relies only on the metaslab's 1857321529Smav * range tree which is always accurate when the metaslab is loaded. 1858321529Smav */ 1859321529Smavstatic uint64_t 1860321529Smavmetaslab_weight_from_range_tree(metaslab_t *msp) 1861321529Smav{ 1862321529Smav uint64_t weight = 0; 1863321529Smav uint32_t segments = 0; 1864321529Smav 1865321529Smav ASSERT(msp->ms_loaded); 1866321529Smav 1867321529Smav for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 1868321529Smav i--) { 1869321529Smav uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 1870321529Smav int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1871321529Smav 1872321529Smav segments <<= 1; 1873332547Smav segments += msp->ms_allocatable->rt_histogram[i]; 1874321529Smav 1875321529Smav /* 1876321529Smav * The range tree provides more precision than the space map 1877321529Smav * and must be downgraded so that all values fit within the 1878321529Smav * space map's histogram. This allows us to compare loaded 1879321529Smav * vs. unloaded metaslabs to determine which metaslab is 1880321529Smav * considered "best". 1881321529Smav */ 1882321529Smav if (i > max_idx) 1883321529Smav continue; 1884321529Smav 1885321529Smav if (segments != 0) { 1886321529Smav WEIGHT_SET_COUNT(weight, segments); 1887321529Smav WEIGHT_SET_INDEX(weight, i); 1888321529Smav WEIGHT_SET_ACTIVE(weight, 0); 1889321529Smav break; 1890321529Smav } 1891321529Smav } 1892321529Smav return (weight); 1893321529Smav} 1894321529Smav 1895321529Smav/* 1896321529Smav * Calculate the weight based on the on-disk histogram. This should only 1897321529Smav * be called after a sync pass has completely finished since the on-disk 1898321529Smav * information is updated in metaslab_sync(). 1899321529Smav */ 1900321529Smavstatic uint64_t 1901321529Smavmetaslab_weight_from_spacemap(metaslab_t *msp) 1902321529Smav{ 1903321529Smav uint64_t weight = 0; 1904321529Smav 1905321529Smav for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 1906321529Smav if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { 1907321529Smav WEIGHT_SET_COUNT(weight, 1908321529Smav msp->ms_sm->sm_phys->smp_histogram[i]); 1909321529Smav WEIGHT_SET_INDEX(weight, i + 1910321529Smav msp->ms_sm->sm_shift); 1911321529Smav WEIGHT_SET_ACTIVE(weight, 0); 1912321529Smav break; 1913321529Smav } 1914321529Smav } 1915321529Smav return (weight); 1916321529Smav} 1917321529Smav 1918321529Smav/* 1919321529Smav * Compute a segment-based weight for the specified metaslab. The weight 1920321529Smav * is determined by highest bucket in the histogram. The information 1921321529Smav * for the highest bucket is encoded into the weight value. 1922321529Smav */ 1923321529Smavstatic uint64_t 1924321529Smavmetaslab_segment_weight(metaslab_t *msp) 1925321529Smav{ 1926321529Smav metaslab_group_t *mg = msp->ms_group; 1927321529Smav uint64_t weight = 0; 1928321529Smav uint8_t shift = mg->mg_vd->vdev_ashift; 1929321529Smav 1930321529Smav ASSERT(MUTEX_HELD(&msp->ms_lock)); 1931321529Smav 1932321529Smav /* 1933321529Smav * The metaslab is completely free. 1934321529Smav */ 1935321529Smav if (space_map_allocated(msp->ms_sm) == 0) { 1936321529Smav int idx = highbit64(msp->ms_size) - 1; 1937321529Smav int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1938321529Smav 1939321529Smav if (idx < max_idx) { 1940321529Smav WEIGHT_SET_COUNT(weight, 1ULL); 1941321529Smav WEIGHT_SET_INDEX(weight, idx); 1942321529Smav } else { 1943321529Smav WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 1944321529Smav WEIGHT_SET_INDEX(weight, max_idx); 1945321529Smav } 1946321529Smav WEIGHT_SET_ACTIVE(weight, 0); 1947321529Smav ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 1948321529Smav 1949321529Smav return (weight); 1950321529Smav } 1951321529Smav 1952321529Smav ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 1953321529Smav 1954321529Smav /* 1955321529Smav * If the metaslab is fully allocated then just make the weight 0. 1956321529Smav */ 1957321529Smav if (space_map_allocated(msp->ms_sm) == msp->ms_size) 1958321529Smav return (0); 1959321529Smav /* 1960321529Smav * If the metaslab is already loaded, then use the range tree to 1961321529Smav * determine the weight. Otherwise, we rely on the space map information 1962321529Smav * to generate the weight. 1963321529Smav */ 1964321529Smav if (msp->ms_loaded) { 1965321529Smav weight = metaslab_weight_from_range_tree(msp); 1966321529Smav } else { 1967321529Smav weight = metaslab_weight_from_spacemap(msp); 1968321529Smav } 1969321529Smav 1970321529Smav /* 1971321529Smav * If the metaslab was active the last time we calculated its weight 1972321529Smav * then keep it active. We want to consume the entire region that 1973321529Smav * is associated with this weight. 1974321529Smav */ 1975321529Smav if (msp->ms_activation_weight != 0 && weight != 0) 1976321529Smav WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 1977321529Smav return (weight); 1978321529Smav} 1979321529Smav 1980321529Smav/* 1981321529Smav * Determine if we should attempt to allocate from this metaslab. If the 1982321529Smav * metaslab has a maximum size then we can quickly determine if the desired 1983321529Smav * allocation size can be satisfied. Otherwise, if we're using segment-based 1984321529Smav * weighting then we can determine the maximum allocation that this metaslab 1985321529Smav * can accommodate based on the index encoded in the weight. If we're using 1986321529Smav * space-based weights then rely on the entire weight (excluding the weight 1987321529Smav * type bit). 1988321529Smav */ 1989321529Smavboolean_t 1990321529Smavmetaslab_should_allocate(metaslab_t *msp, uint64_t asize) 1991321529Smav{ 1992321529Smav boolean_t should_allocate; 1993321529Smav 1994321529Smav if (msp->ms_max_size != 0) 1995321529Smav return (msp->ms_max_size >= asize); 1996321529Smav 1997321529Smav if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 1998321529Smav /* 1999321529Smav * The metaslab segment weight indicates segments in the 2000321529Smav * range [2^i, 2^(i+1)), where i is the index in the weight. 2001321529Smav * Since the asize might be in the middle of the range, we 2002321529Smav * should attempt the allocation if asize < 2^(i+1). 2003321529Smav */ 2004321529Smav should_allocate = (asize < 2005321529Smav 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 2006321529Smav } else { 2007321529Smav should_allocate = (asize <= 2008321529Smav (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 2009321529Smav } 2010321529Smav return (should_allocate); 2011321529Smav} 2012321529Smav 2013321529Smavstatic uint64_t 2014321529Smavmetaslab_weight(metaslab_t *msp) 2015321529Smav{ 2016321529Smav vdev_t *vd = msp->ms_group->mg_vd; 2017321529Smav spa_t *spa = vd->vdev_spa; 2018321529Smav uint64_t weight; 2019321529Smav 2020321529Smav ASSERT(MUTEX_HELD(&msp->ms_lock)); 2021321529Smav 2022321529Smav /* 2023332525Smav * If this vdev is in the process of being removed, there is nothing 2024321529Smav * for us to do here. 2025321529Smav */ 2026332525Smav if (vd->vdev_removing) 2027321529Smav return (0); 2028321529Smav 2029321529Smav metaslab_set_fragmentation(msp); 2030321529Smav 2031321529Smav /* 2032321529Smav * Update the maximum size if the metaslab is loaded. This will 2033321529Smav * ensure that we get an accurate maximum size if newly freed space 2034321529Smav * has been added back into the free tree. 2035321529Smav */ 2036321529Smav if (msp->ms_loaded) 2037321529Smav msp->ms_max_size = metaslab_block_maxsize(msp); 2038321529Smav 2039321529Smav /* 2040321529Smav * Segment-based weighting requires space map histogram support. 2041321529Smav */ 2042321529Smav if (zfs_metaslab_segment_weight_enabled && 2043321529Smav spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 2044321529Smav (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 2045321529Smav sizeof (space_map_phys_t))) { 2046321529Smav weight = metaslab_segment_weight(msp); 2047321529Smav } else { 2048321529Smav weight = metaslab_space_weight(msp); 2049321529Smav } 2050321529Smav return (weight); 2051321529Smav} 2052321529Smav 2053168404Spjdstatic int 2054339105Smavmetaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2055339105Smav int allocator, uint64_t activation_weight) 2056168404Spjd{ 2057339105Smav /* 2058339105Smav * If we're activating for the claim code, we don't want to actually 2059339105Smav * set the metaslab up for a specific allocator. 2060339105Smav */ 2061339105Smav if (activation_weight == METASLAB_WEIGHT_CLAIM) 2062339105Smav return (0); 2063339105Smav metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? 2064339105Smav mg->mg_primaries : mg->mg_secondaries); 2065339105Smav 2066168404Spjd ASSERT(MUTEX_HELD(&msp->ms_lock)); 2067339105Smav mutex_enter(&mg->mg_lock); 2068339105Smav if (arr[allocator] != NULL) { 2069339105Smav mutex_exit(&mg->mg_lock); 2070339105Smav return (EEXIST); 2071339105Smav } 2072168404Spjd 2073339105Smav arr[allocator] = msp; 2074339105Smav ASSERT3S(msp->ms_allocator, ==, -1); 2075339105Smav msp->ms_allocator = allocator; 2076339105Smav msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); 2077339105Smav mutex_exit(&mg->mg_lock); 2078339105Smav 2079339105Smav return (0); 2080339105Smav} 2081339105Smav 2082339105Smavstatic int 2083339105Smavmetaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) 2084339105Smav{ 2085339105Smav ASSERT(MUTEX_HELD(&msp->ms_lock)); 2086339105Smav 2087168404Spjd if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 2088339105Smav int error = 0; 2089258717Savg metaslab_load_wait(msp); 2090258717Savg if (!msp->ms_loaded) { 2091339105Smav if ((error = metaslab_load(msp)) != 0) { 2092219089Spjd metaslab_group_sort(msp->ms_group, msp, 0); 2093219089Spjd return (error); 2094219089Spjd } 2095168404Spjd } 2096339105Smav if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { 2097339105Smav /* 2098339105Smav * The metaslab was activated for another allocator 2099339105Smav * while we were waiting, we should reselect. 2100339105Smav */ 2101339105Smav return (EBUSY); 2102339105Smav } 2103339105Smav if ((error = metaslab_activate_allocator(msp->ms_group, msp, 2104339105Smav allocator, activation_weight)) != 0) { 2105339105Smav return (error); 2106339105Smav } 2107209962Smm 2108321529Smav msp->ms_activation_weight = msp->ms_weight; 2109168404Spjd metaslab_group_sort(msp->ms_group, msp, 2110168404Spjd msp->ms_weight | activation_weight); 2111168404Spjd } 2112258717Savg ASSERT(msp->ms_loaded); 2113168404Spjd ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 2114168404Spjd 2115168404Spjd return (0); 2116168404Spjd} 2117168404Spjd 2118168404Spjdstatic void 2119339105Smavmetaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, 2120339105Smav uint64_t weight) 2121339105Smav{ 2122339105Smav ASSERT(MUTEX_HELD(&msp->ms_lock)); 2123339105Smav if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { 2124339105Smav metaslab_group_sort(mg, msp, weight); 2125339105Smav return; 2126339105Smav } 2127339105Smav 2128339105Smav mutex_enter(&mg->mg_lock); 2129339105Smav ASSERT3P(msp->ms_group, ==, mg); 2130339105Smav if (msp->ms_primary) { 2131339105Smav ASSERT3U(0, <=, msp->ms_allocator); 2132339105Smav ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); 2133339105Smav ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); 2134339105Smav ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); 2135339105Smav mg->mg_primaries[msp->ms_allocator] = NULL; 2136339105Smav } else { 2137339105Smav ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); 2138339105Smav ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); 2139339105Smav mg->mg_secondaries[msp->ms_allocator] = NULL; 2140339105Smav } 2141339105Smav msp->ms_allocator = -1; 2142339105Smav metaslab_group_sort_impl(mg, msp, weight); 2143339105Smav mutex_exit(&mg->mg_lock); 2144339105Smav} 2145339105Smav 2146339105Smavstatic void 2147321529Smavmetaslab_passivate(metaslab_t *msp, uint64_t weight) 2148168404Spjd{ 2149321529Smav uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 2150321529Smav 2151168404Spjd /* 2152168404Spjd * If size < SPA_MINBLOCKSIZE, then we will not allocate from 2153168404Spjd * this metaslab again. In that case, it had better be empty, 2154168404Spjd * or we would be leaving space on the table. 2155168404Spjd */ 2156321529Smav ASSERT(size >= SPA_MINBLOCKSIZE || 2157332547Smav range_tree_is_empty(msp->ms_allocatable)); 2158321529Smav ASSERT0(weight & METASLAB_ACTIVE_MASK); 2159321529Smav 2160321529Smav msp->ms_activation_weight = 0; 2161339105Smav metaslab_passivate_allocator(msp->ms_group, msp, weight); 2162168404Spjd ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 2163168404Spjd} 2164168404Spjd 2165321529Smav/* 2166321529Smav * Segment-based metaslabs are activated once and remain active until 2167321529Smav * we either fail an allocation attempt (similar to space-based metaslabs) 2168321529Smav * or have exhausted the free space in zfs_metaslab_switch_threshold 2169321529Smav * buckets since the metaslab was activated. This function checks to see 2170321529Smav * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 2171321529Smav * metaslab and passivates it proactively. This will allow us to select a 2172321529Smav * metaslabs with larger contiguous region if any remaining within this 2173321529Smav * metaslab group. If we're in sync pass > 1, then we continue using this 2174321529Smav * metaslab so that we don't dirty more block and cause more sync passes. 2175321529Smav */ 2176321529Smavvoid 2177321529Smavmetaslab_segment_may_passivate(metaslab_t *msp) 2178321529Smav{ 2179321529Smav spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2180321529Smav 2181321529Smav if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 2182321529Smav return; 2183321529Smav 2184321529Smav /* 2185321529Smav * Since we are in the middle of a sync pass, the most accurate 2186321529Smav * information that is accessible to us is the in-core range tree 2187321529Smav * histogram; calculate the new weight based on that information. 2188321529Smav */ 2189321529Smav uint64_t weight = metaslab_weight_from_range_tree(msp); 2190321529Smav int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 2191321529Smav int current_idx = WEIGHT_GET_INDEX(weight); 2192321529Smav 2193321529Smav if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 2194321529Smav metaslab_passivate(msp, weight); 2195321529Smav} 2196321529Smav 2197258717Savgstatic void 2198258717Savgmetaslab_preload(void *arg) 2199258717Savg{ 2200258717Savg metaslab_t *msp = arg; 2201258717Savg spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2202258717Savg 2203268086Sdelphij ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 2204268086Sdelphij 2205258717Savg mutex_enter(&msp->ms_lock); 2206258717Savg metaslab_load_wait(msp); 2207258717Savg if (!msp->ms_loaded) 2208258717Savg (void) metaslab_load(msp); 2209321529Smav msp->ms_selected_txg = spa_syncing_txg(spa); 2210258717Savg mutex_exit(&msp->ms_lock); 2211258717Savg} 2212258717Savg 2213258717Savgstatic void 2214258717Savgmetaslab_group_preload(metaslab_group_t *mg) 2215258717Savg{ 2216258717Savg spa_t *spa = mg->mg_vd->vdev_spa; 2217258717Savg metaslab_t *msp; 2218258717Savg avl_tree_t *t = &mg->mg_metaslab_tree; 2219258717Savg int m = 0; 2220258717Savg 2221258717Savg if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 2222258717Savg taskq_wait(mg->mg_taskq); 2223258717Savg return; 2224258717Savg } 2225268086Sdelphij 2226258717Savg mutex_enter(&mg->mg_lock); 2227332525Smav 2228258717Savg /* 2229268086Sdelphij * Load the next potential metaslabs 2230258717Savg */ 2231321529Smav for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 2232332525Smav ASSERT3P(msp->ms_group, ==, mg); 2233332525Smav 2234269118Sdelphij /* 2235269118Sdelphij * We preload only the maximum number of metaslabs specified 2236269118Sdelphij * by metaslab_preload_limit. If a metaslab is being forced 2237269118Sdelphij * to condense then we preload it too. This will ensure 2238269118Sdelphij * that force condensing happens in the next txg. 2239269118Sdelphij */ 2240269118Sdelphij if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 2241269118Sdelphij continue; 2242269118Sdelphij } 2243258717Savg 2244258717Savg VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 2245258717Savg msp, TQ_SLEEP) != 0); 2246258717Savg } 2247258717Savg mutex_exit(&mg->mg_lock); 2248258717Savg} 2249258717Savg 2250168404Spjd/* 2251258717Savg * Determine if the space map's on-disk footprint is past our tolerance 2252258717Savg * for inefficiency. We would like to use the following criteria to make 2253258717Savg * our decision: 2254247398Smm * 2255247398Smm * 1. The size of the space map object should not dramatically increase as a 2256258717Savg * result of writing out the free space range tree. 2257247398Smm * 2258247398Smm * 2. The minimal on-disk space map representation is zfs_condense_pct/100 2259258717Savg * times the size than the free space range tree representation 2260332525Smav * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). 2261247398Smm * 2262268855Sdelphij * 3. The on-disk size of the space map should actually decrease. 2263268855Sdelphij * 2264268855Sdelphij * Unfortunately, we cannot compute the on-disk size of the space map in this 2265268855Sdelphij * context because we cannot accurately compute the effects of compression, etc. 2266268855Sdelphij * Instead, we apply the heuristic described in the block comment for 2267268855Sdelphij * zfs_metaslab_condense_block_threshold - we only condense if the space used 2268268855Sdelphij * is greater than a threshold number of blocks. 2269247398Smm */ 2270247398Smmstatic boolean_t 2271247398Smmmetaslab_should_condense(metaslab_t *msp) 2272247398Smm{ 2273258717Savg space_map_t *sm = msp->ms_sm; 2274332547Smav vdev_t *vd = msp->ms_group->mg_vd; 2275332547Smav uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 2276332547Smav uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); 2277247398Smm 2278247398Smm ASSERT(MUTEX_HELD(&msp->ms_lock)); 2279258717Savg ASSERT(msp->ms_loaded); 2280247398Smm 2281247398Smm /* 2282332547Smav * Allocations and frees in early passes are generally more space 2283332547Smav * efficient (in terms of blocks described in space map entries) 2284332547Smav * than the ones in later passes (e.g. we don't compress after 2285332547Smav * sync pass 5) and condensing a metaslab multiple times in a txg 2286332547Smav * could degrade performance. 2287332547Smav * 2288332547Smav * Thus we prefer condensing each metaslab at most once every txg at 2289332547Smav * the earliest sync pass possible. If a metaslab is eligible for 2290332547Smav * condensing again after being considered for condensing within the 2291332547Smav * same txg, it will hopefully be dirty in the next txg where it will 2292332547Smav * be condensed at an earlier pass. 2293247398Smm */ 2294332547Smav if (msp->ms_condense_checked_txg == current_txg) 2295332547Smav return (B_FALSE); 2296332547Smav msp->ms_condense_checked_txg = current_txg; 2297332547Smav 2298332547Smav /* 2299339104Smav * We always condense metaslabs that are empty and metaslabs for 2300339104Smav * which a condense request has been made. 2301332547Smav */ 2302339104Smav if (avl_is_empty(&msp->ms_allocatable_by_size) || 2303339104Smav msp->ms_condense_wanted) 2304247398Smm return (B_TRUE); 2305247398Smm 2306339104Smav uint64_t object_size = space_map_length(msp->ms_sm); 2307339104Smav uint64_t optimal_size = space_map_estimate_optimal_size(sm, 2308339104Smav msp->ms_allocatable, SM_NO_VDEVID); 2309247398Smm 2310339104Smav dmu_object_info_t doi; 2311268855Sdelphij dmu_object_info_from_db(sm->sm_dbuf, &doi); 2312339104Smav uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 2313268855Sdelphij 2314339104Smav return (object_size >= (optimal_size * zfs_condense_pct / 100) && 2315268855Sdelphij object_size > zfs_metaslab_condense_block_threshold * record_size); 2316247398Smm} 2317247398Smm 2318247398Smm/* 2319247398Smm * Condense the on-disk space map representation to its minimized form. 2320247398Smm * The minimized form consists of a small number of allocations followed by 2321258717Savg * the entries of the free range tree. 2322247398Smm */ 2323247398Smmstatic void 2324247398Smmmetaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 2325247398Smm{ 2326258717Savg range_tree_t *condense_tree; 2327258717Savg space_map_t *sm = msp->ms_sm; 2328247398Smm 2329247398Smm ASSERT(MUTEX_HELD(&msp->ms_lock)); 2330258717Savg ASSERT(msp->ms_loaded); 2331247398Smm 2332332547Smav zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 2333289307Smav "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 2334289307Smav msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 2335289307Smav msp->ms_group->mg_vd->vdev_spa->spa_name, 2336332547Smav space_map_length(msp->ms_sm), 2337332547Smav avl_numnodes(&msp->ms_allocatable->rt_root), 2338269118Sdelphij msp->ms_condense_wanted ? "TRUE" : "FALSE"); 2339247398Smm 2340269118Sdelphij msp->ms_condense_wanted = B_FALSE; 2341269118Sdelphij 2342247398Smm /* 2343258717Savg * Create an range tree that is 100% allocated. We remove segments 2344247398Smm * that have been freed in this txg, any deferred frees that exist, 2345247398Smm * and any allocation in the future. Removing segments should be 2346258717Savg * a relatively inexpensive operation since we expect these trees to 2347258717Savg * have a small number of nodes. 2348247398Smm */ 2349332525Smav condense_tree = range_tree_create(NULL, NULL); 2350258717Savg range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 2351247398Smm 2352332547Smav range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); 2353332547Smav range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); 2354247398Smm 2355258717Savg for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2356332547Smav range_tree_walk(msp->ms_defer[t], 2357258717Savg range_tree_remove, condense_tree); 2358258717Savg } 2359247398Smm 2360258717Savg for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2361332547Smav range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 2362258717Savg range_tree_remove, condense_tree); 2363258717Savg } 2364247398Smm 2365247398Smm /* 2366247398Smm * We're about to drop the metaslab's lock thus allowing 2367247398Smm * other consumers to change it's content. Set the 2368258717Savg * metaslab's ms_condensing flag to ensure that 2369247398Smm * allocations on this metaslab do not occur while we're 2370247398Smm * in the middle of committing it to disk. This is only critical 2371332547Smav * for ms_allocatable as all other range trees use per txg 2372247398Smm * views of their content. 2373247398Smm */ 2374258717Savg msp->ms_condensing = B_TRUE; 2375247398Smm 2376247398Smm mutex_exit(&msp->ms_lock); 2377332547Smav space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); 2378247398Smm 2379247398Smm /* 2380321529Smav * While we would ideally like to create a space map representation 2381247398Smm * that consists only of allocation records, doing so can be 2382258717Savg * prohibitively expensive because the in-core free tree can be 2383247398Smm * large, and therefore computationally expensive to subtract 2384258717Savg * from the condense_tree. Instead we sync out two trees, a cheap 2385258717Savg * allocation only tree followed by the in-core free tree. While not 2386247398Smm * optimal, this is typically close to optimal, and much cheaper to 2387247398Smm * compute. 2388247398Smm */ 2389339104Smav space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); 2390258717Savg range_tree_vacate(condense_tree, NULL, NULL); 2391258717Savg range_tree_destroy(condense_tree); 2392247398Smm 2393339104Smav space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 2394332525Smav mutex_enter(&msp->ms_lock); 2395258717Savg msp->ms_condensing = B_FALSE; 2396247398Smm} 2397247398Smm 2398247398Smm/* 2399168404Spjd * Write a metaslab to disk in the context of the specified transaction group. 2400168404Spjd */ 2401168404Spjdvoid 2402168404Spjdmetaslab_sync(metaslab_t *msp, uint64_t txg) 2403168404Spjd{ 2404258717Savg metaslab_group_t *mg = msp->ms_group; 2405258717Savg vdev_t *vd = mg->mg_vd; 2406168404Spjd spa_t *spa = vd->vdev_spa; 2407219089Spjd objset_t *mos = spa_meta_objset(spa); 2408332547Smav range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 2409168404Spjd dmu_tx_t *tx; 2410258717Savg uint64_t object = space_map_object(msp->ms_sm); 2411168404Spjd 2412219089Spjd ASSERT(!vd->vdev_ishole); 2413168404Spjd 2414247398Smm /* 2415247398Smm * This metaslab has just been added so there's no work to do now. 2416247398Smm */ 2417332547Smav if (msp->ms_freeing == NULL) { 2418258717Savg ASSERT3P(alloctree, ==, NULL); 2419219089Spjd return; 2420247398Smm } 2421219089Spjd 2422258717Savg ASSERT3P(alloctree, !=, NULL); 2423332547Smav ASSERT3P(msp->ms_freeing, !=, NULL); 2424332547Smav ASSERT3P(msp->ms_freed, !=, NULL); 2425332547Smav ASSERT3P(msp->ms_checkpointing, !=, NULL); 2426247398Smm 2427269118Sdelphij /* 2428332547Smav * Normally, we don't want to process a metaslab if there are no 2429332547Smav * allocations or frees to perform. However, if the metaslab is being 2430332547Smav * forced to condense and it's loaded, we need to let it through. 2431269118Sdelphij */ 2432332547Smav if (range_tree_is_empty(alloctree) && 2433332547Smav range_tree_is_empty(msp->ms_freeing) && 2434332547Smav range_tree_is_empty(msp->ms_checkpointing) && 2435321554Smav !(msp->ms_loaded && msp->ms_condense_wanted)) 2436247398Smm return; 2437247398Smm 2438321554Smav 2439321554Smav VERIFY(txg <= spa_final_dirty_txg(spa)); 2440321554Smav 2441168404Spjd /* 2442168404Spjd * The only state that can actually be changing concurrently with 2443332547Smav * metaslab_sync() is the metaslab's ms_allocatable. No other 2444332547Smav * thread can be modifying this txg's alloc, freeing, 2445332547Smav * freed, or space_map_phys_t. We drop ms_lock whenever we 2446332547Smav * could call into the DMU, because the DMU can call down to us 2447332525Smav * (e.g. via zio_free()) at any time. 2448332525Smav * 2449332525Smav * The spa_vdev_remove_thread() can be reading metaslab state 2450332525Smav * concurrently, and it is locked out by the ms_sync_lock. Note 2451332525Smav * that the ms_lock is insufficient for this, because it is dropped 2452332525Smav * by space_map_write(). 2453168404Spjd */ 2454219089Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2455219089Spjd 2456258717Savg if (msp->ms_sm == NULL) { 2457258717Savg uint64_t new_object; 2458258717Savg 2459332547Smav new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); 2460258717Savg VERIFY3U(new_object, !=, 0); 2461258717Savg 2462258717Savg VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2463332525Smav msp->ms_start, msp->ms_size, vd->vdev_ashift)); 2464258717Savg ASSERT(msp->ms_sm != NULL); 2465168404Spjd } 2466168404Spjd 2467332547Smav if (!range_tree_is_empty(msp->ms_checkpointing) && 2468332547Smav vd->vdev_checkpoint_sm == NULL) { 2469332547Smav ASSERT(spa_has_checkpoint(spa)); 2470332547Smav 2471332547Smav uint64_t new_object = space_map_alloc(mos, 2472332547Smav vdev_standard_sm_blksz, tx); 2473332547Smav VERIFY3U(new_object, !=, 0); 2474332547Smav 2475332547Smav VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 2476332547Smav mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 2477332547Smav ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2478332547Smav 2479332547Smav /* 2480332547Smav * We save the space map object as an entry in vdev_top_zap 2481332547Smav * so it can be retrieved when the pool is reopened after an 2482332547Smav * export or through zdb. 2483332547Smav */ 2484332547Smav VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 2485332547Smav vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 2486332547Smav sizeof (new_object), 1, &new_object, tx)); 2487332547Smav } 2488332547Smav 2489332525Smav mutex_enter(&msp->ms_sync_lock); 2490219089Spjd mutex_enter(&msp->ms_lock); 2491219089Spjd 2492272504Sdelphij /* 2493321529Smav * Note: metaslab_condense() clears the space map's histogram. 2494272504Sdelphij * Therefore we must verify and remove this histogram before 2495272504Sdelphij * condensing. 2496272504Sdelphij */ 2497272504Sdelphij metaslab_group_histogram_verify(mg); 2498272504Sdelphij metaslab_class_histogram_verify(mg->mg_class); 2499272504Sdelphij metaslab_group_histogram_remove(mg, msp); 2500272504Sdelphij 2501332547Smav if (msp->ms_loaded && metaslab_should_condense(msp)) { 2502247398Smm metaslab_condense(msp, txg, tx); 2503247398Smm } else { 2504332525Smav mutex_exit(&msp->ms_lock); 2505339104Smav space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 2506339104Smav SM_NO_VDEVID, tx); 2507339104Smav space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 2508339104Smav SM_NO_VDEVID, tx); 2509332525Smav mutex_enter(&msp->ms_lock); 2510247398Smm } 2511168404Spjd 2512332547Smav if (!range_tree_is_empty(msp->ms_checkpointing)) { 2513332547Smav ASSERT(spa_has_checkpoint(spa)); 2514332547Smav ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2515332547Smav 2516332547Smav /* 2517332547Smav * Since we are doing writes to disk and the ms_checkpointing 2518332547Smav * tree won't be changing during that time, we drop the 2519332547Smav * ms_lock while writing to the checkpoint space map. 2520332547Smav */ 2521332547Smav mutex_exit(&msp->ms_lock); 2522332547Smav space_map_write(vd->vdev_checkpoint_sm, 2523339104Smav msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 2524332547Smav mutex_enter(&msp->ms_lock); 2525332547Smav space_map_update(vd->vdev_checkpoint_sm); 2526332547Smav 2527332547Smav spa->spa_checkpoint_info.sci_dspace += 2528332547Smav range_tree_space(msp->ms_checkpointing); 2529332547Smav vd->vdev_stat.vs_checkpoint_space += 2530332547Smav range_tree_space(msp->ms_checkpointing); 2531332547Smav ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 2532332547Smav -vd->vdev_checkpoint_sm->sm_alloc); 2533332547Smav 2534332547Smav range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 2535332547Smav } 2536332547Smav 2537258717Savg if (msp->ms_loaded) { 2538258717Savg /* 2539332525Smav * When the space map is loaded, we have an accurate 2540258717Savg * histogram in the range tree. This gives us an opportunity 2541258717Savg * to bring the space map's histogram up-to-date so we clear 2542258717Savg * it first before updating it. 2543258717Savg */ 2544258717Savg space_map_histogram_clear(msp->ms_sm); 2545332547Smav space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 2546321529Smav 2547258717Savg /* 2548321529Smav * Since we've cleared the histogram we need to add back 2549321529Smav * any free space that has already been processed, plus 2550321529Smav * any deferred space. This allows the on-disk histogram 2551321529Smav * to accurately reflect all free space even if some space 2552321529Smav * is not yet available for allocation (i.e. deferred). 2553258717Savg */ 2554332547Smav space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 2555321529Smav 2556321529Smav /* 2557321529Smav * Add back any deferred free space that has not been 2558321529Smav * added back into the in-core free tree yet. This will 2559321529Smav * ensure that we don't end up with a space map histogram 2560321529Smav * that is completely empty unless the metaslab is fully 2561321529Smav * allocated. 2562321529Smav */ 2563321529Smav for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2564321529Smav space_map_histogram_add(msp->ms_sm, 2565332547Smav msp->ms_defer[t], tx); 2566321529Smav } 2567258717Savg } 2568321529Smav 2569321529Smav /* 2570321529Smav * Always add the free space from this sync pass to the space 2571321529Smav * map histogram. We want to make sure that the on-disk histogram 2572321529Smav * accounts for all free space. If the space map is not loaded, 2573321529Smav * then we will lose some accuracy but will correct it the next 2574321529Smav * time we load the space map. 2575321529Smav */ 2576332547Smav space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 2577321529Smav 2578269118Sdelphij metaslab_group_histogram_add(mg, msp); 2579269118Sdelphij metaslab_group_histogram_verify(mg); 2580269118Sdelphij metaslab_class_histogram_verify(mg->mg_class); 2581258717Savg 2582247398Smm /* 2583258717Savg * For sync pass 1, we avoid traversing this txg's free range tree 2584332547Smav * and instead will just swap the pointers for freeing and 2585332547Smav * freed. We can safely do this since the freed_tree is 2586247398Smm * guaranteed to be empty on the initial pass. 2587247398Smm */ 2588247398Smm if (spa_sync_pass(spa) == 1) { 2589332547Smav range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 2590247398Smm } else { 2591332547Smav range_tree_vacate(msp->ms_freeing, 2592332547Smav range_tree_add, msp->ms_freed); 2593168404Spjd } 2594269118Sdelphij range_tree_vacate(alloctree, NULL, NULL); 2595168404Spjd 2596332547Smav ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2597332547Smav ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 2598332547Smav & TXG_MASK])); 2599332547Smav ASSERT0(range_tree_space(msp->ms_freeing)); 2600332547Smav ASSERT0(range_tree_space(msp->ms_checkpointing)); 2601168404Spjd 2602168404Spjd mutex_exit(&msp->ms_lock); 2603168404Spjd 2604258717Savg if (object != space_map_object(msp->ms_sm)) { 2605258717Savg object = space_map_object(msp->ms_sm); 2606258717Savg dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2607258717Savg msp->ms_id, sizeof (uint64_t), &object, tx); 2608258717Savg } 2609332525Smav mutex_exit(&msp->ms_sync_lock); 2610168404Spjd dmu_tx_commit(tx); 2611168404Spjd} 2612168404Spjd 2613168404Spjd/* 2614168404Spjd * Called after a transaction group has completely synced to mark 2615168404Spjd * all of the metaslab's free space as usable. 2616168404Spjd */ 2617168404Spjdvoid 2618168404Spjdmetaslab_sync_done(metaslab_t *msp, uint64_t txg) 2619168404Spjd{ 2620168404Spjd metaslab_group_t *mg = msp->ms_group; 2621168404Spjd vdev_t *vd = mg->mg_vd; 2622321529Smav spa_t *spa = vd->vdev_spa; 2623258717Savg range_tree_t **defer_tree; 2624219089Spjd int64_t alloc_delta, defer_delta; 2625321529Smav boolean_t defer_allowed = B_TRUE; 2626168404Spjd 2627219089Spjd ASSERT(!vd->vdev_ishole); 2628219089Spjd 2629168404Spjd mutex_enter(&msp->ms_lock); 2630168404Spjd 2631168404Spjd /* 2632168404Spjd * If this metaslab is just becoming available, initialize its 2633321539Smav * range trees and add its capacity to the vdev. 2634168404Spjd */ 2635332547Smav if (msp->ms_freed == NULL) { 2636219089Spjd for (int t = 0; t < TXG_SIZE; t++) { 2637332547Smav ASSERT(msp->ms_allocating[t] == NULL); 2638258717Savg 2639332547Smav msp->ms_allocating[t] = range_tree_create(NULL, NULL); 2640168404Spjd } 2641219089Spjd 2642332547Smav ASSERT3P(msp->ms_freeing, ==, NULL); 2643332547Smav msp->ms_freeing = range_tree_create(NULL, NULL); 2644321539Smav 2645332547Smav ASSERT3P(msp->ms_freed, ==, NULL); 2646332547Smav msp->ms_freed = range_tree_create(NULL, NULL); 2647321539Smav 2648247398Smm for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2649332547Smav ASSERT(msp->ms_defer[t] == NULL); 2650258717Savg 2651332547Smav msp->ms_defer[t] = range_tree_create(NULL, NULL); 2652247398Smm } 2653219089Spjd 2654332547Smav ASSERT3P(msp->ms_checkpointing, ==, NULL); 2655332547Smav msp->ms_checkpointing = range_tree_create(NULL, NULL); 2656332547Smav 2657258717Savg vdev_space_update(vd, 0, 0, msp->ms_size); 2658168404Spjd } 2659332547Smav ASSERT0(range_tree_space(msp->ms_freeing)); 2660332547Smav ASSERT0(range_tree_space(msp->ms_checkpointing)); 2661168404Spjd 2662332547Smav defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 2663168404Spjd 2664321529Smav uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 2665321529Smav metaslab_class_get_alloc(spa_normal_class(spa)); 2666332525Smav if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 2667321529Smav defer_allowed = B_FALSE; 2668321529Smav } 2669321529Smav 2670321529Smav defer_delta = 0; 2671258717Savg alloc_delta = space_map_alloc_delta(msp->ms_sm); 2672321529Smav if (defer_allowed) { 2673332547Smav defer_delta = range_tree_space(msp->ms_freed) - 2674321529Smav range_tree_space(*defer_tree); 2675321529Smav } else { 2676321529Smav defer_delta -= range_tree_space(*defer_tree); 2677321529Smav } 2678258717Savg 2679219089Spjd vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2680219089Spjd 2681168404Spjd /* 2682258717Savg * If there's a metaslab_load() in progress, wait for it to complete 2683168404Spjd * so that we have a consistent view of the in-core space map. 2684168404Spjd */ 2685258717Savg metaslab_load_wait(msp); 2686168404Spjd 2687247398Smm /* 2688258717Savg * Move the frees from the defer_tree back to the free 2689332547Smav * range tree (if it's loaded). Swap the freed_tree and 2690332547Smav * the defer_tree -- this is safe to do because we've 2691332547Smav * just emptied out the defer_tree. 2692247398Smm */ 2693258717Savg range_tree_vacate(*defer_tree, 2694332547Smav msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 2695321529Smav if (defer_allowed) { 2696332547Smav range_tree_swap(&msp->ms_freed, defer_tree); 2697321529Smav } else { 2698332547Smav range_tree_vacate(msp->ms_freed, 2699332547Smav msp->ms_loaded ? range_tree_add : NULL, 2700332547Smav msp->ms_allocatable); 2701321529Smav } 2702258717Savg space_map_update(msp->ms_sm); 2703168404Spjd 2704219089Spjd msp->ms_deferspace += defer_delta; 2705219089Spjd ASSERT3S(msp->ms_deferspace, >=, 0); 2706258717Savg ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2707219089Spjd if (msp->ms_deferspace != 0) { 2708219089Spjd /* 2709219089Spjd * Keep syncing this metaslab until all deferred frees 2710219089Spjd * are back in circulation. 2711219089Spjd */ 2712219089Spjd vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2713219089Spjd } 2714219089Spjd 2715339105Smav if (msp->ms_new) { 2716339105Smav msp->ms_new = B_FALSE; 2717339105Smav mutex_enter(&mg->mg_lock); 2718339105Smav mg->mg_ms_ready++; 2719339105Smav mutex_exit(&mg->mg_lock); 2720339105Smav } 2721321529Smav /* 2722321529Smav * Calculate the new weights before unloading any metaslabs. 2723321529Smav * This will give us the most accurate weighting. 2724321529Smav */ 2725339105Smav metaslab_group_sort(mg, msp, metaslab_weight(msp) | 2726339105Smav (msp->ms_weight & METASLAB_ACTIVE_MASK)); 2727321529Smav 2728321529Smav /* 2729321529Smav * If the metaslab is loaded and we've not tried to load or allocate 2730321529Smav * from it in 'metaslab_unload_delay' txgs, then unload it. 2731321529Smav */ 2732321529Smav if (msp->ms_loaded && 2733339111Smav msp->ms_initializing == 0 && 2734321529Smav msp->ms_selected_txg + metaslab_unload_delay < txg) { 2735258717Savg for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2736258717Savg VERIFY0(range_tree_space( 2737332547Smav msp->ms_allocating[(txg + t) & TXG_MASK])); 2738258717Savg } 2739339105Smav if (msp->ms_allocator != -1) { 2740339105Smav metaslab_passivate(msp, msp->ms_weight & 2741339105Smav ~METASLAB_ACTIVE_MASK); 2742339105Smav } 2743168404Spjd 2744258717Savg if (!metaslab_debug_unload) 2745258717Savg metaslab_unload(msp); 2746168404Spjd } 2747168404Spjd 2748332547Smav ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2749332547Smav ASSERT0(range_tree_space(msp->ms_freeing)); 2750332547Smav ASSERT0(range_tree_space(msp->ms_freed)); 2751332547Smav ASSERT0(range_tree_space(msp->ms_checkpointing)); 2752332525Smav 2753258717Savg mutex_exit(&msp->ms_lock); 2754168404Spjd} 2755168404Spjd 2756211931Smmvoid 2757211931Smmmetaslab_sync_reassess(metaslab_group_t *mg) 2758211931Smm{ 2759332525Smav spa_t *spa = mg->mg_class->mc_spa; 2760332525Smav 2761332525Smav spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2762258633Savg metaslab_group_alloc_update(mg); 2763269118Sdelphij mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2764224177Smm 2765211931Smm /* 2766332525Smav * Preload the next potential metaslabs but only on active 2767332525Smav * metaslab groups. We can get into a state where the metaslab 2768332525Smav * is no longer active since we dirty metaslabs as we remove a 2769332525Smav * a device, thus potentially making the metaslab group eligible 2770332525Smav * for preloading. 2771211931Smm */ 2772332525Smav if (mg->mg_activation_count > 0) { 2773332525Smav metaslab_group_preload(mg); 2774332525Smav } 2775332525Smav spa_config_exit(spa, SCL_ALLOC, FTAG); 2776211931Smm} 2777211931Smm 2778168404Spjdstatic uint64_t 2779168404Spjdmetaslab_distance(metaslab_t *msp, dva_t *dva) 2780168404Spjd{ 2781168404Spjd uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2782168404Spjd uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2783258717Savg uint64_t start = msp->ms_id; 2784168404Spjd 2785168404Spjd if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2786168404Spjd return (1ULL << 63); 2787168404Spjd 2788168404Spjd if (offset < start) 2789168404Spjd return ((start - offset) << ms_shift); 2790168404Spjd if (offset > start) 2791168404Spjd return ((offset - start) << ms_shift); 2792168404Spjd return (0); 2793168404Spjd} 2794168404Spjd 2795307277Smav/* 2796307277Smav * ========================================================================== 2797321529Smav * Metaslab allocation tracing facility 2798321529Smav * ========================================================================== 2799321529Smav */ 2800321529Smavkstat_t *metaslab_trace_ksp; 2801321529Smavkstat_named_t metaslab_trace_over_limit; 2802321529Smav 2803321529Smavvoid 2804321529Smavmetaslab_alloc_trace_init(void) 2805321529Smav{ 2806321529Smav ASSERT(metaslab_alloc_trace_cache == NULL); 2807321529Smav metaslab_alloc_trace_cache = kmem_cache_create( 2808321529Smav "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 2809321529Smav 0, NULL, NULL, NULL, NULL, NULL, 0); 2810321529Smav metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 2811321529Smav "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 2812321529Smav if (metaslab_trace_ksp != NULL) { 2813321529Smav metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 2814321529Smav kstat_named_init(&metaslab_trace_over_limit, 2815321529Smav "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 2816321529Smav kstat_install(metaslab_trace_ksp); 2817321529Smav } 2818321529Smav} 2819321529Smav 2820321529Smavvoid 2821321529Smavmetaslab_alloc_trace_fini(void) 2822321529Smav{ 2823321529Smav if (metaslab_trace_ksp != NULL) { 2824321529Smav kstat_delete(metaslab_trace_ksp); 2825321529Smav metaslab_trace_ksp = NULL; 2826321529Smav } 2827321529Smav kmem_cache_destroy(metaslab_alloc_trace_cache); 2828321529Smav metaslab_alloc_trace_cache = NULL; 2829321529Smav} 2830321529Smav 2831321529Smav/* 2832321529Smav * Add an allocation trace element to the allocation tracing list. 2833321529Smav */ 2834321529Smavstatic void 2835321529Smavmetaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 2836339105Smav metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, 2837339105Smav int allocator) 2838321529Smav{ 2839321529Smav if (!metaslab_trace_enabled) 2840321529Smav return; 2841321529Smav 2842321529Smav /* 2843321529Smav * When the tracing list reaches its maximum we remove 2844321529Smav * the second element in the list before adding a new one. 2845321529Smav * By removing the second element we preserve the original 2846321529Smav * entry as a clue to what allocations steps have already been 2847321529Smav * performed. 2848321529Smav */ 2849321529Smav if (zal->zal_size == metaslab_trace_max_entries) { 2850321529Smav metaslab_alloc_trace_t *mat_next; 2851321529Smav#ifdef DEBUG 2852321529Smav panic("too many entries in allocation list"); 2853321529Smav#endif 2854321529Smav atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 2855321529Smav zal->zal_size--; 2856321529Smav mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 2857321529Smav list_remove(&zal->zal_list, mat_next); 2858321529Smav kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 2859321529Smav } 2860321529Smav 2861321529Smav metaslab_alloc_trace_t *mat = 2862321529Smav kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 2863321529Smav list_link_init(&mat->mat_list_node); 2864321529Smav mat->mat_mg = mg; 2865321529Smav mat->mat_msp = msp; 2866321529Smav mat->mat_size = psize; 2867321529Smav mat->mat_dva_id = dva_id; 2868321529Smav mat->mat_offset = offset; 2869321529Smav mat->mat_weight = 0; 2870339105Smav mat->mat_allocator = allocator; 2871321529Smav 2872321529Smav if (msp != NULL) 2873321529Smav mat->mat_weight = msp->ms_weight; 2874321529Smav 2875321529Smav /* 2876321529Smav * The list is part of the zio so locking is not required. Only 2877321529Smav * a single thread will perform allocations for a given zio. 2878321529Smav */ 2879321529Smav list_insert_tail(&zal->zal_list, mat); 2880321529Smav zal->zal_size++; 2881321529Smav 2882321529Smav ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 2883321529Smav} 2884321529Smav 2885321529Smavvoid 2886321529Smavmetaslab_trace_init(zio_alloc_list_t *zal) 2887321529Smav{ 2888321529Smav list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 2889321529Smav offsetof(metaslab_alloc_trace_t, mat_list_node)); 2890321529Smav zal->zal_size = 0; 2891321529Smav} 2892321529Smav 2893321529Smavvoid 2894321529Smavmetaslab_trace_fini(zio_alloc_list_t *zal) 2895321529Smav{ 2896321529Smav metaslab_alloc_trace_t *mat; 2897321529Smav 2898321529Smav while ((mat = list_remove_head(&zal->zal_list)) != NULL) 2899321529Smav kmem_cache_free(metaslab_alloc_trace_cache, mat); 2900321529Smav list_destroy(&zal->zal_list); 2901321529Smav zal->zal_size = 0; 2902321529Smav} 2903321529Smav 2904321529Smav/* 2905321529Smav * ========================================================================== 2906307277Smav * Metaslab block operations 2907307277Smav * ========================================================================== 2908307277Smav */ 2909307277Smav 2910307277Smavstatic void 2911339105Smavmetaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, 2912339105Smav int allocator) 2913307277Smav{ 2914307277Smav if (!(flags & METASLAB_ASYNC_ALLOC) || 2915339105Smav (flags & METASLAB_DONT_THROTTLE)) 2916307277Smav return; 2917307277Smav 2918307277Smav metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2919307277Smav if (!mg->mg_class->mc_alloc_throttle_enabled) 2920307277Smav return; 2921307277Smav 2922339105Smav (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); 2923307277Smav} 2924307277Smav 2925339105Smavstatic void 2926339105Smavmetaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) 2927339105Smav{ 2928339105Smav uint64_t max = mg->mg_max_alloc_queue_depth; 2929339105Smav uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2930339105Smav while (cur < max) { 2931339105Smav if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], 2932339105Smav cur, cur + 1) == cur) { 2933339105Smav atomic_inc_64( 2934339105Smav &mg->mg_class->mc_alloc_max_slots[allocator]); 2935339105Smav return; 2936339105Smav } 2937339105Smav cur = mg->mg_cur_max_alloc_queue_depth[allocator]; 2938339105Smav } 2939339105Smav} 2940339105Smav 2941307277Smavvoid 2942339105Smavmetaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, 2943339105Smav int allocator, boolean_t io_complete) 2944307277Smav{ 2945307277Smav if (!(flags & METASLAB_ASYNC_ALLOC) || 2946339105Smav (flags & METASLAB_DONT_THROTTLE)) 2947307277Smav return; 2948307277Smav 2949307277Smav metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2950307277Smav if (!mg->mg_class->mc_alloc_throttle_enabled) 2951307277Smav return; 2952307277Smav 2953339105Smav (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); 2954339105Smav if (io_complete) 2955339105Smav metaslab_group_increment_qdepth(mg, allocator); 2956307277Smav} 2957307277Smav 2958307277Smavvoid 2959339105Smavmetaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, 2960339105Smav int allocator) 2961307277Smav{ 2962307277Smav#ifdef ZFS_DEBUG 2963307277Smav const dva_t *dva = bp->blk_dva; 2964307277Smav int ndvas = BP_GET_NDVAS(bp); 2965307277Smav 2966307277Smav for (int d = 0; d < ndvas; d++) { 2967307277Smav uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2968307277Smav metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2969339105Smav VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator], 2970339105Smav tag)); 2971307277Smav } 2972307277Smav#endif 2973307277Smav} 2974307277Smav 2975168404Spjdstatic uint64_t 2976321529Smavmetaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 2977168404Spjd{ 2978321529Smav uint64_t start; 2979332547Smav range_tree_t *rt = msp->ms_allocatable; 2980321529Smav metaslab_class_t *mc = msp->ms_group->mg_class; 2981321529Smav 2982321529Smav VERIFY(!msp->ms_condensing); 2983339111Smav VERIFY0(msp->ms_initializing); 2984321529Smav 2985321529Smav start = mc->mc_ops->msop_alloc(msp, size); 2986321529Smav if (start != -1ULL) { 2987321529Smav metaslab_group_t *mg = msp->ms_group; 2988321529Smav vdev_t *vd = mg->mg_vd; 2989321529Smav 2990321529Smav VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 2991321529Smav VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2992321529Smav VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 2993321529Smav range_tree_remove(rt, start, size); 2994321529Smav 2995332547Smav if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 2996321529Smav vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2997321529Smav 2998332547Smav range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 2999321529Smav 3000321529Smav /* Track the last successful allocation */ 3001321529Smav msp->ms_alloc_txg = txg; 3002321529Smav metaslab_verify_space(msp, txg); 3003321529Smav } 3004321529Smav 3005321529Smav /* 3006321529Smav * Now that we've attempted the allocation we need to update the 3007321529Smav * metaslab's maximum block size since it may have changed. 3008321529Smav */ 3009321529Smav msp->ms_max_size = metaslab_block_maxsize(msp); 3010321529Smav return (start); 3011321529Smav} 3012321529Smav 3013339105Smav/* 3014339105Smav * Find the metaslab with the highest weight that is less than what we've 3015339105Smav * already tried. In the common case, this means that we will examine each 3016339105Smav * metaslab at most once. Note that concurrent callers could reorder metaslabs 3017339105Smav * by activation/passivation once we have dropped the mg_lock. If a metaslab is 3018339105Smav * activated by another thread, and we fail to allocate from the metaslab we 3019339105Smav * have selected, we may not try the newly-activated metaslab, and instead 3020339105Smav * activate another metaslab. This is not optimal, but generally does not cause 3021339105Smav * any problems (a possible exception being if every metaslab is completely full 3022339105Smav * except for the the newly-activated metaslab which we fail to examine). 3023339105Smav */ 3024339105Smavstatic metaslab_t * 3025339105Smavfind_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, 3026339105Smav dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator, 3027339105Smav zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) 3028339105Smav{ 3029339105Smav avl_index_t idx; 3030339105Smav avl_tree_t *t = &mg->mg_metaslab_tree; 3031339105Smav metaslab_t *msp = avl_find(t, search, &idx); 3032339105Smav if (msp == NULL) 3033339105Smav msp = avl_nearest(t, idx, AVL_AFTER); 3034339105Smav 3035339105Smav for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 3036339105Smav int i; 3037339105Smav if (!metaslab_should_allocate(msp, asize)) { 3038339105Smav metaslab_trace_add(zal, mg, msp, asize, d, 3039339105Smav TRACE_TOO_SMALL, allocator); 3040339105Smav continue; 3041339105Smav } 3042339105Smav 3043339105Smav /* 3044339111Smav * If the selected metaslab is condensing or being 3045339111Smav * initialized, skip it. 3046339105Smav */ 3047339111Smav if (msp->ms_condensing || msp->ms_initializing > 0) 3048339105Smav continue; 3049339105Smav 3050339105Smav *was_active = msp->ms_allocator != -1; 3051339105Smav /* 3052339105Smav * If we're activating as primary, this is our first allocation 3053339105Smav * from this disk, so we don't need to check how close we are. 3054339105Smav * If the metaslab under consideration was already active, 3055339105Smav * we're getting desperate enough to steal another allocator's 3056339105Smav * metaslab, so we still don't care about distances. 3057339105Smav */ 3058339105Smav if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) 3059339105Smav break; 3060339105Smav 3061339105Smav uint64_t target_distance = min_distance 3062339105Smav + (space_map_allocated(msp->ms_sm) != 0 ? 0 : 3063339105Smav min_distance >> 1); 3064339105Smav 3065339105Smav for (i = 0; i < d; i++) { 3066339105Smav if (metaslab_distance(msp, &dva[i]) < target_distance) 3067339105Smav break; 3068339105Smav } 3069339105Smav if (i == d) 3070339105Smav break; 3071339105Smav } 3072339105Smav 3073339105Smav if (msp != NULL) { 3074339105Smav search->ms_weight = msp->ms_weight; 3075339105Smav search->ms_start = msp->ms_start + 1; 3076339105Smav search->ms_allocator = msp->ms_allocator; 3077339105Smav search->ms_primary = msp->ms_primary; 3078339105Smav } 3079339105Smav return (msp); 3080339105Smav} 3081339105Smav 3082339105Smav/* ARGSUSED */ 3083321529Smavstatic uint64_t 3084321529Smavmetaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 3085339105Smav uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, 3086339105Smav int allocator) 3087321529Smav{ 3088168404Spjd metaslab_t *msp = NULL; 3089168404Spjd uint64_t offset = -1ULL; 3090168404Spjd uint64_t activation_weight; 3091168404Spjd 3092168404Spjd activation_weight = METASLAB_WEIGHT_PRIMARY; 3093339105Smav for (int i = 0; i < d; i++) { 3094339105Smav if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3095339105Smav DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3096168404Spjd activation_weight = METASLAB_WEIGHT_SECONDARY; 3097339105Smav } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3098339105Smav DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 3099339151Smav activation_weight = METASLAB_WEIGHT_CLAIM; 3100209962Smm break; 3101209962Smm } 3102209962Smm } 3103168404Spjd 3104339105Smav /* 3105339105Smav * If we don't have enough metaslabs active to fill the entire array, we 3106339105Smav * just use the 0th slot. 3107339105Smav */ 3108339151Smav if (mg->mg_ms_ready < mg->mg_allocators * 3) 3109339105Smav allocator = 0; 3110339105Smav 3111339105Smav ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); 3112339105Smav 3113321529Smav metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 3114321529Smav search->ms_weight = UINT64_MAX; 3115321529Smav search->ms_start = 0; 3116339105Smav /* 3117339105Smav * At the end of the metaslab tree are the already-active metaslabs, 3118339105Smav * first the primaries, then the secondaries. When we resume searching 3119339105Smav * through the tree, we need to consider ms_allocator and ms_primary so 3120339105Smav * we start in the location right after where we left off, and don't 3121339105Smav * accidentally loop forever considering the same metaslabs. 3122339105Smav */ 3123339105Smav search->ms_allocator = -1; 3124339105Smav search->ms_primary = B_TRUE; 3125168404Spjd for (;;) { 3126339105Smav boolean_t was_active = B_FALSE; 3127209962Smm 3128168404Spjd mutex_enter(&mg->mg_lock); 3129321529Smav 3130339105Smav if (activation_weight == METASLAB_WEIGHT_PRIMARY && 3131339105Smav mg->mg_primaries[allocator] != NULL) { 3132339105Smav msp = mg->mg_primaries[allocator]; 3133339105Smav was_active = B_TRUE; 3134339105Smav } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && 3135339151Smav mg->mg_secondaries[allocator] != NULL) { 3136339105Smav msp = mg->mg_secondaries[allocator]; 3137339105Smav was_active = B_TRUE; 3138339105Smav } else { 3139339105Smav msp = find_valid_metaslab(mg, activation_weight, dva, d, 3140339105Smav min_distance, asize, allocator, zal, search, 3141339105Smav &was_active); 3142339105Smav } 3143321529Smav 3144168404Spjd mutex_exit(&mg->mg_lock); 3145321529Smav if (msp == NULL) { 3146321529Smav kmem_free(search, sizeof (*search)); 3147168404Spjd return (-1ULL); 3148321529Smav } 3149168404Spjd 3150258633Savg mutex_enter(&msp->ms_lock); 3151224177Smm /* 3152168404Spjd * Ensure that the metaslab we have selected is still 3153168404Spjd * capable of handling our request. It's possible that 3154168404Spjd * another thread may have changed the weight while we 3155321529Smav * were blocked on the metaslab lock. We check the 3156321529Smav * active status first to see if we need to reselect 3157321529Smav * a new metaslab. 3158168404Spjd */ 3159321529Smav if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 3160168404Spjd mutex_exit(&msp->ms_lock); 3161168404Spjd continue; 3162168404Spjd } 3163168404Spjd 3164339105Smav /* 3165339105Smav * If the metaslab is freshly activated for an allocator that 3166339105Smav * isn't the one we're allocating from, or if it's a primary and 3167339105Smav * we're seeking a secondary (or vice versa), we go back and 3168339105Smav * select a new metaslab. 3169339105Smav */ 3170339105Smav if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && 3171339105Smav (msp->ms_allocator != -1) && 3172339105Smav (msp->ms_allocator != allocator || ((activation_weight == 3173339105Smav METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { 3174168404Spjd mutex_exit(&msp->ms_lock); 3175168404Spjd continue; 3176168404Spjd } 3177168404Spjd 3178339151Smav if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && 3179339151Smav activation_weight != METASLAB_WEIGHT_CLAIM) { 3180339105Smav metaslab_passivate(msp, msp->ms_weight & 3181339105Smav ~METASLAB_WEIGHT_CLAIM); 3182168404Spjd mutex_exit(&msp->ms_lock); 3183168404Spjd continue; 3184168404Spjd } 3185339105Smav 3186339105Smav if (metaslab_activate(msp, allocator, activation_weight) != 0) { 3187339105Smav mutex_exit(&msp->ms_lock); 3188339105Smav continue; 3189339105Smav } 3190339105Smav 3191321529Smav msp->ms_selected_txg = txg; 3192168404Spjd 3193247398Smm /* 3194321529Smav * Now that we have the lock, recheck to see if we should 3195321529Smav * continue to use this metaslab for this allocation. The 3196321529Smav * the metaslab is now loaded so metaslab_should_allocate() can 3197321529Smav * accurately determine if the allocation attempt should 3198321529Smav * proceed. 3199321529Smav */ 3200321529Smav if (!metaslab_should_allocate(msp, asize)) { 3201321529Smav /* Passivate this metaslab and select a new one. */ 3202321529Smav metaslab_trace_add(zal, mg, msp, asize, d, 3203339105Smav TRACE_TOO_SMALL, allocator); 3204321529Smav goto next; 3205321529Smav } 3206321529Smav 3207321529Smav /* 3208247398Smm * If this metaslab is currently condensing then pick again as 3209247398Smm * we can't manipulate this metaslab until it's committed 3210339111Smav * to disk. If this metaslab is being initialized, we shouldn't 3211339111Smav * allocate from it since the allocated region might be 3212339111Smav * overwritten after allocation. 3213247398Smm */ 3214258717Savg if (msp->ms_condensing) { 3215321529Smav metaslab_trace_add(zal, mg, msp, asize, d, 3216339105Smav TRACE_CONDENSING, allocator); 3217339105Smav metaslab_passivate(msp, msp->ms_weight & 3218339105Smav ~METASLAB_ACTIVE_MASK); 3219247398Smm mutex_exit(&msp->ms_lock); 3220247398Smm continue; 3221339111Smav } else if (msp->ms_initializing > 0) { 3222339111Smav metaslab_trace_add(zal, mg, msp, asize, d, 3223339111Smav TRACE_INITIALIZING, allocator); 3224339111Smav metaslab_passivate(msp, msp->ms_weight & 3225339111Smav ~METASLAB_ACTIVE_MASK); 3226339111Smav mutex_exit(&msp->ms_lock); 3227339111Smav continue; 3228247398Smm } 3229247398Smm 3230321529Smav offset = metaslab_block_alloc(msp, asize, txg); 3231339105Smav metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); 3232321529Smav 3233321529Smav if (offset != -1ULL) { 3234321529Smav /* Proactively passivate the metaslab, if needed */ 3235321529Smav metaslab_segment_may_passivate(msp); 3236168404Spjd break; 3237321529Smav } 3238321529Smavnext: 3239321529Smav ASSERT(msp->ms_loaded); 3240168404Spjd 3241321529Smav /* 3242321529Smav * We were unable to allocate from this metaslab so determine 3243321529Smav * a new weight for this metaslab. Now that we have loaded 3244321529Smav * the metaslab we can provide a better hint to the metaslab 3245321529Smav * selector. 3246321529Smav * 3247321529Smav * For space-based metaslabs, we use the maximum block size. 3248321529Smav * This information is only available when the metaslab 3249321529Smav * is loaded and is more accurate than the generic free 3250321529Smav * space weight that was calculated by metaslab_weight(). 3251321529Smav * This information allows us to quickly compare the maximum 3252321529Smav * available allocation in the metaslab to the allocation 3253321529Smav * size being requested. 3254321529Smav * 3255321529Smav * For segment-based metaslabs, determine the new weight 3256321529Smav * based on the highest bucket in the range tree. We 3257321529Smav * explicitly use the loaded segment weight (i.e. the range 3258321529Smav * tree histogram) since it contains the space that is 3259321529Smav * currently available for allocation and is accurate 3260321529Smav * even within a sync pass. 3261321529Smav */ 3262321529Smav if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3263321529Smav uint64_t weight = metaslab_block_maxsize(msp); 3264321529Smav WEIGHT_SET_SPACEBASED(weight); 3265321529Smav metaslab_passivate(msp, weight); 3266321529Smav } else { 3267321529Smav metaslab_passivate(msp, 3268321529Smav metaslab_weight_from_range_tree(msp)); 3269321529Smav } 3270321529Smav 3271321529Smav /* 3272321529Smav * We have just failed an allocation attempt, check 3273321529Smav * that metaslab_should_allocate() agrees. Otherwise, 3274321529Smav * we may end up in an infinite loop retrying the same 3275321529Smav * metaslab. 3276321529Smav */ 3277321529Smav ASSERT(!metaslab_should_allocate(msp, asize)); 3278168404Spjd mutex_exit(&msp->ms_lock); 3279168404Spjd } 3280321529Smav mutex_exit(&msp->ms_lock); 3281321529Smav kmem_free(search, sizeof (*search)); 3282321529Smav return (offset); 3283321529Smav} 3284168404Spjd 3285321529Smavstatic uint64_t 3286321529Smavmetaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 3287339105Smav uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, 3288339105Smav int allocator) 3289321529Smav{ 3290321529Smav uint64_t offset; 3291321529Smav ASSERT(mg->mg_initialized); 3292168404Spjd 3293321529Smav offset = metaslab_group_alloc_normal(mg, zal, asize, txg, 3294339105Smav min_distance, dva, d, allocator); 3295168404Spjd 3296321529Smav mutex_enter(&mg->mg_lock); 3297321529Smav if (offset == -1ULL) { 3298321529Smav mg->mg_failed_allocations++; 3299321529Smav metaslab_trace_add(zal, mg, NULL, asize, d, 3300339105Smav TRACE_GROUP_FAILURE, allocator); 3301321529Smav if (asize == SPA_GANGBLOCKSIZE) { 3302321529Smav /* 3303321529Smav * This metaslab group was unable to allocate 3304321529Smav * the minimum gang block size so it must be out of 3305321529Smav * space. We must notify the allocation throttle 3306321529Smav * to start skipping allocation attempts to this 3307321529Smav * metaslab group until more space becomes available. 3308321529Smav * Note: this failure cannot be caused by the 3309321529Smav * allocation throttle since the allocation throttle 3310321529Smav * is only responsible for skipping devices and 3311321529Smav * not failing block allocations. 3312321529Smav */ 3313321529Smav mg->mg_no_free_space = B_TRUE; 3314321529Smav } 3315321529Smav } 3316321529Smav mg->mg_allocations++; 3317321529Smav mutex_exit(&mg->mg_lock); 3318168404Spjd return (offset); 3319168404Spjd} 3320168404Spjd 3321168404Spjd/* 3322321529Smav * If we have to write a ditto block (i.e. more than one DVA for a given BP) 3323321529Smav * on the same vdev as an existing DVA of this BP, then try to allocate it 3324321529Smav * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the 3325321529Smav * existing DVAs. 3326321529Smav */ 3327321529Smavint ditto_same_vdev_distance_shift = 3; 3328321529Smav 3329321529Smav/* 3330168404Spjd * Allocate a block for the specified i/o. 3331168404Spjd */ 3332332525Smavint 3333185029Spjdmetaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3334321529Smav dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 3335339105Smav zio_alloc_list_t *zal, int allocator) 3336168404Spjd{ 3337168404Spjd metaslab_group_t *mg, *rotor; 3338168404Spjd vdev_t *vd; 3339321529Smav boolean_t try_hard = B_FALSE; 3340168404Spjd 3341168404Spjd ASSERT(!DVA_IS_VALID(&dva[d])); 3342168404Spjd 3343185029Spjd /* 3344185029Spjd * For testing, make some blocks above a certain size be gang blocks. 3345185029Spjd */ 3346332553Smav if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 3347339105Smav metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, 3348339105Smav allocator); 3349249195Smm return (SET_ERROR(ENOSPC)); 3350321529Smav } 3351168404Spjd 3352168404Spjd /* 3353168404Spjd * Start at the rotor and loop through all mgs until we find something. 3354219089Spjd * Note that there's no locking on mc_rotor or mc_aliquot because 3355168404Spjd * nothing actually breaks if we miss a few updates -- we just won't 3356168404Spjd * allocate quite as evenly. It all balances out over time. 3357168404Spjd * 3358168404Spjd * If we are doing ditto or log blocks, try to spread them across 3359168404Spjd * consecutive vdevs. If we're forced to reuse a vdev before we've 3360168404Spjd * allocated all of our ditto blocks, then try and spread them out on 3361168404Spjd * that vdev as much as possible. If it turns out to not be possible, 3362168404Spjd * gradually lower our standards until anything becomes acceptable. 3363168404Spjd * Also, allocating on consecutive vdevs (as opposed to random vdevs) 3364168404Spjd * gives us hope of containing our fault domains to something we're 3365168404Spjd * able to reason about. Otherwise, any two top-level vdev failures 3366168404Spjd * will guarantee the loss of data. With consecutive allocation, 3367168404Spjd * only two adjacent top-level vdev failures will result in data loss. 3368168404Spjd * 3369168404Spjd * If we are doing gang blocks (hintdva is non-NULL), try to keep 3370168404Spjd * ourselves on the same vdev as our gang block header. That 3371168404Spjd * way, we can hope for locality in vdev_cache, plus it makes our 3372168404Spjd * fault domains something tractable. 3373168404Spjd */ 3374168404Spjd if (hintdva) { 3375168404Spjd vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 3376219089Spjd 3377219089Spjd /* 3378219089Spjd * It's possible the vdev we're using as the hint no 3379332525Smav * longer exists or its mg has been closed (e.g. by 3380332525Smav * device removal). Consult the rotor when 3381219089Spjd * all else fails. 3382219089Spjd */ 3383332525Smav if (vd != NULL && vd->vdev_mg != NULL) { 3384168404Spjd mg = vd->vdev_mg; 3385219089Spjd 3386219089Spjd if (flags & METASLAB_HINTBP_AVOID && 3387219089Spjd mg->mg_next != NULL) 3388219089Spjd mg = mg->mg_next; 3389219089Spjd } else { 3390219089Spjd mg = mc->mc_rotor; 3391219089Spjd } 3392168404Spjd } else if (d != 0) { 3393168404Spjd vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 3394168404Spjd mg = vd->vdev_mg->mg_next; 3395168404Spjd } else { 3396168404Spjd mg = mc->mc_rotor; 3397168404Spjd } 3398185029Spjd 3399185029Spjd /* 3400219089Spjd * If the hint put us into the wrong metaslab class, or into a 3401219089Spjd * metaslab group that has been passivated, just follow the rotor. 3402185029Spjd */ 3403219089Spjd if (mg->mg_class != mc || mg->mg_activation_count <= 0) 3404185029Spjd mg = mc->mc_rotor; 3405185029Spjd 3406168404Spjd rotor = mg; 3407168404Spjdtop: 3408168404Spjd do { 3409321529Smav boolean_t allocatable; 3410321529Smav 3411219089Spjd ASSERT(mg->mg_activation_count == 1); 3412168404Spjd vd = mg->mg_vd; 3413209962Smm 3414185029Spjd /* 3415185029Spjd * Don't allocate from faulted devices. 3416185029Spjd */ 3417321529Smav if (try_hard) { 3418209962Smm spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 3419209962Smm allocatable = vdev_allocatable(vd); 3420209962Smm spa_config_exit(spa, SCL_ZIO, FTAG); 3421209962Smm } else { 3422209962Smm allocatable = vdev_allocatable(vd); 3423209962Smm } 3424258633Savg 3425258633Savg /* 3426258633Savg * Determine if the selected metaslab group is eligible 3427307277Smav * for allocations. If we're ganging then don't allow 3428307277Smav * this metaslab group to skip allocations since that would 3429307277Smav * inadvertently return ENOSPC and suspend the pool 3430258633Savg * even though space is still available. 3431258633Savg */ 3432321529Smav if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3433307277Smav allocatable = metaslab_group_allocatable(mg, rotor, 3434339152Smav psize, allocator, d); 3435307277Smav } 3436258633Savg 3437321529Smav if (!allocatable) { 3438321529Smav metaslab_trace_add(zal, mg, NULL, psize, d, 3439339105Smav TRACE_NOT_ALLOCATABLE, allocator); 3440185029Spjd goto next; 3441321529Smav } 3442209962Smm 3443307277Smav ASSERT(mg->mg_initialized); 3444307277Smav 3445185029Spjd /* 3446321529Smav * Avoid writing single-copy data to a failing, 3447321529Smav * non-redundant vdev, unless we've already tried all 3448321529Smav * other vdevs. 3449185029Spjd */ 3450185029Spjd if ((vd->vdev_stat.vs_write_errors > 0 || 3451185029Spjd vd->vdev_state < VDEV_STATE_HEALTHY) && 3452321529Smav d == 0 && !try_hard && vd->vdev_children == 0) { 3453321529Smav metaslab_trace_add(zal, mg, NULL, psize, d, 3454339105Smav TRACE_VDEV_ERROR, allocator); 3455185029Spjd goto next; 3456185029Spjd } 3457168404Spjd 3458185029Spjd ASSERT(mg->mg_class == mc); 3459185029Spjd 3460321529Smav /* 3461321529Smav * If we don't need to try hard, then require that the 3462321529Smav * block be 1/8th of the device away from any other DVAs 3463321529Smav * in this BP. If we are trying hard, allow any offset 3464321529Smav * to be used (distance=0). 3465321529Smav */ 3466321529Smav uint64_t distance = 0; 3467321529Smav if (!try_hard) { 3468321529Smav distance = vd->vdev_asize >> 3469321529Smav ditto_same_vdev_distance_shift; 3470321529Smav if (distance <= (1ULL << vd->vdev_ms_shift)) 3471321529Smav distance = 0; 3472321529Smav } 3473168404Spjd 3474321529Smav uint64_t asize = vdev_psize_to_asize(vd, psize); 3475168404Spjd ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3476168404Spjd 3477321529Smav uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 3478339105Smav distance, dva, d, allocator); 3479307277Smav 3480168404Spjd if (offset != -1ULL) { 3481168404Spjd /* 3482168404Spjd * If we've just selected this metaslab group, 3483168404Spjd * figure out whether the corresponding vdev is 3484168404Spjd * over- or under-used relative to the pool, 3485168404Spjd * and set an allocation bias to even it out. 3486168404Spjd */ 3487269118Sdelphij if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 3488168404Spjd vdev_stat_t *vs = &vd->vdev_stat; 3489219089Spjd int64_t vu, cu; 3490168404Spjd 3491224177Smm vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 3492224177Smm cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 3493168404Spjd 3494168404Spjd /* 3495224177Smm * Calculate how much more or less we should 3496224177Smm * try to allocate from this device during 3497224177Smm * this iteration around the rotor. 3498224177Smm * For example, if a device is 80% full 3499224177Smm * and the pool is 20% full then we should 3500224177Smm * reduce allocations by 60% on this device. 3501224177Smm * 3502224177Smm * mg_bias = (20 - 80) * 512K / 100 = -307K 3503224177Smm * 3504224177Smm * This reduces allocations by 307K for this 3505224177Smm * iteration. 3506168404Spjd */ 3507219089Spjd mg->mg_bias = ((cu - vu) * 3508224177Smm (int64_t)mg->mg_aliquot) / 100; 3509269118Sdelphij } else if (!metaslab_bias_enabled) { 3510269118Sdelphij mg->mg_bias = 0; 3511168404Spjd } 3512168404Spjd 3513219089Spjd if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 3514168404Spjd mg->mg_aliquot + mg->mg_bias) { 3515168404Spjd mc->mc_rotor = mg->mg_next; 3516219089Spjd mc->mc_aliquot = 0; 3517168404Spjd } 3518168404Spjd 3519168404Spjd DVA_SET_VDEV(&dva[d], vd->vdev_id); 3520168404Spjd DVA_SET_OFFSET(&dva[d], offset); 3521185029Spjd DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 3522168404Spjd DVA_SET_ASIZE(&dva[d], asize); 3523168404Spjd 3524168404Spjd return (0); 3525168404Spjd } 3526185029Spjdnext: 3527168404Spjd mc->mc_rotor = mg->mg_next; 3528219089Spjd mc->mc_aliquot = 0; 3529168404Spjd } while ((mg = mg->mg_next) != rotor); 3530168404Spjd 3531321529Smav /* 3532321529Smav * If we haven't tried hard, do so now. 3533321529Smav */ 3534321529Smav if (!try_hard) { 3535321529Smav try_hard = B_TRUE; 3536168404Spjd goto top; 3537168404Spjd } 3538168404Spjd 3539168404Spjd bzero(&dva[d], sizeof (dva_t)); 3540168404Spjd 3541339105Smav metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); 3542249195Smm return (SET_ERROR(ENOSPC)); 3543168404Spjd} 3544168404Spjd 3545332525Smavvoid 3546332525Smavmetaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3547332547Smav boolean_t checkpoint) 3548332525Smav{ 3549332525Smav metaslab_t *msp; 3550332525Smav spa_t *spa = vd->vdev_spa; 3551332525Smav 3552332525Smav ASSERT(vdev_is_concrete(vd)); 3553332525Smav ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3554332525Smav ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3555332525Smav 3556332525Smav msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3557332525Smav 3558332525Smav VERIFY(!msp->ms_condensing); 3559332525Smav VERIFY3U(offset, >=, msp->ms_start); 3560332525Smav VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 3561332525Smav VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3562332525Smav VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 3563332525Smav 3564332525Smav metaslab_check_free_impl(vd, offset, asize); 3565332547Smav 3566332525Smav mutex_enter(&msp->ms_lock); 3567332547Smav if (range_tree_is_empty(msp->ms_freeing) && 3568332547Smav range_tree_is_empty(msp->ms_checkpointing)) { 3569332547Smav vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 3570332525Smav } 3571332547Smav 3572332547Smav if (checkpoint) { 3573332547Smav ASSERT(spa_has_checkpoint(spa)); 3574332547Smav range_tree_add(msp->ms_checkpointing, offset, asize); 3575332547Smav } else { 3576332547Smav range_tree_add(msp->ms_freeing, offset, asize); 3577332547Smav } 3578332525Smav mutex_exit(&msp->ms_lock); 3579332525Smav} 3580332525Smav 3581332525Smav/* ARGSUSED */ 3582332525Smavvoid 3583332525Smavmetaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3584332525Smav uint64_t size, void *arg) 3585332525Smav{ 3586332547Smav boolean_t *checkpoint = arg; 3587332525Smav 3588332547Smav ASSERT3P(checkpoint, !=, NULL); 3589332547Smav 3590332525Smav if (vd->vdev_ops->vdev_op_remap != NULL) 3591332547Smav vdev_indirect_mark_obsolete(vd, offset, size); 3592332525Smav else 3593332547Smav metaslab_free_impl(vd, offset, size, *checkpoint); 3594332525Smav} 3595332525Smav 3596332525Smavstatic void 3597332525Smavmetaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 3598332547Smav boolean_t checkpoint) 3599332525Smav{ 3600332525Smav spa_t *spa = vd->vdev_spa; 3601332525Smav 3602332525Smav ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3603332525Smav 3604332547Smav if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 3605332525Smav return; 3606332525Smav 3607332525Smav if (spa->spa_vdev_removal != NULL && 3608339106Smav spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && 3609332525Smav vdev_is_concrete(vd)) { 3610332525Smav /* 3611332525Smav * Note: we check if the vdev is concrete because when 3612332525Smav * we complete the removal, we first change the vdev to be 3613332525Smav * an indirect vdev (in open context), and then (in syncing 3614332525Smav * context) clear spa_vdev_removal. 3615332525Smav */ 3616332547Smav free_from_removing_vdev(vd, offset, size); 3617332525Smav } else if (vd->vdev_ops->vdev_op_remap != NULL) { 3618332547Smav vdev_indirect_mark_obsolete(vd, offset, size); 3619332525Smav vd->vdev_ops->vdev_op_remap(vd, offset, size, 3620332547Smav metaslab_free_impl_cb, &checkpoint); 3621332525Smav } else { 3622332547Smav metaslab_free_concrete(vd, offset, size, checkpoint); 3623332525Smav } 3624332525Smav} 3625332525Smav 3626332525Smavtypedef struct remap_blkptr_cb_arg { 3627332525Smav blkptr_t *rbca_bp; 3628332525Smav spa_remap_cb_t rbca_cb; 3629332525Smav vdev_t *rbca_remap_vd; 3630332525Smav uint64_t rbca_remap_offset; 3631332525Smav void *rbca_cb_arg; 3632332525Smav} remap_blkptr_cb_arg_t; 3633332525Smav 3634332525Smavvoid 3635332525Smavremap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3636332525Smav uint64_t size, void *arg) 3637332525Smav{ 3638332525Smav remap_blkptr_cb_arg_t *rbca = arg; 3639332525Smav blkptr_t *bp = rbca->rbca_bp; 3640332525Smav 3641332525Smav /* We can not remap split blocks. */ 3642332525Smav if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 3643332525Smav return; 3644332525Smav ASSERT0(inner_offset); 3645332525Smav 3646332525Smav if (rbca->rbca_cb != NULL) { 3647332525Smav /* 3648332525Smav * At this point we know that we are not handling split 3649332525Smav * blocks and we invoke the callback on the previous 3650332525Smav * vdev which must be indirect. 3651332525Smav */ 3652332525Smav ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 3653332525Smav 3654332525Smav rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 3655332525Smav rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 3656332525Smav 3657332525Smav /* set up remap_blkptr_cb_arg for the next call */ 3658332525Smav rbca->rbca_remap_vd = vd; 3659332525Smav rbca->rbca_remap_offset = offset; 3660332525Smav } 3661332525Smav 3662332525Smav /* 3663332525Smav * The phys birth time is that of dva[0]. This ensures that we know 3664332525Smav * when each dva was written, so that resilver can determine which 3665332525Smav * blocks need to be scrubbed (i.e. those written during the time 3666332525Smav * the vdev was offline). It also ensures that the key used in 3667332525Smav * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 3668332525Smav * we didn't change the phys_birth, a lookup in the ARC for a 3669332525Smav * remapped BP could find the data that was previously stored at 3670332525Smav * this vdev + offset. 3671332525Smav */ 3672332525Smav vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 3673332525Smav DVA_GET_VDEV(&bp->blk_dva[0])); 3674332525Smav vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 3675332525Smav bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 3676332525Smav DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 3677332525Smav 3678332525Smav DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 3679332525Smav DVA_SET_OFFSET(&bp->blk_dva[0], offset); 3680332525Smav} 3681332525Smav 3682168404Spjd/* 3683332525Smav * If the block pointer contains any indirect DVAs, modify them to refer to 3684332525Smav * concrete DVAs. Note that this will sometimes not be possible, leaving 3685332525Smav * the indirect DVA in place. This happens if the indirect DVA spans multiple 3686332525Smav * segments in the mapping (i.e. it is a "split block"). 3687332525Smav * 3688332525Smav * If the BP was remapped, calls the callback on the original dva (note the 3689332525Smav * callback can be called multiple times if the original indirect DVA refers 3690332525Smav * to another indirect DVA, etc). 3691332525Smav * 3692332525Smav * Returns TRUE if the BP was remapped. 3693168404Spjd */ 3694332525Smavboolean_t 3695332525Smavspa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 3696168404Spjd{ 3697332525Smav remap_blkptr_cb_arg_t rbca; 3698332525Smav 3699332525Smav if (!zfs_remap_blkptr_enable) 3700332525Smav return (B_FALSE); 3701332525Smav 3702332525Smav if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 3703332525Smav return (B_FALSE); 3704332525Smav 3705332525Smav /* 3706332525Smav * Dedup BP's can not be remapped, because ddt_phys_select() depends 3707332525Smav * on DVA[0] being the same in the BP as in the DDT (dedup table). 3708332525Smav */ 3709332525Smav if (BP_GET_DEDUP(bp)) 3710332525Smav return (B_FALSE); 3711332525Smav 3712332525Smav /* 3713332525Smav * Gang blocks can not be remapped, because 3714332525Smav * zio_checksum_gang_verifier() depends on the DVA[0] that's in 3715332525Smav * the BP used to read the gang block header (GBH) being the same 3716332525Smav * as the DVA[0] that we allocated for the GBH. 3717332525Smav */ 3718332525Smav if (BP_IS_GANG(bp)) 3719332525Smav return (B_FALSE); 3720332525Smav 3721332525Smav /* 3722332525Smav * Embedded BP's have no DVA to remap. 3723332525Smav */ 3724332525Smav if (BP_GET_NDVAS(bp) < 1) 3725332525Smav return (B_FALSE); 3726332525Smav 3727332525Smav /* 3728332525Smav * Note: we only remap dva[0]. If we remapped other dvas, we 3729332525Smav * would no longer know what their phys birth txg is. 3730332525Smav */ 3731332525Smav dva_t *dva = &bp->blk_dva[0]; 3732332525Smav 3733332525Smav uint64_t offset = DVA_GET_OFFSET(dva); 3734332525Smav uint64_t size = DVA_GET_ASIZE(dva); 3735332525Smav vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 3736332525Smav 3737332525Smav if (vd->vdev_ops->vdev_op_remap == NULL) 3738332525Smav return (B_FALSE); 3739332525Smav 3740332525Smav rbca.rbca_bp = bp; 3741332525Smav rbca.rbca_cb = callback; 3742332525Smav rbca.rbca_remap_vd = vd; 3743332525Smav rbca.rbca_remap_offset = offset; 3744332525Smav rbca.rbca_cb_arg = arg; 3745332525Smav 3746332525Smav /* 3747332525Smav * remap_blkptr_cb() will be called in order for each level of 3748332525Smav * indirection, until a concrete vdev is reached or a split block is 3749332525Smav * encountered. old_vd and old_offset are updated within the callback 3750332525Smav * as we go from the one indirect vdev to the next one (either concrete 3751332525Smav * or indirect again) in that order. 3752332525Smav */ 3753332525Smav vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 3754332525Smav 3755332525Smav /* Check if the DVA wasn't remapped because it is a split block */ 3756332525Smav if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 3757332525Smav return (B_FALSE); 3758332525Smav 3759332525Smav return (B_TRUE); 3760332525Smav} 3761332525Smav 3762332525Smav/* 3763332525Smav * Undo the allocation of a DVA which happened in the given transaction group. 3764332525Smav */ 3765332525Smavvoid 3766332525Smavmetaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 3767332525Smav{ 3768332525Smav metaslab_t *msp; 3769332525Smav vdev_t *vd; 3770168404Spjd uint64_t vdev = DVA_GET_VDEV(dva); 3771168404Spjd uint64_t offset = DVA_GET_OFFSET(dva); 3772168404Spjd uint64_t size = DVA_GET_ASIZE(dva); 3773168404Spjd 3774168404Spjd ASSERT(DVA_IS_VALID(dva)); 3775332525Smav ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3776168404Spjd 3777168404Spjd if (txg > spa_freeze_txg(spa)) 3778168404Spjd return; 3779168404Spjd 3780168404Spjd if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 3781168404Spjd (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 3782168404Spjd cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 3783168404Spjd (u_longlong_t)vdev, (u_longlong_t)offset); 3784168404Spjd ASSERT(0); 3785168404Spjd return; 3786168404Spjd } 3787168404Spjd 3788332525Smav ASSERT(!vd->vdev_removing); 3789332525Smav ASSERT(vdev_is_concrete(vd)); 3790332525Smav ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 3791332525Smav ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 3792168404Spjd 3793168404Spjd if (DVA_GET_GANG(dva)) 3794168404Spjd size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3795168404Spjd 3796332525Smav msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3797332525Smav 3798168404Spjd mutex_enter(&msp->ms_lock); 3799332547Smav range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 3800332525Smav offset, size); 3801168404Spjd 3802332525Smav VERIFY(!msp->ms_condensing); 3803332525Smav VERIFY3U(offset, >=, msp->ms_start); 3804332525Smav VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 3805332547Smav VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 3806332525Smav msp->ms_size); 3807332525Smav VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3808332525Smav VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3809332547Smav range_tree_add(msp->ms_allocatable, offset, size); 3810168404Spjd mutex_exit(&msp->ms_lock); 3811168404Spjd} 3812168404Spjd 3813168404Spjd/* 3814332547Smav * Free the block represented by the given DVA. 3815168404Spjd */ 3816332525Smavvoid 3817332547Smavmetaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 3818168404Spjd{ 3819168404Spjd uint64_t vdev = DVA_GET_VDEV(dva); 3820168404Spjd uint64_t offset = DVA_GET_OFFSET(dva); 3821168404Spjd uint64_t size = DVA_GET_ASIZE(dva); 3822332525Smav vdev_t *vd = vdev_lookup_top(spa, vdev); 3823168404Spjd 3824168404Spjd ASSERT(DVA_IS_VALID(dva)); 3825332525Smav ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3826168404Spjd 3827332525Smav if (DVA_GET_GANG(dva)) { 3828168404Spjd size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3829168404Spjd } 3830168404Spjd 3831332547Smav metaslab_free_impl(vd, offset, size, checkpoint); 3832168404Spjd} 3833168404Spjd 3834307277Smav/* 3835307277Smav * Reserve some allocation slots. The reservation system must be called 3836307277Smav * before we call into the allocator. If there aren't any available slots 3837307277Smav * then the I/O will be throttled until an I/O completes and its slots are 3838307277Smav * freed up. The function returns true if it was successful in placing 3839307277Smav * the reservation. 3840307277Smav */ 3841307277Smavboolean_t 3842339105Smavmetaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, 3843339105Smav zio_t *zio, int flags) 3844307277Smav{ 3845307277Smav uint64_t available_slots = 0; 3846307277Smav boolean_t slot_reserved = B_FALSE; 3847339105Smav uint64_t max = mc->mc_alloc_max_slots[allocator]; 3848307277Smav 3849307277Smav ASSERT(mc->mc_alloc_throttle_enabled); 3850307277Smav mutex_enter(&mc->mc_lock); 3851307277Smav 3852339105Smav uint64_t reserved_slots = 3853339105Smav refcount_count(&mc->mc_alloc_slots[allocator]); 3854339105Smav if (reserved_slots < max) 3855339105Smav available_slots = max - reserved_slots; 3856307277Smav 3857307277Smav if (slots <= available_slots || GANG_ALLOCATION(flags)) { 3858307277Smav /* 3859307277Smav * We reserve the slots individually so that we can unreserve 3860307277Smav * them individually when an I/O completes. 3861307277Smav */ 3862307277Smav for (int d = 0; d < slots; d++) { 3863339105Smav reserved_slots = 3864339105Smav refcount_add(&mc->mc_alloc_slots[allocator], 3865339105Smav zio); 3866307277Smav } 3867307277Smav zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 3868307277Smav slot_reserved = B_TRUE; 3869307277Smav } 3870307277Smav 3871307277Smav mutex_exit(&mc->mc_lock); 3872307277Smav return (slot_reserved); 3873307277Smav} 3874307277Smav 3875307277Smavvoid 3876339105Smavmetaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, 3877339105Smav int allocator, zio_t *zio) 3878307277Smav{ 3879307277Smav ASSERT(mc->mc_alloc_throttle_enabled); 3880307277Smav mutex_enter(&mc->mc_lock); 3881307277Smav for (int d = 0; d < slots; d++) { 3882339105Smav (void) refcount_remove(&mc->mc_alloc_slots[allocator], 3883339105Smav zio); 3884307277Smav } 3885307277Smav mutex_exit(&mc->mc_lock); 3886307277Smav} 3887307277Smav 3888332525Smavstatic int 3889332525Smavmetaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 3890332525Smav uint64_t txg) 3891332525Smav{ 3892332525Smav metaslab_t *msp; 3893332525Smav spa_t *spa = vd->vdev_spa; 3894332525Smav int error = 0; 3895332525Smav 3896332525Smav if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 3897332525Smav return (ENXIO); 3898332525Smav 3899332525Smav ASSERT3P(vd->vdev_ms, !=, NULL); 3900332525Smav msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3901332525Smav 3902332525Smav mutex_enter(&msp->ms_lock); 3903332525Smav 3904332525Smav if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 3905339105Smav error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); 3906339105Smav /* 3907339105Smav * No need to fail in that case; someone else has activated the 3908339105Smav * metaslab, but that doesn't preclude us from using it. 3909339105Smav */ 3910339105Smav if (error == EBUSY) 3911339105Smav error = 0; 3912332525Smav 3913332547Smav if (error == 0 && 3914332547Smav !range_tree_contains(msp->ms_allocatable, offset, size)) 3915332525Smav error = SET_ERROR(ENOENT); 3916332525Smav 3917332525Smav if (error || txg == 0) { /* txg == 0 indicates dry run */ 3918332525Smav mutex_exit(&msp->ms_lock); 3919332525Smav return (error); 3920332525Smav } 3921332525Smav 3922332525Smav VERIFY(!msp->ms_condensing); 3923332525Smav VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3924332525Smav VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3925332547Smav VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 3926332547Smav msp->ms_size); 3927332547Smav range_tree_remove(msp->ms_allocatable, offset, size); 3928332525Smav 3929332525Smav if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 3930332547Smav if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3931332525Smav vdev_dirty(vd, VDD_METASLAB, msp, txg); 3932332547Smav range_tree_add(msp->ms_allocating[txg & TXG_MASK], 3933332547Smav offset, size); 3934332525Smav } 3935332525Smav 3936332525Smav mutex_exit(&msp->ms_lock); 3937332525Smav 3938332525Smav return (0); 3939332525Smav} 3940332525Smav 3941332525Smavtypedef struct metaslab_claim_cb_arg_t { 3942332525Smav uint64_t mcca_txg; 3943332525Smav int mcca_error; 3944332525Smav} metaslab_claim_cb_arg_t; 3945332525Smav 3946332525Smav/* ARGSUSED */ 3947332525Smavstatic void 3948332525Smavmetaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3949332525Smav uint64_t size, void *arg) 3950332525Smav{ 3951332525Smav metaslab_claim_cb_arg_t *mcca_arg = arg; 3952332525Smav 3953332525Smav if (mcca_arg->mcca_error == 0) { 3954332525Smav mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 3955332525Smav size, mcca_arg->mcca_txg); 3956332525Smav } 3957332525Smav} 3958332525Smav 3959168404Spjdint 3960332525Smavmetaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 3961332525Smav{ 3962332525Smav if (vd->vdev_ops->vdev_op_remap != NULL) { 3963332525Smav metaslab_claim_cb_arg_t arg; 3964332525Smav 3965332525Smav /* 3966332525Smav * Only zdb(1M) can claim on indirect vdevs. This is used 3967332525Smav * to detect leaks of mapped space (that are not accounted 3968332525Smav * for in the obsolete counts, spacemap, or bpobj). 3969332525Smav */ 3970332525Smav ASSERT(!spa_writeable(vd->vdev_spa)); 3971332525Smav arg.mcca_error = 0; 3972332525Smav arg.mcca_txg = txg; 3973332525Smav 3974332525Smav vd->vdev_ops->vdev_op_remap(vd, offset, size, 3975332525Smav metaslab_claim_impl_cb, &arg); 3976332525Smav 3977332525Smav if (arg.mcca_error == 0) { 3978332525Smav arg.mcca_error = metaslab_claim_concrete(vd, 3979332525Smav offset, size, txg); 3980332525Smav } 3981332525Smav return (arg.mcca_error); 3982332525Smav } else { 3983332525Smav return (metaslab_claim_concrete(vd, offset, size, txg)); 3984332525Smav } 3985332525Smav} 3986332525Smav 3987332525Smav/* 3988332525Smav * Intent log support: upon opening the pool after a crash, notify the SPA 3989332525Smav * of blocks that the intent log has allocated for immediate write, but 3990332525Smav * which are still considered free by the SPA because the last transaction 3991332525Smav * group didn't commit yet. 3992332525Smav */ 3993332525Smavstatic int 3994332525Smavmetaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 3995332525Smav{ 3996332525Smav uint64_t vdev = DVA_GET_VDEV(dva); 3997332525Smav uint64_t offset = DVA_GET_OFFSET(dva); 3998332525Smav uint64_t size = DVA_GET_ASIZE(dva); 3999332525Smav vdev_t *vd; 4000332525Smav 4001332525Smav if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 4002332525Smav return (SET_ERROR(ENXIO)); 4003332525Smav } 4004332525Smav 4005332525Smav ASSERT(DVA_IS_VALID(dva)); 4006332525Smav 4007332525Smav if (DVA_GET_GANG(dva)) 4008332525Smav size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4009332525Smav 4010332525Smav return (metaslab_claim_impl(vd, offset, size, txg)); 4011332525Smav} 4012332525Smav 4013332525Smavint 4014185029Spjdmetaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 4015321529Smav int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 4016339105Smav zio_alloc_list_t *zal, zio_t *zio, int allocator) 4017168404Spjd{ 4018168404Spjd dva_t *dva = bp->blk_dva; 4019168404Spjd dva_t *hintdva = hintbp->blk_dva; 4020168404Spjd int error = 0; 4021168404Spjd 4022185029Spjd ASSERT(bp->blk_birth == 0); 4023219089Spjd ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 4024185029Spjd 4025185029Spjd spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4026185029Spjd 4027185029Spjd if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 4028185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 4029249195Smm return (SET_ERROR(ENOSPC)); 4030185029Spjd } 4031185029Spjd 4032168404Spjd ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 4033168404Spjd ASSERT(BP_GET_NDVAS(bp) == 0); 4034168404Spjd ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 4035321529Smav ASSERT3P(zal, !=, NULL); 4036168404Spjd 4037185029Spjd for (int d = 0; d < ndvas; d++) { 4038185029Spjd error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 4039339105Smav txg, flags, zal, allocator); 4040258717Savg if (error != 0) { 4041168404Spjd for (d--; d >= 0; d--) { 4042332525Smav metaslab_unalloc_dva(spa, &dva[d], txg); 4043307277Smav metaslab_group_alloc_decrement(spa, 4044339105Smav DVA_GET_VDEV(&dva[d]), zio, flags, 4045339105Smav allocator, B_FALSE); 4046168404Spjd bzero(&dva[d], sizeof (dva_t)); 4047168404Spjd } 4048185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 4049168404Spjd return (error); 4050307277Smav } else { 4051307277Smav /* 4052307277Smav * Update the metaslab group's queue depth 4053307277Smav * based on the newly allocated dva. 4054307277Smav */ 4055307277Smav metaslab_group_alloc_increment(spa, 4056339105Smav DVA_GET_VDEV(&dva[d]), zio, flags, allocator); 4057168404Spjd } 4058307277Smav 4059168404Spjd } 4060168404Spjd ASSERT(error == 0); 4061168404Spjd ASSERT(BP_GET_NDVAS(bp) == ndvas); 4062168404Spjd 4063185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 4064185029Spjd 4065219089Spjd BP_SET_BIRTH(bp, txg, txg); 4066185029Spjd 4067168404Spjd return (0); 4068168404Spjd} 4069168404Spjd 4070168404Spjdvoid 4071168404Spjdmetaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 4072168404Spjd{ 4073168404Spjd const dva_t *dva = bp->blk_dva; 4074168404Spjd int ndvas = BP_GET_NDVAS(bp); 4075168404Spjd 4076168404Spjd ASSERT(!BP_IS_HOLE(bp)); 4077219089Spjd ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 4078168404Spjd 4079332547Smav /* 4080332547Smav * If we have a checkpoint for the pool we need to make sure that 4081332547Smav * the blocks that we free that are part of the checkpoint won't be 4082332547Smav * reused until the checkpoint is discarded or we revert to it. 4083332547Smav * 4084332547Smav * The checkpoint flag is passed down the metaslab_free code path 4085332547Smav * and is set whenever we want to add a block to the checkpoint's 4086332547Smav * accounting. That is, we "checkpoint" blocks that existed at the 4087332547Smav * time the checkpoint was created and are therefore referenced by 4088332547Smav * the checkpointed uberblock. 4089332547Smav * 4090332547Smav * Note that, we don't checkpoint any blocks if the current 4091332547Smav * syncing txg <= spa_checkpoint_txg. We want these frees to sync 4092332547Smav * normally as they will be referenced by the checkpointed uberblock. 4093332547Smav */ 4094332547Smav boolean_t checkpoint = B_FALSE; 4095332547Smav if (bp->blk_birth <= spa->spa_checkpoint_txg && 4096332547Smav spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 4097332547Smav /* 4098332547Smav * At this point, if the block is part of the checkpoint 4099332547Smav * there is no way it was created in the current txg. 4100332547Smav */ 4101332547Smav ASSERT(!now); 4102332547Smav ASSERT3U(spa_syncing_txg(spa), ==, txg); 4103332547Smav checkpoint = B_TRUE; 4104332547Smav } 4105332547Smav 4106185029Spjd spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 4107185029Spjd 4108332525Smav for (int d = 0; d < ndvas; d++) { 4109332525Smav if (now) { 4110332525Smav metaslab_unalloc_dva(spa, &dva[d], txg); 4111332525Smav } else { 4112332547Smav ASSERT3U(txg, ==, spa_syncing_txg(spa)); 4113332547Smav metaslab_free_dva(spa, &dva[d], checkpoint); 4114332525Smav } 4115332525Smav } 4116185029Spjd 4117185029Spjd spa_config_exit(spa, SCL_FREE, FTAG); 4118168404Spjd} 4119168404Spjd 4120168404Spjdint 4121168404Spjdmetaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 4122168404Spjd{ 4123168404Spjd const dva_t *dva = bp->blk_dva; 4124168404Spjd int ndvas = BP_GET_NDVAS(bp); 4125185029Spjd int error = 0; 4126168404Spjd 4127168404Spjd ASSERT(!BP_IS_HOLE(bp)); 4128168404Spjd 4129185029Spjd if (txg != 0) { 4130185029Spjd /* 4131185029Spjd * First do a dry run to make sure all DVAs are claimable, 4132185029Spjd * so we don't have to unwind from partial failures below. 4133185029Spjd */ 4134185029Spjd if ((error = metaslab_claim(spa, bp, 0)) != 0) 4135185029Spjd return (error); 4136185029Spjd } 4137185029Spjd 4138185029Spjd spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 4139185029Spjd 4140185029Spjd for (int d = 0; d < ndvas; d++) 4141168404Spjd if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 4142185029Spjd break; 4143168404Spjd 4144185029Spjd spa_config_exit(spa, SCL_ALLOC, FTAG); 4145185029Spjd 4146185029Spjd ASSERT(error == 0 || txg == 0); 4147185029Spjd 4148185029Spjd return (error); 4149168404Spjd} 4150248571Smm 4151332525Smav/* ARGSUSED */ 4152332525Smavstatic void 4153332525Smavmetaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 4154332525Smav uint64_t size, void *arg) 4155332525Smav{ 4156332525Smav if (vd->vdev_ops == &vdev_indirect_ops) 4157332525Smav return; 4158332525Smav 4159332525Smav metaslab_check_free_impl(vd, offset, size); 4160332525Smav} 4161332525Smav 4162332525Smavstatic void 4163332525Smavmetaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 4164332525Smav{ 4165332525Smav metaslab_t *msp; 4166332525Smav spa_t *spa = vd->vdev_spa; 4167332525Smav 4168332525Smav if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4169332525Smav return; 4170332525Smav 4171332525Smav if (vd->vdev_ops->vdev_op_remap != NULL) { 4172332525Smav vd->vdev_ops->vdev_op_remap(vd, offset, size, 4173332525Smav metaslab_check_free_impl_cb, NULL); 4174332525Smav return; 4175332525Smav } 4176332525Smav 4177332525Smav ASSERT(vdev_is_concrete(vd)); 4178332525Smav ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 4179332525Smav ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 4180332525Smav 4181332525Smav msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4182332525Smav 4183332525Smav mutex_enter(&msp->ms_lock); 4184332525Smav if (msp->ms_loaded) 4185332547Smav range_tree_verify(msp->ms_allocatable, offset, size); 4186332525Smav 4187332547Smav range_tree_verify(msp->ms_freeing, offset, size); 4188332547Smav range_tree_verify(msp->ms_checkpointing, offset, size); 4189332547Smav range_tree_verify(msp->ms_freed, offset, size); 4190332525Smav for (int j = 0; j < TXG_DEFER_SIZE; j++) 4191332547Smav range_tree_verify(msp->ms_defer[j], offset, size); 4192332525Smav mutex_exit(&msp->ms_lock); 4193332525Smav} 4194332525Smav 4195248571Smmvoid 4196248571Smmmetaslab_check_free(spa_t *spa, const blkptr_t *bp) 4197248571Smm{ 4198248571Smm if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 4199248571Smm return; 4200248571Smm 4201248571Smm spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4202248571Smm for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 4203258717Savg uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 4204258717Savg vdev_t *vd = vdev_lookup_top(spa, vdev); 4205258717Savg uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 4206248571Smm uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 4207248571Smm 4208332525Smav if (DVA_GET_GANG(&bp->blk_dva[i])) 4209332525Smav size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 4210248571Smm 4211332525Smav ASSERT3P(vd, !=, NULL); 4212332525Smav 4213332525Smav metaslab_check_free_impl(vd, offset, size); 4214248571Smm } 4215248571Smm spa_config_exit(spa, SCL_VDEV, FTAG); 4216248571Smm} 4217