Deleted Added
full compact
metaslab.c (219089) metaslab.c (224177)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 6 unchanged lines hidden (view full) ---

15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 6 unchanged lines hidden (view full) ---

15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 by Delphix. All rights reserved.
23 */
24
25#include <sys/zfs_context.h>
26#include <sys/dmu.h>
27#include <sys/dmu_tx.h>
28#include <sys/space_map.h>
29#include <sys/metaslab_impl.h>
30#include <sys/vdev_impl.h>
31#include <sys/zio.h>
32
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/dmu.h>
28#include <sys/dmu_tx.h>
29#include <sys/space_map.h>
30#include <sys/metaslab_impl.h>
31#include <sys/vdev_impl.h>
32#include <sys/zio.h>
33
34/*
35 * Allow allocations to switch to gang blocks quickly. We do this to
36 * avoid having to load lots of space_maps in a given txg. There are,
37 * however, some cases where we want to avoid "fast" ganging and instead
38 * we want to do an exhaustive search of all metaslabs on this device.
39 * Currently we don't allow any gang or dump device related allocations
40 * to "fast" gang.
41 */
42#define CAN_FASTGANG(flags) \
43 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
44 METASLAB_GANG_AVOID)))
45
33uint64_t metaslab_aliquot = 512ULL << 10;
34uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
35
36/*
46uint64_t metaslab_aliquot = 512ULL << 10;
47uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
48
49/*
50 * This value defines the number of allowed allocation failures per vdev.
51 * If a device reaches this threshold in a given txg then we consider skipping
52 * allocations on that device.
53 */
54int zfs_mg_alloc_failures = 0;
55
56SYSCTL_DECL(_vfs_zfs);
57SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RDTUN,
58 &zfs_mg_alloc_failures, 0,
59 "Number of allowed allocation failures per vdev");
60TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures);
61
62/*
37 * Metaslab debugging: when set, keeps all space maps in core to verify frees.
38 */
39static int metaslab_debug = 0;
40
41/*
42 * Minimum size which forces the dynamic allocator to change
43 * it's allocation strategy. Once the space map cannot satisfy
44 * an allocation of this size then it switches to using more

--- 621 unchanged lines hidden (view full) ---

666 metaslab_pp_unload,
667 metaslab_ndf_alloc,
668 metaslab_pp_claim,
669 metaslab_pp_free,
670 metaslab_pp_maxsize,
671 metaslab_ndf_fragmented
672};
673
63 * Metaslab debugging: when set, keeps all space maps in core to verify frees.
64 */
65static int metaslab_debug = 0;
66
67/*
68 * Minimum size which forces the dynamic allocator to change
69 * it's allocation strategy. Once the space map cannot satisfy
70 * an allocation of this size then it switches to using more

--- 621 unchanged lines hidden (view full) ---

692 metaslab_pp_unload,
693 metaslab_ndf_alloc,
694 metaslab_pp_claim,
695 metaslab_pp_free,
696 metaslab_pp_maxsize,
697 metaslab_ndf_fragmented
698};
699
674space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
700space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
675
676/*
677 * ==========================================================================
678 * Metaslabs
679 * ==========================================================================
680 */
681metaslab_t *
682metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,

--- 156 unchanged lines hidden (view full) ---

839 0ULL, smo->smo_objsize);
840 mutex_enter(&mg->mg_lock);
841 }
842 }
843 mutex_exit(&mg->mg_lock);
844}
845
846static int
701
702/*
703 * ==========================================================================
704 * Metaslabs
705 * ==========================================================================
706 */
707metaslab_t *
708metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,

--- 156 unchanged lines hidden (view full) ---

865 0ULL, smo->smo_objsize);
866 mutex_enter(&mg->mg_lock);
867 }
868 }
869 mutex_exit(&mg->mg_lock);
870}
871
872static int
847metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
873metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
848{
849 metaslab_group_t *mg = msp->ms_group;
850 space_map_t *sm = &msp->ms_map;
851 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
852
853 ASSERT(MUTEX_HELD(&msp->ms_lock));
854
855 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {

--- 16 unchanged lines hidden (view full) ---

872 * Track the bonus area as we activate new metaslabs.
873 */
874 if (sm->sm_start > mg->mg_bonus_area) {
875 mutex_enter(&mg->mg_lock);
876 mg->mg_bonus_area = sm->sm_start;
877 mutex_exit(&mg->mg_lock);
878 }
879
874{
875 metaslab_group_t *mg = msp->ms_group;
876 space_map_t *sm = &msp->ms_map;
877 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
878
879 ASSERT(MUTEX_HELD(&msp->ms_lock));
880
881 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {

--- 16 unchanged lines hidden (view full) ---

898 * Track the bonus area as we activate new metaslabs.
899 */
900 if (sm->sm_start > mg->mg_bonus_area) {
901 mutex_enter(&mg->mg_lock);
902 mg->mg_bonus_area = sm->sm_start;
903 mutex_exit(&mg->mg_lock);
904 }
905
880 /*
881 * If we were able to load the map then make sure
882 * that this map is still able to satisfy our request.
883 */
884 if (msp->ms_weight < size)
885 return (ENOSPC);
886
887 metaslab_group_sort(msp->ms_group, msp,
888 msp->ms_weight | activation_weight);
889 }
890 ASSERT(sm->sm_loaded);
891 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
892
893 return (0);
894}

--- 199 unchanged lines hidden (view full) ---

1094
1095 mutex_exit(&msp->ms_lock);
1096}
1097
1098void
1099metaslab_sync_reassess(metaslab_group_t *mg)
1100{
1101 vdev_t *vd = mg->mg_vd;
906 metaslab_group_sort(msp->ms_group, msp,
907 msp->ms_weight | activation_weight);
908 }
909 ASSERT(sm->sm_loaded);
910 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
911
912 return (0);
913}

--- 199 unchanged lines hidden (view full) ---

1113
1114 mutex_exit(&msp->ms_lock);
1115}
1116
1117void
1118metaslab_sync_reassess(metaslab_group_t *mg)
1119{
1120 vdev_t *vd = mg->mg_vd;
1121 int64_t failures = mg->mg_alloc_failures;
1102
1103 /*
1104 * Re-evaluate all metaslabs which have lower offsets than the
1105 * bonus area.
1106 */
1107 for (int m = 0; m < vd->vdev_ms_count; m++) {
1108 metaslab_t *msp = vd->vdev_ms[m];
1109
1110 if (msp->ms_map.sm_start > mg->mg_bonus_area)
1111 break;
1112
1113 mutex_enter(&msp->ms_lock);
1114 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1115 mutex_exit(&msp->ms_lock);
1116 }
1117
1122
1123 /*
1124 * Re-evaluate all metaslabs which have lower offsets than the
1125 * bonus area.
1126 */
1127 for (int m = 0; m < vd->vdev_ms_count; m++) {
1128 metaslab_t *msp = vd->vdev_ms[m];
1129
1130 if (msp->ms_map.sm_start > mg->mg_bonus_area)
1131 break;
1132
1133 mutex_enter(&msp->ms_lock);
1134 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1135 mutex_exit(&msp->ms_lock);
1136 }
1137
1138 atomic_add_64(&mg->mg_alloc_failures, -failures);
1139
1118 /*
1119 * Prefetch the next potential metaslabs
1120 */
1121 metaslab_prefetch(mg);
1122}
1123
1124static uint64_t
1125metaslab_distance(metaslab_t *msp, dva_t *dva)

--- 8 unchanged lines hidden (view full) ---

1134 if (offset < start)
1135 return ((start - offset) << ms_shift);
1136 if (offset > start)
1137 return ((offset - start) << ms_shift);
1138 return (0);
1139}
1140
1141static uint64_t
1140 /*
1141 * Prefetch the next potential metaslabs
1142 */
1143 metaslab_prefetch(mg);
1144}
1145
1146static uint64_t
1147metaslab_distance(metaslab_t *msp, dva_t *dva)

--- 8 unchanged lines hidden (view full) ---

1156 if (offset < start)
1157 return ((start - offset) << ms_shift);
1158 if (offset > start)
1159 return ((offset - start) << ms_shift);
1160 return (0);
1161}
1162
1163static uint64_t
1142metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
1143 uint64_t min_distance, dva_t *dva, int d)
1164metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1165 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1144{
1166{
1167 spa_t *spa = mg->mg_vd->vdev_spa;
1145 metaslab_t *msp = NULL;
1146 uint64_t offset = -1ULL;
1147 avl_tree_t *t = &mg->mg_metaslab_tree;
1148 uint64_t activation_weight;
1149 uint64_t target_distance;
1150 int i;
1151
1152 activation_weight = METASLAB_WEIGHT_PRIMARY;

--- 4 unchanged lines hidden (view full) ---

1157 }
1158 }
1159
1160 for (;;) {
1161 boolean_t was_active;
1162
1163 mutex_enter(&mg->mg_lock);
1164 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1168 metaslab_t *msp = NULL;
1169 uint64_t offset = -1ULL;
1170 avl_tree_t *t = &mg->mg_metaslab_tree;
1171 uint64_t activation_weight;
1172 uint64_t target_distance;
1173 int i;
1174
1175 activation_weight = METASLAB_WEIGHT_PRIMARY;

--- 4 unchanged lines hidden (view full) ---

1180 }
1181 }
1182
1183 for (;;) {
1184 boolean_t was_active;
1185
1186 mutex_enter(&mg->mg_lock);
1187 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1165 if (msp->ms_weight < size) {
1188 if (msp->ms_weight < asize) {
1189 spa_dbgmsg(spa, "%s: failed to meet weight "
1190 "requirement: vdev %llu, txg %llu, mg %p, "
1191 "msp %p, psize %llu, asize %llu, "
1192 "failures %llu, weight %llu",
1193 spa_name(spa), mg->mg_vd->vdev_id, txg,
1194 mg, msp, psize, asize,
1195 mg->mg_alloc_failures, msp->ms_weight);
1166 mutex_exit(&mg->mg_lock);
1167 return (-1ULL);
1168 }
1196 mutex_exit(&mg->mg_lock);
1197 return (-1ULL);
1198 }
1169
1170 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1171 if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1172 break;
1173
1174 target_distance = min_distance +
1175 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1176
1177 for (i = 0; i < d; i++)
1178 if (metaslab_distance(msp, &dva[i]) <
1179 target_distance)
1180 break;
1181 if (i == d)
1182 break;
1183 }
1184 mutex_exit(&mg->mg_lock);
1185 if (msp == NULL)
1186 return (-1ULL);
1187
1199 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1200 if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1201 break;
1202
1203 target_distance = min_distance +
1204 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1205
1206 for (i = 0; i < d; i++)
1207 if (metaslab_distance(msp, &dva[i]) <
1208 target_distance)
1209 break;
1210 if (i == d)
1211 break;
1212 }
1213 mutex_exit(&mg->mg_lock);
1214 if (msp == NULL)
1215 return (-1ULL);
1216
1217 /*
1218 * If we've already reached the allowable number of failed
1219 * allocation attempts on this metaslab group then we
1220 * consider skipping it. We skip it only if we're allowed
1221 * to "fast" gang, the physical size is larger than
1222 * a gang block, and we're attempting to allocate from
1223 * the primary metaslab.
1224 */
1225 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1226 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1227 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1228 spa_dbgmsg(spa, "%s: skipping metaslab group: "
1229 "vdev %llu, txg %llu, mg %p, psize %llu, "
1230 "asize %llu, failures %llu", spa_name(spa),
1231 mg->mg_vd->vdev_id, txg, mg, psize, asize,
1232 mg->mg_alloc_failures);
1233 return (-1ULL);
1234 }
1235
1188 mutex_enter(&msp->ms_lock);
1189
1190 /*
1191 * Ensure that the metaslab we have selected is still
1192 * capable of handling our request. It's possible that
1193 * another thread may have changed the weight while we
1194 * were blocked on the metaslab lock.
1195 */
1236 mutex_enter(&msp->ms_lock);
1237
1238 /*
1239 * Ensure that the metaslab we have selected is still
1240 * capable of handling our request. It's possible that
1241 * another thread may have changed the weight while we
1242 * were blocked on the metaslab lock.
1243 */
1196 if (msp->ms_weight < size || (was_active &&
1244 if (msp->ms_weight < asize || (was_active &&
1197 !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1198 activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1199 mutex_exit(&msp->ms_lock);
1200 continue;
1201 }
1202
1203 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1204 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1205 metaslab_passivate(msp,
1206 msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1207 mutex_exit(&msp->ms_lock);
1208 continue;
1209 }
1210
1245 !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1246 activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1247 mutex_exit(&msp->ms_lock);
1248 continue;
1249 }
1250
1251 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1252 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1253 metaslab_passivate(msp,
1254 msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1255 mutex_exit(&msp->ms_lock);
1256 continue;
1257 }
1258
1211 if (metaslab_activate(msp, activation_weight, size) != 0) {
1259 if (metaslab_activate(msp, activation_weight) != 0) {
1212 mutex_exit(&msp->ms_lock);
1213 continue;
1214 }
1215
1260 mutex_exit(&msp->ms_lock);
1261 continue;
1262 }
1263
1216 if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
1264 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
1217 break;
1218
1265 break;
1266
1267 atomic_inc_64(&mg->mg_alloc_failures);
1268
1219 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
1220
1221 mutex_exit(&msp->ms_lock);
1222 }
1223
1224 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1225 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1226
1269 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
1270
1271 mutex_exit(&msp->ms_lock);
1272 }
1273
1274 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1275 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1276
1227 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
1277 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
1228
1229 mutex_exit(&msp->ms_lock);
1230
1231 return (offset);
1232}
1233
1234/*
1235 * Allocate a block for the specified i/o.

--- 110 unchanged lines hidden (view full) ---

1346 if (distance <= (1ULL << vd->vdev_ms_shift))
1347 distance = 0;
1348 else
1349 all_zero = B_FALSE;
1350
1351 asize = vdev_psize_to_asize(vd, psize);
1352 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1353
1278
1279 mutex_exit(&msp->ms_lock);
1280
1281 return (offset);
1282}
1283
1284/*
1285 * Allocate a block for the specified i/o.

--- 110 unchanged lines hidden (view full) ---

1396 if (distance <= (1ULL << vd->vdev_ms_shift))
1397 distance = 0;
1398 else
1399 all_zero = B_FALSE;
1400
1401 asize = vdev_psize_to_asize(vd, psize);
1402 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1403
1354 offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
1404 offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1405 dva, d, flags);
1355 if (offset != -1ULL) {
1356 /*
1357 * If we've just selected this metaslab group,
1358 * figure out whether the corresponding vdev is
1359 * over- or under-used relative to the pool,
1360 * and set an allocation bias to even it out.
1361 */
1362 if (mc->mc_aliquot == 0) {
1363 vdev_stat_t *vs = &vd->vdev_stat;
1364 int64_t vu, cu;
1365
1406 if (offset != -1ULL) {
1407 /*
1408 * If we've just selected this metaslab group,
1409 * figure out whether the corresponding vdev is
1410 * over- or under-used relative to the pool,
1411 * and set an allocation bias to even it out.
1412 */
1413 if (mc->mc_aliquot == 0) {
1414 vdev_stat_t *vs = &vd->vdev_stat;
1415 int64_t vu, cu;
1416
1366 /*
1367 * Determine percent used in units of 0..1024.
1368 * (This is just to avoid floating point.)
1369 */
1370 vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
1371 cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
1417 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1418 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1372
1373 /*
1419
1420 /*
1374 * Bias by at most +/- 25% of the aliquot.
1421 * Calculate how much more or less we should
1422 * try to allocate from this device during
1423 * this iteration around the rotor.
1424 * For example, if a device is 80% full
1425 * and the pool is 20% full then we should
1426 * reduce allocations by 60% on this device.
1427 *
1428 * mg_bias = (20 - 80) * 512K / 100 = -307K
1429 *
1430 * This reduces allocations by 307K for this
1431 * iteration.
1375 */
1376 mg->mg_bias = ((cu - vu) *
1432 */
1433 mg->mg_bias = ((cu - vu) *
1377 (int64_t)mg->mg_aliquot) / (1024 * 4);
1434 (int64_t)mg->mg_aliquot) / 100;
1378 }
1379
1380 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1381 mg->mg_aliquot + mg->mg_bias) {
1382 mc->mc_rotor = mg->mg_next;
1383 mc->mc_aliquot = 0;
1384 }
1385

--- 97 unchanged lines hidden (view full) ---

1483 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1484
1485 if (DVA_GET_GANG(dva))
1486 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1487
1488 mutex_enter(&msp->ms_lock);
1489
1490 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
1435 }
1436
1437 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1438 mg->mg_aliquot + mg->mg_bias) {
1439 mc->mc_rotor = mg->mg_next;
1440 mc->mc_aliquot = 0;
1441 }
1442

--- 97 unchanged lines hidden (view full) ---

1540 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1541
1542 if (DVA_GET_GANG(dva))
1543 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1544
1545 mutex_enter(&msp->ms_lock);
1546
1547 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
1491 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
1548 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1492
1493 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
1494 error = ENOENT;
1495
1496 if (error || txg == 0) { /* txg == 0 indicates dry run */
1497 mutex_exit(&msp->ms_lock);
1498 return (error);
1499 }

--- 105 unchanged lines hidden ---
1549
1550 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
1551 error = ENOENT;
1552
1553 if (error || txg == 0) { /* txg == 0 indicates dry run */
1554 mutex_exit(&msp->ms_lock);
1555 return (error);
1556 }

--- 105 unchanged lines hidden ---