Deleted Added
sdiff udiff text old ( 219089 ) new ( 224177 )
full compact
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 6 unchanged lines hidden (view full) ---

15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 by Delphix. All rights reserved.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/dmu.h>
28#include <sys/dmu_tx.h>
29#include <sys/space_map.h>
30#include <sys/metaslab_impl.h>
31#include <sys/vdev_impl.h>
32#include <sys/zio.h>
33
34/*
35 * Allow allocations to switch to gang blocks quickly. We do this to
36 * avoid having to load lots of space_maps in a given txg. There are,
37 * however, some cases where we want to avoid "fast" ganging and instead
38 * we want to do an exhaustive search of all metaslabs on this device.
39 * Currently we don't allow any gang or dump device related allocations
40 * to "fast" gang.
41 */
42#define CAN_FASTGANG(flags) \
43 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
44 METASLAB_GANG_AVOID)))
45
46uint64_t metaslab_aliquot = 512ULL << 10;
47uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
48
49/*
50 * This value defines the number of allowed allocation failures per vdev.
51 * If a device reaches this threshold in a given txg then we consider skipping
52 * allocations on that device.
53 */
54int zfs_mg_alloc_failures = 0;
55
56SYSCTL_DECL(_vfs_zfs);
57SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RDTUN,
58 &zfs_mg_alloc_failures, 0,
59 "Number of allowed allocation failures per vdev");
60TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures);
61
62/*
63 * Metaslab debugging: when set, keeps all space maps in core to verify frees.
64 */
65static int metaslab_debug = 0;
66
67/*
68 * Minimum size which forces the dynamic allocator to change
69 * it's allocation strategy. Once the space map cannot satisfy
70 * an allocation of this size then it switches to using more

--- 621 unchanged lines hidden (view full) ---

692 metaslab_pp_unload,
693 metaslab_ndf_alloc,
694 metaslab_pp_claim,
695 metaslab_pp_free,
696 metaslab_pp_maxsize,
697 metaslab_ndf_fragmented
698};
699
700space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
701
702/*
703 * ==========================================================================
704 * Metaslabs
705 * ==========================================================================
706 */
707metaslab_t *
708metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,

--- 156 unchanged lines hidden (view full) ---

865 0ULL, smo->smo_objsize);
866 mutex_enter(&mg->mg_lock);
867 }
868 }
869 mutex_exit(&mg->mg_lock);
870}
871
872static int
873metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
874{
875 metaslab_group_t *mg = msp->ms_group;
876 space_map_t *sm = &msp->ms_map;
877 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
878
879 ASSERT(MUTEX_HELD(&msp->ms_lock));
880
881 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {

--- 16 unchanged lines hidden (view full) ---

898 * Track the bonus area as we activate new metaslabs.
899 */
900 if (sm->sm_start > mg->mg_bonus_area) {
901 mutex_enter(&mg->mg_lock);
902 mg->mg_bonus_area = sm->sm_start;
903 mutex_exit(&mg->mg_lock);
904 }
905
906 metaslab_group_sort(msp->ms_group, msp,
907 msp->ms_weight | activation_weight);
908 }
909 ASSERT(sm->sm_loaded);
910 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
911
912 return (0);
913}

--- 199 unchanged lines hidden (view full) ---

1113
1114 mutex_exit(&msp->ms_lock);
1115}
1116
1117void
1118metaslab_sync_reassess(metaslab_group_t *mg)
1119{
1120 vdev_t *vd = mg->mg_vd;
1121 int64_t failures = mg->mg_alloc_failures;
1122
1123 /*
1124 * Re-evaluate all metaslabs which have lower offsets than the
1125 * bonus area.
1126 */
1127 for (int m = 0; m < vd->vdev_ms_count; m++) {
1128 metaslab_t *msp = vd->vdev_ms[m];
1129
1130 if (msp->ms_map.sm_start > mg->mg_bonus_area)
1131 break;
1132
1133 mutex_enter(&msp->ms_lock);
1134 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1135 mutex_exit(&msp->ms_lock);
1136 }
1137
1138 atomic_add_64(&mg->mg_alloc_failures, -failures);
1139
1140 /*
1141 * Prefetch the next potential metaslabs
1142 */
1143 metaslab_prefetch(mg);
1144}
1145
1146static uint64_t
1147metaslab_distance(metaslab_t *msp, dva_t *dva)

--- 8 unchanged lines hidden (view full) ---

1156 if (offset < start)
1157 return ((start - offset) << ms_shift);
1158 if (offset > start)
1159 return ((offset - start) << ms_shift);
1160 return (0);
1161}
1162
1163static uint64_t
1164metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1165 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1166{
1167 spa_t *spa = mg->mg_vd->vdev_spa;
1168 metaslab_t *msp = NULL;
1169 uint64_t offset = -1ULL;
1170 avl_tree_t *t = &mg->mg_metaslab_tree;
1171 uint64_t activation_weight;
1172 uint64_t target_distance;
1173 int i;
1174
1175 activation_weight = METASLAB_WEIGHT_PRIMARY;

--- 4 unchanged lines hidden (view full) ---

1180 }
1181 }
1182
1183 for (;;) {
1184 boolean_t was_active;
1185
1186 mutex_enter(&mg->mg_lock);
1187 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1188 if (msp->ms_weight < asize) {
1189 spa_dbgmsg(spa, "%s: failed to meet weight "
1190 "requirement: vdev %llu, txg %llu, mg %p, "
1191 "msp %p, psize %llu, asize %llu, "
1192 "failures %llu, weight %llu",
1193 spa_name(spa), mg->mg_vd->vdev_id, txg,
1194 mg, msp, psize, asize,
1195 mg->mg_alloc_failures, msp->ms_weight);
1196 mutex_exit(&mg->mg_lock);
1197 return (-1ULL);
1198 }
1199 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1200 if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1201 break;
1202
1203 target_distance = min_distance +
1204 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1205
1206 for (i = 0; i < d; i++)
1207 if (metaslab_distance(msp, &dva[i]) <
1208 target_distance)
1209 break;
1210 if (i == d)
1211 break;
1212 }
1213 mutex_exit(&mg->mg_lock);
1214 if (msp == NULL)
1215 return (-1ULL);
1216
1217 /*
1218 * If we've already reached the allowable number of failed
1219 * allocation attempts on this metaslab group then we
1220 * consider skipping it. We skip it only if we're allowed
1221 * to "fast" gang, the physical size is larger than
1222 * a gang block, and we're attempting to allocate from
1223 * the primary metaslab.
1224 */
1225 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1226 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1227 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1228 spa_dbgmsg(spa, "%s: skipping metaslab group: "
1229 "vdev %llu, txg %llu, mg %p, psize %llu, "
1230 "asize %llu, failures %llu", spa_name(spa),
1231 mg->mg_vd->vdev_id, txg, mg, psize, asize,
1232 mg->mg_alloc_failures);
1233 return (-1ULL);
1234 }
1235
1236 mutex_enter(&msp->ms_lock);
1237
1238 /*
1239 * Ensure that the metaslab we have selected is still
1240 * capable of handling our request. It's possible that
1241 * another thread may have changed the weight while we
1242 * were blocked on the metaslab lock.
1243 */
1244 if (msp->ms_weight < asize || (was_active &&
1245 !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1246 activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1247 mutex_exit(&msp->ms_lock);
1248 continue;
1249 }
1250
1251 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1252 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1253 metaslab_passivate(msp,
1254 msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1255 mutex_exit(&msp->ms_lock);
1256 continue;
1257 }
1258
1259 if (metaslab_activate(msp, activation_weight) != 0) {
1260 mutex_exit(&msp->ms_lock);
1261 continue;
1262 }
1263
1264 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
1265 break;
1266
1267 atomic_inc_64(&mg->mg_alloc_failures);
1268
1269 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
1270
1271 mutex_exit(&msp->ms_lock);
1272 }
1273
1274 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1275 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1276
1277 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
1278
1279 mutex_exit(&msp->ms_lock);
1280
1281 return (offset);
1282}
1283
1284/*
1285 * Allocate a block for the specified i/o.

--- 110 unchanged lines hidden (view full) ---

1396 if (distance <= (1ULL << vd->vdev_ms_shift))
1397 distance = 0;
1398 else
1399 all_zero = B_FALSE;
1400
1401 asize = vdev_psize_to_asize(vd, psize);
1402 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1403
1404 offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1405 dva, d, flags);
1406 if (offset != -1ULL) {
1407 /*
1408 * If we've just selected this metaslab group,
1409 * figure out whether the corresponding vdev is
1410 * over- or under-used relative to the pool,
1411 * and set an allocation bias to even it out.
1412 */
1413 if (mc->mc_aliquot == 0) {
1414 vdev_stat_t *vs = &vd->vdev_stat;
1415 int64_t vu, cu;
1416
1417 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1418 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1419
1420 /*
1421 * Calculate how much more or less we should
1422 * try to allocate from this device during
1423 * this iteration around the rotor.
1424 * For example, if a device is 80% full
1425 * and the pool is 20% full then we should
1426 * reduce allocations by 60% on this device.
1427 *
1428 * mg_bias = (20 - 80) * 512K / 100 = -307K
1429 *
1430 * This reduces allocations by 307K for this
1431 * iteration.
1432 */
1433 mg->mg_bias = ((cu - vu) *
1434 (int64_t)mg->mg_aliquot) / 100;
1435 }
1436
1437 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1438 mg->mg_aliquot + mg->mg_bias) {
1439 mc->mc_rotor = mg->mg_next;
1440 mc->mc_aliquot = 0;
1441 }
1442

--- 97 unchanged lines hidden (view full) ---

1540 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1541
1542 if (DVA_GET_GANG(dva))
1543 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1544
1545 mutex_enter(&msp->ms_lock);
1546
1547 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
1548 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1549
1550 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
1551 error = ENOENT;
1552
1553 if (error || txg == 0) { /* txg == 0 indicates dry run */
1554 mutex_exit(&msp->ms_lock);
1555 return (error);
1556 }

--- 105 unchanged lines hidden ---