metaslab.c (219089) | metaslab.c (224177) |
---|---|
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 6 unchanged lines hidden (view full) --- 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | 1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 6 unchanged lines hidden (view full) --- 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
23 * Copyright (c) 2011 by Delphix. All rights reserved. |
|
23 */ 24 25#include <sys/zfs_context.h> 26#include <sys/dmu.h> 27#include <sys/dmu_tx.h> 28#include <sys/space_map.h> 29#include <sys/metaslab_impl.h> 30#include <sys/vdev_impl.h> 31#include <sys/zio.h> 32 | 24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/dmu.h> 28#include <sys/dmu_tx.h> 29#include <sys/space_map.h> 30#include <sys/metaslab_impl.h> 31#include <sys/vdev_impl.h> 32#include <sys/zio.h> 33 |
34/* 35 * Allow allocations to switch to gang blocks quickly. We do this to 36 * avoid having to load lots of space_maps in a given txg. There are, 37 * however, some cases where we want to avoid "fast" ganging and instead 38 * we want to do an exhaustive search of all metaslabs on this device. 39 * Currently we don't allow any gang or dump device related allocations 40 * to "fast" gang. 41 */ 42#define CAN_FASTGANG(flags) \ 43 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 44 METASLAB_GANG_AVOID))) 45 |
|
33uint64_t metaslab_aliquot = 512ULL << 10; 34uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 35 36/* | 46uint64_t metaslab_aliquot = 512ULL << 10; 47uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 48 49/* |
50 * This value defines the number of allowed allocation failures per vdev. 51 * If a device reaches this threshold in a given txg then we consider skipping 52 * allocations on that device. 53 */ 54int zfs_mg_alloc_failures = 0; 55 56SYSCTL_DECL(_vfs_zfs); 57SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RDTUN, 58 &zfs_mg_alloc_failures, 0, 59 "Number of allowed allocation failures per vdev"); 60TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures); 61 62/* |
|
37 * Metaslab debugging: when set, keeps all space maps in core to verify frees. 38 */ 39static int metaslab_debug = 0; 40 41/* 42 * Minimum size which forces the dynamic allocator to change 43 * it's allocation strategy. Once the space map cannot satisfy 44 * an allocation of this size then it switches to using more --- 621 unchanged lines hidden (view full) --- 666 metaslab_pp_unload, 667 metaslab_ndf_alloc, 668 metaslab_pp_claim, 669 metaslab_pp_free, 670 metaslab_pp_maxsize, 671 metaslab_ndf_fragmented 672}; 673 | 63 * Metaslab debugging: when set, keeps all space maps in core to verify frees. 64 */ 65static int metaslab_debug = 0; 66 67/* 68 * Minimum size which forces the dynamic allocator to change 69 * it's allocation strategy. Once the space map cannot satisfy 70 * an allocation of this size then it switches to using more --- 621 unchanged lines hidden (view full) --- 692 metaslab_pp_unload, 693 metaslab_ndf_alloc, 694 metaslab_pp_claim, 695 metaslab_pp_free, 696 metaslab_pp_maxsize, 697 metaslab_ndf_fragmented 698}; 699 |
674space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; | 700space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; |
675 676/* 677 * ========================================================================== 678 * Metaslabs 679 * ========================================================================== 680 */ 681metaslab_t * 682metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, --- 156 unchanged lines hidden (view full) --- 839 0ULL, smo->smo_objsize); 840 mutex_enter(&mg->mg_lock); 841 } 842 } 843 mutex_exit(&mg->mg_lock); 844} 845 846static int | 701 702/* 703 * ========================================================================== 704 * Metaslabs 705 * ========================================================================== 706 */ 707metaslab_t * 708metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, --- 156 unchanged lines hidden (view full) --- 865 0ULL, smo->smo_objsize); 866 mutex_enter(&mg->mg_lock); 867 } 868 } 869 mutex_exit(&mg->mg_lock); 870} 871 872static int |
847metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) | 873metaslab_activate(metaslab_t *msp, uint64_t activation_weight) |
848{ 849 metaslab_group_t *mg = msp->ms_group; 850 space_map_t *sm = &msp->ms_map; 851 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 852 853 ASSERT(MUTEX_HELD(&msp->ms_lock)); 854 855 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { --- 16 unchanged lines hidden (view full) --- 872 * Track the bonus area as we activate new metaslabs. 873 */ 874 if (sm->sm_start > mg->mg_bonus_area) { 875 mutex_enter(&mg->mg_lock); 876 mg->mg_bonus_area = sm->sm_start; 877 mutex_exit(&mg->mg_lock); 878 } 879 | 874{ 875 metaslab_group_t *mg = msp->ms_group; 876 space_map_t *sm = &msp->ms_map; 877 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 878 879 ASSERT(MUTEX_HELD(&msp->ms_lock)); 880 881 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { --- 16 unchanged lines hidden (view full) --- 898 * Track the bonus area as we activate new metaslabs. 899 */ 900 if (sm->sm_start > mg->mg_bonus_area) { 901 mutex_enter(&mg->mg_lock); 902 mg->mg_bonus_area = sm->sm_start; 903 mutex_exit(&mg->mg_lock); 904 } 905 |
880 /* 881 * If we were able to load the map then make sure 882 * that this map is still able to satisfy our request. 883 */ 884 if (msp->ms_weight < size) 885 return (ENOSPC); 886 | |
887 metaslab_group_sort(msp->ms_group, msp, 888 msp->ms_weight | activation_weight); 889 } 890 ASSERT(sm->sm_loaded); 891 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 892 893 return (0); 894} --- 199 unchanged lines hidden (view full) --- 1094 1095 mutex_exit(&msp->ms_lock); 1096} 1097 1098void 1099metaslab_sync_reassess(metaslab_group_t *mg) 1100{ 1101 vdev_t *vd = mg->mg_vd; | 906 metaslab_group_sort(msp->ms_group, msp, 907 msp->ms_weight | activation_weight); 908 } 909 ASSERT(sm->sm_loaded); 910 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 911 912 return (0); 913} --- 199 unchanged lines hidden (view full) --- 1113 1114 mutex_exit(&msp->ms_lock); 1115} 1116 1117void 1118metaslab_sync_reassess(metaslab_group_t *mg) 1119{ 1120 vdev_t *vd = mg->mg_vd; |
1121 int64_t failures = mg->mg_alloc_failures; |
|
1102 1103 /* 1104 * Re-evaluate all metaslabs which have lower offsets than the 1105 * bonus area. 1106 */ 1107 for (int m = 0; m < vd->vdev_ms_count; m++) { 1108 metaslab_t *msp = vd->vdev_ms[m]; 1109 1110 if (msp->ms_map.sm_start > mg->mg_bonus_area) 1111 break; 1112 1113 mutex_enter(&msp->ms_lock); 1114 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1115 mutex_exit(&msp->ms_lock); 1116 } 1117 | 1122 1123 /* 1124 * Re-evaluate all metaslabs which have lower offsets than the 1125 * bonus area. 1126 */ 1127 for (int m = 0; m < vd->vdev_ms_count; m++) { 1128 metaslab_t *msp = vd->vdev_ms[m]; 1129 1130 if (msp->ms_map.sm_start > mg->mg_bonus_area) 1131 break; 1132 1133 mutex_enter(&msp->ms_lock); 1134 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1135 mutex_exit(&msp->ms_lock); 1136 } 1137 |
1138 atomic_add_64(&mg->mg_alloc_failures, -failures); 1139 |
|
1118 /* 1119 * Prefetch the next potential metaslabs 1120 */ 1121 metaslab_prefetch(mg); 1122} 1123 1124static uint64_t 1125metaslab_distance(metaslab_t *msp, dva_t *dva) --- 8 unchanged lines hidden (view full) --- 1134 if (offset < start) 1135 return ((start - offset) << ms_shift); 1136 if (offset > start) 1137 return ((offset - start) << ms_shift); 1138 return (0); 1139} 1140 1141static uint64_t | 1140 /* 1141 * Prefetch the next potential metaslabs 1142 */ 1143 metaslab_prefetch(mg); 1144} 1145 1146static uint64_t 1147metaslab_distance(metaslab_t *msp, dva_t *dva) --- 8 unchanged lines hidden (view full) --- 1156 if (offset < start) 1157 return ((start - offset) << ms_shift); 1158 if (offset > start) 1159 return ((offset - start) << ms_shift); 1160 return (0); 1161} 1162 1163static uint64_t |
1142metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, 1143 uint64_t min_distance, dva_t *dva, int d) | 1164metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 1165 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) |
1144{ | 1166{ |
1167 spa_t *spa = mg->mg_vd->vdev_spa; |
|
1145 metaslab_t *msp = NULL; 1146 uint64_t offset = -1ULL; 1147 avl_tree_t *t = &mg->mg_metaslab_tree; 1148 uint64_t activation_weight; 1149 uint64_t target_distance; 1150 int i; 1151 1152 activation_weight = METASLAB_WEIGHT_PRIMARY; --- 4 unchanged lines hidden (view full) --- 1157 } 1158 } 1159 1160 for (;;) { 1161 boolean_t was_active; 1162 1163 mutex_enter(&mg->mg_lock); 1164 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { | 1168 metaslab_t *msp = NULL; 1169 uint64_t offset = -1ULL; 1170 avl_tree_t *t = &mg->mg_metaslab_tree; 1171 uint64_t activation_weight; 1172 uint64_t target_distance; 1173 int i; 1174 1175 activation_weight = METASLAB_WEIGHT_PRIMARY; --- 4 unchanged lines hidden (view full) --- 1180 } 1181 } 1182 1183 for (;;) { 1184 boolean_t was_active; 1185 1186 mutex_enter(&mg->mg_lock); 1187 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { |
1165 if (msp->ms_weight < size) { | 1188 if (msp->ms_weight < asize) { 1189 spa_dbgmsg(spa, "%s: failed to meet weight " 1190 "requirement: vdev %llu, txg %llu, mg %p, " 1191 "msp %p, psize %llu, asize %llu, " 1192 "failures %llu, weight %llu", 1193 spa_name(spa), mg->mg_vd->vdev_id, txg, 1194 mg, msp, psize, asize, 1195 mg->mg_alloc_failures, msp->ms_weight); |
1166 mutex_exit(&mg->mg_lock); 1167 return (-1ULL); 1168 } | 1196 mutex_exit(&mg->mg_lock); 1197 return (-1ULL); 1198 } |
1169 | |
1170 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1171 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1172 break; 1173 1174 target_distance = min_distance + 1175 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 1176 1177 for (i = 0; i < d; i++) 1178 if (metaslab_distance(msp, &dva[i]) < 1179 target_distance) 1180 break; 1181 if (i == d) 1182 break; 1183 } 1184 mutex_exit(&mg->mg_lock); 1185 if (msp == NULL) 1186 return (-1ULL); 1187 | 1199 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1200 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1201 break; 1202 1203 target_distance = min_distance + 1204 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 1205 1206 for (i = 0; i < d; i++) 1207 if (metaslab_distance(msp, &dva[i]) < 1208 target_distance) 1209 break; 1210 if (i == d) 1211 break; 1212 } 1213 mutex_exit(&mg->mg_lock); 1214 if (msp == NULL) 1215 return (-1ULL); 1216 |
1217 /* 1218 * If we've already reached the allowable number of failed 1219 * allocation attempts on this metaslab group then we 1220 * consider skipping it. We skip it only if we're allowed 1221 * to "fast" gang, the physical size is larger than 1222 * a gang block, and we're attempting to allocate from 1223 * the primary metaslab. 1224 */ 1225 if (mg->mg_alloc_failures > zfs_mg_alloc_failures && 1226 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && 1227 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1228 spa_dbgmsg(spa, "%s: skipping metaslab group: " 1229 "vdev %llu, txg %llu, mg %p, psize %llu, " 1230 "asize %llu, failures %llu", spa_name(spa), 1231 mg->mg_vd->vdev_id, txg, mg, psize, asize, 1232 mg->mg_alloc_failures); 1233 return (-1ULL); 1234 } 1235 |
|
1188 mutex_enter(&msp->ms_lock); 1189 1190 /* 1191 * Ensure that the metaslab we have selected is still 1192 * capable of handling our request. It's possible that 1193 * another thread may have changed the weight while we 1194 * were blocked on the metaslab lock. 1195 */ | 1236 mutex_enter(&msp->ms_lock); 1237 1238 /* 1239 * Ensure that the metaslab we have selected is still 1240 * capable of handling our request. It's possible that 1241 * another thread may have changed the weight while we 1242 * were blocked on the metaslab lock. 1243 */ |
1196 if (msp->ms_weight < size || (was_active && | 1244 if (msp->ms_weight < asize || (was_active && |
1197 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1198 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1199 mutex_exit(&msp->ms_lock); 1200 continue; 1201 } 1202 1203 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1204 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1205 metaslab_passivate(msp, 1206 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1207 mutex_exit(&msp->ms_lock); 1208 continue; 1209 } 1210 | 1245 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1246 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1247 mutex_exit(&msp->ms_lock); 1248 continue; 1249 } 1250 1251 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1252 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1253 metaslab_passivate(msp, 1254 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1255 mutex_exit(&msp->ms_lock); 1256 continue; 1257 } 1258 |
1211 if (metaslab_activate(msp, activation_weight, size) != 0) { | 1259 if (metaslab_activate(msp, activation_weight) != 0) { |
1212 mutex_exit(&msp->ms_lock); 1213 continue; 1214 } 1215 | 1260 mutex_exit(&msp->ms_lock); 1261 continue; 1262 } 1263 |
1216 if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) | 1264 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) |
1217 break; 1218 | 1265 break; 1266 |
1267 atomic_inc_64(&mg->mg_alloc_failures); 1268 |
|
1219 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); 1220 1221 mutex_exit(&msp->ms_lock); 1222 } 1223 1224 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1225 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1226 | 1269 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); 1270 1271 mutex_exit(&msp->ms_lock); 1272 } 1273 1274 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1275 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1276 |
1227 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); | 1277 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); |
1228 1229 mutex_exit(&msp->ms_lock); 1230 1231 return (offset); 1232} 1233 1234/* 1235 * Allocate a block for the specified i/o. --- 110 unchanged lines hidden (view full) --- 1346 if (distance <= (1ULL << vd->vdev_ms_shift)) 1347 distance = 0; 1348 else 1349 all_zero = B_FALSE; 1350 1351 asize = vdev_psize_to_asize(vd, psize); 1352 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1353 | 1278 1279 mutex_exit(&msp->ms_lock); 1280 1281 return (offset); 1282} 1283 1284/* 1285 * Allocate a block for the specified i/o. --- 110 unchanged lines hidden (view full) --- 1396 if (distance <= (1ULL << vd->vdev_ms_shift)) 1397 distance = 0; 1398 else 1399 all_zero = B_FALSE; 1400 1401 asize = vdev_psize_to_asize(vd, psize); 1402 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1403 |
1354 offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); | 1404 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 1405 dva, d, flags); |
1355 if (offset != -1ULL) { 1356 /* 1357 * If we've just selected this metaslab group, 1358 * figure out whether the corresponding vdev is 1359 * over- or under-used relative to the pool, 1360 * and set an allocation bias to even it out. 1361 */ 1362 if (mc->mc_aliquot == 0) { 1363 vdev_stat_t *vs = &vd->vdev_stat; 1364 int64_t vu, cu; 1365 | 1406 if (offset != -1ULL) { 1407 /* 1408 * If we've just selected this metaslab group, 1409 * figure out whether the corresponding vdev is 1410 * over- or under-used relative to the pool, 1411 * and set an allocation bias to even it out. 1412 */ 1413 if (mc->mc_aliquot == 0) { 1414 vdev_stat_t *vs = &vd->vdev_stat; 1415 int64_t vu, cu; 1416 |
1366 /* 1367 * Determine percent used in units of 0..1024. 1368 * (This is just to avoid floating point.) 1369 */ 1370 vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); 1371 cu = (mc->mc_alloc << 10) / (mc->mc_space + 1); | 1417 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 1418 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); |
1372 1373 /* | 1419 1420 /* |
1374 * Bias by at most +/- 25% of the aliquot. | 1421 * Calculate how much more or less we should 1422 * try to allocate from this device during 1423 * this iteration around the rotor. 1424 * For example, if a device is 80% full 1425 * and the pool is 20% full then we should 1426 * reduce allocations by 60% on this device. 1427 * 1428 * mg_bias = (20 - 80) * 512K / 100 = -307K 1429 * 1430 * This reduces allocations by 307K for this 1431 * iteration. |
1375 */ 1376 mg->mg_bias = ((cu - vu) * | 1432 */ 1433 mg->mg_bias = ((cu - vu) * |
1377 (int64_t)mg->mg_aliquot) / (1024 * 4); | 1434 (int64_t)mg->mg_aliquot) / 100; |
1378 } 1379 1380 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1381 mg->mg_aliquot + mg->mg_bias) { 1382 mc->mc_rotor = mg->mg_next; 1383 mc->mc_aliquot = 0; 1384 } 1385 --- 97 unchanged lines hidden (view full) --- 1483 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1484 1485 if (DVA_GET_GANG(dva)) 1486 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1487 1488 mutex_enter(&msp->ms_lock); 1489 1490 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) | 1435 } 1436 1437 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1438 mg->mg_aliquot + mg->mg_bias) { 1439 mc->mc_rotor = mg->mg_next; 1440 mc->mc_aliquot = 0; 1441 } 1442 --- 97 unchanged lines hidden (view full) --- 1540 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1541 1542 if (DVA_GET_GANG(dva)) 1543 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1544 1545 mutex_enter(&msp->ms_lock); 1546 1547 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) |
1491 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); | 1548 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); |
1492 1493 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) 1494 error = ENOENT; 1495 1496 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1497 mutex_exit(&msp->ms_lock); 1498 return (error); 1499 } --- 105 unchanged lines hidden --- | 1549 1550 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) 1551 error = ENOENT; 1552 1553 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1554 mutex_exit(&msp->ms_lock); 1555 return (error); 1556 } --- 105 unchanged lines hidden --- |