vdev_indirect.c (339104) | vdev_indirect.c (339106) |
---|---|
1/* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * --- 9 unchanged lines hidden (view full) --- 18 */ 19 20#include <sys/zfs_context.h> 21#include <sys/spa.h> 22#include <sys/spa_impl.h> 23#include <sys/vdev_impl.h> 24#include <sys/fs/zfs.h> 25#include <sys/zio.h> | 1/* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * --- 9 unchanged lines hidden (view full) --- 18 */ 19 20#include <sys/zfs_context.h> 21#include <sys/spa.h> 22#include <sys/spa_impl.h> 23#include <sys/vdev_impl.h> 24#include <sys/fs/zfs.h> 25#include <sys/zio.h> |
26#include <sys/zio_checksum.h> |
|
26#include <sys/metaslab.h> 27#include <sys/refcount.h> 28#include <sys/dmu.h> 29#include <sys/vdev_indirect_mapping.h> 30#include <sys/dmu_tx.h> 31#include <sys/dsl_synctask.h> 32#include <sys/zap.h> 33#include <sys/abd.h> --- 7 unchanged lines hidden (view full) --- 41 * to access the DVA. Unfortunately, this mapping did not respect 42 * logical block boundaries when it was first created, and so a DVA on 43 * this indirect vdev may be "split" into multiple sections that each 44 * map to a different location. As a consequence, not all DVAs can be 45 * translated to an equivalent new DVA. Instead we must provide a 46 * "vdev_remap" operation that executes a callback on each contiguous 47 * segment of the new location. This function is used in multiple ways: 48 * | 27#include <sys/metaslab.h> 28#include <sys/refcount.h> 29#include <sys/dmu.h> 30#include <sys/vdev_indirect_mapping.h> 31#include <sys/dmu_tx.h> 32#include <sys/dsl_synctask.h> 33#include <sys/zap.h> 34#include <sys/abd.h> --- 7 unchanged lines hidden (view full) --- 42 * to access the DVA. Unfortunately, this mapping did not respect 43 * logical block boundaries when it was first created, and so a DVA on 44 * this indirect vdev may be "split" into multiple sections that each 45 * map to a different location. As a consequence, not all DVAs can be 46 * translated to an equivalent new DVA. Instead we must provide a 47 * "vdev_remap" operation that executes a callback on each contiguous 48 * segment of the new location. This function is used in multiple ways: 49 * |
49 * - reads and repair writes to this device use the callback to create 50 * a child io for each mapped segment. | 50 * - i/os to this vdev use the callback to determine where the 51 * data is now located, and issue child i/os for each segment's new 52 * location. |
51 * | 53 * |
52 * - frees and claims to this device use the callback to free or claim | 54 * - frees and claims to this vdev use the callback to free or claim |
53 * each mapped segment. (Note that we don't actually need to claim 54 * log blocks on indirect vdevs, because we don't allocate to 55 * removing vdevs. However, zdb uses zio_claim() for its leak 56 * detection.) 57 */ 58 59/* 60 * "Big theory statement" for how we mark blocks obsolete. --- 138 unchanged lines hidden (view full) --- 199 * This is used by the test suite so that it can ensure that certain 200 * actions happen while in the middle of a condense (which might otherwise 201 * complete too quickly). If used to reduce the performance impact of 202 * condensing in production, a maximum value of 1 should be sufficient. 203 */ 204int zfs_condense_indirect_commit_entry_delay_ticks = 0; 205 206/* | 55 * each mapped segment. (Note that we don't actually need to claim 56 * log blocks on indirect vdevs, because we don't allocate to 57 * removing vdevs. However, zdb uses zio_claim() for its leak 58 * detection.) 59 */ 60 61/* 62 * "Big theory statement" for how we mark blocks obsolete. --- 138 unchanged lines hidden (view full) --- 201 * This is used by the test suite so that it can ensure that certain 202 * actions happen while in the middle of a condense (which might otherwise 203 * complete too quickly). If used to reduce the performance impact of 204 * condensing in production, a maximum value of 1 should be sufficient. 205 */ 206int zfs_condense_indirect_commit_entry_delay_ticks = 0; 207 208/* |
209 * If a split block contains more than this many segments, consider it too 210 * computationally expensive to check all (2^num_segments) possible 211 * combinations. Instead, try at most 2^_segments_max randomly-selected 212 * combinations. 213 * 214 * This is reasonable if only a few segment copies are damaged and the 215 * majority of segment copies are good. This allows all the segment copies to 216 * participate fairly in the reconstruction and prevents the repeated use of 217 * one bad copy. 218 */ 219int zfs_reconstruct_indirect_segments_max = 10; 220 221/* 222 * The indirect_child_t represents the vdev that we will read from, when we 223 * need to read all copies of the data (e.g. for scrub or reconstruction). 224 * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), 225 * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, 226 * ic_vdev is a child of the mirror. 227 */ 228typedef struct indirect_child { 229 abd_t *ic_data; 230 vdev_t *ic_vdev; 231} indirect_child_t; 232 233/* 234 * The indirect_split_t represents one mapped segment of an i/o to the 235 * indirect vdev. For non-split (contiguously-mapped) blocks, there will be 236 * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. 237 * For split blocks, there will be several of these. 238 */ 239typedef struct indirect_split { 240 list_node_t is_node; /* link on iv_splits */ 241 242 /* 243 * is_split_offset is the offset into the i/o. 244 * This is the sum of the previous splits' is_size's. 245 */ 246 uint64_t is_split_offset; 247 248 vdev_t *is_vdev; /* top-level vdev */ 249 uint64_t is_target_offset; /* offset on is_vdev */ 250 uint64_t is_size; 251 int is_children; /* number of entries in is_child[] */ 252 253 /* 254 * is_good_child is the child that we are currently using to 255 * attempt reconstruction. 256 */ 257 int is_good_child; 258 259 indirect_child_t is_child[1]; /* variable-length */ 260} indirect_split_t; 261 262/* 263 * The indirect_vsd_t is associated with each i/o to the indirect vdev. 264 * It is the "Vdev-Specific Data" in the zio_t's io_vsd. 265 */ 266typedef struct indirect_vsd { 267 boolean_t iv_split_block; 268 boolean_t iv_reconstruct; 269 270 list_t iv_splits; /* list of indirect_split_t's */ 271} indirect_vsd_t; 272 273static void 274vdev_indirect_map_free(zio_t *zio) 275{ 276 indirect_vsd_t *iv = zio->io_vsd; 277 278 indirect_split_t *is; 279 while ((is = list_head(&iv->iv_splits)) != NULL) { 280 for (int c = 0; c < is->is_children; c++) { 281 indirect_child_t *ic = &is->is_child[c]; 282 if (ic->ic_data != NULL) 283 abd_free(ic->ic_data); 284 } 285 list_remove(&iv->iv_splits, is); 286 kmem_free(is, 287 offsetof(indirect_split_t, is_child[is->is_children])); 288 } 289 kmem_free(iv, sizeof (*iv)); 290} 291 292static const zio_vsd_ops_t vdev_indirect_vsd_ops = { 293 vdev_indirect_map_free, 294 zio_vsd_default_cksum_report 295}; 296/* |
|
207 * Mark the given offset and size as being obsolete. 208 */ 209void 210vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) 211{ 212 spa_t *spa = vd->vdev_spa; 213 214 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); --- 598 unchanged lines hidden (view full) --- 813 814/* ARGSUSED */ 815static void 816vdev_indirect_close(vdev_t *vd) 817{ 818} 819 820/* ARGSUSED */ | 297 * Mark the given offset and size as being obsolete. 298 */ 299void 300vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) 301{ 302 spa_t *spa = vd->vdev_spa; 303 304 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); --- 598 unchanged lines hidden (view full) --- 903 904/* ARGSUSED */ 905static void 906vdev_indirect_close(vdev_t *vd) 907{ 908} 909 910/* ARGSUSED */ |
821static void 822vdev_indirect_io_done(zio_t *zio) 823{ 824} 825 826/* ARGSUSED */ | |
827static int 828vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 829 uint64_t *logical_ashift, uint64_t *physical_ashift) 830{ 831 *psize = *max_psize = vd->vdev_asize + 832 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 833 *logical_ashift = vd->vdev_ashift; 834 *physical_ashift = vd->vdev_physical_ashift; --- 227 unchanged lines hidden (view full) --- 1062 1063 mutex_enter(&pio->io_lock); 1064 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 1065 mutex_exit(&pio->io_lock); 1066 1067 abd_put(zio->io_abd); 1068} 1069 | 911static int 912vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 913 uint64_t *logical_ashift, uint64_t *physical_ashift) 914{ 915 *psize = *max_psize = vd->vdev_asize + 916 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 917 *logical_ashift = vd->vdev_ashift; 918 *physical_ashift = vd->vdev_physical_ashift; --- 227 unchanged lines hidden (view full) --- 1146 1147 mutex_enter(&pio->io_lock); 1148 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 1149 mutex_exit(&pio->io_lock); 1150 1151 abd_put(zio->io_abd); 1152} 1153 |
1154/* 1155 * This is a callback for vdev_indirect_remap() which allocates an 1156 * indirect_split_t for each split segment and adds it to iv_splits. 1157 */ |
|
1070static void | 1158static void |
1071vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset, | 1159vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, |
1072 uint64_t size, void *arg) 1073{ 1074 zio_t *zio = arg; | 1160 uint64_t size, void *arg) 1161{ 1162 zio_t *zio = arg; |
1163 indirect_vsd_t *iv = zio->io_vsd; |
|
1075 1076 ASSERT3P(vd, !=, NULL); 1077 1078 if (vd->vdev_ops == &vdev_indirect_ops) 1079 return; 1080 | 1164 1165 ASSERT3P(vd, !=, NULL); 1166 1167 if (vd->vdev_ops == &vdev_indirect_ops) 1168 return; 1169 |
1081 zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, 1082 abd_get_offset(zio->io_abd, split_offset), 1083 size, zio->io_type, zio->io_priority, 1084 0, vdev_indirect_child_io_done, zio)); | 1170 int n = 1; 1171 if (vd->vdev_ops == &vdev_mirror_ops) 1172 n = vd->vdev_children; 1173 1174 indirect_split_t *is = 1175 kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); 1176 1177 is->is_children = n; 1178 is->is_size = size; 1179 is->is_split_offset = split_offset; 1180 is->is_target_offset = offset; 1181 is->is_vdev = vd; 1182 1183 /* 1184 * Note that we only consider multiple copies of the data for 1185 * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even 1186 * though they use the same ops as mirror, because there's only one 1187 * "good" copy under the replacing/spare. 1188 */ 1189 if (vd->vdev_ops == &vdev_mirror_ops) { 1190 for (int i = 0; i < n; i++) { 1191 is->is_child[i].ic_vdev = vd->vdev_child[i]; 1192 } 1193 } else { 1194 is->is_child[0].ic_vdev = vd; 1195 } 1196 1197 list_insert_tail(&iv->iv_splits, is); |
1085} 1086 1087static void | 1198} 1199 1200static void |
1201vdev_indirect_read_split_done(zio_t *zio) 1202{ 1203 indirect_child_t *ic = zio->io_private; 1204 1205 if (zio->io_error != 0) { 1206 /* 1207 * Clear ic_data to indicate that we do not have data for this 1208 * child. 1209 */ 1210 abd_free(ic->ic_data); 1211 ic->ic_data = NULL; 1212 } 1213} 1214 1215/* 1216 * Issue reads for all copies (mirror children) of all splits. 1217 */ 1218static void 1219vdev_indirect_read_all(zio_t *zio) 1220{ 1221 indirect_vsd_t *iv = zio->io_vsd; 1222 1223 for (indirect_split_t *is = list_head(&iv->iv_splits); 1224 is != NULL; is = list_next(&iv->iv_splits, is)) { 1225 for (int i = 0; i < is->is_children; i++) { 1226 indirect_child_t *ic = &is->is_child[i]; 1227 1228 if (!vdev_readable(ic->ic_vdev)) 1229 continue; 1230 1231 /* 1232 * Note, we may read from a child whose DTL 1233 * indicates that the data may not be present here. 1234 * While this might result in a few i/os that will 1235 * likely return incorrect data, it simplifies the 1236 * code since we can treat scrub and resilver 1237 * identically. (The incorrect data will be 1238 * detected and ignored when we verify the 1239 * checksum.) 1240 */ 1241 1242 ic->ic_data = abd_alloc_sametype(zio->io_abd, 1243 is->is_size); 1244 1245 zio_nowait(zio_vdev_child_io(zio, NULL, 1246 ic->ic_vdev, is->is_target_offset, ic->ic_data, 1247 is->is_size, zio->io_type, zio->io_priority, 0, 1248 vdev_indirect_read_split_done, ic)); 1249 } 1250 } 1251 iv->iv_reconstruct = B_TRUE; 1252} 1253 1254static void |
|
1088vdev_indirect_io_start(zio_t *zio) 1089{ 1090 spa_t *spa = zio->io_spa; | 1255vdev_indirect_io_start(zio_t *zio) 1256{ 1257 spa_t *spa = zio->io_spa; |
1258 indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); 1259 list_create(&iv->iv_splits, 1260 sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); |
|
1091 | 1261 |
1262 zio->io_vsd = iv; 1263 zio->io_vsd_ops = &vdev_indirect_vsd_ops; 1264 |
|
1092 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1093 if (zio->io_type != ZIO_TYPE_READ) { 1094 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); | 1265 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1266 if (zio->io_type != ZIO_TYPE_READ) { 1267 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); |
1095 ASSERT((zio->io_flags & 1096 (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0); | 1268 /* 1269 * Note: this code can handle other kinds of writes, 1270 * but we don't expect them. 1271 */ 1272 ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | 1273 ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); |
1097 } 1098 1099 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, | 1274 } 1275 1276 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, |
1100 vdev_indirect_io_start_cb, zio); | 1277 vdev_indirect_gather_splits, zio); |
1101 | 1278 |
1279 indirect_split_t *first = list_head(&iv->iv_splits); 1280 if (first->is_size == zio->io_size) { 1281 /* 1282 * This is not a split block; we are pointing to the entire 1283 * data, which will checksum the same as the original data. 1284 * Pass the BP down so that the child i/o can verify the 1285 * checksum, and try a different location if available 1286 * (e.g. on a mirror). 1287 * 1288 * While this special case could be handled the same as the 1289 * general (split block) case, doing it this way ensures 1290 * that the vast majority of blocks on indirect vdevs 1291 * (which are not split) are handled identically to blocks 1292 * on non-indirect vdevs. This allows us to be less strict 1293 * about performance in the general (but rare) case. 1294 */ 1295 ASSERT0(first->is_split_offset); 1296 ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); 1297 zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 1298 first->is_vdev, first->is_target_offset, 1299 abd_get_offset(zio->io_abd, 0), 1300 zio->io_size, zio->io_type, zio->io_priority, 0, 1301 vdev_indirect_child_io_done, zio)); 1302 } else { 1303 iv->iv_split_block = B_TRUE; 1304 if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { 1305 /* 1306 * Read all copies. Note that for simplicity, 1307 * we don't bother consulting the DTL in the 1308 * resilver case. 1309 */ 1310 vdev_indirect_read_all(zio); 1311 } else { 1312 /* 1313 * Read one copy of each split segment, from the 1314 * top-level vdev. Since we don't know the 1315 * checksum of each split individually, the child 1316 * zio can't ensure that we get the right data. 1317 * E.g. if it's a mirror, it will just read from a 1318 * random (healthy) leaf vdev. We have to verify 1319 * the checksum in vdev_indirect_io_done(). 1320 */ 1321 for (indirect_split_t *is = list_head(&iv->iv_splits); 1322 is != NULL; is = list_next(&iv->iv_splits, is)) { 1323 zio_nowait(zio_vdev_child_io(zio, NULL, 1324 is->is_vdev, is->is_target_offset, 1325 abd_get_offset(zio->io_abd, 1326 is->is_split_offset), 1327 is->is_size, zio->io_type, 1328 zio->io_priority, 0, 1329 vdev_indirect_child_io_done, zio)); 1330 } 1331 } 1332 } 1333 |
|
1102 zio_execute(zio); 1103} 1104 | 1334 zio_execute(zio); 1335} 1336 |
1337/* 1338 * Report a checksum error for a child. 1339 */ 1340static void 1341vdev_indirect_checksum_error(zio_t *zio, 1342 indirect_split_t *is, indirect_child_t *ic) 1343{ 1344 vdev_t *vd = ic->ic_vdev; 1345 1346 if (zio->io_flags & ZIO_FLAG_SPECULATIVE) 1347 return; 1348 1349 mutex_enter(&vd->vdev_stat_lock); 1350 vd->vdev_stat.vs_checksum_errors++; 1351 mutex_exit(&vd->vdev_stat_lock); 1352 1353 zio_bad_cksum_t zbc = { 0 }; 1354 void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size); 1355 abd_t *good_abd = is->is_child[is->is_good_child].ic_data; 1356 void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size); 1357 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1358 is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc); 1359 abd_return_buf(ic->ic_data, bad_buf, is->is_size); 1360 abd_return_buf(good_abd, good_buf, is->is_size); 1361} 1362 1363/* 1364 * Issue repair i/os for any incorrect copies. We do this by comparing 1365 * each split segment's correct data (is_good_child's ic_data) with each 1366 * other copy of the data. If they differ, then we overwrite the bad data 1367 * with the good copy. Note that we do this without regard for the DTL's, 1368 * which simplifies this code and also issues the optimal number of writes 1369 * (based on which copies actually read bad data, as opposed to which we 1370 * think might be wrong). For the same reason, we always use 1371 * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). 1372 */ 1373static void 1374vdev_indirect_repair(zio_t *zio) 1375{ 1376 indirect_vsd_t *iv = zio->io_vsd; 1377 1378 enum zio_flag flags = ZIO_FLAG_IO_REPAIR; 1379 1380 if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) 1381 flags |= ZIO_FLAG_SELF_HEAL; 1382 1383 if (!spa_writeable(zio->io_spa)) 1384 return; 1385 1386 for (indirect_split_t *is = list_head(&iv->iv_splits); 1387 is != NULL; is = list_next(&iv->iv_splits, is)) { 1388 indirect_child_t *good_child = &is->is_child[is->is_good_child]; 1389 1390 for (int c = 0; c < is->is_children; c++) { 1391 indirect_child_t *ic = &is->is_child[c]; 1392 if (ic == good_child) 1393 continue; 1394 if (ic->ic_data == NULL) 1395 continue; 1396 if (abd_cmp(good_child->ic_data, ic->ic_data, 1397 is->is_size) == 0) 1398 continue; 1399 1400 zio_nowait(zio_vdev_child_io(zio, NULL, 1401 ic->ic_vdev, is->is_target_offset, 1402 good_child->ic_data, is->is_size, 1403 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 1404 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 1405 NULL, NULL)); 1406 1407 vdev_indirect_checksum_error(zio, is, ic); 1408 } 1409 } 1410} 1411 1412/* 1413 * Report checksum errors on all children that we read from. 1414 */ 1415static void 1416vdev_indirect_all_checksum_errors(zio_t *zio) 1417{ 1418 indirect_vsd_t *iv = zio->io_vsd; 1419 1420 if (zio->io_flags & ZIO_FLAG_SPECULATIVE) 1421 return; 1422 1423 for (indirect_split_t *is = list_head(&iv->iv_splits); 1424 is != NULL; is = list_next(&iv->iv_splits, is)) { 1425 for (int c = 0; c < is->is_children; c++) { 1426 indirect_child_t *ic = &is->is_child[c]; 1427 1428 if (ic->ic_data == NULL) 1429 continue; 1430 1431 vdev_t *vd = ic->ic_vdev; 1432 1433 mutex_enter(&vd->vdev_stat_lock); 1434 vd->vdev_stat.vs_checksum_errors++; 1435 mutex_exit(&vd->vdev_stat_lock); 1436 1437 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1438 is->is_target_offset, is->is_size, 1439 NULL, NULL, NULL); 1440 } 1441 } 1442} 1443 1444/* 1445 * This function is called when we have read all copies of the data and need 1446 * to try to find a combination of copies that gives us the right checksum. 1447 * 1448 * If we pointed to any mirror vdevs, this effectively does the job of the 1449 * mirror. The mirror vdev code can't do its own job because we don't know 1450 * the checksum of each split segment individually. We have to try every 1451 * combination of copies of split segments, until we find one that checksums 1452 * correctly. (Or until we have tried all combinations, or have tried 1453 * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we 1454 * set io_error to ECKSUM to propagate the error up to the user.) 1455 * 1456 * For example, if we have 3 segments in the split, 1457 * and each points to a 2-way mirror, we will have the following pieces of 1458 * data: 1459 * 1460 * | mirror child 1461 * split | [0] [1] 1462 * ======|===================== 1463 * A | data_A_0 data_A_1 1464 * B | data_B_0 data_B_1 1465 * C | data_C_0 data_C_1 1466 * 1467 * We will try the following (mirror children)^(number of splits) (2^3=8) 1468 * combinations, which is similar to bitwise-little-endian counting in 1469 * binary. In general each "digit" corresponds to a split segment, and the 1470 * base of each digit is is_children, which can be different for each 1471 * digit. 1472 * 1473 * "low bit" "high bit" 1474 * v v 1475 * data_A_0 data_B_0 data_C_0 1476 * data_A_1 data_B_0 data_C_0 1477 * data_A_0 data_B_1 data_C_0 1478 * data_A_1 data_B_1 data_C_0 1479 * data_A_0 data_B_0 data_C_1 1480 * data_A_1 data_B_0 data_C_1 1481 * data_A_0 data_B_1 data_C_1 1482 * data_A_1 data_B_1 data_C_1 1483 * 1484 * Note that the split segments may be on the same or different top-level 1485 * vdevs. In either case, we try lots of combinations (see 1486 * zfs_reconstruct_indirect_segments_max). This ensures that if a mirror has 1487 * small silent errors on all of its children, we can still reconstruct the 1488 * correct data, as long as those errors are at sufficiently-separated 1489 * offsets (specifically, separated by the largest block size - default of 1490 * 128KB, but up to 16MB). 1491 */ 1492static void 1493vdev_indirect_reconstruct_io_done(zio_t *zio) 1494{ 1495 indirect_vsd_t *iv = zio->io_vsd; 1496 uint64_t attempts = 0; 1497 uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max; 1498 int segments = 0; 1499 1500 for (indirect_split_t *is = list_head(&iv->iv_splits); 1501 is != NULL; is = list_next(&iv->iv_splits, is)) 1502 segments++; 1503 1504 for (;;) { 1505 /* copy data from splits to main zio */ 1506 int ret; 1507 for (indirect_split_t *is = list_head(&iv->iv_splits); 1508 is != NULL; is = list_next(&iv->iv_splits, is)) { 1509 1510 /* 1511 * If this child failed, its ic_data will be NULL. 1512 * Skip this combination. 1513 */ 1514 if (is->is_child[is->is_good_child].ic_data == NULL) { 1515 ret = EIO; 1516 goto next; 1517 } 1518 1519 abd_copy_off(zio->io_abd, 1520 is->is_child[is->is_good_child].ic_data, 1521 is->is_split_offset, 0, is->is_size); 1522 } 1523 1524 /* See if this checksum matches. */ 1525 zio_bad_cksum_t zbc; 1526 ret = zio_checksum_error(zio, &zbc); 1527 if (ret == 0) { 1528 /* Found a matching checksum. Issue repair i/os. */ 1529 vdev_indirect_repair(zio); 1530 zio_checksum_verified(zio); 1531 return; 1532 } 1533 1534 /* 1535 * Checksum failed; try a different combination of split 1536 * children. 1537 */ 1538 boolean_t more; 1539next: 1540 more = B_FALSE; 1541 if (segments <= zfs_reconstruct_indirect_segments_max) { 1542 /* 1543 * There are relatively few segments, so 1544 * deterministically check all combinations. We do 1545 * this by by adding one to the first split's 1546 * good_child. If it overflows, then "carry over" to 1547 * the next split (like counting in base is_children, 1548 * but each digit can have a different base). 1549 */ 1550 for (indirect_split_t *is = list_head(&iv->iv_splits); 1551 is != NULL; is = list_next(&iv->iv_splits, is)) { 1552 is->is_good_child++; 1553 if (is->is_good_child < is->is_children) { 1554 more = B_TRUE; 1555 break; 1556 } 1557 is->is_good_child = 0; 1558 } 1559 } else if (++attempts < attempts_max) { 1560 /* 1561 * There are too many combinations to try all of them 1562 * in a reasonable amount of time, so try a fixed 1563 * number of random combinations, after which we'll 1564 * consider the block unrecoverable. 1565 */ 1566 for (indirect_split_t *is = list_head(&iv->iv_splits); 1567 is != NULL; is = list_next(&iv->iv_splits, is)) { 1568 is->is_good_child = 1569 spa_get_random(is->is_children); 1570 } 1571 more = B_TRUE; 1572 } 1573 if (!more) { 1574 /* All combinations failed. */ 1575 zio->io_error = ret; 1576 vdev_indirect_all_checksum_errors(zio); 1577 zio_checksum_verified(zio); 1578 return; 1579 } 1580 } 1581} 1582 1583static void 1584vdev_indirect_io_done(zio_t *zio) 1585{ 1586 indirect_vsd_t *iv = zio->io_vsd; 1587 1588 if (iv->iv_reconstruct) { 1589 /* 1590 * We have read all copies of the data (e.g. from mirrors), 1591 * either because this was a scrub/resilver, or because the 1592 * one-copy read didn't checksum correctly. 1593 */ 1594 vdev_indirect_reconstruct_io_done(zio); 1595 return; 1596 } 1597 1598 if (!iv->iv_split_block) { 1599 /* 1600 * This was not a split block, so we passed the BP down, 1601 * and the checksum was handled by the (one) child zio. 1602 */ 1603 return; 1604 } 1605 1606 zio_bad_cksum_t zbc; 1607 int ret = zio_checksum_error(zio, &zbc); 1608 if (ret == 0) { 1609 zio_checksum_verified(zio); 1610 return; 1611 } 1612 1613 /* 1614 * The checksum didn't match. Read all copies of all splits, and 1615 * then we will try to reconstruct. The next time 1616 * vdev_indirect_io_done() is called, iv_reconstruct will be set. 1617 */ 1618 vdev_indirect_read_all(zio); 1619 1620 zio_vdev_io_redone(zio); 1621} 1622 |
|
1105vdev_ops_t vdev_indirect_ops = { 1106 vdev_indirect_open, 1107 vdev_indirect_close, 1108 vdev_default_asize, 1109 vdev_indirect_io_start, 1110 vdev_indirect_io_done, 1111 NULL, 1112 NULL, 1113 NULL, 1114 NULL, 1115 vdev_indirect_remap, 1116 VDEV_TYPE_INDIRECT, /* name of this vdev type */ 1117 B_FALSE /* leaf vdev */ 1118}; | 1623vdev_ops_t vdev_indirect_ops = { 1624 vdev_indirect_open, 1625 vdev_indirect_close, 1626 vdev_default_asize, 1627 vdev_indirect_io_start, 1628 vdev_indirect_io_done, 1629 NULL, 1630 NULL, 1631 NULL, 1632 NULL, 1633 vdev_indirect_remap, 1634 VDEV_TYPE_INDIRECT, /* name of this vdev type */ 1635 B_FALSE /* leaf vdev */ 1636}; |