Deleted Added
full compact
vdev_indirect.c (339104) vdev_indirect.c (339106)
1/*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *

--- 9 unchanged lines hidden (view full) ---

18 */
19
20#include <sys/zfs_context.h>
21#include <sys/spa.h>
22#include <sys/spa_impl.h>
23#include <sys/vdev_impl.h>
24#include <sys/fs/zfs.h>
25#include <sys/zio.h>
1/*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *

--- 9 unchanged lines hidden (view full) ---

18 */
19
20#include <sys/zfs_context.h>
21#include <sys/spa.h>
22#include <sys/spa_impl.h>
23#include <sys/vdev_impl.h>
24#include <sys/fs/zfs.h>
25#include <sys/zio.h>
26#include <sys/zio_checksum.h>
26#include <sys/metaslab.h>
27#include <sys/refcount.h>
28#include <sys/dmu.h>
29#include <sys/vdev_indirect_mapping.h>
30#include <sys/dmu_tx.h>
31#include <sys/dsl_synctask.h>
32#include <sys/zap.h>
33#include <sys/abd.h>

--- 7 unchanged lines hidden (view full) ---

41 * to access the DVA. Unfortunately, this mapping did not respect
42 * logical block boundaries when it was first created, and so a DVA on
43 * this indirect vdev may be "split" into multiple sections that each
44 * map to a different location. As a consequence, not all DVAs can be
45 * translated to an equivalent new DVA. Instead we must provide a
46 * "vdev_remap" operation that executes a callback on each contiguous
47 * segment of the new location. This function is used in multiple ways:
48 *
27#include <sys/metaslab.h>
28#include <sys/refcount.h>
29#include <sys/dmu.h>
30#include <sys/vdev_indirect_mapping.h>
31#include <sys/dmu_tx.h>
32#include <sys/dsl_synctask.h>
33#include <sys/zap.h>
34#include <sys/abd.h>

--- 7 unchanged lines hidden (view full) ---

42 * to access the DVA. Unfortunately, this mapping did not respect
43 * logical block boundaries when it was first created, and so a DVA on
44 * this indirect vdev may be "split" into multiple sections that each
45 * map to a different location. As a consequence, not all DVAs can be
46 * translated to an equivalent new DVA. Instead we must provide a
47 * "vdev_remap" operation that executes a callback on each contiguous
48 * segment of the new location. This function is used in multiple ways:
49 *
49 * - reads and repair writes to this device use the callback to create
50 * a child io for each mapped segment.
50 * - i/os to this vdev use the callback to determine where the
51 * data is now located, and issue child i/os for each segment's new
52 * location.
51 *
53 *
52 * - frees and claims to this device use the callback to free or claim
54 * - frees and claims to this vdev use the callback to free or claim
53 * each mapped segment. (Note that we don't actually need to claim
54 * log blocks on indirect vdevs, because we don't allocate to
55 * removing vdevs. However, zdb uses zio_claim() for its leak
56 * detection.)
57 */
58
59/*
60 * "Big theory statement" for how we mark blocks obsolete.

--- 138 unchanged lines hidden (view full) ---

199 * This is used by the test suite so that it can ensure that certain
200 * actions happen while in the middle of a condense (which might otherwise
201 * complete too quickly). If used to reduce the performance impact of
202 * condensing in production, a maximum value of 1 should be sufficient.
203 */
204int zfs_condense_indirect_commit_entry_delay_ticks = 0;
205
206/*
55 * each mapped segment. (Note that we don't actually need to claim
56 * log blocks on indirect vdevs, because we don't allocate to
57 * removing vdevs. However, zdb uses zio_claim() for its leak
58 * detection.)
59 */
60
61/*
62 * "Big theory statement" for how we mark blocks obsolete.

--- 138 unchanged lines hidden (view full) ---

201 * This is used by the test suite so that it can ensure that certain
202 * actions happen while in the middle of a condense (which might otherwise
203 * complete too quickly). If used to reduce the performance impact of
204 * condensing in production, a maximum value of 1 should be sufficient.
205 */
206int zfs_condense_indirect_commit_entry_delay_ticks = 0;
207
208/*
209 * If a split block contains more than this many segments, consider it too
210 * computationally expensive to check all (2^num_segments) possible
211 * combinations. Instead, try at most 2^_segments_max randomly-selected
212 * combinations.
213 *
214 * This is reasonable if only a few segment copies are damaged and the
215 * majority of segment copies are good. This allows all the segment copies to
216 * participate fairly in the reconstruction and prevents the repeated use of
217 * one bad copy.
218 */
219int zfs_reconstruct_indirect_segments_max = 10;
220
221/*
222 * The indirect_child_t represents the vdev that we will read from, when we
223 * need to read all copies of the data (e.g. for scrub or reconstruction).
224 * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
225 * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs,
226 * ic_vdev is a child of the mirror.
227 */
228typedef struct indirect_child {
229 abd_t *ic_data;
230 vdev_t *ic_vdev;
231} indirect_child_t;
232
233/*
234 * The indirect_split_t represents one mapped segment of an i/o to the
235 * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
236 * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
237 * For split blocks, there will be several of these.
238 */
239typedef struct indirect_split {
240 list_node_t is_node; /* link on iv_splits */
241
242 /*
243 * is_split_offset is the offset into the i/o.
244 * This is the sum of the previous splits' is_size's.
245 */
246 uint64_t is_split_offset;
247
248 vdev_t *is_vdev; /* top-level vdev */
249 uint64_t is_target_offset; /* offset on is_vdev */
250 uint64_t is_size;
251 int is_children; /* number of entries in is_child[] */
252
253 /*
254 * is_good_child is the child that we are currently using to
255 * attempt reconstruction.
256 */
257 int is_good_child;
258
259 indirect_child_t is_child[1]; /* variable-length */
260} indirect_split_t;
261
262/*
263 * The indirect_vsd_t is associated with each i/o to the indirect vdev.
264 * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
265 */
266typedef struct indirect_vsd {
267 boolean_t iv_split_block;
268 boolean_t iv_reconstruct;
269
270 list_t iv_splits; /* list of indirect_split_t's */
271} indirect_vsd_t;
272
273static void
274vdev_indirect_map_free(zio_t *zio)
275{
276 indirect_vsd_t *iv = zio->io_vsd;
277
278 indirect_split_t *is;
279 while ((is = list_head(&iv->iv_splits)) != NULL) {
280 for (int c = 0; c < is->is_children; c++) {
281 indirect_child_t *ic = &is->is_child[c];
282 if (ic->ic_data != NULL)
283 abd_free(ic->ic_data);
284 }
285 list_remove(&iv->iv_splits, is);
286 kmem_free(is,
287 offsetof(indirect_split_t, is_child[is->is_children]));
288 }
289 kmem_free(iv, sizeof (*iv));
290}
291
292static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
293 vdev_indirect_map_free,
294 zio_vsd_default_cksum_report
295};
296/*
207 * Mark the given offset and size as being obsolete.
208 */
209void
210vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
211{
212 spa_t *spa = vd->vdev_spa;
213
214 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);

--- 598 unchanged lines hidden (view full) ---

813
814/* ARGSUSED */
815static void
816vdev_indirect_close(vdev_t *vd)
817{
818}
819
820/* ARGSUSED */
297 * Mark the given offset and size as being obsolete.
298 */
299void
300vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
301{
302 spa_t *spa = vd->vdev_spa;
303
304 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);

--- 598 unchanged lines hidden (view full) ---

903
904/* ARGSUSED */
905static void
906vdev_indirect_close(vdev_t *vd)
907{
908}
909
910/* ARGSUSED */
821static void
822vdev_indirect_io_done(zio_t *zio)
823{
824}
825
826/* ARGSUSED */
827static int
828vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
829 uint64_t *logical_ashift, uint64_t *physical_ashift)
830{
831 *psize = *max_psize = vd->vdev_asize +
832 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
833 *logical_ashift = vd->vdev_ashift;
834 *physical_ashift = vd->vdev_physical_ashift;

--- 227 unchanged lines hidden (view full) ---

1062
1063 mutex_enter(&pio->io_lock);
1064 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
1065 mutex_exit(&pio->io_lock);
1066
1067 abd_put(zio->io_abd);
1068}
1069
911static int
912vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
913 uint64_t *logical_ashift, uint64_t *physical_ashift)
914{
915 *psize = *max_psize = vd->vdev_asize +
916 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
917 *logical_ashift = vd->vdev_ashift;
918 *physical_ashift = vd->vdev_physical_ashift;

--- 227 unchanged lines hidden (view full) ---

1146
1147 mutex_enter(&pio->io_lock);
1148 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
1149 mutex_exit(&pio->io_lock);
1150
1151 abd_put(zio->io_abd);
1152}
1153
1154/*
1155 * This is a callback for vdev_indirect_remap() which allocates an
1156 * indirect_split_t for each split segment and adds it to iv_splits.
1157 */
1070static void
1158static void
1071vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
1159vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
1072 uint64_t size, void *arg)
1073{
1074 zio_t *zio = arg;
1160 uint64_t size, void *arg)
1161{
1162 zio_t *zio = arg;
1163 indirect_vsd_t *iv = zio->io_vsd;
1075
1076 ASSERT3P(vd, !=, NULL);
1077
1078 if (vd->vdev_ops == &vdev_indirect_ops)
1079 return;
1080
1164
1165 ASSERT3P(vd, !=, NULL);
1166
1167 if (vd->vdev_ops == &vdev_indirect_ops)
1168 return;
1169
1081 zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
1082 abd_get_offset(zio->io_abd, split_offset),
1083 size, zio->io_type, zio->io_priority,
1084 0, vdev_indirect_child_io_done, zio));
1170 int n = 1;
1171 if (vd->vdev_ops == &vdev_mirror_ops)
1172 n = vd->vdev_children;
1173
1174 indirect_split_t *is =
1175 kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
1176
1177 is->is_children = n;
1178 is->is_size = size;
1179 is->is_split_offset = split_offset;
1180 is->is_target_offset = offset;
1181 is->is_vdev = vd;
1182
1183 /*
1184 * Note that we only consider multiple copies of the data for
1185 * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even
1186 * though they use the same ops as mirror, because there's only one
1187 * "good" copy under the replacing/spare.
1188 */
1189 if (vd->vdev_ops == &vdev_mirror_ops) {
1190 for (int i = 0; i < n; i++) {
1191 is->is_child[i].ic_vdev = vd->vdev_child[i];
1192 }
1193 } else {
1194 is->is_child[0].ic_vdev = vd;
1195 }
1196
1197 list_insert_tail(&iv->iv_splits, is);
1085}
1086
1087static void
1198}
1199
1200static void
1201vdev_indirect_read_split_done(zio_t *zio)
1202{
1203 indirect_child_t *ic = zio->io_private;
1204
1205 if (zio->io_error != 0) {
1206 /*
1207 * Clear ic_data to indicate that we do not have data for this
1208 * child.
1209 */
1210 abd_free(ic->ic_data);
1211 ic->ic_data = NULL;
1212 }
1213}
1214
1215/*
1216 * Issue reads for all copies (mirror children) of all splits.
1217 */
1218static void
1219vdev_indirect_read_all(zio_t *zio)
1220{
1221 indirect_vsd_t *iv = zio->io_vsd;
1222
1223 for (indirect_split_t *is = list_head(&iv->iv_splits);
1224 is != NULL; is = list_next(&iv->iv_splits, is)) {
1225 for (int i = 0; i < is->is_children; i++) {
1226 indirect_child_t *ic = &is->is_child[i];
1227
1228 if (!vdev_readable(ic->ic_vdev))
1229 continue;
1230
1231 /*
1232 * Note, we may read from a child whose DTL
1233 * indicates that the data may not be present here.
1234 * While this might result in a few i/os that will
1235 * likely return incorrect data, it simplifies the
1236 * code since we can treat scrub and resilver
1237 * identically. (The incorrect data will be
1238 * detected and ignored when we verify the
1239 * checksum.)
1240 */
1241
1242 ic->ic_data = abd_alloc_sametype(zio->io_abd,
1243 is->is_size);
1244
1245 zio_nowait(zio_vdev_child_io(zio, NULL,
1246 ic->ic_vdev, is->is_target_offset, ic->ic_data,
1247 is->is_size, zio->io_type, zio->io_priority, 0,
1248 vdev_indirect_read_split_done, ic));
1249 }
1250 }
1251 iv->iv_reconstruct = B_TRUE;
1252}
1253
1254static void
1088vdev_indirect_io_start(zio_t *zio)
1089{
1090 spa_t *spa = zio->io_spa;
1255vdev_indirect_io_start(zio_t *zio)
1256{
1257 spa_t *spa = zio->io_spa;
1258 indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
1259 list_create(&iv->iv_splits,
1260 sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
1091
1261
1262 zio->io_vsd = iv;
1263 zio->io_vsd_ops = &vdev_indirect_vsd_ops;
1264
1092 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1093 if (zio->io_type != ZIO_TYPE_READ) {
1094 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
1265 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1266 if (zio->io_type != ZIO_TYPE_READ) {
1267 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
1095 ASSERT((zio->io_flags &
1096 (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
1268 /*
1269 * Note: this code can handle other kinds of writes,
1270 * but we don't expect them.
1271 */
1272 ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
1273 ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
1097 }
1098
1099 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
1274 }
1275
1276 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
1100 vdev_indirect_io_start_cb, zio);
1277 vdev_indirect_gather_splits, zio);
1101
1278
1279 indirect_split_t *first = list_head(&iv->iv_splits);
1280 if (first->is_size == zio->io_size) {
1281 /*
1282 * This is not a split block; we are pointing to the entire
1283 * data, which will checksum the same as the original data.
1284 * Pass the BP down so that the child i/o can verify the
1285 * checksum, and try a different location if available
1286 * (e.g. on a mirror).
1287 *
1288 * While this special case could be handled the same as the
1289 * general (split block) case, doing it this way ensures
1290 * that the vast majority of blocks on indirect vdevs
1291 * (which are not split) are handled identically to blocks
1292 * on non-indirect vdevs. This allows us to be less strict
1293 * about performance in the general (but rare) case.
1294 */
1295 ASSERT0(first->is_split_offset);
1296 ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
1297 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
1298 first->is_vdev, first->is_target_offset,
1299 abd_get_offset(zio->io_abd, 0),
1300 zio->io_size, zio->io_type, zio->io_priority, 0,
1301 vdev_indirect_child_io_done, zio));
1302 } else {
1303 iv->iv_split_block = B_TRUE;
1304 if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
1305 /*
1306 * Read all copies. Note that for simplicity,
1307 * we don't bother consulting the DTL in the
1308 * resilver case.
1309 */
1310 vdev_indirect_read_all(zio);
1311 } else {
1312 /*
1313 * Read one copy of each split segment, from the
1314 * top-level vdev. Since we don't know the
1315 * checksum of each split individually, the child
1316 * zio can't ensure that we get the right data.
1317 * E.g. if it's a mirror, it will just read from a
1318 * random (healthy) leaf vdev. We have to verify
1319 * the checksum in vdev_indirect_io_done().
1320 */
1321 for (indirect_split_t *is = list_head(&iv->iv_splits);
1322 is != NULL; is = list_next(&iv->iv_splits, is)) {
1323 zio_nowait(zio_vdev_child_io(zio, NULL,
1324 is->is_vdev, is->is_target_offset,
1325 abd_get_offset(zio->io_abd,
1326 is->is_split_offset),
1327 is->is_size, zio->io_type,
1328 zio->io_priority, 0,
1329 vdev_indirect_child_io_done, zio));
1330 }
1331 }
1332 }
1333
1102 zio_execute(zio);
1103}
1104
1334 zio_execute(zio);
1335}
1336
1337/*
1338 * Report a checksum error for a child.
1339 */
1340static void
1341vdev_indirect_checksum_error(zio_t *zio,
1342 indirect_split_t *is, indirect_child_t *ic)
1343{
1344 vdev_t *vd = ic->ic_vdev;
1345
1346 if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
1347 return;
1348
1349 mutex_enter(&vd->vdev_stat_lock);
1350 vd->vdev_stat.vs_checksum_errors++;
1351 mutex_exit(&vd->vdev_stat_lock);
1352
1353 zio_bad_cksum_t zbc = { 0 };
1354 void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
1355 abd_t *good_abd = is->is_child[is->is_good_child].ic_data;
1356 void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
1357 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1358 is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
1359 abd_return_buf(ic->ic_data, bad_buf, is->is_size);
1360 abd_return_buf(good_abd, good_buf, is->is_size);
1361}
1362
1363/*
1364 * Issue repair i/os for any incorrect copies. We do this by comparing
1365 * each split segment's correct data (is_good_child's ic_data) with each
1366 * other copy of the data. If they differ, then we overwrite the bad data
1367 * with the good copy. Note that we do this without regard for the DTL's,
1368 * which simplifies this code and also issues the optimal number of writes
1369 * (based on which copies actually read bad data, as opposed to which we
1370 * think might be wrong). For the same reason, we always use
1371 * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
1372 */
1373static void
1374vdev_indirect_repair(zio_t *zio)
1375{
1376 indirect_vsd_t *iv = zio->io_vsd;
1377
1378 enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
1379
1380 if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
1381 flags |= ZIO_FLAG_SELF_HEAL;
1382
1383 if (!spa_writeable(zio->io_spa))
1384 return;
1385
1386 for (indirect_split_t *is = list_head(&iv->iv_splits);
1387 is != NULL; is = list_next(&iv->iv_splits, is)) {
1388 indirect_child_t *good_child = &is->is_child[is->is_good_child];
1389
1390 for (int c = 0; c < is->is_children; c++) {
1391 indirect_child_t *ic = &is->is_child[c];
1392 if (ic == good_child)
1393 continue;
1394 if (ic->ic_data == NULL)
1395 continue;
1396 if (abd_cmp(good_child->ic_data, ic->ic_data,
1397 is->is_size) == 0)
1398 continue;
1399
1400 zio_nowait(zio_vdev_child_io(zio, NULL,
1401 ic->ic_vdev, is->is_target_offset,
1402 good_child->ic_data, is->is_size,
1403 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
1404 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
1405 NULL, NULL));
1406
1407 vdev_indirect_checksum_error(zio, is, ic);
1408 }
1409 }
1410}
1411
1412/*
1413 * Report checksum errors on all children that we read from.
1414 */
1415static void
1416vdev_indirect_all_checksum_errors(zio_t *zio)
1417{
1418 indirect_vsd_t *iv = zio->io_vsd;
1419
1420 if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
1421 return;
1422
1423 for (indirect_split_t *is = list_head(&iv->iv_splits);
1424 is != NULL; is = list_next(&iv->iv_splits, is)) {
1425 for (int c = 0; c < is->is_children; c++) {
1426 indirect_child_t *ic = &is->is_child[c];
1427
1428 if (ic->ic_data == NULL)
1429 continue;
1430
1431 vdev_t *vd = ic->ic_vdev;
1432
1433 mutex_enter(&vd->vdev_stat_lock);
1434 vd->vdev_stat.vs_checksum_errors++;
1435 mutex_exit(&vd->vdev_stat_lock);
1436
1437 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1438 is->is_target_offset, is->is_size,
1439 NULL, NULL, NULL);
1440 }
1441 }
1442}
1443
1444/*
1445 * This function is called when we have read all copies of the data and need
1446 * to try to find a combination of copies that gives us the right checksum.
1447 *
1448 * If we pointed to any mirror vdevs, this effectively does the job of the
1449 * mirror. The mirror vdev code can't do its own job because we don't know
1450 * the checksum of each split segment individually. We have to try every
1451 * combination of copies of split segments, until we find one that checksums
1452 * correctly. (Or until we have tried all combinations, or have tried
1453 * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we
1454 * set io_error to ECKSUM to propagate the error up to the user.)
1455 *
1456 * For example, if we have 3 segments in the split,
1457 * and each points to a 2-way mirror, we will have the following pieces of
1458 * data:
1459 *
1460 * | mirror child
1461 * split | [0] [1]
1462 * ======|=====================
1463 * A | data_A_0 data_A_1
1464 * B | data_B_0 data_B_1
1465 * C | data_C_0 data_C_1
1466 *
1467 * We will try the following (mirror children)^(number of splits) (2^3=8)
1468 * combinations, which is similar to bitwise-little-endian counting in
1469 * binary. In general each "digit" corresponds to a split segment, and the
1470 * base of each digit is is_children, which can be different for each
1471 * digit.
1472 *
1473 * "low bit" "high bit"
1474 * v v
1475 * data_A_0 data_B_0 data_C_0
1476 * data_A_1 data_B_0 data_C_0
1477 * data_A_0 data_B_1 data_C_0
1478 * data_A_1 data_B_1 data_C_0
1479 * data_A_0 data_B_0 data_C_1
1480 * data_A_1 data_B_0 data_C_1
1481 * data_A_0 data_B_1 data_C_1
1482 * data_A_1 data_B_1 data_C_1
1483 *
1484 * Note that the split segments may be on the same or different top-level
1485 * vdevs. In either case, we try lots of combinations (see
1486 * zfs_reconstruct_indirect_segments_max). This ensures that if a mirror has
1487 * small silent errors on all of its children, we can still reconstruct the
1488 * correct data, as long as those errors are at sufficiently-separated
1489 * offsets (specifically, separated by the largest block size - default of
1490 * 128KB, but up to 16MB).
1491 */
1492static void
1493vdev_indirect_reconstruct_io_done(zio_t *zio)
1494{
1495 indirect_vsd_t *iv = zio->io_vsd;
1496 uint64_t attempts = 0;
1497 uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
1498 int segments = 0;
1499
1500 for (indirect_split_t *is = list_head(&iv->iv_splits);
1501 is != NULL; is = list_next(&iv->iv_splits, is))
1502 segments++;
1503
1504 for (;;) {
1505 /* copy data from splits to main zio */
1506 int ret;
1507 for (indirect_split_t *is = list_head(&iv->iv_splits);
1508 is != NULL; is = list_next(&iv->iv_splits, is)) {
1509
1510 /*
1511 * If this child failed, its ic_data will be NULL.
1512 * Skip this combination.
1513 */
1514 if (is->is_child[is->is_good_child].ic_data == NULL) {
1515 ret = EIO;
1516 goto next;
1517 }
1518
1519 abd_copy_off(zio->io_abd,
1520 is->is_child[is->is_good_child].ic_data,
1521 is->is_split_offset, 0, is->is_size);
1522 }
1523
1524 /* See if this checksum matches. */
1525 zio_bad_cksum_t zbc;
1526 ret = zio_checksum_error(zio, &zbc);
1527 if (ret == 0) {
1528 /* Found a matching checksum. Issue repair i/os. */
1529 vdev_indirect_repair(zio);
1530 zio_checksum_verified(zio);
1531 return;
1532 }
1533
1534 /*
1535 * Checksum failed; try a different combination of split
1536 * children.
1537 */
1538 boolean_t more;
1539next:
1540 more = B_FALSE;
1541 if (segments <= zfs_reconstruct_indirect_segments_max) {
1542 /*
1543 * There are relatively few segments, so
1544 * deterministically check all combinations. We do
1545 * this by by adding one to the first split's
1546 * good_child. If it overflows, then "carry over" to
1547 * the next split (like counting in base is_children,
1548 * but each digit can have a different base).
1549 */
1550 for (indirect_split_t *is = list_head(&iv->iv_splits);
1551 is != NULL; is = list_next(&iv->iv_splits, is)) {
1552 is->is_good_child++;
1553 if (is->is_good_child < is->is_children) {
1554 more = B_TRUE;
1555 break;
1556 }
1557 is->is_good_child = 0;
1558 }
1559 } else if (++attempts < attempts_max) {
1560 /*
1561 * There are too many combinations to try all of them
1562 * in a reasonable amount of time, so try a fixed
1563 * number of random combinations, after which we'll
1564 * consider the block unrecoverable.
1565 */
1566 for (indirect_split_t *is = list_head(&iv->iv_splits);
1567 is != NULL; is = list_next(&iv->iv_splits, is)) {
1568 is->is_good_child =
1569 spa_get_random(is->is_children);
1570 }
1571 more = B_TRUE;
1572 }
1573 if (!more) {
1574 /* All combinations failed. */
1575 zio->io_error = ret;
1576 vdev_indirect_all_checksum_errors(zio);
1577 zio_checksum_verified(zio);
1578 return;
1579 }
1580 }
1581}
1582
1583static void
1584vdev_indirect_io_done(zio_t *zio)
1585{
1586 indirect_vsd_t *iv = zio->io_vsd;
1587
1588 if (iv->iv_reconstruct) {
1589 /*
1590 * We have read all copies of the data (e.g. from mirrors),
1591 * either because this was a scrub/resilver, or because the
1592 * one-copy read didn't checksum correctly.
1593 */
1594 vdev_indirect_reconstruct_io_done(zio);
1595 return;
1596 }
1597
1598 if (!iv->iv_split_block) {
1599 /*
1600 * This was not a split block, so we passed the BP down,
1601 * and the checksum was handled by the (one) child zio.
1602 */
1603 return;
1604 }
1605
1606 zio_bad_cksum_t zbc;
1607 int ret = zio_checksum_error(zio, &zbc);
1608 if (ret == 0) {
1609 zio_checksum_verified(zio);
1610 return;
1611 }
1612
1613 /*
1614 * The checksum didn't match. Read all copies of all splits, and
1615 * then we will try to reconstruct. The next time
1616 * vdev_indirect_io_done() is called, iv_reconstruct will be set.
1617 */
1618 vdev_indirect_read_all(zio);
1619
1620 zio_vdev_io_redone(zio);
1621}
1622
1105vdev_ops_t vdev_indirect_ops = {
1106 vdev_indirect_open,
1107 vdev_indirect_close,
1108 vdev_default_asize,
1109 vdev_indirect_io_start,
1110 vdev_indirect_io_done,
1111 NULL,
1112 NULL,
1113 NULL,
1114 NULL,
1115 vdev_indirect_remap,
1116 VDEV_TYPE_INDIRECT, /* name of this vdev type */
1117 B_FALSE /* leaf vdev */
1118};
1623vdev_ops_t vdev_indirect_ops = {
1624 vdev_indirect_open,
1625 vdev_indirect_close,
1626 vdev_default_asize,
1627 vdev_indirect_io_start,
1628 vdev_indirect_io_done,
1629 NULL,
1630 NULL,
1631 NULL,
1632 NULL,
1633 vdev_indirect_remap,
1634 VDEV_TYPE_INDIRECT, /* name of this vdev type */
1635 B_FALSE /* leaf vdev */
1636};