Deleted Added
full compact
zio.c (209261) zio.c (209962)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 5 unchanged lines hidden (view full) ---

14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 5 unchanged lines hidden (view full) ---

14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/fm/fs/zfs.h>
28#include <sys/spa.h>
29#include <sys/txg.h>
30#include <sys/spa_impl.h>

--- 40 unchanged lines hidden (view full) ---

71#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
72
73/*
74 * ==========================================================================
75 * I/O kmem caches
76 * ==========================================================================
77 */
78kmem_cache_t *zio_cache;
23 * Use is subject to license terms.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/fm/fs/zfs.h>
28#include <sys/spa.h>
29#include <sys/txg.h>
30#include <sys/spa_impl.h>

--- 40 unchanged lines hidden (view full) ---

71#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
72
73/*
74 * ==========================================================================
75 * I/O kmem caches
76 * ==========================================================================
77 */
78kmem_cache_t *zio_cache;
79kmem_cache_t *zio_link_cache;
79kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
80kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
81
82#ifdef _KERNEL
83extern vmem_t *zio_alloc_arena;
84#endif
85
86/*
87 * An allocating zio is one that either currently has the DVA allocate
88 * stage set or will have it later in its lifetime.
89 */
90#define IO_IS_ALLOCATING(zio) \
91 ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
92
93void
94zio_init(void)
95{
96 size_t c;
80kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
81kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
82
83#ifdef _KERNEL
84extern vmem_t *zio_alloc_arena;
85#endif
86
87/*
88 * An allocating zio is one that either currently has the DVA allocate
89 * stage set or will have it later in its lifetime.
90 */
91#define IO_IS_ALLOCATING(zio) \
92 ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
93
94void
95zio_init(void)
96{
97 size_t c;
97 zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
98 NULL, NULL, NULL, NULL, NULL, 0);
98 zio_cache = kmem_cache_create("zio_cache",
99 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
100 zio_link_cache = kmem_cache_create("zio_link_cache",
101 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
99
100 /*
101 * For small buffers, we want a cache for each multiple of
102 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
103 * for each quarter-power of 2. For large buffers, we want
104 * a cache for each multiple of PAGESIZE.
105 */
106 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {

--- 53 unchanged lines hidden (view full) ---

160
161 if (zio_data_buf_cache[c] != last_data_cache) {
162 last_data_cache = zio_data_buf_cache[c];
163 kmem_cache_destroy(zio_data_buf_cache[c]);
164 }
165 zio_data_buf_cache[c] = NULL;
166 }
167
102
103 /*
104 * For small buffers, we want a cache for each multiple of
105 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
106 * for each quarter-power of 2. For large buffers, we want
107 * a cache for each multiple of PAGESIZE.
108 */
109 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {

--- 53 unchanged lines hidden (view full) ---

163
164 if (zio_data_buf_cache[c] != last_data_cache) {
165 last_data_cache = zio_data_buf_cache[c];
166 kmem_cache_destroy(zio_data_buf_cache[c]);
167 }
168 zio_data_buf_cache[c] = NULL;
169 }
170
171 kmem_cache_destroy(zio_link_cache);
168 kmem_cache_destroy(zio_cache);
169
170 zio_inject_fini();
171}
172
173/*
174 * ==========================================================================
175 * Allocate and free I/O buffers

--- 130 unchanged lines hidden (view full) ---

306 zio->io_error = EIO;
307}
308
309/*
310 * ==========================================================================
311 * I/O parent/child relationships and pipeline interlocks
312 * ==========================================================================
313 */
172 kmem_cache_destroy(zio_cache);
173
174 zio_inject_fini();
175}
176
177/*
178 * ==========================================================================
179 * Allocate and free I/O buffers

--- 130 unchanged lines hidden (view full) ---

310 zio->io_error = EIO;
311}
312
313/*
314 * ==========================================================================
315 * I/O parent/child relationships and pipeline interlocks
316 * ==========================================================================
317 */
318/*
319 * NOTE - Callers to zio_walk_parents() and zio_walk_children must
320 * continue calling these functions until they return NULL.
321 * Otherwise, the next caller will pick up the list walk in
322 * some indeterminate state. (Otherwise every caller would
323 * have to pass in a cookie to keep the state represented by
324 * io_walk_link, which gets annoying.)
325 */
326zio_t *
327zio_walk_parents(zio_t *cio)
328{
329 zio_link_t *zl = cio->io_walk_link;
330 list_t *pl = &cio->io_parent_list;
314
331
315static void
316zio_add_child(zio_t *pio, zio_t *zio)
332 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
333 cio->io_walk_link = zl;
334
335 if (zl == NULL)
336 return (NULL);
337
338 ASSERT(zl->zl_child == cio);
339 return (zl->zl_parent);
340}
341
342zio_t *
343zio_walk_children(zio_t *pio)
317{
344{
345 zio_link_t *zl = pio->io_walk_link;
346 list_t *cl = &pio->io_child_list;
347
348 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
349 pio->io_walk_link = zl;
350
351 if (zl == NULL)
352 return (NULL);
353
354 ASSERT(zl->zl_parent == pio);
355 return (zl->zl_child);
356}
357
358zio_t *
359zio_unique_parent(zio_t *cio)
360{
361 zio_t *pio = zio_walk_parents(cio);
362
363 VERIFY(zio_walk_parents(cio) == NULL);
364 return (pio);
365}
366
367void
368zio_add_child(zio_t *pio, zio_t *cio)
369{
370 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
371
372 /*
373 * Logical I/Os can have logical, gang, or vdev children.
374 * Gang I/Os can have gang or vdev children.
375 * Vdev I/Os can only have vdev children.
376 * The following ASSERT captures all of these constraints.
377 */
378 ASSERT(cio->io_child_type <= pio->io_child_type);
379
380 zl->zl_parent = pio;
381 zl->zl_child = cio;
382
383 mutex_enter(&cio->io_lock);
318 mutex_enter(&pio->io_lock);
384 mutex_enter(&pio->io_lock);
319 if (zio->io_stage < ZIO_STAGE_READY)
320 pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
321 if (zio->io_stage < ZIO_STAGE_DONE)
322 pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
323 zio->io_sibling_prev = NULL;
324 zio->io_sibling_next = pio->io_child;
325 if (pio->io_child != NULL)
326 pio->io_child->io_sibling_prev = zio;
327 pio->io_child = zio;
328 zio->io_parent = pio;
385
386 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
387
388 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
389 pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
390
391 list_insert_head(&pio->io_child_list, zl);
392 list_insert_head(&cio->io_parent_list, zl);
393
329 mutex_exit(&pio->io_lock);
394 mutex_exit(&pio->io_lock);
395 mutex_exit(&cio->io_lock);
330}
331
332static void
396}
397
398static void
333zio_remove_child(zio_t *pio, zio_t *zio)
399zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
334{
400{
335 zio_t *next, *prev;
401 ASSERT(zl->zl_parent == pio);
402 ASSERT(zl->zl_child == cio);
336
403
337 ASSERT(zio->io_parent == pio);
338
404 mutex_enter(&cio->io_lock);
339 mutex_enter(&pio->io_lock);
405 mutex_enter(&pio->io_lock);
340 next = zio->io_sibling_next;
341 prev = zio->io_sibling_prev;
342 if (next != NULL)
343 next->io_sibling_prev = prev;
344 if (prev != NULL)
345 prev->io_sibling_next = next;
346 if (pio->io_child == zio)
347 pio->io_child = next;
406
407 list_remove(&pio->io_child_list, zl);
408 list_remove(&cio->io_parent_list, zl);
409
348 mutex_exit(&pio->io_lock);
410 mutex_exit(&pio->io_lock);
411 mutex_exit(&cio->io_lock);
412
413 kmem_cache_free(zio_link_cache, zl);
349}
350
351static boolean_t
352zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
353{
354 uint64_t *countp = &zio->io_children[child][wait];
355 boolean_t waiting = B_FALSE;
356

--- 58 unchanged lines hidden (view full) ---

415 ASSERT(vd || stage == ZIO_STAGE_OPEN);
416
417 zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
418 bzero(zio, sizeof (zio_t));
419
420 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
421 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
422
414}
415
416static boolean_t
417zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
418{
419 uint64_t *countp = &zio->io_children[child][wait];
420 boolean_t waiting = B_FALSE;
421

--- 58 unchanged lines hidden (view full) ---

480 ASSERT(vd || stage == ZIO_STAGE_OPEN);
481
482 zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
483 bzero(zio, sizeof (zio_t));
484
485 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
486 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
487
488 list_create(&zio->io_parent_list, sizeof (zio_link_t),
489 offsetof(zio_link_t, zl_parent_node));
490 list_create(&zio->io_child_list, sizeof (zio_link_t),
491 offsetof(zio_link_t, zl_child_node));
492
423 if (vd != NULL)
424 zio->io_child_type = ZIO_CHILD_VDEV;
425 else if (flags & ZIO_FLAG_GANG_CHILD)
426 zio->io_child_type = ZIO_CHILD_GANG;
427 else
428 zio->io_child_type = ZIO_CHILD_LOGICAL;
429
430 if (bp != NULL) {
431 zio->io_bp = bp;
432 zio->io_bp_copy = *bp;
433 zio->io_bp_orig = *bp;
434 if (type != ZIO_TYPE_WRITE)
435 zio->io_bp = &zio->io_bp_copy; /* so caller can free */
493 if (vd != NULL)
494 zio->io_child_type = ZIO_CHILD_VDEV;
495 else if (flags & ZIO_FLAG_GANG_CHILD)
496 zio->io_child_type = ZIO_CHILD_GANG;
497 else
498 zio->io_child_type = ZIO_CHILD_LOGICAL;
499
500 if (bp != NULL) {
501 zio->io_bp = bp;
502 zio->io_bp_copy = *bp;
503 zio->io_bp_orig = *bp;
504 if (type != ZIO_TYPE_WRITE)
505 zio->io_bp = &zio->io_bp_copy; /* so caller can free */
436 if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
437 if (BP_IS_GANG(bp))
438 pipeline |= ZIO_GANG_STAGES;
506 if (zio->io_child_type == ZIO_CHILD_LOGICAL)
439 zio->io_logical = zio;
507 zio->io_logical = zio;
440 }
508 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
509 pipeline |= ZIO_GANG_STAGES;
441 }
442
443 zio->io_spa = spa;
444 zio->io_txg = txg;
445 zio->io_data = data;
446 zio->io_size = size;
447 zio->io_done = done;
448 zio->io_private = private;
449 zio->io_type = type;
450 zio->io_priority = priority;
451 zio->io_vd = vd;
452 zio->io_offset = offset;
453 zio->io_orig_flags = zio->io_flags = flags;
454 zio->io_orig_stage = zio->io_stage = stage;
455 zio->io_orig_pipeline = zio->io_pipeline = pipeline;
456
510 }
511
512 zio->io_spa = spa;
513 zio->io_txg = txg;
514 zio->io_data = data;
515 zio->io_size = size;
516 zio->io_done = done;
517 zio->io_private = private;
518 zio->io_type = type;
519 zio->io_priority = priority;
520 zio->io_vd = vd;
521 zio->io_offset = offset;
522 zio->io_orig_flags = zio->io_flags = flags;
523 zio->io_orig_stage = zio->io_stage = stage;
524 zio->io_orig_pipeline = zio->io_pipeline = pipeline;
525
526 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
527 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
528
457 if (zb != NULL)
458 zio->io_bookmark = *zb;
459
460 if (pio != NULL) {
529 if (zb != NULL)
530 zio->io_bookmark = *zb;
531
532 if (pio != NULL) {
461 /*
462 * Logical I/Os can have logical, gang, or vdev children.
463 * Gang I/Os can have gang or vdev children.
464 * Vdev I/Os can only have vdev children.
465 * The following ASSERT captures all of these constraints.
466 */
467 ASSERT(zio->io_child_type <= pio->io_child_type);
468 if (zio->io_logical == NULL)
469 zio->io_logical = pio->io_logical;
533 if (zio->io_logical == NULL)
534 zio->io_logical = pio->io_logical;
535 if (zio->io_child_type == ZIO_CHILD_GANG)
536 zio->io_gang_leader = pio->io_gang_leader;
470 zio_add_child(pio, zio);
471 }
472
473 return (zio);
474}
475
476static void
477zio_destroy(zio_t *zio)
478{
537 zio_add_child(pio, zio);
538 }
539
540 return (zio);
541}
542
543static void
544zio_destroy(zio_t *zio)
545{
479 spa_t *spa = zio->io_spa;
480 uint8_t async_root = zio->io_async_root;
481
546 list_destroy(&zio->io_parent_list);
547 list_destroy(&zio->io_child_list);
482 mutex_destroy(&zio->io_lock);
483 cv_destroy(&zio->io_cv);
484 kmem_cache_free(zio_cache, zio);
548 mutex_destroy(&zio->io_lock);
549 cv_destroy(&zio->io_cv);
550 kmem_cache_free(zio_cache, zio);
485
486 if (async_root) {
487 mutex_enter(&spa->spa_async_root_lock);
488 if (--spa->spa_async_root_count == 0)
489 cv_broadcast(&spa->spa_async_root_cv);
490 mutex_exit(&spa->spa_async_root_lock);
491 }
492}
493
494zio_t *
551}
552
553zio_t *
495zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
496 int flags)
554zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
555 void *private, int flags)
497{
498 zio_t *zio;
499
500 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
556{
557 zio_t *zio;
558
559 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
501 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
560 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
502 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
503
504 return (zio);
505}
506
507zio_t *
508zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
509{
561 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
562
563 return (zio);
564}
565
566zio_t *
567zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
568{
510 return (zio_null(NULL, spa, done, private, flags));
569 return (zio_null(NULL, spa, NULL, done, private, flags));
511}
512
513zio_t *
514zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
515 void *data, uint64_t size, zio_done_func_t *done, void *private,
516 int priority, int flags, const zbookmark_t *zb)
517{
518 zio_t *zio;

--- 52 unchanged lines hidden (view full) ---

571zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
572 zio_done_func_t *done, void *private, int flags)
573{
574 zio_t *zio;
575
576 ASSERT(!BP_IS_HOLE(bp));
577
578 if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
570}
571
572zio_t *
573zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
574 void *data, uint64_t size, zio_done_func_t *done, void *private,
575 int priority, int flags, const zbookmark_t *zb)
576{
577 zio_t *zio;

--- 52 unchanged lines hidden (view full) ---

630zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
631 zio_done_func_t *done, void *private, int flags)
632{
633 zio_t *zio;
634
635 ASSERT(!BP_IS_HOLE(bp));
636
637 if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
579 return (zio_null(pio, spa, NULL, NULL, flags));
638 return (zio_null(pio, spa, NULL, NULL, NULL, flags));
580
581 if (txg == spa->spa_syncing_txg &&
582 spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
583 bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
639
640 if (txg == spa->spa_syncing_txg &&
641 spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
642 bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
584 return (zio_null(pio, spa, NULL, NULL, flags));
643 return (zio_null(pio, spa, NULL, NULL, NULL, flags));
585 }
586
587 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
588 done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
589 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
590
591 return (zio);
592}

--- 34 unchanged lines hidden (view full) ---

627
628 if (vd->vdev_children == 0) {
629 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
630 ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
631 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
632
633 zio->io_cmd = cmd;
634 } else {
644 }
645
646 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
647 done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
648 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
649
650 return (zio);
651}

--- 34 unchanged lines hidden (view full) ---

686
687 if (vd->vdev_children == 0) {
688 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
689 ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
690 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
691
692 zio->io_cmd = cmd;
693 } else {
635 zio = zio_null(pio, spa, NULL, NULL, flags);
694 zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
636
637 for (c = 0; c < vd->vdev_children; c++)
638 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
639 done, private, priority, flags));
640 }
641
642 return (zio);
643}

--- 121 unchanged lines hidden (view full) ---

765 * ==========================================================================
766 */
767
768static int
769zio_read_bp_init(zio_t *zio)
770{
771 blkptr_t *bp = zio->io_bp;
772
695
696 for (c = 0; c < vd->vdev_children; c++)
697 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
698 done, private, priority, flags));
699 }
700
701 return (zio);
702}

--- 121 unchanged lines hidden (view full) ---

824 * ==========================================================================
825 */
826
827static int
828zio_read_bp_init(zio_t *zio)
829{
830 blkptr_t *bp = zio->io_bp;
831
773 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) {
832 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
833 zio->io_child_type == ZIO_CHILD_LOGICAL &&
834 !(zio->io_flags & ZIO_FLAG_RAW)) {
774 uint64_t csize = BP_GET_PSIZE(bp);
775 void *cbuf = zio_buf_alloc(csize);
776
777 zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
778 }
779
780 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
781 zio->io_flags |= ZIO_FLAG_DONT_CACHE;

--- 32 unchanged lines hidden (view full) ---

814 * working on behalf of spa_sync(). For spa_sync() to
815 * converge, it must eventually be the case that we don't
816 * have to allocate new blocks. But compression changes
817 * the blocksize, which forces a reallocate, and makes
818 * convergence take longer. Therefore, after the first
819 * few passes, stop compressing to ensure convergence.
820 */
821 pass = spa_sync_pass(zio->io_spa);
835 uint64_t csize = BP_GET_PSIZE(bp);
836 void *cbuf = zio_buf_alloc(csize);
837
838 zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
839 }
840
841 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
842 zio->io_flags |= ZIO_FLAG_DONT_CACHE;

--- 32 unchanged lines hidden (view full) ---

875 * working on behalf of spa_sync(). For spa_sync() to
876 * converge, it must eventually be the case that we don't
877 * have to allocate new blocks. But compression changes
878 * the blocksize, which forces a reallocate, and makes
879 * convergence take longer. Therefore, after the first
880 * few passes, stop compressing to ensure convergence.
881 */
882 pass = spa_sync_pass(zio->io_spa);
822 ASSERT(pass > 1);
823
824 if (pass > SYNC_PASS_DONT_COMPRESS)
825 compress = ZIO_COMPRESS_OFF;
826
883
884 if (pass > SYNC_PASS_DONT_COMPRESS)
885 compress = ZIO_COMPRESS_OFF;
886
827 /*
828 * Only MOS (objset 0) data should need to be rewritten.
829 */
830 ASSERT(zio->io_logical->io_bookmark.zb_objset == 0);
831
832 /* Make sure someone doesn't change their mind on overwrites */
833 ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
834 spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
835 }
836
837 if (compress != ZIO_COMPRESS_OFF) {
838 if (!zio_compress_data(compress, zio->io_data, zio->io_size,
839 &cbuf, &csize, &cbufsize)) {

--- 177 unchanged lines hidden (view full) ---

1017 return (error);
1018}
1019
1020void
1021zio_nowait(zio_t *zio)
1022{
1023 ASSERT(zio->io_executor == NULL);
1024
887 /* Make sure someone doesn't change their mind on overwrites */
888 ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
889 spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
890 }
891
892 if (compress != ZIO_COMPRESS_OFF) {
893 if (!zio_compress_data(compress, zio->io_data, zio->io_size,
894 &cbuf, &csize, &cbufsize)) {

--- 177 unchanged lines hidden (view full) ---

1072 return (error);
1073}
1074
1075void
1076zio_nowait(zio_t *zio)
1077{
1078 ASSERT(zio->io_executor == NULL);
1079
1025 if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) {
1080 if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1081 zio_unique_parent(zio) == NULL) {
1026 /*
1027 * This is a logical async I/O with no parent to wait for it.
1082 /*
1083 * This is a logical async I/O with no parent to wait for it.
1028 * Attach it to the pool's global async root zio so that
1029 * spa_unload() has a way of waiting for async I/O to finish.
1084 * We add it to the spa_async_root_zio "Godfather" I/O which
1085 * will ensure they complete prior to unloading the pool.
1030 */
1031 spa_t *spa = zio->io_spa;
1086 */
1087 spa_t *spa = zio->io_spa;
1032 zio->io_async_root = B_TRUE;
1033 mutex_enter(&spa->spa_async_root_lock);
1034 spa->spa_async_root_count++;
1035 mutex_exit(&spa->spa_async_root_lock);
1088
1089 zio_add_child(spa->spa_async_zio_root, zio);
1036 }
1037
1038 zio_execute(zio);
1039}
1040
1041/*
1042 * ==========================================================================
1043 * Reexecute or suspend/resume failed I/O
1044 * ==========================================================================
1045 */
1046
1047static void
1048zio_reexecute(zio_t *pio)
1049{
1090 }
1091
1092 zio_execute(zio);
1093}
1094
1095/*
1096 * ==========================================================================
1097 * Reexecute or suspend/resume failed I/O
1098 * ==========================================================================
1099 */
1100
1101static void
1102zio_reexecute(zio_t *pio)
1103{
1050 zio_t *zio, *zio_next;
1104 zio_t *cio, *cio_next;
1051
1105
1106 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1107 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1108 ASSERT(pio->io_gang_leader == NULL);
1109 ASSERT(pio->io_gang_tree == NULL);
1110
1052 pio->io_flags = pio->io_orig_flags;
1053 pio->io_stage = pio->io_orig_stage;
1054 pio->io_pipeline = pio->io_orig_pipeline;
1055 pio->io_reexecute = 0;
1056 pio->io_error = 0;
1111 pio->io_flags = pio->io_orig_flags;
1112 pio->io_stage = pio->io_orig_stage;
1113 pio->io_pipeline = pio->io_orig_pipeline;
1114 pio->io_reexecute = 0;
1115 pio->io_error = 0;
1116 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1117 pio->io_state[w] = 0;
1057 for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1058 pio->io_child_error[c] = 0;
1059
1060 if (IO_IS_ALLOCATING(pio)) {
1061 /*
1062 * Remember the failed bp so that the io_ready() callback
1063 * can update its accounting upon reexecution. The block
1064 * was already freed in zio_done(); we indicate this with
1065 * a fill count of -1 so that zio_free() knows to skip it.
1066 */
1067 blkptr_t *bp = pio->io_bp;
1068 ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
1069 bp->blk_fill = BLK_FILL_ALREADY_FREED;
1070 pio->io_bp_orig = *bp;
1071 BP_ZERO(bp);
1072 }
1073
1074 /*
1075 * As we reexecute pio's children, new children could be created.
1118 for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1119 pio->io_child_error[c] = 0;
1120
1121 if (IO_IS_ALLOCATING(pio)) {
1122 /*
1123 * Remember the failed bp so that the io_ready() callback
1124 * can update its accounting upon reexecution. The block
1125 * was already freed in zio_done(); we indicate this with
1126 * a fill count of -1 so that zio_free() knows to skip it.
1127 */
1128 blkptr_t *bp = pio->io_bp;
1129 ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
1130 bp->blk_fill = BLK_FILL_ALREADY_FREED;
1131 pio->io_bp_orig = *bp;
1132 BP_ZERO(bp);
1133 }
1134
1135 /*
1136 * As we reexecute pio's children, new children could be created.
1076 * New children go to the head of the io_child list, however,
1137 * New children go to the head of pio's io_child_list, however,
1077 * so we will (correctly) not reexecute them. The key is that
1138 * so we will (correctly) not reexecute them. The key is that
1078 * the remainder of the io_child list, from 'zio_next' onward,
1079 * cannot be affected by any side effects of reexecuting 'zio'.
1139 * the remainder of pio's io_child_list, from 'cio_next' onward,
1140 * cannot be affected by any side effects of reexecuting 'cio'.
1080 */
1141 */
1081 for (zio = pio->io_child; zio != NULL; zio = zio_next) {
1082 zio_next = zio->io_sibling_next;
1142 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1143 cio_next = zio_walk_children(pio);
1083 mutex_enter(&pio->io_lock);
1144 mutex_enter(&pio->io_lock);
1084 pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
1085 pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
1145 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1146 pio->io_children[cio->io_child_type][w]++;
1086 mutex_exit(&pio->io_lock);
1147 mutex_exit(&pio->io_lock);
1087 zio_reexecute(zio);
1148 zio_reexecute(cio);
1088 }
1089
1090 /*
1091 * Now that all children have been reexecuted, execute the parent.
1149 }
1150
1151 /*
1152 * Now that all children have been reexecuted, execute the parent.
1153 * We don't reexecute "The Godfather" I/O here as it's the
1154 * responsibility of the caller to wait on him.
1092 */
1155 */
1093 zio_execute(pio);
1156 if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1157 zio_execute(pio);
1094}
1095
1096void
1097zio_suspend(spa_t *spa, zio_t *zio)
1098{
1099 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1100 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1101 "failure and the failure mode property for this pool "
1102 "is set to panic.", spa_name(spa));
1103
1104 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1105
1106 mutex_enter(&spa->spa_suspend_lock);
1107
1108 if (spa->spa_suspend_zio_root == NULL)
1158}
1159
1160void
1161zio_suspend(spa_t *spa, zio_t *zio)
1162{
1163 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1164 fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1165 "failure and the failure mode property for this pool "
1166 "is set to panic.", spa_name(spa));
1167
1168 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1169
1170 mutex_enter(&spa->spa_suspend_lock);
1171
1172 if (spa->spa_suspend_zio_root == NULL)
1109 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0);
1173 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1174 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1175 ZIO_FLAG_GODFATHER);
1110
1111 spa->spa_suspended = B_TRUE;
1112
1113 if (zio != NULL) {
1176
1177 spa->spa_suspended = B_TRUE;
1178
1179 if (zio != NULL) {
1180 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1114 ASSERT(zio != spa->spa_suspend_zio_root);
1115 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1181 ASSERT(zio != spa->spa_suspend_zio_root);
1182 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1116 ASSERT(zio->io_parent == NULL);
1183 ASSERT(zio_unique_parent(zio) == NULL);
1117 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1118 zio_add_child(spa->spa_suspend_zio_root, zio);
1119 }
1120
1121 mutex_exit(&spa->spa_suspend_lock);
1122}
1123
1184 ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1185 zio_add_child(spa->spa_suspend_zio_root, zio);
1186 }
1187
1188 mutex_exit(&spa->spa_suspend_lock);
1189}
1190
1124void
1191int
1125zio_resume(spa_t *spa)
1126{
1192zio_resume(spa_t *spa)
1193{
1127 zio_t *pio, *zio;
1194 zio_t *pio;
1128
1129 /*
1130 * Reexecute all previously suspended i/o.
1131 */
1132 mutex_enter(&spa->spa_suspend_lock);
1133 spa->spa_suspended = B_FALSE;
1134 cv_broadcast(&spa->spa_suspend_cv);
1135 pio = spa->spa_suspend_zio_root;
1136 spa->spa_suspend_zio_root = NULL;
1137 mutex_exit(&spa->spa_suspend_lock);
1138
1139 if (pio == NULL)
1195
1196 /*
1197 * Reexecute all previously suspended i/o.
1198 */
1199 mutex_enter(&spa->spa_suspend_lock);
1200 spa->spa_suspended = B_FALSE;
1201 cv_broadcast(&spa->spa_suspend_cv);
1202 pio = spa->spa_suspend_zio_root;
1203 spa->spa_suspend_zio_root = NULL;
1204 mutex_exit(&spa->spa_suspend_lock);
1205
1206 if (pio == NULL)
1140 return;
1207 return (0);
1141
1208
1142 while ((zio = pio->io_child) != NULL) {
1143 zio_remove_child(pio, zio);
1144 zio->io_parent = NULL;
1145 zio_reexecute(zio);
1146 }
1147
1148 ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0);
1149
1150 (void) zio_wait(pio);
1209 zio_reexecute(pio);
1210 return (zio_wait(pio));
1151}
1152
1153void
1154zio_resume_wait(spa_t *spa)
1155{
1156 mutex_enter(&spa->spa_suspend_lock);
1157 while (spa_suspended(spa))
1158 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);

--- 90 unchanged lines hidden (view full) ---

1249 * As we rewrite each gang header, the pipeline will compute
1250 * a new gang block header checksum for it; but no one will
1251 * compute a new data checksum, so we do that here. The one
1252 * exception is the gang leader: the pipeline already computed
1253 * its data checksum because that stage precedes gang assembly.
1254 * (Presently, nothing actually uses interior data checksums;
1255 * this is just good hygiene.)
1256 */
1211}
1212
1213void
1214zio_resume_wait(spa_t *spa)
1215{
1216 mutex_enter(&spa->spa_suspend_lock);
1217 while (spa_suspended(spa))
1218 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);

--- 90 unchanged lines hidden (view full) ---

1309 * As we rewrite each gang header, the pipeline will compute
1310 * a new gang block header checksum for it; but no one will
1311 * compute a new data checksum, so we do that here. The one
1312 * exception is the gang leader: the pipeline already computed
1313 * its data checksum because that stage precedes gang assembly.
1314 * (Presently, nothing actually uses interior data checksums;
1315 * this is just good hygiene.)
1316 */
1257 if (gn != pio->io_logical->io_gang_tree) {
1317 if (gn != pio->io_gang_leader->io_gang_tree) {
1258 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1259 data, BP_GET_PSIZE(bp));
1260 }
1261 } else {
1262 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1263 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1264 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1265 }

--- 65 unchanged lines hidden (view full) ---

1331
1332 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1333 zio_gang_tree_free(&gn->gn_child[g]);
1334
1335 zio_gang_node_free(gnpp);
1336}
1337
1338static void
1318 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1319 data, BP_GET_PSIZE(bp));
1320 }
1321 } else {
1322 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1323 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1324 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1325 }

--- 65 unchanged lines hidden (view full) ---

1391
1392 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1393 zio_gang_tree_free(&gn->gn_child[g]);
1394
1395 zio_gang_node_free(gnpp);
1396}
1397
1398static void
1339zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp)
1399zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1340{
1341 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1342
1400{
1401 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1402
1343 ASSERT(lio->io_logical == lio);
1403 ASSERT(gio->io_gang_leader == gio);
1344 ASSERT(BP_IS_GANG(bp));
1345
1404 ASSERT(BP_IS_GANG(bp));
1405
1346 zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh,
1406 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1347 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1407 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1348 lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark));
1408 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1349}
1350
1351static void
1352zio_gang_tree_assemble_done(zio_t *zio)
1353{
1409}
1410
1411static void
1412zio_gang_tree_assemble_done(zio_t *zio)
1413{
1354 zio_t *lio = zio->io_logical;
1414 zio_t *gio = zio->io_gang_leader;
1355 zio_gang_node_t *gn = zio->io_private;
1356 blkptr_t *bp = zio->io_bp;
1357
1415 zio_gang_node_t *gn = zio->io_private;
1416 blkptr_t *bp = zio->io_bp;
1417
1358 ASSERT(zio->io_parent == lio);
1359 ASSERT(zio->io_child == NULL);
1418 ASSERT(gio == zio_unique_parent(zio));
1419 ASSERT(zio_walk_children(zio) == NULL);
1360
1361 if (zio->io_error)
1362 return;
1363
1364 if (BP_SHOULD_BYTESWAP(bp))
1365 byteswap_uint64_array(zio->io_data, zio->io_size);
1366
1367 ASSERT(zio->io_data == gn->gn_gbh);
1368 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1369 ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
1370
1371 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1372 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1373 if (!BP_IS_GANG(gbp))
1374 continue;
1420
1421 if (zio->io_error)
1422 return;
1423
1424 if (BP_SHOULD_BYTESWAP(bp))
1425 byteswap_uint64_array(zio->io_data, zio->io_size);
1426
1427 ASSERT(zio->io_data == gn->gn_gbh);
1428 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1429 ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
1430
1431 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1432 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1433 if (!BP_IS_GANG(gbp))
1434 continue;
1375 zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]);
1435 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1376 }
1377}
1378
1379static void
1380zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1381{
1436 }
1437}
1438
1439static void
1440zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1441{
1382 zio_t *lio = pio->io_logical;
1442 zio_t *gio = pio->io_gang_leader;
1383 zio_t *zio;
1384
1385 ASSERT(BP_IS_GANG(bp) == !!gn);
1443 zio_t *zio;
1444
1445 ASSERT(BP_IS_GANG(bp) == !!gn);
1386 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp));
1387 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree);
1446 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1447 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1388
1389 /*
1390 * If you're a gang header, your data is in gn->gn_gbh.
1391 * If you're a gang member, your data is in 'data' and gn == NULL.
1392 */
1448
1449 /*
1450 * If you're a gang header, your data is in gn->gn_gbh.
1451 * If you're a gang member, your data is in 'data' and gn == NULL.
1452 */
1393 zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data);
1453 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1394
1395 if (gn != NULL) {
1396 ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
1397
1398 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1399 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1400 if (BP_IS_HOLE(gbp))
1401 continue;
1402 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1403 data = (char *)data + BP_GET_PSIZE(gbp);
1404 }
1405 }
1406
1454
1455 if (gn != NULL) {
1456 ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
1457
1458 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1459 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1460 if (BP_IS_HOLE(gbp))
1461 continue;
1462 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1463 data = (char *)data + BP_GET_PSIZE(gbp);
1464 }
1465 }
1466
1407 if (gn == lio->io_gang_tree)
1408 ASSERT3P((char *)lio->io_data + lio->io_size, ==, data);
1467 if (gn == gio->io_gang_tree)
1468 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1409
1410 if (zio != pio)
1411 zio_nowait(zio);
1412}
1413
1414static int
1415zio_gang_assemble(zio_t *zio)
1416{
1417 blkptr_t *bp = zio->io_bp;
1418
1469
1470 if (zio != pio)
1471 zio_nowait(zio);
1472}
1473
1474static int
1475zio_gang_assemble(zio_t *zio)
1476{
1477 blkptr_t *bp = zio->io_bp;
1478
1419 ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical);
1479 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1480 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1420
1481
1482 zio->io_gang_leader = zio;
1483
1421 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1422
1423 return (ZIO_PIPELINE_CONTINUE);
1424}
1425
1426static int
1427zio_gang_issue(zio_t *zio)
1428{
1484 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1485
1486 return (ZIO_PIPELINE_CONTINUE);
1487}
1488
1489static int
1490zio_gang_issue(zio_t *zio)
1491{
1429 zio_t *lio = zio->io_logical;
1430 blkptr_t *bp = zio->io_bp;
1431
1432 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1433 return (ZIO_PIPELINE_STOP);
1434
1492 blkptr_t *bp = zio->io_bp;
1493
1494 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1495 return (ZIO_PIPELINE_STOP);
1496
1435 ASSERT(BP_IS_GANG(bp) && zio == lio);
1497 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1498 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1436
1437 if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1499
1500 if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1438 zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data);
1501 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1439 else
1502 else
1440 zio_gang_tree_free(&lio->io_gang_tree);
1503 zio_gang_tree_free(&zio->io_gang_tree);
1441
1442 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1443
1444 return (ZIO_PIPELINE_CONTINUE);
1445}
1446
1447static void
1448zio_write_gang_member_ready(zio_t *zio)
1449{
1504
1505 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1506
1507 return (ZIO_PIPELINE_CONTINUE);
1508}
1509
1510static void
1511zio_write_gang_member_ready(zio_t *zio)
1512{
1450 zio_t *pio = zio->io_parent;
1451 zio_t *lio = zio->io_logical;
1513 zio_t *pio = zio_unique_parent(zio);
1514 zio_t *gio = zio->io_gang_leader;
1452 dva_t *cdva = zio->io_bp->blk_dva;
1453 dva_t *pdva = pio->io_bp->blk_dva;
1454 uint64_t asize;
1455
1456 if (BP_IS_HOLE(zio->io_bp))
1457 return;
1458
1459 ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1460
1461 ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1515 dva_t *cdva = zio->io_bp->blk_dva;
1516 dva_t *pdva = pio->io_bp->blk_dva;
1517 uint64_t asize;
1518
1519 if (BP_IS_HOLE(zio->io_bp))
1520 return;
1521
1522 ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1523
1524 ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1462 ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas);
1525 ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas);
1463 ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
1464 ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
1465 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1466
1467 mutex_enter(&pio->io_lock);
1468 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1469 ASSERT(DVA_GET_GANG(&pdva[d]));
1470 asize = DVA_GET_ASIZE(&pdva[d]);
1471 asize += DVA_GET_ASIZE(&cdva[d]);
1472 DVA_SET_ASIZE(&pdva[d], asize);
1473 }
1474 mutex_exit(&pio->io_lock);
1475}
1476
1477static int
1478zio_write_gang_block(zio_t *pio)
1479{
1480 spa_t *spa = pio->io_spa;
1481 blkptr_t *bp = pio->io_bp;
1526 ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
1527 ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
1528 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1529
1530 mutex_enter(&pio->io_lock);
1531 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1532 ASSERT(DVA_GET_GANG(&pdva[d]));
1533 asize = DVA_GET_ASIZE(&pdva[d]);
1534 asize += DVA_GET_ASIZE(&cdva[d]);
1535 DVA_SET_ASIZE(&pdva[d], asize);
1536 }
1537 mutex_exit(&pio->io_lock);
1538}
1539
1540static int
1541zio_write_gang_block(zio_t *pio)
1542{
1543 spa_t *spa = pio->io_spa;
1544 blkptr_t *bp = pio->io_bp;
1482 zio_t *lio = pio->io_logical;
1545 zio_t *gio = pio->io_gang_leader;
1483 zio_t *zio;
1484 zio_gang_node_t *gn, **gnpp;
1485 zio_gbh_phys_t *gbh;
1486 uint64_t txg = pio->io_txg;
1487 uint64_t resid = pio->io_size;
1488 uint64_t lsize;
1546 zio_t *zio;
1547 zio_gang_node_t *gn, **gnpp;
1548 zio_gbh_phys_t *gbh;
1549 uint64_t txg = pio->io_txg;
1550 uint64_t resid = pio->io_size;
1551 uint64_t lsize;
1489 int ndvas = lio->io_prop.zp_ndvas;
1552 int ndvas = gio->io_prop.zp_ndvas;
1490 int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
1491 zio_prop_t zp;
1492 int error;
1493
1494 error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
1553 int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
1554 zio_prop_t zp;
1555 int error;
1556
1557 error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
1495 bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp,
1558 bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
1496 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1497 if (error) {
1498 pio->io_error = error;
1499 return (ZIO_PIPELINE_CONTINUE);
1500 }
1501
1559 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1560 if (error) {
1561 pio->io_error = error;
1562 return (ZIO_PIPELINE_CONTINUE);
1563 }
1564
1502 if (pio == lio) {
1503 gnpp = &lio->io_gang_tree;
1565 if (pio == gio) {
1566 gnpp = &gio->io_gang_tree;
1504 } else {
1505 gnpp = pio->io_private;
1506 ASSERT(pio->io_ready == zio_write_gang_member_ready);
1507 }
1508
1509 gn = zio_gang_node_alloc(gnpp);
1510 gbh = gn->gn_gbh;
1511 bzero(gbh, SPA_GANGBLOCKSIZE);

--- 7 unchanged lines hidden (view full) ---

1519 /*
1520 * Create and nowait the gang children.
1521 */
1522 for (int g = 0; resid != 0; resid -= lsize, g++) {
1523 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1524 SPA_MINBLOCKSIZE);
1525 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1526
1567 } else {
1568 gnpp = pio->io_private;
1569 ASSERT(pio->io_ready == zio_write_gang_member_ready);
1570 }
1571
1572 gn = zio_gang_node_alloc(gnpp);
1573 gbh = gn->gn_gbh;
1574 bzero(gbh, SPA_GANGBLOCKSIZE);

--- 7 unchanged lines hidden (view full) ---

1582 /*
1583 * Create and nowait the gang children.
1584 */
1585 for (int g = 0; resid != 0; resid -= lsize, g++) {
1586 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1587 SPA_MINBLOCKSIZE);
1588 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1589
1527 zp.zp_checksum = lio->io_prop.zp_checksum;
1590 zp.zp_checksum = gio->io_prop.zp_checksum;
1528 zp.zp_compress = ZIO_COMPRESS_OFF;
1529 zp.zp_type = DMU_OT_NONE;
1530 zp.zp_level = 0;
1591 zp.zp_compress = ZIO_COMPRESS_OFF;
1592 zp.zp_type = DMU_OT_NONE;
1593 zp.zp_level = 0;
1531 zp.zp_ndvas = lio->io_prop.zp_ndvas;
1594 zp.zp_ndvas = gio->io_prop.zp_ndvas;
1532
1533 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1534 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1535 zio_write_gang_member_ready, NULL, &gn->gn_child[g],
1536 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1537 &pio->io_bookmark));
1538 }
1539

--- 16 unchanged lines hidden (view full) ---

1556static int
1557zio_dva_allocate(zio_t *zio)
1558{
1559 spa_t *spa = zio->io_spa;
1560 metaslab_class_t *mc = spa->spa_normal_class;
1561 blkptr_t *bp = zio->io_bp;
1562 int error;
1563
1595
1596 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1597 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1598 zio_write_gang_member_ready, NULL, &gn->gn_child[g],
1599 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1600 &pio->io_bookmark));
1601 }
1602

--- 16 unchanged lines hidden (view full) ---

1619static int
1620zio_dva_allocate(zio_t *zio)
1621{
1622 spa_t *spa = zio->io_spa;
1623 metaslab_class_t *mc = spa->spa_normal_class;
1624 blkptr_t *bp = zio->io_bp;
1625 int error;
1626
1627 if (zio->io_gang_leader == NULL) {
1628 ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1629 zio->io_gang_leader = zio;
1630 }
1631
1564 ASSERT(BP_IS_HOLE(bp));
1565 ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
1566 ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
1567 ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
1568 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1569
1570 error = metaslab_alloc(spa, mc, zio->io_size, bp,
1571 zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);

--- 115 unchanged lines hidden (view full) ---

1687 metaslab_free(spa, bp, txg, B_FALSE);
1688}
1689
1690/*
1691 * ==========================================================================
1692 * Read and write to physical devices
1693 * ==========================================================================
1694 */
1632 ASSERT(BP_IS_HOLE(bp));
1633 ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
1634 ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
1635 ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
1636 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1637
1638 error = metaslab_alloc(spa, mc, zio->io_size, bp,
1639 zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);

--- 115 unchanged lines hidden (view full) ---

1755 metaslab_free(spa, bp, txg, B_FALSE);
1756}
1757
1758/*
1759 * ==========================================================================
1760 * Read and write to physical devices
1761 * ==========================================================================
1762 */
1695
1696static void
1697zio_vdev_io_probe_done(zio_t *zio)
1698{
1699 zio_t *dio;
1700 vdev_t *vd = zio->io_private;
1701
1702 mutex_enter(&vd->vdev_probe_lock);
1703 ASSERT(vd->vdev_probe_zio == zio);
1704 vd->vdev_probe_zio = NULL;
1705 mutex_exit(&vd->vdev_probe_lock);
1706
1707 while ((dio = zio->io_delegate_list) != NULL) {
1708 zio->io_delegate_list = dio->io_delegate_next;
1709 dio->io_delegate_next = NULL;
1710 if (!vdev_accessible(vd, dio))
1711 dio->io_error = ENXIO;
1712 zio_execute(dio);
1713 }
1714}
1715
1716/*
1717 * Probe the device to determine whether I/O failure is specific to this
1718 * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged).
1719 */
1720static int
1763static int
1721zio_vdev_io_probe(zio_t *zio)
1722{
1723 vdev_t *vd = zio->io_vd;
1724 zio_t *pio = NULL;
1725 boolean_t created_pio = B_FALSE;
1726
1727 /*
1728 * Don't probe the probe.
1729 */
1730 if (zio->io_flags & ZIO_FLAG_PROBE)
1731 return (ZIO_PIPELINE_CONTINUE);
1732
1733 /*
1734 * To prevent 'probe storms' when a device fails, we create
1735 * just one probe i/o at a time. All zios that want to probe
1736 * this vdev will join the probe zio's io_delegate_list.
1737 */
1738 mutex_enter(&vd->vdev_probe_lock);
1739
1740 if ((pio = vd->vdev_probe_zio) == NULL) {
1741 vd->vdev_probe_zio = pio = zio_root(zio->io_spa,
1742 zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL);
1743 created_pio = B_TRUE;
1744 vd->vdev_probe_wanted = B_TRUE;
1745 spa_async_request(zio->io_spa, SPA_ASYNC_PROBE);
1746 }
1747
1748 zio->io_delegate_next = pio->io_delegate_list;
1749 pio->io_delegate_list = zio;
1750
1751 mutex_exit(&vd->vdev_probe_lock);
1752
1753 if (created_pio) {
1754 zio_nowait(vdev_probe(vd, pio));
1755 zio_nowait(pio);
1756 }
1757
1758 return (ZIO_PIPELINE_STOP);
1759}
1760
1761static int
1762zio_vdev_io_start(zio_t *zio)
1763{
1764 vdev_t *vd = zio->io_vd;
1765 uint64_t align;
1766 spa_t *spa = zio->io_spa;
1767
1768 ASSERT(zio->io_error == 0);
1769 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);

--- 18 unchanged lines hidden (view full) ---

1788 bcopy(zio->io_data, abuf, zio->io_size);
1789 bzero(abuf + zio->io_size, asize - zio->io_size);
1790 }
1791 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
1792 }
1793
1794 ASSERT(P2PHASE(zio->io_offset, align) == 0);
1795 ASSERT(P2PHASE(zio->io_size, align) == 0);
1764zio_vdev_io_start(zio_t *zio)
1765{
1766 vdev_t *vd = zio->io_vd;
1767 uint64_t align;
1768 spa_t *spa = zio->io_spa;
1769
1770 ASSERT(zio->io_error == 0);
1771 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);

--- 18 unchanged lines hidden (view full) ---

1790 bcopy(zio->io_data, abuf, zio->io_size);
1791 bzero(abuf + zio->io_size, asize - zio->io_size);
1792 }
1793 zio_push_transform(zio, abuf, asize, asize, zio_subblock);
1794 }
1795
1796 ASSERT(P2PHASE(zio->io_offset, align) == 0);
1797 ASSERT(P2PHASE(zio->io_size, align) == 0);
1796 ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
1798 ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
1797
1799
1800 /*
1801 * If this is a repair I/O, and there's no self-healing involved --
1802 * that is, we're just resilvering what we expect to resilver --
1803 * then don't do the I/O unless zio's txg is actually in vd's DTL.
1804 * This prevents spurious resilvering with nested replication.
1805 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
1806 * A is out of date, we'll read from C+D, then use the data to
1807 * resilver A+B -- but we don't actually want to resilver B, just A.
1808 * The top-level mirror has no way to know this, so instead we just
1809 * discard unnecessary repairs as we work our way down the vdev tree.
1810 * The same logic applies to any form of nested replication:
1811 * ditto + mirror, RAID-Z + replacing, etc. This covers them all.
1812 */
1813 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
1814 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
1815 zio->io_txg != 0 && /* not a delegated i/o */
1816 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
1817 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
1818 zio_vdev_io_bypass(zio);
1819 return (ZIO_PIPELINE_CONTINUE);
1820 }
1821
1798 if (vd->vdev_ops->vdev_op_leaf &&
1799 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
1800
1801 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
1822 if (vd->vdev_ops->vdev_op_leaf &&
1823 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
1824
1825 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
1802 return (ZIO_PIPELINE_STOP);
1826 return (ZIO_PIPELINE_CONTINUE);
1803
1804 if ((zio = vdev_queue_io(zio)) == NULL)
1805 return (ZIO_PIPELINE_STOP);
1806
1807 if (!vdev_accessible(vd, zio)) {
1808 zio->io_error = ENXIO;
1809 zio_interrupt(zio);
1810 return (ZIO_PIPELINE_STOP);
1811 }
1827
1828 if ((zio = vdev_queue_io(zio)) == NULL)
1829 return (ZIO_PIPELINE_STOP);
1830
1831 if (!vdev_accessible(vd, zio)) {
1832 zio->io_error = ENXIO;
1833 zio_interrupt(zio);
1834 return (ZIO_PIPELINE_STOP);
1835 }
1812
1813 }
1814
1815 return (vd->vdev_ops->vdev_op_io_start(zio));
1816}
1817
1818static int
1819zio_vdev_io_done(zio_t *zio)
1820{

--- 26 unchanged lines hidden (view full) ---

1847 unexpected_error = B_TRUE;
1848 }
1849 }
1850 }
1851
1852 ops->vdev_op_io_done(zio);
1853
1854 if (unexpected_error)
1836 }
1837
1838 return (vd->vdev_ops->vdev_op_io_start(zio));
1839}
1840
1841static int
1842zio_vdev_io_done(zio_t *zio)
1843{

--- 26 unchanged lines hidden (view full) ---

1870 unexpected_error = B_TRUE;
1871 }
1872 }
1873 }
1874
1875 ops->vdev_op_io_done(zio);
1876
1877 if (unexpected_error)
1855 return (zio_vdev_io_probe(zio));
1878 VERIFY(vdev_probe(vd, zio) == NULL);
1856
1857 return (ZIO_PIPELINE_CONTINUE);
1858}
1859
1860static int
1861zio_vdev_io_assess(zio_t *zio)
1862{
1863 vdev_t *vd = zio->io_vd;

--- 179 unchanged lines hidden (view full) ---

2043 * ==========================================================================
2044 * I/O completion
2045 * ==========================================================================
2046 */
2047static int
2048zio_ready(zio_t *zio)
2049{
2050 blkptr_t *bp = zio->io_bp;
1879
1880 return (ZIO_PIPELINE_CONTINUE);
1881}
1882
1883static int
1884zio_vdev_io_assess(zio_t *zio)
1885{
1886 vdev_t *vd = zio->io_vd;

--- 179 unchanged lines hidden (view full) ---

2066 * ==========================================================================
2067 * I/O completion
2068 * ==========================================================================
2069 */
2070static int
2071zio_ready(zio_t *zio)
2072{
2073 blkptr_t *bp = zio->io_bp;
2051 zio_t *pio = zio->io_parent;
2074 zio_t *pio, *pio_next;
2052
2075
2053 if (zio->io_ready) {
2054 if (BP_IS_GANG(bp) &&
2055 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
2056 return (ZIO_PIPELINE_STOP);
2076 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
2077 return (ZIO_PIPELINE_STOP);
2057
2078
2079 if (zio->io_ready) {
2058 ASSERT(IO_IS_ALLOCATING(zio));
2059 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2060 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2061
2062 zio->io_ready(zio);
2063 }
2064
2065 if (bp != NULL && bp != &zio->io_bp_copy)
2066 zio->io_bp_copy = *bp;
2067
2068 if (zio->io_error)
2069 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2070
2080 ASSERT(IO_IS_ALLOCATING(zio));
2081 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2082 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2083
2084 zio->io_ready(zio);
2085 }
2086
2087 if (bp != NULL && bp != &zio->io_bp_copy)
2088 zio->io_bp_copy = *bp;
2089
2090 if (zio->io_error)
2091 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2092
2071 if (pio != NULL)
2093 mutex_enter(&zio->io_lock);
2094 zio->io_state[ZIO_WAIT_READY] = 1;
2095 pio = zio_walk_parents(zio);
2096 mutex_exit(&zio->io_lock);
2097
2098 /*
2099 * As we notify zio's parents, new parents could be added.
2100 * New parents go to the head of zio's io_parent_list, however,
2101 * so we will (correctly) not notify them. The remainder of zio's
2102 * io_parent_list, from 'pio_next' onward, cannot change because
2103 * all parents must wait for us to be done before they can be done.
2104 */
2105 for (; pio != NULL; pio = pio_next) {
2106 pio_next = zio_walk_parents(zio);
2072 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2107 zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2108 }
2073
2074 return (ZIO_PIPELINE_CONTINUE);
2075}
2076
2077static int
2078zio_done(zio_t *zio)
2079{
2080 spa_t *spa = zio->io_spa;
2109
2110 return (ZIO_PIPELINE_CONTINUE);
2111}
2112
2113static int
2114zio_done(zio_t *zio)
2115{
2116 spa_t *spa = zio->io_spa;
2081 zio_t *pio = zio->io_parent;
2082 zio_t *lio = zio->io_logical;
2083 blkptr_t *bp = zio->io_bp;
2084 vdev_t *vd = zio->io_vd;
2085 uint64_t psize = zio->io_size;
2117 zio_t *lio = zio->io_logical;
2118 blkptr_t *bp = zio->io_bp;
2119 vdev_t *vd = zio->io_vd;
2120 uint64_t psize = zio->io_size;
2121 zio_t *pio, *pio_next;
2086
2087 /*
2122
2123 /*
2088 * If our of children haven't all completed,
2124 * If our children haven't all completed,
2089 * wait for them and then repeat this pipeline stage.
2090 */
2091 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
2092 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
2093 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
2094 return (ZIO_PIPELINE_STOP);
2095
2096 for (int c = 0; c < ZIO_CHILD_TYPES; c++)
2097 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
2098 ASSERT(zio->io_children[c][w] == 0);
2099
2100 if (bp != NULL) {
2101 ASSERT(bp->blk_pad[0] == 0);
2102 ASSERT(bp->blk_pad[1] == 0);
2103 ASSERT(bp->blk_pad[2] == 0);
2104 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
2125 * wait for them and then repeat this pipeline stage.
2126 */
2127 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
2128 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
2129 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
2130 return (ZIO_PIPELINE_STOP);
2131
2132 for (int c = 0; c < ZIO_CHILD_TYPES; c++)
2133 for (int w = 0; w < ZIO_WAIT_TYPES; w++)
2134 ASSERT(zio->io_children[c][w] == 0);
2135
2136 if (bp != NULL) {
2137 ASSERT(bp->blk_pad[0] == 0);
2138 ASSERT(bp->blk_pad[1] == 0);
2139 ASSERT(bp->blk_pad[2] == 0);
2140 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
2105 (pio != NULL && bp == pio->io_bp));
2141 (bp == zio_unique_parent(zio)->io_bp));
2106 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
2107 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2108 ASSERT(!BP_SHOULD_BYTESWAP(bp));
2109 ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
2110 ASSERT(BP_COUNT_GANG(bp) == 0 ||
2111 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
2112 }
2113 }

--- 41 unchanged lines hidden (view full) ---

2155 if (zio->io_error != ENOSPC)
2156 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
2157 else
2158 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2159
2160 if ((zio->io_type == ZIO_TYPE_READ ||
2161 zio->io_type == ZIO_TYPE_FREE) &&
2162 zio->io_error == ENXIO &&
2142 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
2143 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2144 ASSERT(!BP_SHOULD_BYTESWAP(bp));
2145 ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
2146 ASSERT(BP_COUNT_GANG(bp) == 0 ||
2147 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
2148 }
2149 }

--- 41 unchanged lines hidden (view full) ---

2191 if (zio->io_error != ENOSPC)
2192 zio->io_reexecute |= ZIO_REEXECUTE_NOW;
2193 else
2194 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2195
2196 if ((zio->io_type == ZIO_TYPE_READ ||
2197 zio->io_type == ZIO_TYPE_FREE) &&
2198 zio->io_error == ENXIO &&
2199 spa->spa_load_state == SPA_LOAD_NONE &&
2163 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
2164 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2165
2166 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
2167 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2168 }
2169
2170 /*
2171 * If there were logical child errors, they apply to us now.
2172 * We defer this until now to avoid conflating logical child
2173 * errors with errors that happened to the zio itself when
2174 * updating vdev stats and reporting FMA events above.
2175 */
2176 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
2177
2200 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
2201 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2202
2203 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
2204 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2205 }
2206
2207 /*
2208 * If there were logical child errors, they apply to us now.
2209 * We defer this until now to avoid conflating logical child
2210 * errors with errors that happened to the zio itself when
2211 * updating vdev stats and reporting FMA events above.
2212 */
2213 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
2214
2215 if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) &&
2216 zio->io_child_type == ZIO_CHILD_LOGICAL) {
2217 ASSERT(zio->io_child_type != ZIO_CHILD_GANG);
2218 zio_dva_unallocate(zio, zio->io_gang_tree, bp);
2219 }
2220
2221 zio_gang_tree_free(&zio->io_gang_tree);
2222
2223 /*
2224 * Godfather I/Os should never suspend.
2225 */
2226 if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
2227 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
2228 zio->io_reexecute = 0;
2229
2178 if (zio->io_reexecute) {
2179 /*
2180 * This is a logical I/O that wants to reexecute.
2181 *
2182 * Reexecute is top-down. When an i/o fails, if it's not
2183 * the root, it simply notifies its parent and sticks around.
2184 * The parent, seeing that it still has children in zio_done(),
2185 * does the same. This percolates all the way up to the root.
2186 * The root i/o will reexecute or suspend the entire tree.
2187 *
2188 * This approach ensures that zio_reexecute() honors
2189 * all the original i/o dependency relationships, e.g.
2190 * parents not executing until children are ready.
2191 */
2192 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2193
2230 if (zio->io_reexecute) {
2231 /*
2232 * This is a logical I/O that wants to reexecute.
2233 *
2234 * Reexecute is top-down. When an i/o fails, if it's not
2235 * the root, it simply notifies its parent and sticks around.
2236 * The parent, seeing that it still has children in zio_done(),
2237 * does the same. This percolates all the way up to the root.
2238 * The root i/o will reexecute or suspend the entire tree.
2239 *
2240 * This approach ensures that zio_reexecute() honors
2241 * all the original i/o dependency relationships, e.g.
2242 * parents not executing until children are ready.
2243 */
2244 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2245
2194 if (IO_IS_ALLOCATING(zio))
2195 zio_dva_unallocate(zio, zio->io_gang_tree, bp);
2246 zio->io_gang_leader = NULL;
2196
2247
2197 zio_gang_tree_free(&zio->io_gang_tree);
2248 mutex_enter(&zio->io_lock);
2249 zio->io_state[ZIO_WAIT_DONE] = 1;
2250 mutex_exit(&zio->io_lock);
2198
2251
2199 if (pio != NULL) {
2252 /*
2253 * "The Godfather" I/O monitors its children but is
2254 * not a true parent to them. It will track them through
2255 * the pipeline but severs its ties whenever they get into
2256 * trouble (e.g. suspended). This allows "The Godfather"
2257 * I/O to return status without blocking.
2258 */
2259 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
2260 zio_link_t *zl = zio->io_walk_link;
2261 pio_next = zio_walk_parents(zio);
2262
2263 if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
2264 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
2265 zio_remove_child(pio, zio, zl);
2266 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2267 }
2268 }
2269
2270 if ((pio = zio_unique_parent(zio)) != NULL) {
2200 /*
2201 * We're not a root i/o, so there's nothing to do
2202 * but notify our parent. Don't propagate errors
2203 * upward since we haven't permanently failed yet.
2204 */
2271 /*
2272 * We're not a root i/o, so there's nothing to do
2273 * but notify our parent. Don't propagate errors
2274 * upward since we haven't permanently failed yet.
2275 */
2276 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
2205 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
2206 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2207 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
2208 /*
2209 * We'd fail again if we reexecuted now, so suspend
2210 * until conditions improve (e.g. device comes online).
2211 */
2212 zio_suspend(spa, zio);

--- 4 unchanged lines hidden (view full) ---

2217 */
2218 (void) taskq_dispatch_safe(
2219 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
2220 (task_func_t *)zio_reexecute, zio, &zio->io_task);
2221 }
2222 return (ZIO_PIPELINE_STOP);
2223 }
2224
2277 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
2278 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2279 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
2280 /*
2281 * We'd fail again if we reexecuted now, so suspend
2282 * until conditions improve (e.g. device comes online).
2283 */
2284 zio_suspend(spa, zio);

--- 4 unchanged lines hidden (view full) ---

2289 */
2290 (void) taskq_dispatch_safe(
2291 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
2292 (task_func_t *)zio_reexecute, zio, &zio->io_task);
2293 }
2294 return (ZIO_PIPELINE_STOP);
2295 }
2296
2225 ASSERT(zio->io_child == NULL);
2297 ASSERT(zio_walk_children(zio) == NULL);
2226 ASSERT(zio->io_reexecute == 0);
2227 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
2228
2298 ASSERT(zio->io_reexecute == 0);
2299 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
2300
2301 /*
2302 * It is the responsibility of the done callback to ensure that this
2303 * particular zio is no longer discoverable for adoption, and as
2304 * such, cannot acquire any new parents.
2305 */
2229 if (zio->io_done)
2230 zio->io_done(zio);
2231
2306 if (zio->io_done)
2307 zio->io_done(zio);
2308
2232 zio_gang_tree_free(&zio->io_gang_tree);
2309 mutex_enter(&zio->io_lock);
2310 zio->io_state[ZIO_WAIT_DONE] = 1;
2311 mutex_exit(&zio->io_lock);
2233
2312
2234 ASSERT(zio->io_delegate_list == NULL);
2235 ASSERT(zio->io_delegate_next == NULL);
2236
2237 if (pio != NULL) {
2238 zio_remove_child(pio, zio);
2313 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
2314 zio_link_t *zl = zio->io_walk_link;
2315 pio_next = zio_walk_parents(zio);
2316 zio_remove_child(pio, zio, zl);
2239 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2240 }
2241
2242 if (zio->io_waiter != NULL) {
2243 mutex_enter(&zio->io_lock);
2244 zio->io_executor = NULL;
2245 cv_broadcast(&zio->io_cv);
2246 mutex_exit(&zio->io_lock);

--- 30 unchanged lines hidden ---
2317 zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2318 }
2319
2320 if (zio->io_waiter != NULL) {
2321 mutex_enter(&zio->io_lock);
2322 zio->io_executor = NULL;
2323 cv_broadcast(&zio->io_cv);
2324 mutex_exit(&zio->io_lock);

--- 30 unchanged lines hidden ---