zio.c revision 10685:931790026ac6
165793Smsmith/*
265793Smsmith * CDDL HEADER START
381082Sscottl *
465793Smsmith * The contents of this file are subject to the terms of the
581082Sscottl * Common Development and Distribution License (the "License").
665793Smsmith * You may not use this file except in compliance with the License.
765793Smsmith *
865793Smsmith * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
965793Smsmith * or http://www.opensolaris.org/os/licensing.
1065793Smsmith * See the License for the specific language governing permissions
1165793Smsmith * and limitations under the License.
1265793Smsmith *
1365793Smsmith * When distributing Covered Code, include this CDDL HEADER in each
1465793Smsmith * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1565793Smsmith * If applicable, add the following below this CDDL HEADER, with the
1665793Smsmith * fields enclosed by brackets "[]" replaced with your own identifying
1765793Smsmith * information: Portions Copyright [yyyy] [name of copyright owner]
1865793Smsmith *
1965793Smsmith * CDDL HEADER END
2065793Smsmith */
2165793Smsmith/*
2265793Smsmith * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
2365793Smsmith * Use is subject to license terms.
2465793Smsmith */
2565793Smsmith
2665793Smsmith#include <sys/zfs_context.h>
2765793Smsmith#include <sys/fm/fs/zfs.h>
2865793Smsmith#include <sys/spa.h>
2965793Smsmith#include <sys/txg.h>
30119418Sobrien#include <sys/spa_impl.h>
31119418Sobrien#include <sys/vdev_impl.h>
32119418Sobrien#include <sys/zio_impl.h>
3381151Sscottl#include <sys/zio_compress.h>
3481151Sscottl#include <sys/zio_checksum.h>
3565793Smsmith
3665793Smsmith/*
3765793Smsmith * ==========================================================================
38129879Sphk * I/O priority table
3965793Smsmith * ==========================================================================
4065793Smsmith */
4165793Smsmithuint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
4265793Smsmith	0,	/* ZIO_PRIORITY_NOW		*/
4365793Smsmith	0,	/* ZIO_PRIORITY_SYNC_READ	*/
4482527Sscottl	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
4582527Sscottl	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
4682527Sscottl	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
4782527Sscottl	4,	/* ZIO_PRIORITY_FREE		*/
4865793Smsmith	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
4965793Smsmith	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
5065793Smsmith	10,	/* ZIO_PRIORITY_RESILVER	*/
5165793Smsmith	20,	/* ZIO_PRIORITY_SCRUB		*/
52138635Sscottl};
5365793Smsmith
5465793Smsmith/*
5565793Smsmith * ==========================================================================
5665793Smsmith * I/O type descriptions
5765793Smsmith * ==========================================================================
5865793Smsmith */
5965793Smsmithchar *zio_type_name[ZIO_TYPES] = {
6065793Smsmith	"null", "read", "write", "free", "claim", "ioctl" };
6165793Smsmith
6265793Smsmith#define	SYNC_PASS_DEFERRED_FREE	1	/* defer frees after this pass */
6365793Smsmith#define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
6465793Smsmith#define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
65111525Sscottl
66111525Sscottl/*
67111525Sscottl * ==========================================================================
68111220Sphk * I/O kmem caches
6965793Smsmith * ==========================================================================
7089112Smsmith */
7165793Smsmithkmem_cache_t *zio_cache;
7265793Smsmithkmem_cache_t *zio_link_cache;
7383114Sscottlkmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
7483114Sscottlkmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
7583114Sscottl
7683114Sscottl#ifdef _KERNEL
7765793Smsmithextern vmem_t *zio_alloc_arena;
7865793Smsmith#endif
7965793Smsmith
8083114Sscottl/*
8183114Sscottl * An allocating zio is one that either currently has the DVA allocate
8283114Sscottl * stage set or will have it later in its lifetime.
8365793Smsmith */
8465793Smsmith#define	IO_IS_ALLOCATING(zio) \
8565793Smsmith	((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
8665793Smsmith
8783114Sscottlvoid
8865793Smsmithzio_init(void)
8965793Smsmith{
90206534Semaste	size_t c;
9165793Smsmith	vmem_t *data_alloc_arena = NULL;
9265793Smsmith
9365793Smsmith#ifdef _KERNEL
94111525Sscottl	data_alloc_arena = zio_alloc_arena;
9565793Smsmith#endif
9683114Sscottl	zio_cache = kmem_cache_create("zio_cache",
9765793Smsmith	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
98177567Semaste	zio_link_cache = kmem_cache_create("zio_link_cache",
9983114Sscottl	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
100111525Sscottl
10165793Smsmith	/*
102109088Sscottl	 * For small buffers, we want a cache for each multiple of
103109088Sscottl	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
10483114Sscottl	 * for each quarter-power of 2.  For large buffers, we want
105109088Sscottl	 * a cache for each multiple of PAGESIZE.
10665793Smsmith	 */
10783114Sscottl	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
108109088Sscottl		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
109212773Semaste		size_t p2 = size;
110212773Semaste		size_t align = 0;
111212773Semaste
11283114Sscottl		while (p2 & (p2 - 1))
113109088Sscottl			p2 &= p2 - 1;
11465793Smsmith
11583114Sscottl		if (size <= 4 * SPA_MINBLOCKSIZE) {
11683114Sscottl			align = SPA_MINBLOCKSIZE;
11765793Smsmith		} else if (P2PHASE(size, PAGESIZE) == 0) {
11865793Smsmith			align = PAGESIZE;
11983114Sscottl		} else if (P2PHASE(size, p2 >> 2) == 0) {
12065793Smsmith			align = p2 >> 2;
12165793Smsmith		}
12265793Smsmith
123111525Sscottl		if (align != 0) {
12465793Smsmith			char name[36];
12583114Sscottl			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
12665793Smsmith			zio_buf_cache[c] = kmem_cache_create(name, size,
127177567Semaste			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
12883114Sscottl
129111525Sscottl			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
13065793Smsmith			zio_data_buf_cache[c] = kmem_cache_create(name, size,
13183114Sscottl			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
13283114Sscottl			    KMC_NODEBUG);
13365793Smsmith		}
13483114Sscottl	}
13583114Sscottl
13665793Smsmith	while (--c != 0) {
13765793Smsmith		ASSERT(zio_buf_cache[c] != NULL);
13883114Sscottl		if (zio_buf_cache[c - 1] == NULL)
13965793Smsmith			zio_buf_cache[c - 1] = zio_buf_cache[c];
14065793Smsmith
14165793Smsmith		ASSERT(zio_data_buf_cache[c] != NULL);
14265793Smsmith		if (zio_data_buf_cache[c - 1] == NULL)
14365793Smsmith			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
14483114Sscottl	}
14565793Smsmith
146111525Sscottl	zio_inject_init();
147177567Semaste}
14865793Smsmith
14983114Sscottlvoid
15083114Sscottlzio_fini(void)
15183114Sscottl{
15283114Sscottl	size_t c;
15383114Sscottl	kmem_cache_t *last_cache = NULL;
15483114Sscottl	kmem_cache_t *last_data_cache = NULL;
15583114Sscottl
15682527Sscottl	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
15783114Sscottl		if (zio_buf_cache[c] != last_cache) {
15883114Sscottl			last_cache = zio_buf_cache[c];
15983114Sscottl			kmem_cache_destroy(zio_buf_cache[c]);
16083114Sscottl		}
16183114Sscottl		zio_buf_cache[c] = NULL;
16283114Sscottl
16365793Smsmith		if (zio_data_buf_cache[c] != last_data_cache) {
16483114Sscottl			last_data_cache = zio_data_buf_cache[c];
16583114Sscottl			kmem_cache_destroy(zio_data_buf_cache[c]);
16683114Sscottl		}
167133540Sscottl		zio_data_buf_cache[c] = NULL;
16883114Sscottl	}
169133540Sscottl
170111532Sscottl	kmem_cache_destroy(zio_link_cache);
17183114Sscottl	kmem_cache_destroy(zio_cache);
17265793Smsmith
17365793Smsmith	zio_inject_fini();
17483114Sscottl}
17595350Sscottl
17695350Sscottl/*
17795350Sscottl * ==========================================================================
17895350Sscottl * Allocate and free I/O buffers
17995350Sscottl * ==========================================================================
18095350Sscottl */
18195350Sscottl
18295350Sscottl/*
18395350Sscottl * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
18495350Sscottl * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
18595350Sscottl * useful to inspect ZFS metadata, but if possible, we should avoid keeping
18695350Sscottl * excess / transient data in-core during a crashdump.
18795350Sscottl */
18895350Sscottlvoid *
18995350Sscottlzio_buf_alloc(size_t size)
19095350Sscottl{
19195350Sscottl	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
192116553Sscottl
193116553Sscottl	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
19495350Sscottl
19595350Sscottl	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
19695350Sscottl}
19795350Sscottl
19895350Sscottl/*
19995350Sscottl * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
20095350Sscottl * crashdump if the kernel panics.  This exists so that we will limit the amount
20195350Sscottl * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
202177899Semaste * of kernel heap dumped to disk when the kernel panics)
203177899Semaste */
204177899Semastevoid *
205177899Semastezio_data_buf_alloc(size_t size)
206177899Semaste{
207177899Semaste	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
208177899Semaste
209177899Semaste	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
210177899Semaste
211177899Semaste	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
212177899Semaste}
213177899Semaste
214177899Semastevoid
215177899Semastezio_buf_free(void *buf, size_t size)
216177899Semaste{
217177899Semaste	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
218177899Semaste
219177899Semaste	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
220177899Semaste
221177899Semaste	kmem_cache_free(zio_buf_cache[c], buf);
222177899Semaste}
223177899Semaste
224177899Semastevoid
225177899Semastezio_data_buf_free(void *buf, size_t size)
226177899Semaste{
22782527Sscottl	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
22882527Sscottl
229195614Sjkim	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
23082527Sscottl
23182527Sscottl	kmem_cache_free(zio_data_buf_cache[c], buf);
232111220Sphk}
23382527Sscottl
23483114Sscottl/*
23583114Sscottl * ==========================================================================
23695350Sscottl * Push and pop I/O transform buffers
237195614Sjkim * ==========================================================================
23895350Sscottl */
23995350Sscottlstatic void
24095350Sscottlzio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
241111220Sphk	zio_transform_func_t *transform)
242177899Semaste{
243177899Semaste	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
24482527Sscottl
245111220Sphk	zt->zt_orig_data = zio->io_data;
246111525Sscottl	zt->zt_orig_size = zio->io_size;
24782527Sscottl	zt->zt_bufsize = bufsize;
24883114Sscottl	zt->zt_transform = transform;
24995350Sscottl
25082527Sscottl	zt->zt_next = zio->io_transform_stack;
25183114Sscottl	zio->io_transform_stack = zt;
25282527Sscottl
25395350Sscottl	zio->io_data = data;
25495350Sscottl	zio->io_size = size;
25595350Sscottl}
256212773Semaste
257212773Semastestatic void
25895350Sscottlzio_pop_transforms(zio_t *zio)
25995350Sscottl{
26095350Sscottl	zio_transform_t *zt;
26182527Sscottl
262130006Sscottl	while ((zt = zio->io_transform_stack) != NULL) {
263130006Sscottl		if (zt->zt_transform != NULL)
26482527Sscottl			zt->zt_transform(zio,
26595350Sscottl			    zt->zt_orig_data, zt->zt_orig_size);
266195614Sjkim
267195614Sjkim		zio_buf_free(zio->io_data, zt->zt_bufsize);
268177899Semaste
269177899Semaste		zio->io_data = zt->zt_orig_data;
270177899Semaste		zio->io_size = zt->zt_orig_size;
271177899Semaste		zio->io_transform_stack = zt->zt_next;
272177899Semaste
273177899Semaste		kmem_free(zt, sizeof (zio_transform_t));
274177899Semaste	}
275177899Semaste}
276177899Semaste
277177899Semaste/*
278177899Semaste * ==========================================================================
279177899Semaste * I/O transform callbacks for subblocks and decompression
280177899Semaste * ==========================================================================
281177899Semaste */
282177899Semastestatic void
283177899Semastezio_subblock(zio_t *zio, void *data, uint64_t size)
284177899Semaste{
285177899Semaste	ASSERT(zio->io_size > size);
286177899Semaste
287177899Semaste	if (zio->io_type == ZIO_TYPE_READ)
288177899Semaste		bcopy(zio->io_data, data, size);
289177899Semaste}
290177899Semaste
291177899Semastestatic void
292116553Sscottlzio_decompress(zio_t *zio, void *data, uint64_t size)
293116553Sscottl{
294116553Sscottl	if (zio->io_error == 0 &&
295116553Sscottl	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
296116553Sscottl	    zio->io_data, zio->io_size, data, size) != 0)
297116553Sscottl		zio->io_error = EIO;
298116553Sscottl}
299116553Sscottl
300177899Semaste/*
301145811Sscottl * ==========================================================================
302116553Sscottl * I/O parent/child relationships and pipeline interlocks
30395350Sscottl * ==========================================================================
30495350Sscottl */
30582527Sscottl/*
30695350Sscottl * NOTE - Callers to zio_walk_parents() and zio_walk_children must
307177899Semaste *        continue calling these functions until they return NULL.
30883114Sscottl *        Otherwise, the next caller will pick up the list walk in
309177899Semaste *        some indeterminate state.  (Otherwise every caller would
310212773Semaste *        have to pass in a cookie to keep the state represented by
311212773Semaste *        io_walk_link, which gets annoying.)
312212773Semaste */
31395350Sscottlzio_t *
31483114Sscottlzio_walk_parents(zio_t *cio)
315116553Sscottl{
316145811Sscottl	zio_link_t *zl = cio->io_walk_link;
317145811Sscottl	list_t *pl = &cio->io_parent_list;
318145811Sscottl
319145811Sscottl	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
320145811Sscottl	cio->io_walk_link = zl;
32195350Sscottl
32295350Sscottl	if (zl == NULL)
323132771Skan		return (NULL);
32483114Sscottl
32582527Sscottl	ASSERT(zl->zl_child == cio);
32683114Sscottl	return (zl->zl_parent);
32782527Sscottl}
32882527Sscottl
32983114Sscottlzio_t *
33065793Smsmithzio_walk_children(zio_t *pio)
33165793Smsmith{
33265793Smsmith	zio_link_t *zl = pio->io_walk_link;
33370393Smsmith	list_t *cl = &pio->io_child_list;
33465793Smsmith
33583114Sscottl	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
33665793Smsmith	pio->io_walk_link = zl;
337111525Sscottl
338177567Semaste	if (zl == NULL)
33983114Sscottl		return (NULL);
340111691Sscottl
341103675Sphk	ASSERT(zl->zl_parent == pio);
342111691Sscottl	return (zl->zl_child);
34383114Sscottl}
34465793Smsmith
34565793Smsmithzio_t *
34683114Sscottlzio_unique_parent(zio_t *cio)
34765793Smsmith{
34865793Smsmith	zio_t *pio = zio_walk_parents(cio);
34965793Smsmith
35065793Smsmith	VERIFY(zio_walk_parents(cio) == NULL);
35165793Smsmith	return (pio);
35265793Smsmith}
353177567Semaste
35465793Smsmithvoid
35583114Sscottlzio_add_child(zio_t *pio, zio_t *cio)
35665793Smsmith{
35765793Smsmith	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
35883114Sscottl
35965793Smsmith	/*
36065793Smsmith	 * Logical I/Os can have logical, gang, or vdev children.
36165793Smsmith	 * Gang I/Os can have gang or vdev children.
36265793Smsmith	 * Vdev I/Os can only have vdev children.
36365793Smsmith	 * The following ASSERT captures all of these constraints.
36483114Sscottl	 */
36583114Sscottl	ASSERT(cio->io_child_type <= pio->io_child_type);
36683114Sscottl
367177567Semaste	zl->zl_parent = pio;
36865793Smsmith	zl->zl_child = cio;
36983114Sscottl
37083114Sscottl	mutex_enter(&cio->io_lock);
37183114Sscottl	mutex_enter(&pio->io_lock);
37283114Sscottl
37383114Sscottl	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
37465793Smsmith
37583114Sscottl	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
37683114Sscottl		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
37783114Sscottl
37883114Sscottl	list_insert_head(&pio->io_child_list, zl);
37983114Sscottl	list_insert_head(&cio->io_parent_list, zl);
380177619Semaste
381177619Semaste	mutex_exit(&pio->io_lock);
382177619Semaste	mutex_exit(&cio->io_lock);
38383114Sscottl}
38483114Sscottl
38583114Sscottlstatic void
38683114Sscottlzio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
38783114Sscottl{
38883114Sscottl	ASSERT(zl->zl_parent == pio);
38983114Sscottl	ASSERT(zl->zl_child == cio);
39083114Sscottl
39183114Sscottl	mutex_enter(&cio->io_lock);
39283114Sscottl	mutex_enter(&pio->io_lock);
39383114Sscottl
39465793Smsmith	list_remove(&pio->io_child_list, zl);
395177619Semaste	list_remove(&cio->io_parent_list, zl);
396177619Semaste
397177619Semaste	mutex_exit(&pio->io_lock);
39865793Smsmith	mutex_exit(&cio->io_lock);
39983114Sscottl
40083114Sscottl	kmem_cache_free(zio_link_cache, zl);
401125975Sphk}
402125975Sphk
403125975Sphkstatic boolean_t
404195614Sjkimzio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
405125975Sphk{
406125975Sphk	uint64_t *countp = &zio->io_children[child][wait];
407125975Sphk	boolean_t waiting = B_FALSE;
408125975Sphk
409125975Sphk	mutex_enter(&zio->io_lock);
410125975Sphk	ASSERT(zio->io_stall == NULL);
411125975Sphk	if (*countp != 0) {
412125975Sphk		zio->io_stage--;
413125975Sphk		zio->io_stall = countp;
414125975Sphk		waiting = B_TRUE;
41581082Sscottl	}
41683114Sscottl	mutex_exit(&zio->io_lock);
41765793Smsmith
41865793Smsmith	return (waiting);
41983114Sscottl}
42065793Smsmith
42165793Smsmithstatic void
42265793Smsmithzio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
42365793Smsmith{
42465793Smsmith	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
42583114Sscottl	int *errorp = &pio->io_child_error[zio->io_child_type];
42665793Smsmith
42783114Sscottl	mutex_enter(&pio->io_lock);
428177567Semaste	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
42965793Smsmith		*errorp = zio_worst_error(*errorp, zio->io_error);
43083114Sscottl	pio->io_reexecute |= zio->io_reexecute;
43183114Sscottl	ASSERT3U(*countp, >, 0);
43283114Sscottl	if (--*countp == 0 && pio->io_stall == countp) {
433125975Sphk		pio->io_stall = NULL;
43465793Smsmith		mutex_exit(&pio->io_lock);
43583114Sscottl		zio_execute(pio);
43665793Smsmith	} else {
437		mutex_exit(&pio->io_lock);
438	}
439}
440
441static void
442zio_inherit_child_errors(zio_t *zio, enum zio_child c)
443{
444	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
445		zio->io_error = zio->io_child_error[c];
446}
447
448/*
449 * ==========================================================================
450 * Create the various types of I/O (read, write, free, etc)
451 * ==========================================================================
452 */
453static zio_t *
454zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
455    void *data, uint64_t size, zio_done_func_t *done, void *private,
456    zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset,
457    const zbookmark_t *zb, uint8_t stage, uint32_t pipeline)
458{
459	zio_t *zio;
460
461	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
462	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
463	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
464
465	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
466	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
467	ASSERT(vd || stage == ZIO_STAGE_OPEN);
468
469	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
470	bzero(zio, sizeof (zio_t));
471
472	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
473	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
474
475	list_create(&zio->io_parent_list, sizeof (zio_link_t),
476	    offsetof(zio_link_t, zl_parent_node));
477	list_create(&zio->io_child_list, sizeof (zio_link_t),
478	    offsetof(zio_link_t, zl_child_node));
479
480	if (vd != NULL)
481		zio->io_child_type = ZIO_CHILD_VDEV;
482	else if (flags & ZIO_FLAG_GANG_CHILD)
483		zio->io_child_type = ZIO_CHILD_GANG;
484	else
485		zio->io_child_type = ZIO_CHILD_LOGICAL;
486
487	if (bp != NULL) {
488		zio->io_bp = bp;
489		zio->io_bp_copy = *bp;
490		zio->io_bp_orig = *bp;
491		if (type != ZIO_TYPE_WRITE)
492			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
493		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
494			zio->io_logical = zio;
495		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
496			pipeline |= ZIO_GANG_STAGES;
497	}
498
499	zio->io_spa = spa;
500	zio->io_txg = txg;
501	zio->io_data = data;
502	zio->io_size = size;
503	zio->io_done = done;
504	zio->io_private = private;
505	zio->io_type = type;
506	zio->io_priority = priority;
507	zio->io_vd = vd;
508	zio->io_offset = offset;
509	zio->io_orig_flags = zio->io_flags = flags;
510	zio->io_orig_stage = zio->io_stage = stage;
511	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
512
513	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
514	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
515
516	if (zb != NULL)
517		zio->io_bookmark = *zb;
518
519	if (pio != NULL) {
520		if (zio->io_logical == NULL)
521			zio->io_logical = pio->io_logical;
522		if (zio->io_child_type == ZIO_CHILD_GANG)
523			zio->io_gang_leader = pio->io_gang_leader;
524		zio_add_child(pio, zio);
525	}
526
527	return (zio);
528}
529
530static void
531zio_destroy(zio_t *zio)
532{
533	list_destroy(&zio->io_parent_list);
534	list_destroy(&zio->io_child_list);
535	mutex_destroy(&zio->io_lock);
536	cv_destroy(&zio->io_cv);
537	kmem_cache_free(zio_cache, zio);
538}
539
540zio_t *
541zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
542    void *private, int flags)
543{
544	zio_t *zio;
545
546	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
547	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
548	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
549
550	return (zio);
551}
552
553zio_t *
554zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
555{
556	return (zio_null(NULL, spa, NULL, done, private, flags));
557}
558
559zio_t *
560zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
561    void *data, uint64_t size, zio_done_func_t *done, void *private,
562    int priority, int flags, const zbookmark_t *zb)
563{
564	zio_t *zio;
565
566	zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
567	    data, size, done, private,
568	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
569	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
570
571	return (zio);
572}
573
574void
575zio_skip_write(zio_t *zio)
576{
577	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
578	ASSERT(zio->io_stage == ZIO_STAGE_READY);
579	ASSERT(!BP_IS_GANG(zio->io_bp));
580
581	zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
582}
583
584zio_t *
585zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
586    void *data, uint64_t size, zio_prop_t *zp,
587    zio_done_func_t *ready, zio_done_func_t *done, void *private,
588    int priority, int flags, const zbookmark_t *zb)
589{
590	zio_t *zio;
591
592	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
593	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
594	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
595	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
596	    zp->zp_type < DMU_OT_NUMTYPES &&
597	    zp->zp_level < 32 &&
598	    zp->zp_ndvas > 0 &&
599	    zp->zp_ndvas <= spa_max_replication(spa));
600	ASSERT(ready != NULL);
601
602	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
603	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
604	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
605
606	zio->io_ready = ready;
607	zio->io_prop = *zp;
608
609	return (zio);
610}
611
612zio_t *
613zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
614    uint64_t size, zio_done_func_t *done, void *private, int priority,
615    int flags, zbookmark_t *zb)
616{
617	zio_t *zio;
618
619	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
620	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
621	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
622
623	return (zio);
624}
625
626zio_t *
627zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
628    zio_done_func_t *done, void *private, int flags)
629{
630	zio_t *zio;
631
632	ASSERT(!BP_IS_HOLE(bp));
633
634	if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
635		return (zio_null(pio, spa, NULL, NULL, NULL, flags));
636
637	if (txg == spa->spa_syncing_txg &&
638	    spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
639		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
640		return (zio_null(pio, spa, NULL, NULL, NULL, flags));
641	}
642
643	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
644	    done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
645	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
646
647	return (zio);
648}
649
650zio_t *
651zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
652    zio_done_func_t *done, void *private, int flags)
653{
654	zio_t *zio;
655
656	/*
657	 * A claim is an allocation of a specific block.  Claims are needed
658	 * to support immediate writes in the intent log.  The issue is that
659	 * immediate writes contain committed data, but in a txg that was
660	 * *not* committed.  Upon opening the pool after an unclean shutdown,
661	 * the intent log claims all blocks that contain immediate write data
662	 * so that the SPA knows they're in use.
663	 *
664	 * All claims *must* be resolved in the first txg -- before the SPA
665	 * starts allocating blocks -- so that nothing is allocated twice.
666	 */
667	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
668	ASSERT3U(spa_first_txg(spa), <=, txg);
669
670	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
671	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
672	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
673
674	return (zio);
675}
676
677zio_t *
678zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
679    zio_done_func_t *done, void *private, int priority, int flags)
680{
681	zio_t *zio;
682	int c;
683
684	if (vd->vdev_children == 0) {
685		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
686		    ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
687		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
688
689		zio->io_cmd = cmd;
690	} else {
691		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
692
693		for (c = 0; c < vd->vdev_children; c++)
694			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
695			    done, private, priority, flags));
696	}
697
698	return (zio);
699}
700
701zio_t *
702zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
703    void *data, int checksum, zio_done_func_t *done, void *private,
704    int priority, int flags, boolean_t labels)
705{
706	zio_t *zio;
707
708	ASSERT(vd->vdev_children == 0);
709	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
710	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
711	ASSERT3U(offset + size, <=, vd->vdev_psize);
712
713	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
714	    ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
715	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
716
717	zio->io_prop.zp_checksum = checksum;
718
719	return (zio);
720}
721
722zio_t *
723zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
724    void *data, int checksum, zio_done_func_t *done, void *private,
725    int priority, int flags, boolean_t labels)
726{
727	zio_t *zio;
728
729	ASSERT(vd->vdev_children == 0);
730	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
731	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
732	ASSERT3U(offset + size, <=, vd->vdev_psize);
733
734	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
735	    ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
736	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
737
738	zio->io_prop.zp_checksum = checksum;
739
740	if (zio_checksum_table[checksum].ci_zbt) {
741		/*
742		 * zbt checksums are necessarily destructive -- they modify
743		 * the end of the write buffer to hold the verifier/checksum.
744		 * Therefore, we must make a local copy in case the data is
745		 * being written to multiple places in parallel.
746		 */
747		void *wbuf = zio_buf_alloc(size);
748		bcopy(data, wbuf, size);
749		zio_push_transform(zio, wbuf, size, size, NULL);
750	}
751
752	return (zio);
753}
754
755/*
756 * Create a child I/O to do some work for us.
757 */
758zio_t *
759zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
760	void *data, uint64_t size, int type, int priority, int flags,
761	zio_done_func_t *done, void *private)
762{
763	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
764	zio_t *zio;
765
766	ASSERT(vd->vdev_parent ==
767	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
768
769	if (type == ZIO_TYPE_READ && bp != NULL) {
770		/*
771		 * If we have the bp, then the child should perform the
772		 * checksum and the parent need not.  This pushes error
773		 * detection as close to the leaves as possible and
774		 * eliminates redundant checksums in the interior nodes.
775		 */
776		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
777		pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
778	}
779
780	if (vd->vdev_children == 0)
781		offset += VDEV_LABEL_START_SIZE;
782
783	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
784	    done, private, type, priority,
785	    (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) |
786	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags,
787	    vd, offset, &pio->io_bookmark,
788	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
789
790	return (zio);
791}
792
793zio_t *
794zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
795	int type, int priority, int flags, zio_done_func_t *done, void *private)
796{
797	zio_t *zio;
798
799	ASSERT(vd->vdev_ops->vdev_op_leaf);
800
801	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
802	    data, size, done, private, type, priority,
803	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
804	    vd, offset, NULL,
805	    ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE);
806
807	return (zio);
808}
809
810void
811zio_flush(zio_t *zio, vdev_t *vd)
812{
813	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
814	    NULL, NULL, ZIO_PRIORITY_NOW,
815	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
816}
817
818/*
819 * ==========================================================================
820 * Prepare to read and write logical blocks
821 * ==========================================================================
822 */
823
824static int
825zio_read_bp_init(zio_t *zio)
826{
827	blkptr_t *bp = zio->io_bp;
828
829	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
830	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
831	    !(zio->io_flags & ZIO_FLAG_RAW)) {
832		uint64_t csize = BP_GET_PSIZE(bp);
833		void *cbuf = zio_buf_alloc(csize);
834
835		zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
836	}
837
838	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
839		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
840
841	return (ZIO_PIPELINE_CONTINUE);
842}
843
844static int
845zio_write_bp_init(zio_t *zio)
846{
847	zio_prop_t *zp = &zio->io_prop;
848	int compress = zp->zp_compress;
849	blkptr_t *bp = zio->io_bp;
850	void *cbuf;
851	uint64_t lsize = zio->io_size;
852	uint64_t csize = lsize;
853	uint64_t cbufsize = 0;
854	int pass = 1;
855
856	/*
857	 * If our children haven't all reached the ready stage,
858	 * wait for them and then repeat this pipeline stage.
859	 */
860	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
861	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
862		return (ZIO_PIPELINE_STOP);
863
864	if (!IO_IS_ALLOCATING(zio))
865		return (ZIO_PIPELINE_CONTINUE);
866
867	ASSERT(compress != ZIO_COMPRESS_INHERIT);
868
869	if (bp->blk_birth == zio->io_txg) {
870		/*
871		 * We're rewriting an existing block, which means we're
872		 * working on behalf of spa_sync().  For spa_sync() to
873		 * converge, it must eventually be the case that we don't
874		 * have to allocate new blocks.  But compression changes
875		 * the blocksize, which forces a reallocate, and makes
876		 * convergence take longer.  Therefore, after the first
877		 * few passes, stop compressing to ensure convergence.
878		 */
879		pass = spa_sync_pass(zio->io_spa);
880
881		if (pass > SYNC_PASS_DONT_COMPRESS)
882			compress = ZIO_COMPRESS_OFF;
883
884		/* Make sure someone doesn't change their mind on overwrites */
885		ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
886		    spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
887	}
888
889	if (compress != ZIO_COMPRESS_OFF) {
890		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
891		    &cbuf, &csize, &cbufsize)) {
892			compress = ZIO_COMPRESS_OFF;
893		} else if (csize != 0) {
894			zio_push_transform(zio, cbuf, csize, cbufsize, NULL);
895		}
896	}
897
898	/*
899	 * The final pass of spa_sync() must be all rewrites, but the first
900	 * few passes offer a trade-off: allocating blocks defers convergence,
901	 * but newly allocated blocks are sequential, so they can be written
902	 * to disk faster.  Therefore, we allow the first few passes of
903	 * spa_sync() to allocate new blocks, but force rewrites after that.
904	 * There should only be a handful of blocks after pass 1 in any case.
905	 */
906	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
907	    pass > SYNC_PASS_REWRITE) {
908		ASSERT(csize != 0);
909		uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
910		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
911		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
912	} else {
913		BP_ZERO(bp);
914		zio->io_pipeline = ZIO_WRITE_PIPELINE;
915	}
916
917	if (csize == 0) {
918		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
919	} else {
920		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
921		BP_SET_LSIZE(bp, lsize);
922		BP_SET_PSIZE(bp, csize);
923		BP_SET_COMPRESS(bp, compress);
924		BP_SET_CHECKSUM(bp, zp->zp_checksum);
925		BP_SET_TYPE(bp, zp->zp_type);
926		BP_SET_LEVEL(bp, zp->zp_level);
927		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
928	}
929
930	return (ZIO_PIPELINE_CONTINUE);
931}
932
933/*
934 * ==========================================================================
935 * Execute the I/O pipeline
936 * ==========================================================================
937 */
938
939static void
940zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
941{
942	zio_type_t t = zio->io_type;
943
944	/*
945	 * If we're a config writer or a probe, the normal issue and
946	 * interrupt threads may all be blocked waiting for the config lock.
947	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
948	 */
949	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
950		t = ZIO_TYPE_NULL;
951
952	/*
953	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
954	 */
955	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
956		t = ZIO_TYPE_NULL;
957
958	(void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q],
959	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
960}
961
962static boolean_t
963zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
964{
965	kthread_t *executor = zio->io_executor;
966	spa_t *spa = zio->io_spa;
967
968	for (zio_type_t t = 0; t < ZIO_TYPES; t++)
969		if (taskq_member(spa->spa_zio_taskq[t][q], executor))
970			return (B_TRUE);
971
972	return (B_FALSE);
973}
974
975static int
976zio_issue_async(zio_t *zio)
977{
978	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
979
980	return (ZIO_PIPELINE_STOP);
981}
982
983void
984zio_interrupt(zio_t *zio)
985{
986	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT);
987}
988
989/*
990 * Execute the I/O pipeline until one of the following occurs:
991 * (1) the I/O completes; (2) the pipeline stalls waiting for
992 * dependent child I/Os; (3) the I/O issues, so we're waiting
993 * for an I/O completion interrupt; (4) the I/O is delegated by
994 * vdev-level caching or aggregation; (5) the I/O is deferred
995 * due to vdev-level queueing; (6) the I/O is handed off to
996 * another thread.  In all cases, the pipeline stops whenever
997 * there's no CPU work; it never burns a thread in cv_wait().
998 *
999 * There's no locking on io_stage because there's no legitimate way
1000 * for multiple threads to be attempting to process the same I/O.
1001 */
1002static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES];
1003
1004void
1005zio_execute(zio_t *zio)
1006{
1007	zio->io_executor = curthread;
1008
1009	while (zio->io_stage < ZIO_STAGE_DONE) {
1010		uint32_t pipeline = zio->io_pipeline;
1011		zio_stage_t stage = zio->io_stage;
1012		int rv;
1013
1014		ASSERT(!MUTEX_HELD(&zio->io_lock));
1015
1016		while (((1U << ++stage) & pipeline) == 0)
1017			continue;
1018
1019		ASSERT(stage <= ZIO_STAGE_DONE);
1020		ASSERT(zio->io_stall == NULL);
1021
1022		/*
1023		 * If we are in interrupt context and this pipeline stage
1024		 * will grab a config lock that is held across I/O,
1025		 * issue async to avoid deadlock.
1026		 */
1027		if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) &&
1028		    zio->io_vd == NULL &&
1029		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1030			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
1031			return;
1032		}
1033
1034		zio->io_stage = stage;
1035		rv = zio_pipeline[stage](zio);
1036
1037		if (rv == ZIO_PIPELINE_STOP)
1038			return;
1039
1040		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1041	}
1042}
1043
1044/*
1045 * ==========================================================================
1046 * Initiate I/O, either sync or async
1047 * ==========================================================================
1048 */
1049int
1050zio_wait(zio_t *zio)
1051{
1052	int error;
1053
1054	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1055	ASSERT(zio->io_executor == NULL);
1056
1057	zio->io_waiter = curthread;
1058
1059	zio_execute(zio);
1060
1061	mutex_enter(&zio->io_lock);
1062	while (zio->io_executor != NULL)
1063		cv_wait(&zio->io_cv, &zio->io_lock);
1064	mutex_exit(&zio->io_lock);
1065
1066	error = zio->io_error;
1067	zio_destroy(zio);
1068
1069	return (error);
1070}
1071
1072void
1073zio_nowait(zio_t *zio)
1074{
1075	ASSERT(zio->io_executor == NULL);
1076
1077	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1078	    zio_unique_parent(zio) == NULL) {
1079		/*
1080		 * This is a logical async I/O with no parent to wait for it.
1081		 * We add it to the spa_async_root_zio "Godfather" I/O which
1082		 * will ensure they complete prior to unloading the pool.
1083		 */
1084		spa_t *spa = zio->io_spa;
1085
1086		zio_add_child(spa->spa_async_zio_root, zio);
1087	}
1088
1089	zio_execute(zio);
1090}
1091
1092/*
1093 * ==========================================================================
1094 * Reexecute or suspend/resume failed I/O
1095 * ==========================================================================
1096 */
1097
1098static void
1099zio_reexecute(zio_t *pio)
1100{
1101	zio_t *cio, *cio_next;
1102
1103	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1104	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1105	ASSERT(pio->io_gang_leader == NULL);
1106	ASSERT(pio->io_gang_tree == NULL);
1107
1108	pio->io_flags = pio->io_orig_flags;
1109	pio->io_stage = pio->io_orig_stage;
1110	pio->io_pipeline = pio->io_orig_pipeline;
1111	pio->io_reexecute = 0;
1112	pio->io_error = 0;
1113	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1114		pio->io_state[w] = 0;
1115	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1116		pio->io_child_error[c] = 0;
1117
1118	if (IO_IS_ALLOCATING(pio)) {
1119		/*
1120		 * Remember the failed bp so that the io_ready() callback
1121		 * can update its accounting upon reexecution.  The block
1122		 * was already freed in zio_done(); we indicate this with
1123		 * a fill count of -1 so that zio_free() knows to skip it.
1124		 */
1125		blkptr_t *bp = pio->io_bp;
1126		ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
1127		bp->blk_fill = BLK_FILL_ALREADY_FREED;
1128		pio->io_bp_orig = *bp;
1129		BP_ZERO(bp);
1130	}
1131
1132	/*
1133	 * As we reexecute pio's children, new children could be created.
1134	 * New children go to the head of pio's io_child_list, however,
1135	 * so we will (correctly) not reexecute them.  The key is that
1136	 * the remainder of pio's io_child_list, from 'cio_next' onward,
1137	 * cannot be affected by any side effects of reexecuting 'cio'.
1138	 */
1139	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1140		cio_next = zio_walk_children(pio);
1141		mutex_enter(&pio->io_lock);
1142		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1143			pio->io_children[cio->io_child_type][w]++;
1144		mutex_exit(&pio->io_lock);
1145		zio_reexecute(cio);
1146	}
1147
1148	/*
1149	 * Now that all children have been reexecuted, execute the parent.
1150	 * We don't reexecute "The Godfather" I/O here as it's the
1151	 * responsibility of the caller to wait on him.
1152	 */
1153	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1154		zio_execute(pio);
1155}
1156
1157void
1158zio_suspend(spa_t *spa, zio_t *zio)
1159{
1160	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1161		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1162		    "failure and the failure mode property for this pool "
1163		    "is set to panic.", spa_name(spa));
1164
1165	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1166
1167	mutex_enter(&spa->spa_suspend_lock);
1168
1169	if (spa->spa_suspend_zio_root == NULL)
1170		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1171		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1172		    ZIO_FLAG_GODFATHER);
1173
1174	spa->spa_suspended = B_TRUE;
1175
1176	if (zio != NULL) {
1177		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1178		ASSERT(zio != spa->spa_suspend_zio_root);
1179		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1180		ASSERT(zio_unique_parent(zio) == NULL);
1181		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1182		zio_add_child(spa->spa_suspend_zio_root, zio);
1183	}
1184
1185	mutex_exit(&spa->spa_suspend_lock);
1186}
1187
1188int
1189zio_resume(spa_t *spa)
1190{
1191	zio_t *pio;
1192
1193	/*
1194	 * Reexecute all previously suspended i/o.
1195	 */
1196	mutex_enter(&spa->spa_suspend_lock);
1197	spa->spa_suspended = B_FALSE;
1198	cv_broadcast(&spa->spa_suspend_cv);
1199	pio = spa->spa_suspend_zio_root;
1200	spa->spa_suspend_zio_root = NULL;
1201	mutex_exit(&spa->spa_suspend_lock);
1202
1203	if (pio == NULL)
1204		return (0);
1205
1206	zio_reexecute(pio);
1207	return (zio_wait(pio));
1208}
1209
1210void
1211zio_resume_wait(spa_t *spa)
1212{
1213	mutex_enter(&spa->spa_suspend_lock);
1214	while (spa_suspended(spa))
1215		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1216	mutex_exit(&spa->spa_suspend_lock);
1217}
1218
1219/*
1220 * ==========================================================================
1221 * Gang blocks.
1222 *
1223 * A gang block is a collection of small blocks that looks to the DMU
1224 * like one large block.  When zio_dva_allocate() cannot find a block
1225 * of the requested size, due to either severe fragmentation or the pool
1226 * being nearly full, it calls zio_write_gang_block() to construct the
1227 * block from smaller fragments.
1228 *
1229 * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1230 * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1231 * an indirect block: it's an array of block pointers.  It consumes
1232 * only one sector and hence is allocatable regardless of fragmentation.
1233 * The gang header's bps point to its gang members, which hold the data.
1234 *
1235 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1236 * as the verifier to ensure uniqueness of the SHA256 checksum.
1237 * Critically, the gang block bp's blk_cksum is the checksum of the data,
1238 * not the gang header.  This ensures that data block signatures (needed for
1239 * deduplication) are independent of how the block is physically stored.
1240 *
1241 * Gang blocks can be nested: a gang member may itself be a gang block.
1242 * Thus every gang block is a tree in which root and all interior nodes are
1243 * gang headers, and the leaves are normal blocks that contain user data.
1244 * The root of the gang tree is called the gang leader.
1245 *
1246 * To perform any operation (read, rewrite, free, claim) on a gang block,
1247 * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1248 * in the io_gang_tree field of the original logical i/o by recursively
1249 * reading the gang leader and all gang headers below it.  This yields
1250 * an in-core tree containing the contents of every gang header and the
1251 * bps for every constituent of the gang block.
1252 *
1253 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1254 * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1255 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1256 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1257 * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1258 * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1259 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1260 * of the gang header plus zio_checksum_compute() of the data to update the
1261 * gang header's blk_cksum as described above.
1262 *
1263 * The two-phase assemble/issue model solves the problem of partial failure --
1264 * what if you'd freed part of a gang block but then couldn't read the
1265 * gang header for another part?  Assembling the entire gang tree first
1266 * ensures that all the necessary gang header I/O has succeeded before
1267 * starting the actual work of free, claim, or write.  Once the gang tree
1268 * is assembled, free and claim are in-memory operations that cannot fail.
1269 *
1270 * In the event that a gang write fails, zio_dva_unallocate() walks the
1271 * gang tree to immediately free (i.e. insert back into the space map)
1272 * everything we've allocated.  This ensures that we don't get ENOSPC
1273 * errors during repeated suspend/resume cycles due to a flaky device.
1274 *
1275 * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1276 * the gang tree, we won't modify the block, so we can safely defer the free
1277 * (knowing that the block is still intact).  If we *can* assemble the gang
1278 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1279 * each constituent bp and we can allocate a new block on the next sync pass.
1280 *
1281 * In all cases, the gang tree allows complete recovery from partial failure.
1282 * ==========================================================================
1283 */
1284
1285static zio_t *
1286zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1287{
1288	if (gn != NULL)
1289		return (pio);
1290
1291	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1292	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1293	    &pio->io_bookmark));
1294}
1295
1296zio_t *
1297zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1298{
1299	zio_t *zio;
1300
1301	if (gn != NULL) {
1302		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1303		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1304		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1305		/*
1306		 * As we rewrite each gang header, the pipeline will compute
1307		 * a new gang block header checksum for it; but no one will
1308		 * compute a new data checksum, so we do that here.  The one
1309		 * exception is the gang leader: the pipeline already computed
1310		 * its data checksum because that stage precedes gang assembly.
1311		 * (Presently, nothing actually uses interior data checksums;
1312		 * this is just good hygiene.)
1313		 */
1314		if (gn != pio->io_gang_leader->io_gang_tree) {
1315			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1316			    data, BP_GET_PSIZE(bp));
1317		}
1318	} else {
1319		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1320		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1321		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1322	}
1323
1324	return (zio);
1325}
1326
1327/* ARGSUSED */
1328zio_t *
1329zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1330{
1331	return (zio_free(pio, pio->io_spa, pio->io_txg, bp,
1332	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1333}
1334
1335/* ARGSUSED */
1336zio_t *
1337zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1338{
1339	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1340	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1341}
1342
1343static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1344	NULL,
1345	zio_read_gang,
1346	zio_rewrite_gang,
1347	zio_free_gang,
1348	zio_claim_gang,
1349	NULL
1350};
1351
1352static void zio_gang_tree_assemble_done(zio_t *zio);
1353
1354static zio_gang_node_t *
1355zio_gang_node_alloc(zio_gang_node_t **gnpp)
1356{
1357	zio_gang_node_t *gn;
1358
1359	ASSERT(*gnpp == NULL);
1360
1361	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1362	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1363	*gnpp = gn;
1364
1365	return (gn);
1366}
1367
1368static void
1369zio_gang_node_free(zio_gang_node_t **gnpp)
1370{
1371	zio_gang_node_t *gn = *gnpp;
1372
1373	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1374		ASSERT(gn->gn_child[g] == NULL);
1375
1376	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1377	kmem_free(gn, sizeof (*gn));
1378	*gnpp = NULL;
1379}
1380
1381static void
1382zio_gang_tree_free(zio_gang_node_t **gnpp)
1383{
1384	zio_gang_node_t *gn = *gnpp;
1385
1386	if (gn == NULL)
1387		return;
1388
1389	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1390		zio_gang_tree_free(&gn->gn_child[g]);
1391
1392	zio_gang_node_free(gnpp);
1393}
1394
1395static void
1396zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1397{
1398	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1399
1400	ASSERT(gio->io_gang_leader == gio);
1401	ASSERT(BP_IS_GANG(bp));
1402
1403	zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1404	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1405	    gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1406}
1407
1408static void
1409zio_gang_tree_assemble_done(zio_t *zio)
1410{
1411	zio_t *gio = zio->io_gang_leader;
1412	zio_gang_node_t *gn = zio->io_private;
1413	blkptr_t *bp = zio->io_bp;
1414
1415	ASSERT(gio == zio_unique_parent(zio));
1416	ASSERT(zio_walk_children(zio) == NULL);
1417
1418	if (zio->io_error)
1419		return;
1420
1421	if (BP_SHOULD_BYTESWAP(bp))
1422		byteswap_uint64_array(zio->io_data, zio->io_size);
1423
1424	ASSERT(zio->io_data == gn->gn_gbh);
1425	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1426	ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
1427
1428	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1429		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1430		if (!BP_IS_GANG(gbp))
1431			continue;
1432		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1433	}
1434}
1435
1436static void
1437zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1438{
1439	zio_t *gio = pio->io_gang_leader;
1440	zio_t *zio;
1441
1442	ASSERT(BP_IS_GANG(bp) == !!gn);
1443	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1444	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1445
1446	/*
1447	 * If you're a gang header, your data is in gn->gn_gbh.
1448	 * If you're a gang member, your data is in 'data' and gn == NULL.
1449	 */
1450	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1451
1452	if (gn != NULL) {
1453		ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
1454
1455		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1456			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1457			if (BP_IS_HOLE(gbp))
1458				continue;
1459			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1460			data = (char *)data + BP_GET_PSIZE(gbp);
1461		}
1462	}
1463
1464	if (gn == gio->io_gang_tree)
1465		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1466
1467	if (zio != pio)
1468		zio_nowait(zio);
1469}
1470
1471static int
1472zio_gang_assemble(zio_t *zio)
1473{
1474	blkptr_t *bp = zio->io_bp;
1475
1476	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1477	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1478
1479	zio->io_gang_leader = zio;
1480
1481	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1482
1483	return (ZIO_PIPELINE_CONTINUE);
1484}
1485
1486static int
1487zio_gang_issue(zio_t *zio)
1488{
1489	blkptr_t *bp = zio->io_bp;
1490
1491	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1492		return (ZIO_PIPELINE_STOP);
1493
1494	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1495	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1496
1497	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1498		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1499	else
1500		zio_gang_tree_free(&zio->io_gang_tree);
1501
1502	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1503
1504	return (ZIO_PIPELINE_CONTINUE);
1505}
1506
1507static void
1508zio_write_gang_member_ready(zio_t *zio)
1509{
1510	zio_t *pio = zio_unique_parent(zio);
1511	zio_t *gio = zio->io_gang_leader;
1512	dva_t *cdva = zio->io_bp->blk_dva;
1513	dva_t *pdva = pio->io_bp->blk_dva;
1514	uint64_t asize;
1515
1516	if (BP_IS_HOLE(zio->io_bp))
1517		return;
1518
1519	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1520
1521	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1522	ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas);
1523	ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
1524	ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
1525	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1526
1527	mutex_enter(&pio->io_lock);
1528	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1529		ASSERT(DVA_GET_GANG(&pdva[d]));
1530		asize = DVA_GET_ASIZE(&pdva[d]);
1531		asize += DVA_GET_ASIZE(&cdva[d]);
1532		DVA_SET_ASIZE(&pdva[d], asize);
1533	}
1534	mutex_exit(&pio->io_lock);
1535}
1536
1537static int
1538zio_write_gang_block(zio_t *pio)
1539{
1540	spa_t *spa = pio->io_spa;
1541	blkptr_t *bp = pio->io_bp;
1542	zio_t *gio = pio->io_gang_leader;
1543	zio_t *zio;
1544	zio_gang_node_t *gn, **gnpp;
1545	zio_gbh_phys_t *gbh;
1546	uint64_t txg = pio->io_txg;
1547	uint64_t resid = pio->io_size;
1548	uint64_t lsize;
1549	int ndvas = gio->io_prop.zp_ndvas;
1550	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
1551	zio_prop_t zp;
1552	int error;
1553
1554	error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
1555	    bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
1556	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1557	if (error) {
1558		pio->io_error = error;
1559		return (ZIO_PIPELINE_CONTINUE);
1560	}
1561
1562	if (pio == gio) {
1563		gnpp = &gio->io_gang_tree;
1564	} else {
1565		gnpp = pio->io_private;
1566		ASSERT(pio->io_ready == zio_write_gang_member_ready);
1567	}
1568
1569	gn = zio_gang_node_alloc(gnpp);
1570	gbh = gn->gn_gbh;
1571	bzero(gbh, SPA_GANGBLOCKSIZE);
1572
1573	/*
1574	 * Create the gang header.
1575	 */
1576	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1577	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1578
1579	/*
1580	 * Create and nowait the gang children.
1581	 */
1582	for (int g = 0; resid != 0; resid -= lsize, g++) {
1583		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1584		    SPA_MINBLOCKSIZE);
1585		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1586
1587		zp.zp_checksum = gio->io_prop.zp_checksum;
1588		zp.zp_compress = ZIO_COMPRESS_OFF;
1589		zp.zp_type = DMU_OT_NONE;
1590		zp.zp_level = 0;
1591		zp.zp_ndvas = gio->io_prop.zp_ndvas;
1592
1593		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1594		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1595		    zio_write_gang_member_ready, NULL, &gn->gn_child[g],
1596		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1597		    &pio->io_bookmark));
1598	}
1599
1600	/*
1601	 * Set pio's pipeline to just wait for zio to finish.
1602	 */
1603	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1604
1605	zio_nowait(zio);
1606
1607	return (ZIO_PIPELINE_CONTINUE);
1608}
1609
1610/*
1611 * ==========================================================================
1612 * Allocate and free blocks
1613 * ==========================================================================
1614 */
1615
1616static int
1617zio_dva_allocate(zio_t *zio)
1618{
1619	spa_t *spa = zio->io_spa;
1620	metaslab_class_t *mc = spa->spa_normal_class;
1621	blkptr_t *bp = zio->io_bp;
1622	int error;
1623
1624	if (zio->io_gang_leader == NULL) {
1625		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1626		zio->io_gang_leader = zio;
1627	}
1628
1629	ASSERT(BP_IS_HOLE(bp));
1630	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
1631	ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
1632	ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
1633	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1634
1635	error = metaslab_alloc(spa, mc, zio->io_size, bp,
1636	    zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);
1637
1638	if (error) {
1639		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
1640			return (zio_write_gang_block(zio));
1641		zio->io_error = error;
1642	}
1643
1644	return (ZIO_PIPELINE_CONTINUE);
1645}
1646
1647static int
1648zio_dva_free(zio_t *zio)
1649{
1650	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
1651
1652	return (ZIO_PIPELINE_CONTINUE);
1653}
1654
1655static int
1656zio_dva_claim(zio_t *zio)
1657{
1658	int error;
1659
1660	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
1661	if (error)
1662		zio->io_error = error;
1663
1664	return (ZIO_PIPELINE_CONTINUE);
1665}
1666
1667/*
1668 * Undo an allocation.  This is used by zio_done() when an I/O fails
1669 * and we want to give back the block we just allocated.
1670 * This handles both normal blocks and gang blocks.
1671 */
1672static void
1673zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
1674{
1675	spa_t *spa = zio->io_spa;
1676	boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
1677
1678	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
1679
1680	if (zio->io_bp == bp && !now) {
1681		/*
1682		 * This is a rewrite for sync-to-convergence.
1683		 * We can't do a metaslab_free(NOW) because bp wasn't allocated
1684		 * during this sync pass, which means that metaslab_sync()
1685		 * already committed the allocation.
1686		 */
1687		ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
1688		    BP_IDENTITY(&zio->io_bp_orig)));
1689		ASSERT(spa_sync_pass(spa) > 1);
1690
1691		if (BP_IS_GANG(bp) && gn == NULL) {
1692			/*
1693			 * This is a gang leader whose gang header(s) we
1694			 * couldn't read now, so defer the free until later.
1695			 * The block should still be intact because without
1696			 * the headers, we'd never even start the rewrite.
1697			 */
1698			bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
1699			return;
1700		}
1701	}
1702
1703	if (!BP_IS_HOLE(bp))
1704		metaslab_free(spa, bp, bp->blk_birth, now);
1705
1706	if (gn != NULL) {
1707		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1708			zio_dva_unallocate(zio, gn->gn_child[g],
1709			    &gn->gn_gbh->zg_blkptr[g]);
1710		}
1711	}
1712}
1713
1714/*
1715 * Try to allocate an intent log block.  Return 0 on success, errno on failure.
1716 */
1717int
1718zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
1719    uint64_t txg, boolean_t bypass_slog)
1720{
1721	int error = 1;
1722
1723	if (!bypass_slog)
1724		error = metaslab_alloc(spa, spa->spa_log_class, size,
1725		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
1726
1727	if (error)
1728		error = metaslab_alloc(spa, spa->spa_normal_class, size,
1729		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
1730
1731	if (error == 0) {
1732		BP_SET_LSIZE(new_bp, size);
1733		BP_SET_PSIZE(new_bp, size);
1734		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
1735		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
1736		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
1737		BP_SET_LEVEL(new_bp, 0);
1738		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
1739	}
1740
1741	return (error);
1742}
1743
1744/*
1745 * Free an intent log block.  We know it can't be a gang block, so there's
1746 * nothing to do except metaslab_free() it.
1747 */
1748void
1749zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
1750{
1751	ASSERT(!BP_IS_GANG(bp));
1752
1753	metaslab_free(spa, bp, txg, B_FALSE);
1754}
1755
1756/*
1757 * ==========================================================================
1758 * Read and write to physical devices
1759 * ==========================================================================
1760 */
1761static int
1762zio_vdev_io_start(zio_t *zio)
1763{
1764	vdev_t *vd = zio->io_vd;
1765	uint64_t align;
1766	spa_t *spa = zio->io_spa;
1767
1768	ASSERT(zio->io_error == 0);
1769	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
1770
1771	if (vd == NULL) {
1772		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
1773			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
1774
1775		/*
1776		 * The mirror_ops handle multiple DVAs in a single BP.
1777		 */
1778		return (vdev_mirror_ops.vdev_op_io_start(zio));
1779	}
1780
1781	align = 1ULL << vd->vdev_top->vdev_ashift;
1782
1783	if (P2PHASE(zio->io_size, align) != 0) {
1784		uint64_t asize = P2ROUNDUP(zio->io_size, align);
1785		char *abuf = zio_buf_alloc(asize);
1786		ASSERT(vd == vd->vdev_top);
1787		if (zio->io_type == ZIO_TYPE_WRITE) {
1788			bcopy(zio->io_data, abuf, zio->io_size);
1789			bzero(abuf + zio->io_size, asize - zio->io_size);
1790		}
1791		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
1792	}
1793
1794	ASSERT(P2PHASE(zio->io_offset, align) == 0);
1795	ASSERT(P2PHASE(zio->io_size, align) == 0);
1796	ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
1797
1798	/*
1799	 * If this is a repair I/O, and there's no self-healing involved --
1800	 * that is, we're just resilvering what we expect to resilver --
1801	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
1802	 * This prevents spurious resilvering with nested replication.
1803	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
1804	 * A is out of date, we'll read from C+D, then use the data to
1805	 * resilver A+B -- but we don't actually want to resilver B, just A.
1806	 * The top-level mirror has no way to know this, so instead we just
1807	 * discard unnecessary repairs as we work our way down the vdev tree.
1808	 * The same logic applies to any form of nested replication:
1809	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
1810	 */
1811	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
1812	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
1813	    zio->io_txg != 0 &&	/* not a delegated i/o */
1814	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
1815		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
1816		zio_vdev_io_bypass(zio);
1817		return (ZIO_PIPELINE_CONTINUE);
1818	}
1819
1820	if (vd->vdev_ops->vdev_op_leaf &&
1821	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
1822
1823		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
1824			return (ZIO_PIPELINE_CONTINUE);
1825
1826		if ((zio = vdev_queue_io(zio)) == NULL)
1827			return (ZIO_PIPELINE_STOP);
1828
1829		if (!vdev_accessible(vd, zio)) {
1830			zio->io_error = ENXIO;
1831			zio_interrupt(zio);
1832			return (ZIO_PIPELINE_STOP);
1833		}
1834	}
1835
1836	return (vd->vdev_ops->vdev_op_io_start(zio));
1837}
1838
1839static int
1840zio_vdev_io_done(zio_t *zio)
1841{
1842	vdev_t *vd = zio->io_vd;
1843	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
1844	boolean_t unexpected_error = B_FALSE;
1845
1846	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
1847		return (ZIO_PIPELINE_STOP);
1848
1849	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
1850
1851	if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
1852
1853		vdev_queue_io_done(zio);
1854
1855		if (zio->io_type == ZIO_TYPE_WRITE)
1856			vdev_cache_write(zio);
1857
1858		if (zio_injection_enabled && zio->io_error == 0)
1859			zio->io_error = zio_handle_device_injection(vd,
1860			    zio, EIO);
1861
1862		if (zio_injection_enabled && zio->io_error == 0)
1863			zio->io_error = zio_handle_label_injection(zio, EIO);
1864
1865		if (zio->io_error) {
1866			if (!vdev_accessible(vd, zio)) {
1867				zio->io_error = ENXIO;
1868			} else {
1869				unexpected_error = B_TRUE;
1870			}
1871		}
1872	}
1873
1874	ops->vdev_op_io_done(zio);
1875
1876	if (unexpected_error)
1877		VERIFY(vdev_probe(vd, zio) == NULL);
1878
1879	return (ZIO_PIPELINE_CONTINUE);
1880}
1881
1882/*
1883 * For non-raidz ZIOs, we can just copy aside the bad data read from the
1884 * disk, and use that to finish the checksum ereport later.
1885 */
1886static void
1887zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
1888    const void *good_buf)
1889{
1890	/* no processing needed */
1891	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
1892}
1893
1894/*ARGSUSED*/
1895void
1896zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
1897{
1898	void *buf = zio_buf_alloc(zio->io_size);
1899
1900	bcopy(zio->io_data, buf, zio->io_size);
1901
1902	zcr->zcr_cbinfo = zio->io_size;
1903	zcr->zcr_cbdata = buf;
1904	zcr->zcr_finish = zio_vsd_default_cksum_finish;
1905	zcr->zcr_free = zio_buf_free;
1906}
1907
1908static int
1909zio_vdev_io_assess(zio_t *zio)
1910{
1911	vdev_t *vd = zio->io_vd;
1912
1913	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
1914		return (ZIO_PIPELINE_STOP);
1915
1916	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
1917		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
1918
1919	if (zio->io_vsd != NULL) {
1920		zio->io_vsd_ops->vsd_free(zio);
1921		zio->io_vsd = NULL;
1922	}
1923
1924	if (zio_injection_enabled && zio->io_error == 0)
1925		zio->io_error = zio_handle_fault_injection(zio, EIO);
1926
1927	/*
1928	 * If the I/O failed, determine whether we should attempt to retry it.
1929	 */
1930	if (zio->io_error && vd == NULL &&
1931	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
1932		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
1933		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
1934		zio->io_error = 0;
1935		zio->io_flags |= ZIO_FLAG_IO_RETRY |
1936		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
1937		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
1938		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
1939		return (ZIO_PIPELINE_STOP);
1940	}
1941
1942	/*
1943	 * If we got an error on a leaf device, convert it to ENXIO
1944	 * if the device is not accessible at all.
1945	 */
1946	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
1947	    !vdev_accessible(vd, zio))
1948		zio->io_error = ENXIO;
1949
1950	/*
1951	 * If we can't write to an interior vdev (mirror or RAID-Z),
1952	 * set vdev_cant_write so that we stop trying to allocate from it.
1953	 */
1954	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
1955	    vd != NULL && !vd->vdev_ops->vdev_op_leaf)
1956		vd->vdev_cant_write = B_TRUE;
1957
1958	if (zio->io_error)
1959		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1960
1961	return (ZIO_PIPELINE_CONTINUE);
1962}
1963
1964void
1965zio_vdev_io_reissue(zio_t *zio)
1966{
1967	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1968	ASSERT(zio->io_error == 0);
1969
1970	zio->io_stage--;
1971}
1972
1973void
1974zio_vdev_io_redone(zio_t *zio)
1975{
1976	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
1977
1978	zio->io_stage--;
1979}
1980
1981void
1982zio_vdev_io_bypass(zio_t *zio)
1983{
1984	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1985	ASSERT(zio->io_error == 0);
1986
1987	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
1988	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
1989}
1990
1991/*
1992 * ==========================================================================
1993 * Generate and verify checksums
1994 * ==========================================================================
1995 */
1996static int
1997zio_checksum_generate(zio_t *zio)
1998{
1999	blkptr_t *bp = zio->io_bp;
2000	enum zio_checksum checksum;
2001
2002	if (bp == NULL) {
2003		/*
2004		 * This is zio_write_phys().
2005		 * We're either generating a label checksum, or none at all.
2006		 */
2007		checksum = zio->io_prop.zp_checksum;
2008
2009		if (checksum == ZIO_CHECKSUM_OFF)
2010			return (ZIO_PIPELINE_CONTINUE);
2011
2012		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2013	} else {
2014		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2015			ASSERT(!IO_IS_ALLOCATING(zio));
2016			checksum = ZIO_CHECKSUM_GANG_HEADER;
2017		} else {
2018			checksum = BP_GET_CHECKSUM(bp);
2019		}
2020	}
2021
2022	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2023
2024	return (ZIO_PIPELINE_CONTINUE);
2025}
2026
2027static int
2028zio_checksum_verify(zio_t *zio)
2029{
2030	zio_bad_cksum_t info;
2031
2032	blkptr_t *bp = zio->io_bp;
2033	int error;
2034
2035	if (bp == NULL) {
2036		/*
2037		 * This is zio_read_phys().
2038		 * We're either verifying a label checksum, or nothing at all.
2039		 */
2040		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2041			return (ZIO_PIPELINE_CONTINUE);
2042
2043		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2044	}
2045
2046	if ((error = zio_checksum_error(zio, &info)) != 0) {
2047		zio->io_error = error;
2048		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2049			zfs_ereport_start_checksum(zio->io_spa,
2050			    zio->io_vd, zio, zio->io_offset,
2051			    zio->io_size, NULL, &info);
2052		}
2053	}
2054
2055	return (ZIO_PIPELINE_CONTINUE);
2056}
2057
2058/*
2059 * Called by RAID-Z to ensure we don't compute the checksum twice.
2060 */
2061void
2062zio_checksum_verified(zio_t *zio)
2063{
2064	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
2065}
2066
2067/*
2068 * ==========================================================================
2069 * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2070 * An error of 0 indictes success.  ENXIO indicates whole-device failure,
2071 * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2072 * indicate errors that are specific to one I/O, and most likely permanent.
2073 * Any other error is presumed to be worse because we weren't expecting it.
2074 * ==========================================================================
2075 */
2076int
2077zio_worst_error(int e1, int e2)
2078{
2079	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2080	int r1, r2;
2081
2082	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2083		if (e1 == zio_error_rank[r1])
2084			break;
2085
2086	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2087		if (e2 == zio_error_rank[r2])
2088			break;
2089
2090	return (r1 > r2 ? e1 : e2);
2091}
2092
2093/*
2094 * ==========================================================================
2095 * I/O completion
2096 * ==========================================================================
2097 */
2098static int
2099zio_ready(zio_t *zio)
2100{
2101	blkptr_t *bp = zio->io_bp;
2102	zio_t *pio, *pio_next;
2103
2104	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
2105		return (ZIO_PIPELINE_STOP);
2106
2107	if (zio->io_ready) {
2108		ASSERT(IO_IS_ALLOCATING(zio));
2109		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2110		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2111
2112		zio->io_ready(zio);
2113	}
2114
2115	if (bp != NULL && bp != &zio->io_bp_copy)
2116		zio->io_bp_copy = *bp;
2117
2118	if (zio->io_error)
2119		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2120
2121	mutex_enter(&zio->io_lock);
2122	zio->io_state[ZIO_WAIT_READY] = 1;
2123	pio = zio_walk_parents(zio);
2124	mutex_exit(&zio->io_lock);
2125
2126	/*
2127	 * As we notify zio's parents, new parents could be added.
2128	 * New parents go to the head of zio's io_parent_list, however,
2129	 * so we will (correctly) not notify them.  The remainder of zio's
2130	 * io_parent_list, from 'pio_next' onward, cannot change because
2131	 * all parents must wait for us to be done before they can be done.
2132	 */
2133	for (; pio != NULL; pio = pio_next) {
2134		pio_next = zio_walk_parents(zio);
2135		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2136	}
2137
2138	return (ZIO_PIPELINE_CONTINUE);
2139}
2140
2141static int
2142zio_done(zio_t *zio)
2143{
2144	spa_t *spa = zio->io_spa;
2145	zio_t *lio = zio->io_logical;
2146	blkptr_t *bp = zio->io_bp;
2147	vdev_t *vd = zio->io_vd;
2148	uint64_t psize = zio->io_size;
2149	zio_t *pio, *pio_next;
2150
2151	/*
2152	 * If our children haven't all completed,
2153	 * wait for them and then repeat this pipeline stage.
2154	 */
2155	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
2156	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
2157	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
2158		return (ZIO_PIPELINE_STOP);
2159
2160	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
2161		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
2162			ASSERT(zio->io_children[c][w] == 0);
2163
2164	if (bp != NULL) {
2165		ASSERT(bp->blk_pad[0] == 0);
2166		ASSERT(bp->blk_pad[1] == 0);
2167		ASSERT(bp->blk_pad[2] == 0);
2168		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
2169		    (bp == zio_unique_parent(zio)->io_bp));
2170		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
2171		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2172			ASSERT(!BP_SHOULD_BYTESWAP(bp));
2173			ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
2174			ASSERT(BP_COUNT_GANG(bp) == 0 ||
2175			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
2176		}
2177	}
2178
2179	/*
2180	 * If there were child vdev or gang errors, they apply to us now.
2181	 */
2182	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
2183	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
2184
2185	zio_pop_transforms(zio);	/* note: may set zio->io_error */
2186
2187	vdev_stat_update(zio, psize);
2188
2189	if (zio->io_error) {
2190		/*
2191		 * If this I/O is attached to a particular vdev,
2192		 * generate an error message describing the I/O failure
2193		 * at the block level.  We ignore these errors if the
2194		 * device is currently unavailable.
2195		 */
2196		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
2197			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
2198
2199		if ((zio->io_error == EIO || !(zio->io_flags &
2200		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
2201		    zio == lio) {
2202			/*
2203			 * For logical I/O requests, tell the SPA to log the
2204			 * error and generate a logical data ereport.
2205			 */
2206			spa_log_error(spa, zio);
2207			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
2208			    0, 0);
2209		}
2210	}
2211
2212	if (zio->io_error && zio == lio) {
2213		/*
2214		 * Determine whether zio should be reexecuted.  This will
2215		 * propagate all the way to the root via zio_notify_parent().
2216		 */
2217		ASSERT(vd == NULL && bp != NULL);
2218
2219		if (IO_IS_ALLOCATING(zio))
2220			if (zio->io_error != ENOSPC)
2221				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
2222			else
2223				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2224
2225		if ((zio->io_type == ZIO_TYPE_READ ||
2226		    zio->io_type == ZIO_TYPE_FREE) &&
2227		    zio->io_error == ENXIO &&
2228		    spa->spa_load_state == SPA_LOAD_NONE &&
2229		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
2230			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2231
2232		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
2233			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2234
2235		/*
2236		 * Here is a possibly good place to attempt to do
2237		 * either combinatorial reconstruction or error correction
2238		 * based on checksums.  It also might be a good place
2239		 * to send out preliminary ereports before we suspend
2240		 * processing.
2241		 */
2242	}
2243
2244	/*
2245	 * If there were logical child errors, they apply to us now.
2246	 * We defer this until now to avoid conflating logical child
2247	 * errors with errors that happened to the zio itself when
2248	 * updating vdev stats and reporting FMA events above.
2249	 */
2250	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
2251
2252	if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) &&
2253	    zio->io_child_type == ZIO_CHILD_LOGICAL) {
2254		ASSERT(zio->io_child_type != ZIO_CHILD_GANG);
2255		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
2256	}
2257
2258	zio_gang_tree_free(&zio->io_gang_tree);
2259
2260	/*
2261	 * Godfather I/Os should never suspend.
2262	 */
2263	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
2264	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
2265		zio->io_reexecute = 0;
2266
2267	if (zio->io_reexecute) {
2268		/*
2269		 * This is a logical I/O that wants to reexecute.
2270		 *
2271		 * Reexecute is top-down.  When an i/o fails, if it's not
2272		 * the root, it simply notifies its parent and sticks around.
2273		 * The parent, seeing that it still has children in zio_done(),
2274		 * does the same.  This percolates all the way up to the root.
2275		 * The root i/o will reexecute or suspend the entire tree.
2276		 *
2277		 * This approach ensures that zio_reexecute() honors
2278		 * all the original i/o dependency relationships, e.g.
2279		 * parents not executing until children are ready.
2280		 */
2281		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2282
2283		zio->io_gang_leader = NULL;
2284
2285		mutex_enter(&zio->io_lock);
2286		zio->io_state[ZIO_WAIT_DONE] = 1;
2287		mutex_exit(&zio->io_lock);
2288
2289		/*
2290		 * "The Godfather" I/O monitors its children but is
2291		 * not a true parent to them. It will track them through
2292		 * the pipeline but severs its ties whenever they get into
2293		 * trouble (e.g. suspended). This allows "The Godfather"
2294		 * I/O to return status without blocking.
2295		 */
2296		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
2297			zio_link_t *zl = zio->io_walk_link;
2298			pio_next = zio_walk_parents(zio);
2299
2300			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
2301			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
2302				zio_remove_child(pio, zio, zl);
2303				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2304			}
2305		}
2306
2307		if ((pio = zio_unique_parent(zio)) != NULL) {
2308			/*
2309			 * We're not a root i/o, so there's nothing to do
2310			 * but notify our parent.  Don't propagate errors
2311			 * upward since we haven't permanently failed yet.
2312			 */
2313			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
2314			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
2315			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2316		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
2317			/*
2318			 * We'd fail again if we reexecuted now, so suspend
2319			 * until conditions improve (e.g. device comes online).
2320			 */
2321			zio_suspend(spa, zio);
2322		} else {
2323			/*
2324			 * Reexecution is potentially a huge amount of work.
2325			 * Hand it off to the otherwise-unused claim taskq.
2326			 */
2327			(void) taskq_dispatch(
2328			    spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
2329			    (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
2330		}
2331		return (ZIO_PIPELINE_STOP);
2332	}
2333
2334	ASSERT(zio_walk_children(zio) == NULL);
2335	ASSERT(zio->io_reexecute == 0);
2336	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
2337
2338	/* Report any checksum errors, since the IO is complete */
2339	while (zio->io_cksum_report != NULL) {
2340		zio_cksum_report_t *rpt = zio->io_cksum_report;
2341
2342		zio->io_cksum_report = rpt->zcr_next;
2343		rpt->zcr_next = NULL;
2344
2345		/* only pass in our data buffer if we've succeeded. */
2346		rpt->zcr_finish(rpt,
2347		    (zio->io_error == 0) ? zio->io_data : NULL);
2348
2349		zfs_ereport_free_checksum(rpt);
2350	}
2351
2352	/*
2353	 * It is the responsibility of the done callback to ensure that this
2354	 * particular zio is no longer discoverable for adoption, and as
2355	 * such, cannot acquire any new parents.
2356	 */
2357	if (zio->io_done)
2358		zio->io_done(zio);
2359
2360	mutex_enter(&zio->io_lock);
2361	zio->io_state[ZIO_WAIT_DONE] = 1;
2362	mutex_exit(&zio->io_lock);
2363
2364	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
2365		zio_link_t *zl = zio->io_walk_link;
2366		pio_next = zio_walk_parents(zio);
2367		zio_remove_child(pio, zio, zl);
2368		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2369	}
2370
2371	if (zio->io_waiter != NULL) {
2372		mutex_enter(&zio->io_lock);
2373		zio->io_executor = NULL;
2374		cv_broadcast(&zio->io_cv);
2375		mutex_exit(&zio->io_lock);
2376	} else {
2377		zio_destroy(zio);
2378	}
2379
2380	return (ZIO_PIPELINE_STOP);
2381}
2382
2383/*
2384 * ==========================================================================
2385 * I/O pipeline definition
2386 * ==========================================================================
2387 */
2388static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = {
2389	NULL,
2390	zio_issue_async,
2391	zio_read_bp_init,
2392	zio_write_bp_init,
2393	zio_checksum_generate,
2394	zio_gang_assemble,
2395	zio_gang_issue,
2396	zio_dva_allocate,
2397	zio_dva_free,
2398	zio_dva_claim,
2399	zio_ready,
2400	zio_vdev_io_start,
2401	zio_vdev_io_done,
2402	zio_vdev_io_assess,
2403	zio_checksum_verify,
2404	zio_done
2405};
2406