1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23265740Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24260742Savg * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25168404Spjd */
26168404Spjd
27168404Spjd#include <sys/zfs_context.h>
28168404Spjd#include <sys/fm/fs/zfs.h>
29168404Spjd#include <sys/spa.h>
30168404Spjd#include <sys/txg.h>
31168404Spjd#include <sys/spa_impl.h>
32168404Spjd#include <sys/vdev_impl.h>
33168404Spjd#include <sys/zio_impl.h>
34168404Spjd#include <sys/zio_compress.h>
35168404Spjd#include <sys/zio_checksum.h>
36219089Spjd#include <sys/dmu_objset.h>
37219089Spjd#include <sys/arc.h>
38219089Spjd#include <sys/ddt.h>
39240868Spjd#include <sys/trim_map.h>
40268649Sdelphij#include <sys/blkptr.h>
41263397Sdelphij#include <sys/zfeature.h>
42168404Spjd
43208148SpjdSYSCTL_DECL(_vfs_zfs);
44208148SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
45260338Smav#if defined(__amd64__)
46260338Smavstatic int zio_use_uma = 1;
47260338Smav#else
48209261Spjdstatic int zio_use_uma = 0;
49260338Smav#endif
50208148SpjdTUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
51208148SpjdSYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
52208148Spjd    "Use uma(9) for ZIO allocations");
53230647Skmacystatic int zio_exclude_metadata = 0;
54230647SkmacyTUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata);
55230647SkmacySYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
56230647Skmacy    "Exclude metadata buffers from dumps as well");
57208148Spjd
58240868Spjdzio_trim_stats_t zio_trim_stats = {
59244155Ssmh	{ "bytes",		KSTAT_DATA_UINT64,
60244155Ssmh	  "Number of bytes successfully TRIMmed" },
61244155Ssmh	{ "success",		KSTAT_DATA_UINT64,
62244155Ssmh	  "Number of successful TRIM requests" },
63244155Ssmh	{ "unsupported",	KSTAT_DATA_UINT64,
64244155Ssmh	  "Number of TRIM requests that failed because TRIM is not supported" },
65244155Ssmh	{ "failed",		KSTAT_DATA_UINT64,
66244155Ssmh	  "Number of TRIM requests that failed for reasons other than not supported" },
67240868Spjd};
68240868Spjd
69240868Spjdstatic kstat_t *zio_trim_ksp;
70240868Spjd
71240868Spjd/*
72168404Spjd * ==========================================================================
73168404Spjd * I/O type descriptions
74168404Spjd * ==========================================================================
75168404Spjd */
76260763Savgconst char *zio_type_name[ZIO_TYPES] = {
77211931Smm	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
78211931Smm	"zio_ioctl"
79211931Smm};
80168404Spjd
81168404Spjd/*
82168404Spjd * ==========================================================================
83168404Spjd * I/O kmem caches
84168404Spjd * ==========================================================================
85168404Spjd */
86168926Spjdkmem_cache_t *zio_cache;
87209962Smmkmem_cache_t *zio_link_cache;
88168404Spjdkmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
89168404Spjdkmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
90168404Spjd
91168404Spjd#ifdef _KERNEL
92168404Spjdextern vmem_t *zio_alloc_arena;
93168404Spjd#endif
94168404Spjd
95185029Spjd/*
96243503Smm * The following actions directly effect the spa's sync-to-convergence logic.
97243503Smm * The values below define the sync pass when we start performing the action.
98243503Smm * Care should be taken when changing these values as they directly impact
99243503Smm * spa_sync() performance. Tuning these values may introduce subtle performance
100243503Smm * pathologies and should only be done in the context of performance analysis.
101243503Smm * These tunables will eventually be removed and replaced with #defines once
102243503Smm * enough analysis has been done to determine optimal values.
103243503Smm *
104243503Smm * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
105243503Smm * regular blocks are not deferred.
106243503Smm */
107243503Smmint zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
108243503SmmTUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free);
109243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
110243503Smm    &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
111243503Smmint zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
112243503SmmTUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress);
113243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
114243503Smm    &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
115243503Smmint zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
116243503SmmTUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite);
117243503SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
118243503Smm    &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
119243503Smm
120243503Smm/*
121185029Spjd * An allocating zio is one that either currently has the DVA allocate
122185029Spjd * stage set or will have it later in its lifetime.
123185029Spjd */
124219089Spjd#define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
125185029Spjd
126219089Spjdboolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
127219089Spjd
128219089Spjd#ifdef ZFS_DEBUG
129219089Spjdint zio_buf_debug_limit = 16384;
130219089Spjd#else
131219089Spjdint zio_buf_debug_limit = 0;
132219089Spjd#endif
133219089Spjd
134168404Spjdvoid
135168404Spjdzio_init(void)
136168404Spjd{
137168404Spjd	size_t c;
138209962Smm	zio_cache = kmem_cache_create("zio_cache",
139209962Smm	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
140209962Smm	zio_link_cache = kmem_cache_create("zio_link_cache",
141209962Smm	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
142250149Sdavide	if (!zio_use_uma)
143250149Sdavide		goto out;
144168926Spjd
145168404Spjd	/*
146168404Spjd	 * For small buffers, we want a cache for each multiple of
147168404Spjd	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
148168404Spjd	 * for each quarter-power of 2.  For large buffers, we want
149168404Spjd	 * a cache for each multiple of PAGESIZE.
150168404Spjd	 */
151168404Spjd	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
152168404Spjd		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
153168404Spjd		size_t p2 = size;
154168404Spjd		size_t align = 0;
155219089Spjd		size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
156168404Spjd
157168404Spjd		while (p2 & (p2 - 1))
158168404Spjd			p2 &= p2 - 1;
159168404Spjd
160240133Smm#ifdef illumos
161240133Smm#ifndef _KERNEL
162240133Smm		/*
163240133Smm		 * If we are using watchpoints, put each buffer on its own page,
164240133Smm		 * to eliminate the performance overhead of trapping to the
165240133Smm		 * kernel when modifying a non-watched buffer that shares the
166240133Smm		 * page with a watched buffer.
167240133Smm		 */
168240133Smm		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
169240133Smm			continue;
170240133Smm#endif
171240133Smm#endif /* illumos */
172168404Spjd		if (size <= 4 * SPA_MINBLOCKSIZE) {
173168404Spjd			align = SPA_MINBLOCKSIZE;
174240133Smm		} else if (IS_P2ALIGNED(size, PAGESIZE)) {
175168404Spjd			align = PAGESIZE;
176240133Smm		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
177168404Spjd			align = p2 >> 2;
178168404Spjd		}
179168404Spjd
180168404Spjd		if (align != 0) {
181168404Spjd			char name[36];
182168404Spjd			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
183168404Spjd			zio_buf_cache[c] = kmem_cache_create(name, size,
184219089Spjd			    align, NULL, NULL, NULL, NULL, NULL, cflags);
185168404Spjd
186219089Spjd			/*
187219089Spjd			 * Since zio_data bufs do not appear in crash dumps, we
188219089Spjd			 * pass KMC_NOTOUCH so that no allocator metadata is
189219089Spjd			 * stored with the buffers.
190219089Spjd			 */
191168404Spjd			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
192168404Spjd			zio_data_buf_cache[c] = kmem_cache_create(name, size,
193219089Spjd			    align, NULL, NULL, NULL, NULL, NULL,
194230689Skmacy			    cflags | KMC_NOTOUCH | KMC_NODEBUG);
195168404Spjd		}
196168404Spjd	}
197168404Spjd
198168404Spjd	while (--c != 0) {
199168404Spjd		ASSERT(zio_buf_cache[c] != NULL);
200168404Spjd		if (zio_buf_cache[c - 1] == NULL)
201168404Spjd			zio_buf_cache[c - 1] = zio_buf_cache[c];
202168404Spjd
203168404Spjd		ASSERT(zio_data_buf_cache[c] != NULL);
204168404Spjd		if (zio_data_buf_cache[c - 1] == NULL)
205168404Spjd			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
206168404Spjd	}
207250149Sdavideout:
208208458Spjd
209168404Spjd	zio_inject_init();
210240868Spjd
211240868Spjd	zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
212240868Spjd	    KSTAT_TYPE_NAMED,
213240868Spjd	    sizeof(zio_trim_stats) / sizeof(kstat_named_t),
214240868Spjd	    KSTAT_FLAG_VIRTUAL);
215240868Spjd
216240868Spjd	if (zio_trim_ksp != NULL) {
217240868Spjd		zio_trim_ksp->ks_data = &zio_trim_stats;
218240868Spjd		kstat_install(zio_trim_ksp);
219240868Spjd	}
220168404Spjd}
221168404Spjd
222168404Spjdvoid
223168404Spjdzio_fini(void)
224168404Spjd{
225168404Spjd	size_t c;
226168404Spjd	kmem_cache_t *last_cache = NULL;
227168404Spjd	kmem_cache_t *last_data_cache = NULL;
228168404Spjd
229168404Spjd	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
230168404Spjd		if (zio_buf_cache[c] != last_cache) {
231168404Spjd			last_cache = zio_buf_cache[c];
232168404Spjd			kmem_cache_destroy(zio_buf_cache[c]);
233168404Spjd		}
234168404Spjd		zio_buf_cache[c] = NULL;
235168404Spjd
236168404Spjd		if (zio_data_buf_cache[c] != last_data_cache) {
237168404Spjd			last_data_cache = zio_data_buf_cache[c];
238168404Spjd			kmem_cache_destroy(zio_data_buf_cache[c]);
239168404Spjd		}
240168404Spjd		zio_data_buf_cache[c] = NULL;
241168404Spjd	}
242168404Spjd
243209962Smm	kmem_cache_destroy(zio_link_cache);
244168926Spjd	kmem_cache_destroy(zio_cache);
245168926Spjd
246168404Spjd	zio_inject_fini();
247240868Spjd
248240868Spjd	if (zio_trim_ksp != NULL) {
249240868Spjd		kstat_delete(zio_trim_ksp);
250240868Spjd		zio_trim_ksp = NULL;
251240868Spjd	}
252168404Spjd}
253168404Spjd
254168404Spjd/*
255168404Spjd * ==========================================================================
256168404Spjd * Allocate and free I/O buffers
257168404Spjd * ==========================================================================
258168404Spjd */
259168404Spjd
260168404Spjd/*
261168404Spjd * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
262168404Spjd * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
263168404Spjd * useful to inspect ZFS metadata, but if possible, we should avoid keeping
264168404Spjd * excess / transient data in-core during a crashdump.
265168404Spjd */
266168404Spjdvoid *
267168404Spjdzio_buf_alloc(size_t size)
268168404Spjd{
269168404Spjd	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
270230647Skmacy	int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
271168404Spjd
272268649Sdelphij	ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
273168404Spjd
274208148Spjd	if (zio_use_uma)
275208148Spjd		return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
276208148Spjd	else
277230647Skmacy		return (kmem_alloc(size, KM_SLEEP|flags));
278168404Spjd}
279168404Spjd
280168404Spjd/*
281168404Spjd * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
282168404Spjd * crashdump if the kernel panics.  This exists so that we will limit the amount
283168404Spjd * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
284168404Spjd * of kernel heap dumped to disk when the kernel panics)
285168404Spjd */
286168404Spjdvoid *
287168404Spjdzio_data_buf_alloc(size_t size)
288168404Spjd{
289168404Spjd	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
290168404Spjd
291168404Spjd	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
292168404Spjd
293208148Spjd	if (zio_use_uma)
294208148Spjd		return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
295208148Spjd	else
296230623Skmacy		return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
297168404Spjd}
298168404Spjd
299168404Spjdvoid
300168404Spjdzio_buf_free(void *buf, size_t size)
301168404Spjd{
302168404Spjd	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
303168404Spjd
304168404Spjd	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
305168404Spjd
306208148Spjd	if (zio_use_uma)
307208148Spjd		kmem_cache_free(zio_buf_cache[c], buf);
308208148Spjd	else
309208148Spjd		kmem_free(buf, size);
310168404Spjd}
311168404Spjd
312168404Spjdvoid
313168404Spjdzio_data_buf_free(void *buf, size_t size)
314168404Spjd{
315168404Spjd	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
316168404Spjd
317168404Spjd	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
318168404Spjd
319208148Spjd	if (zio_use_uma)
320208148Spjd		kmem_cache_free(zio_data_buf_cache[c], buf);
321208148Spjd	else
322208148Spjd		kmem_free(buf, size);
323168404Spjd}
324168404Spjd
325168404Spjd/*
326168404Spjd * ==========================================================================
327168404Spjd * Push and pop I/O transform buffers
328168404Spjd * ==========================================================================
329168404Spjd */
330168404Spjdstatic void
331185029Spjdzio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
332185029Spjd	zio_transform_func_t *transform)
333168404Spjd{
334168404Spjd	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
335168404Spjd
336185029Spjd	zt->zt_orig_data = zio->io_data;
337185029Spjd	zt->zt_orig_size = zio->io_size;
338168404Spjd	zt->zt_bufsize = bufsize;
339185029Spjd	zt->zt_transform = transform;
340168404Spjd
341168404Spjd	zt->zt_next = zio->io_transform_stack;
342168404Spjd	zio->io_transform_stack = zt;
343168404Spjd
344168404Spjd	zio->io_data = data;
345168404Spjd	zio->io_size = size;
346168404Spjd}
347168404Spjd
348168404Spjdstatic void
349185029Spjdzio_pop_transforms(zio_t *zio)
350168404Spjd{
351185029Spjd	zio_transform_t *zt;
352168404Spjd
353185029Spjd	while ((zt = zio->io_transform_stack) != NULL) {
354185029Spjd		if (zt->zt_transform != NULL)
355185029Spjd			zt->zt_transform(zio,
356185029Spjd			    zt->zt_orig_data, zt->zt_orig_size);
357168404Spjd
358219089Spjd		if (zt->zt_bufsize != 0)
359219089Spjd			zio_buf_free(zio->io_data, zt->zt_bufsize);
360168404Spjd
361185029Spjd		zio->io_data = zt->zt_orig_data;
362185029Spjd		zio->io_size = zt->zt_orig_size;
363185029Spjd		zio->io_transform_stack = zt->zt_next;
364185029Spjd
365185029Spjd		kmem_free(zt, sizeof (zio_transform_t));
366168404Spjd	}
367168404Spjd}
368168404Spjd
369185029Spjd/*
370185029Spjd * ==========================================================================
371185029Spjd * I/O transform callbacks for subblocks and decompression
372185029Spjd * ==========================================================================
373185029Spjd */
374168404Spjdstatic void
375185029Spjdzio_subblock(zio_t *zio, void *data, uint64_t size)
376168404Spjd{
377185029Spjd	ASSERT(zio->io_size > size);
378168404Spjd
379185029Spjd	if (zio->io_type == ZIO_TYPE_READ)
380185029Spjd		bcopy(zio->io_data, data, size);
381185029Spjd}
382168404Spjd
383185029Spjdstatic void
384185029Spjdzio_decompress(zio_t *zio, void *data, uint64_t size)
385185029Spjd{
386185029Spjd	if (zio->io_error == 0 &&
387185029Spjd	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
388219089Spjd	    zio->io_data, data, zio->io_size, size) != 0)
389249195Smm		zio->io_error = SET_ERROR(EIO);
390185029Spjd}
391185029Spjd
392185029Spjd/*
393185029Spjd * ==========================================================================
394185029Spjd * I/O parent/child relationships and pipeline interlocks
395185029Spjd * ==========================================================================
396185029Spjd */
397209962Smm/*
398209962Smm * NOTE - Callers to zio_walk_parents() and zio_walk_children must
399209962Smm *        continue calling these functions until they return NULL.
400209962Smm *        Otherwise, the next caller will pick up the list walk in
401209962Smm *        some indeterminate state.  (Otherwise every caller would
402209962Smm *        have to pass in a cookie to keep the state represented by
403209962Smm *        io_walk_link, which gets annoying.)
404209962Smm */
405209962Smmzio_t *
406209962Smmzio_walk_parents(zio_t *cio)
407209962Smm{
408209962Smm	zio_link_t *zl = cio->io_walk_link;
409209962Smm	list_t *pl = &cio->io_parent_list;
410185029Spjd
411209962Smm	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
412209962Smm	cio->io_walk_link = zl;
413209962Smm
414209962Smm	if (zl == NULL)
415209962Smm		return (NULL);
416209962Smm
417209962Smm	ASSERT(zl->zl_child == cio);
418209962Smm	return (zl->zl_parent);
419209962Smm}
420209962Smm
421209962Smmzio_t *
422209962Smmzio_walk_children(zio_t *pio)
423185029Spjd{
424209962Smm	zio_link_t *zl = pio->io_walk_link;
425209962Smm	list_t *cl = &pio->io_child_list;
426209962Smm
427209962Smm	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
428209962Smm	pio->io_walk_link = zl;
429209962Smm
430209962Smm	if (zl == NULL)
431209962Smm		return (NULL);
432209962Smm
433209962Smm	ASSERT(zl->zl_parent == pio);
434209962Smm	return (zl->zl_child);
435209962Smm}
436209962Smm
437209962Smmzio_t *
438209962Smmzio_unique_parent(zio_t *cio)
439209962Smm{
440209962Smm	zio_t *pio = zio_walk_parents(cio);
441209962Smm
442209962Smm	VERIFY(zio_walk_parents(cio) == NULL);
443209962Smm	return (pio);
444209962Smm}
445209962Smm
446209962Smmvoid
447209962Smmzio_add_child(zio_t *pio, zio_t *cio)
448209962Smm{
449209962Smm	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
450209962Smm
451209962Smm	/*
452209962Smm	 * Logical I/Os can have logical, gang, or vdev children.
453209962Smm	 * Gang I/Os can have gang or vdev children.
454209962Smm	 * Vdev I/Os can only have vdev children.
455209962Smm	 * The following ASSERT captures all of these constraints.
456209962Smm	 */
457209962Smm	ASSERT(cio->io_child_type <= pio->io_child_type);
458209962Smm
459209962Smm	zl->zl_parent = pio;
460209962Smm	zl->zl_child = cio;
461209962Smm
462209962Smm	mutex_enter(&cio->io_lock);
463185029Spjd	mutex_enter(&pio->io_lock);
464209962Smm
465209962Smm	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
466209962Smm
467209962Smm	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
468209962Smm		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
469209962Smm
470209962Smm	list_insert_head(&pio->io_child_list, zl);
471209962Smm	list_insert_head(&cio->io_parent_list, zl);
472209962Smm
473219089Spjd	pio->io_child_count++;
474219089Spjd	cio->io_parent_count++;
475219089Spjd
476185029Spjd	mutex_exit(&pio->io_lock);
477209962Smm	mutex_exit(&cio->io_lock);
478185029Spjd}
479185029Spjd
480185029Spjdstatic void
481209962Smmzio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
482185029Spjd{
483209962Smm	ASSERT(zl->zl_parent == pio);
484209962Smm	ASSERT(zl->zl_child == cio);
485185029Spjd
486209962Smm	mutex_enter(&cio->io_lock);
487209962Smm	mutex_enter(&pio->io_lock);
488185029Spjd
489209962Smm	list_remove(&pio->io_child_list, zl);
490209962Smm	list_remove(&cio->io_parent_list, zl);
491209962Smm
492219089Spjd	pio->io_child_count--;
493219089Spjd	cio->io_parent_count--;
494219089Spjd
495185029Spjd	mutex_exit(&pio->io_lock);
496209962Smm	mutex_exit(&cio->io_lock);
497209962Smm
498209962Smm	kmem_cache_free(zio_link_cache, zl);
499185029Spjd}
500185029Spjd
501185029Spjdstatic boolean_t
502185029Spjdzio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
503185029Spjd{
504185029Spjd	uint64_t *countp = &zio->io_children[child][wait];
505185029Spjd	boolean_t waiting = B_FALSE;
506185029Spjd
507185029Spjd	mutex_enter(&zio->io_lock);
508185029Spjd	ASSERT(zio->io_stall == NULL);
509185029Spjd	if (*countp != 0) {
510219089Spjd		zio->io_stage >>= 1;
511185029Spjd		zio->io_stall = countp;
512185029Spjd		waiting = B_TRUE;
513168404Spjd	}
514185029Spjd	mutex_exit(&zio->io_lock);
515185029Spjd
516185029Spjd	return (waiting);
517168404Spjd}
518168404Spjd
519185029Spjdstatic void
520185029Spjdzio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
521185029Spjd{
522185029Spjd	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
523185029Spjd	int *errorp = &pio->io_child_error[zio->io_child_type];
524185029Spjd
525185029Spjd	mutex_enter(&pio->io_lock);
526185029Spjd	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
527185029Spjd		*errorp = zio_worst_error(*errorp, zio->io_error);
528185029Spjd	pio->io_reexecute |= zio->io_reexecute;
529185029Spjd	ASSERT3U(*countp, >, 0);
530260763Savg
531260763Savg	(*countp)--;
532260763Savg
533260763Savg	if (*countp == 0 && pio->io_stall == countp) {
534185029Spjd		pio->io_stall = NULL;
535185029Spjd		mutex_exit(&pio->io_lock);
536185029Spjd		zio_execute(pio);
537185029Spjd	} else {
538185029Spjd		mutex_exit(&pio->io_lock);
539185029Spjd	}
540185029Spjd}
541185029Spjd
542185029Spjdstatic void
543185029Spjdzio_inherit_child_errors(zio_t *zio, enum zio_child c)
544185029Spjd{
545185029Spjd	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
546185029Spjd		zio->io_error = zio->io_child_error[c];
547185029Spjd}
548185029Spjd
549168404Spjd/*
550168404Spjd * ==========================================================================
551185029Spjd * Create the various types of I/O (read, write, free, etc)
552168404Spjd * ==========================================================================
553168404Spjd */
554168404Spjdstatic zio_t *
555219089Spjdzio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
556168404Spjd    void *data, uint64_t size, zio_done_func_t *done, void *private,
557260763Savg    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
558268657Sdelphij    vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
559219089Spjd    enum zio_stage stage, enum zio_stage pipeline)
560168404Spjd{
561168404Spjd	zio_t *zio;
562168404Spjd
563240868Spjd	ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE);
564168404Spjd	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
565185029Spjd	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
566168404Spjd
567185029Spjd	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
568185029Spjd	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
569185029Spjd	ASSERT(vd || stage == ZIO_STAGE_OPEN);
570185029Spjd
571168926Spjd	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
572168926Spjd	bzero(zio, sizeof (zio_t));
573185029Spjd
574185029Spjd	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
575185029Spjd	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
576185029Spjd
577209962Smm	list_create(&zio->io_parent_list, sizeof (zio_link_t),
578209962Smm	    offsetof(zio_link_t, zl_parent_node));
579209962Smm	list_create(&zio->io_child_list, sizeof (zio_link_t),
580209962Smm	    offsetof(zio_link_t, zl_child_node));
581209962Smm
582185029Spjd	if (vd != NULL)
583185029Spjd		zio->io_child_type = ZIO_CHILD_VDEV;
584185029Spjd	else if (flags & ZIO_FLAG_GANG_CHILD)
585185029Spjd		zio->io_child_type = ZIO_CHILD_GANG;
586219089Spjd	else if (flags & ZIO_FLAG_DDT_CHILD)
587219089Spjd		zio->io_child_type = ZIO_CHILD_DDT;
588185029Spjd	else
589185029Spjd		zio->io_child_type = ZIO_CHILD_LOGICAL;
590185029Spjd
591168404Spjd	if (bp != NULL) {
592219089Spjd		zio->io_bp = (blkptr_t *)bp;
593168404Spjd		zio->io_bp_copy = *bp;
594168404Spjd		zio->io_bp_orig = *bp;
595219089Spjd		if (type != ZIO_TYPE_WRITE ||
596219089Spjd		    zio->io_child_type == ZIO_CHILD_DDT)
597185029Spjd			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
598209962Smm		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
599185029Spjd			zio->io_logical = zio;
600209962Smm		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
601209962Smm			pipeline |= ZIO_GANG_STAGES;
602168404Spjd	}
603185029Spjd
604185029Spjd	zio->io_spa = spa;
605185029Spjd	zio->io_txg = txg;
606168404Spjd	zio->io_done = done;
607168404Spjd	zio->io_private = private;
608168404Spjd	zio->io_type = type;
609168404Spjd	zio->io_priority = priority;
610185029Spjd	zio->io_vd = vd;
611185029Spjd	zio->io_offset = offset;
612219089Spjd	zio->io_orig_data = zio->io_data = data;
613219089Spjd	zio->io_orig_size = zio->io_size = size;
614185029Spjd	zio->io_orig_flags = zio->io_flags = flags;
615185029Spjd	zio->io_orig_stage = zio->io_stage = stage;
616185029Spjd	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
617168404Spjd
618209962Smm	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
619209962Smm	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
620209962Smm
621185029Spjd	if (zb != NULL)
622185029Spjd		zio->io_bookmark = *zb;
623185029Spjd
624185029Spjd	if (pio != NULL) {
625185029Spjd		if (zio->io_logical == NULL)
626168404Spjd			zio->io_logical = pio->io_logical;
627209962Smm		if (zio->io_child_type == ZIO_CHILD_GANG)
628209962Smm			zio->io_gang_leader = pio->io_gang_leader;
629185029Spjd		zio_add_child(pio, zio);
630168404Spjd	}
631168404Spjd
632168404Spjd	return (zio);
633168404Spjd}
634168404Spjd
635185029Spjdstatic void
636185029Spjdzio_destroy(zio_t *zio)
637185029Spjd{
638209962Smm	list_destroy(&zio->io_parent_list);
639209962Smm	list_destroy(&zio->io_child_list);
640185029Spjd	mutex_destroy(&zio->io_lock);
641185029Spjd	cv_destroy(&zio->io_cv);
642185029Spjd	kmem_cache_free(zio_cache, zio);
643185029Spjd}
644185029Spjd
645168404Spjdzio_t *
646209962Smmzio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
647219089Spjd    void *private, enum zio_flag flags)
648168404Spjd{
649168404Spjd	zio_t *zio;
650168404Spjd
651168404Spjd	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
652209962Smm	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
653185029Spjd	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
654168404Spjd
655168404Spjd	return (zio);
656168404Spjd}
657168404Spjd
658168404Spjdzio_t *
659219089Spjdzio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
660168404Spjd{
661209962Smm	return (zio_null(NULL, spa, NULL, done, private, flags));
662168404Spjd}
663168404Spjd
664168404Spjdzio_t *
665185029Spjdzio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
666185029Spjd    void *data, uint64_t size, zio_done_func_t *done, void *private,
667268657Sdelphij    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
668168404Spjd{
669168404Spjd	zio_t *zio;
670168404Spjd
671219089Spjd	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
672185029Spjd	    data, size, done, private,
673185029Spjd	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
674219089Spjd	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
675219089Spjd	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
676168404Spjd
677168404Spjd	return (zio);
678168404Spjd}
679168404Spjd
680168404Spjdzio_t *
681185029Spjdzio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
682219089Spjd    void *data, uint64_t size, const zio_prop_t *zp,
683260763Savg    zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
684260763Savg    void *private,
685268657Sdelphij    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
686168404Spjd{
687168404Spjd	zio_t *zio;
688168404Spjd
689185029Spjd	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
690185029Spjd	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
691185029Spjd	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
692185029Spjd	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
693236884Smm	    DMU_OT_IS_VALID(zp->zp_type) &&
694185029Spjd	    zp->zp_level < 32 &&
695219089Spjd	    zp->zp_copies > 0 &&
696243524Smm	    zp->zp_copies <= spa_max_replication(spa));
697168404Spjd
698168404Spjd	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
699185029Spjd	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
700219089Spjd	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
701219089Spjd	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
702168404Spjd
703168404Spjd	zio->io_ready = ready;
704260763Savg	zio->io_physdone = physdone;
705185029Spjd	zio->io_prop = *zp;
706168404Spjd
707268649Sdelphij	/*
708268649Sdelphij	 * Data can be NULL if we are going to call zio_write_override() to
709268649Sdelphij	 * provide the already-allocated BP.  But we may need the data to
710268649Sdelphij	 * verify a dedup hit (if requested).  In this case, don't try to
711268649Sdelphij	 * dedup (just take the already-allocated BP verbatim).
712268649Sdelphij	 */
713268649Sdelphij	if (data == NULL && zio->io_prop.zp_dedup_verify) {
714268649Sdelphij		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
715268649Sdelphij	}
716268649Sdelphij
717168404Spjd	return (zio);
718168404Spjd}
719168404Spjd
720168404Spjdzio_t *
721185029Spjdzio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
722260763Savg    uint64_t size, zio_done_func_t *done, void *private,
723268657Sdelphij    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
724168404Spjd{
725168404Spjd	zio_t *zio;
726168404Spjd
727168404Spjd	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
728185029Spjd	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
729168404Spjd	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
730168404Spjd
731168404Spjd	return (zio);
732168404Spjd}
733168404Spjd
734219089Spjdvoid
735243524Smmzio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
736219089Spjd{
737219089Spjd	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
738219089Spjd	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
739219089Spjd	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
740219089Spjd	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
741219089Spjd
742243524Smm	/*
743243524Smm	 * We must reset the io_prop to match the values that existed
744243524Smm	 * when the bp was first written by dmu_sync() keeping in mind
745243524Smm	 * that nopwrite and dedup are mutually exclusive.
746243524Smm	 */
747243524Smm	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
748243524Smm	zio->io_prop.zp_nopwrite = nopwrite;
749219089Spjd	zio->io_prop.zp_copies = copies;
750219089Spjd	zio->io_bp_override = bp;
751219089Spjd}
752219089Spjd
753219089Spjdvoid
754219089Spjdzio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
755219089Spjd{
756268649Sdelphij
757268649Sdelphij	/*
758268649Sdelphij	 * The check for EMBEDDED is a performance optimization.  We
759268649Sdelphij	 * process the free here (by ignoring it) rather than
760268649Sdelphij	 * putting it on the list and then processing it in zio_free_sync().
761268649Sdelphij	 */
762268649Sdelphij	if (BP_IS_EMBEDDED(bp))
763268649Sdelphij		return;
764248571Smm	metaslab_check_free(spa, bp);
765252840Smm
766252840Smm	/*
767252840Smm	 * Frees that are for the currently-syncing txg, are not going to be
768252840Smm	 * deferred, and which will not need to do a read (i.e. not GANG or
769252840Smm	 * DEDUP), can be processed immediately.  Otherwise, put them on the
770252840Smm	 * in-memory list for later processing.
771252840Smm	 */
772253992Smav	if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
773252840Smm	    txg != spa->spa_syncing_txg ||
774252840Smm	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
775252840Smm		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
776252840Smm	} else {
777252840Smm		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
778252840Smm		    BP_GET_PSIZE(bp), 0)));
779252840Smm	}
780219089Spjd}
781219089Spjd
782168404Spjdzio_t *
783219089Spjdzio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
784240868Spjd    uint64_t size, enum zio_flag flags)
785168404Spjd{
786168404Spjd	zio_t *zio;
787252840Smm	enum zio_stage stage = ZIO_FREE_PIPELINE;
788168404Spjd
789168404Spjd	ASSERT(!BP_IS_HOLE(bp));
790219089Spjd	ASSERT(spa_syncing_txg(spa) == txg);
791243503Smm	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
792168404Spjd
793268649Sdelphij	if (BP_IS_EMBEDDED(bp))
794268649Sdelphij		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
795268649Sdelphij
796248571Smm	metaslab_check_free(spa, bp);
797251520Sdelphij	arc_freed(spa, bp);
798248571Smm
799253992Smav	if (zfs_trim_enabled)
800253992Smav		stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
801253992Smav		    ZIO_STAGE_VDEV_IO_ASSESS;
802252840Smm	/*
803252840Smm	 * GANG and DEDUP blocks can induce a read (for the gang block header,
804252840Smm	 * or the DDT), so issue them asynchronously so that this thread is
805252840Smm	 * not tied up.
806252840Smm	 */
807253992Smav	else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
808252840Smm		stage |= ZIO_STAGE_ISSUE_ASYNC;
809252840Smm
810270312Ssmh	flags |= ZIO_FLAG_DONT_QUEUE;
811270312Ssmh
812240868Spjd	zio = zio_create(pio, spa, txg, bp, NULL, size,
813260763Savg	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
814252840Smm	    NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
815168404Spjd
816168404Spjd	return (zio);
817168404Spjd}
818168404Spjd
819168404Spjdzio_t *
820219089Spjdzio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
821219089Spjd    zio_done_func_t *done, void *private, enum zio_flag flags)
822168404Spjd{
823168404Spjd	zio_t *zio;
824168404Spjd
825268649Sdelphij	dprintf_bp(bp, "claiming in txg %llu", txg);
826268649Sdelphij
827268649Sdelphij	if (BP_IS_EMBEDDED(bp))
828268649Sdelphij		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
829268649Sdelphij
830168404Spjd	/*
831168404Spjd	 * A claim is an allocation of a specific block.  Claims are needed
832168404Spjd	 * to support immediate writes in the intent log.  The issue is that
833168404Spjd	 * immediate writes contain committed data, but in a txg that was
834168404Spjd	 * *not* committed.  Upon opening the pool after an unclean shutdown,
835168404Spjd	 * the intent log claims all blocks that contain immediate write data
836168404Spjd	 * so that the SPA knows they're in use.
837168404Spjd	 *
838168404Spjd	 * All claims *must* be resolved in the first txg -- before the SPA
839168404Spjd	 * starts allocating blocks -- so that nothing is allocated twice.
840219089Spjd	 * If txg == 0 we just verify that the block is claimable.
841168404Spjd	 */
842168404Spjd	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
843219089Spjd	ASSERT(txg == spa_first_txg(spa) || txg == 0);
844219089Spjd	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
845168404Spjd
846185029Spjd	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
847185029Spjd	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
848185029Spjd	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
849168404Spjd
850168404Spjd	return (zio);
851168404Spjd}
852168404Spjd
853168404Spjdzio_t *
854240868Spjdzio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
855260763Savg    uint64_t size, zio_done_func_t *done, void *private,
856270312Ssmh    zio_priority_t priority, enum zio_flag flags)
857168404Spjd{
858168404Spjd	zio_t *zio;
859168404Spjd	int c;
860168404Spjd
861168404Spjd	if (vd->vdev_children == 0) {
862240868Spjd		zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
863270312Ssmh		    ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
864168404Spjd		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
865168404Spjd
866168404Spjd		zio->io_cmd = cmd;
867168404Spjd	} else {
868209962Smm		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
869168404Spjd
870168404Spjd		for (c = 0; c < vd->vdev_children; c++)
871168404Spjd			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
872270312Ssmh			    offset, size, done, private, priority, flags));
873168404Spjd	}
874168404Spjd
875168404Spjd	return (zio);
876168404Spjd}
877168404Spjd
878168404Spjdzio_t *
879168404Spjdzio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
880168404Spjd    void *data, int checksum, zio_done_func_t *done, void *private,
881260763Savg    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
882168404Spjd{
883168404Spjd	zio_t *zio;
884168404Spjd
885185029Spjd	ASSERT(vd->vdev_children == 0);
886185029Spjd	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
887185029Spjd	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
888185029Spjd	ASSERT3U(offset + size, <=, vd->vdev_psize);
889168404Spjd
890185029Spjd	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
891269416Sdelphij	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
892269416Sdelphij	    NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
893168404Spjd
894185029Spjd	zio->io_prop.zp_checksum = checksum;
895168404Spjd
896168404Spjd	return (zio);
897168404Spjd}
898168404Spjd
899168404Spjdzio_t *
900168404Spjdzio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
901168404Spjd    void *data, int checksum, zio_done_func_t *done, void *private,
902260763Savg    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
903168404Spjd{
904168404Spjd	zio_t *zio;
905168404Spjd
906185029Spjd	ASSERT(vd->vdev_children == 0);
907185029Spjd	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
908185029Spjd	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
909185029Spjd	ASSERT3U(offset + size, <=, vd->vdev_psize);
910168404Spjd
911185029Spjd	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
912269416Sdelphij	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
913269416Sdelphij	    NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
914168404Spjd
915185029Spjd	zio->io_prop.zp_checksum = checksum;
916168404Spjd
917219089Spjd	if (zio_checksum_table[checksum].ci_eck) {
918168404Spjd		/*
919219089Spjd		 * zec checksums are necessarily destructive -- they modify
920185029Spjd		 * the end of the write buffer to hold the verifier/checksum.
921168404Spjd		 * Therefore, we must make a local copy in case the data is
922185029Spjd		 * being written to multiple places in parallel.
923168404Spjd		 */
924185029Spjd		void *wbuf = zio_buf_alloc(size);
925168404Spjd		bcopy(data, wbuf, size);
926185029Spjd		zio_push_transform(zio, wbuf, size, size, NULL);
927168404Spjd	}
928168404Spjd
929168404Spjd	return (zio);
930168404Spjd}
931168404Spjd
932168404Spjd/*
933185029Spjd * Create a child I/O to do some work for us.
934168404Spjd */
935168404Spjdzio_t *
936185029Spjdzio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
937260763Savg	void *data, uint64_t size, int type, zio_priority_t priority,
938260763Savg	enum zio_flag flags, zio_done_func_t *done, void *private)
939168404Spjd{
940219089Spjd	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
941185029Spjd	zio_t *zio;
942168404Spjd
943185029Spjd	ASSERT(vd->vdev_parent ==
944185029Spjd	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
945185029Spjd
946168404Spjd	if (type == ZIO_TYPE_READ && bp != NULL) {
947168404Spjd		/*
948168404Spjd		 * If we have the bp, then the child should perform the
949168404Spjd		 * checksum and the parent need not.  This pushes error
950168404Spjd		 * detection as close to the leaves as possible and
951168404Spjd		 * eliminates redundant checksums in the interior nodes.
952168404Spjd		 */
953219089Spjd		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
954219089Spjd		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
955168404Spjd	}
956168404Spjd
957270312Ssmh	/* Not all IO types require vdev io done stage e.g. free */
958270312Ssmh	if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
959270312Ssmh		pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
960270312Ssmh
961185029Spjd	if (vd->vdev_children == 0)
962185029Spjd		offset += VDEV_LABEL_START_SIZE;
963185029Spjd
964219089Spjd	flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
965219089Spjd
966219089Spjd	/*
967219089Spjd	 * If we've decided to do a repair, the write is not speculative --
968219089Spjd	 * even if the original read was.
969219089Spjd	 */
970219089Spjd	if (flags & ZIO_FLAG_IO_REPAIR)
971219089Spjd		flags &= ~ZIO_FLAG_SPECULATIVE;
972219089Spjd
973185029Spjd	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
974219089Spjd	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
975219089Spjd	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
976168404Spjd
977260763Savg	zio->io_physdone = pio->io_physdone;
978260763Savg	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
979260763Savg		zio->io_logical->io_phys_children++;
980260763Savg
981185029Spjd	return (zio);
982168404Spjd}
983168404Spjd
984185029Spjdzio_t *
985185029Spjdzio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
986260763Savg	int type, zio_priority_t priority, enum zio_flag flags,
987219089Spjd	zio_done_func_t *done, void *private)
988168404Spjd{
989185029Spjd	zio_t *zio;
990168404Spjd
991185029Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
992168404Spjd
993185029Spjd	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
994185029Spjd	    data, size, done, private, type, priority,
995260763Savg	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
996185029Spjd	    vd, offset, NULL,
997219089Spjd	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
998168404Spjd
999185029Spjd	return (zio);
1000168404Spjd}
1001168404Spjd
1002168404Spjdvoid
1003185029Spjdzio_flush(zio_t *zio, vdev_t *vd)
1004168404Spjd{
1005240868Spjd	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
1006270312Ssmh	    NULL, NULL, ZIO_PRIORITY_NOW,
1007185029Spjd	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
1008168404Spjd}
1009168404Spjd
1010240868Spjdzio_t *
1011240868Spjdzio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
1012240868Spjd{
1013240868Spjd
1014240868Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
1015240868Spjd
1016270312Ssmh	return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL,
1017270312Ssmh	    ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
1018270312Ssmh	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
1019270312Ssmh	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
1020240868Spjd}
1021240868Spjd
1022219089Spjdvoid
1023219089Spjdzio_shrink(zio_t *zio, uint64_t size)
1024219089Spjd{
1025219089Spjd	ASSERT(zio->io_executor == NULL);
1026219089Spjd	ASSERT(zio->io_orig_size == zio->io_size);
1027219089Spjd	ASSERT(size <= zio->io_size);
1028219089Spjd
1029219089Spjd	/*
1030219089Spjd	 * We don't shrink for raidz because of problems with the
1031219089Spjd	 * reconstruction when reading back less than the block size.
1032219089Spjd	 * Note, BP_IS_RAIDZ() assumes no compression.
1033219089Spjd	 */
1034219089Spjd	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1035219089Spjd	if (!BP_IS_RAIDZ(zio->io_bp))
1036219089Spjd		zio->io_orig_size = zio->io_size = size;
1037219089Spjd}
1038219089Spjd
1039168404Spjd/*
1040168404Spjd * ==========================================================================
1041185029Spjd * Prepare to read and write logical blocks
1042168404Spjd * ==========================================================================
1043168404Spjd */
1044185029Spjd
1045185029Spjdstatic int
1046270312Ssmhzio_read_bp_init(zio_t *zio)
1047168404Spjd{
1048185029Spjd	blkptr_t *bp = zio->io_bp;
1049185029Spjd
1050209962Smm	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1051209962Smm	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
1052209962Smm	    !(zio->io_flags & ZIO_FLAG_RAW)) {
1053268649Sdelphij		uint64_t psize =
1054268649Sdelphij		    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
1055219089Spjd		void *cbuf = zio_buf_alloc(psize);
1056185029Spjd
1057219089Spjd		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1058168404Spjd	}
1059185029Spjd
1060268649Sdelphij	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1061268649Sdelphij		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1062268649Sdelphij		decode_embedded_bp_compressed(bp, zio->io_data);
1063268649Sdelphij	} else {
1064268649Sdelphij		ASSERT(!BP_IS_EMBEDDED(bp));
1065268649Sdelphij	}
1066268649Sdelphij
1067236884Smm	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1068185029Spjd		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1069185029Spjd
1070219089Spjd	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1071219089Spjd		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1072219089Spjd
1073219089Spjd	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1074219089Spjd		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1075219089Spjd
1076185029Spjd	return (ZIO_PIPELINE_CONTINUE);
1077168404Spjd}
1078168404Spjd
1079185029Spjdstatic int
1080270312Ssmhzio_write_bp_init(zio_t *zio)
1081168404Spjd{
1082219089Spjd	spa_t *spa = zio->io_spa;
1083185029Spjd	zio_prop_t *zp = &zio->io_prop;
1084219089Spjd	enum zio_compress compress = zp->zp_compress;
1085185029Spjd	blkptr_t *bp = zio->io_bp;
1086185029Spjd	uint64_t lsize = zio->io_size;
1087219089Spjd	uint64_t psize = lsize;
1088185029Spjd	int pass = 1;
1089168404Spjd
1090185029Spjd	/*
1091185029Spjd	 * If our children haven't all reached the ready stage,
1092185029Spjd	 * wait for them and then repeat this pipeline stage.
1093185029Spjd	 */
1094185029Spjd	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1095185029Spjd	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1096185029Spjd		return (ZIO_PIPELINE_STOP);
1097185029Spjd
1098185029Spjd	if (!IO_IS_ALLOCATING(zio))
1099185029Spjd		return (ZIO_PIPELINE_CONTINUE);
1100185029Spjd
1101219089Spjd	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1102185029Spjd
1103219089Spjd	if (zio->io_bp_override) {
1104219089Spjd		ASSERT(bp->blk_birth != zio->io_txg);
1105219089Spjd		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1106219089Spjd
1107219089Spjd		*bp = *zio->io_bp_override;
1108219089Spjd		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1109219089Spjd
1110268649Sdelphij		if (BP_IS_EMBEDDED(bp))
1111268649Sdelphij			return (ZIO_PIPELINE_CONTINUE);
1112268649Sdelphij
1113243524Smm		/*
1114243524Smm		 * If we've been overridden and nopwrite is set then
1115243524Smm		 * set the flag accordingly to indicate that a nopwrite
1116243524Smm		 * has already occurred.
1117243524Smm		 */
1118243524Smm		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1119243524Smm			ASSERT(!zp->zp_dedup);
1120243524Smm			zio->io_flags |= ZIO_FLAG_NOPWRITE;
1121243524Smm			return (ZIO_PIPELINE_CONTINUE);
1122243524Smm		}
1123243524Smm
1124243524Smm		ASSERT(!zp->zp_nopwrite);
1125243524Smm
1126219089Spjd		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1127219089Spjd			return (ZIO_PIPELINE_CONTINUE);
1128219089Spjd
1129219089Spjd		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1130219089Spjd		    zp->zp_dedup_verify);
1131219089Spjd
1132219089Spjd		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1133219089Spjd			BP_SET_DEDUP(bp, 1);
1134219089Spjd			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1135219089Spjd			return (ZIO_PIPELINE_CONTINUE);
1136219089Spjd		}
1137219089Spjd		zio->io_bp_override = NULL;
1138219089Spjd		BP_ZERO(bp);
1139219089Spjd	}
1140219089Spjd
1141263397Sdelphij	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1142185029Spjd		/*
1143185029Spjd		 * We're rewriting an existing block, which means we're
1144185029Spjd		 * working on behalf of spa_sync().  For spa_sync() to
1145185029Spjd		 * converge, it must eventually be the case that we don't
1146185029Spjd		 * have to allocate new blocks.  But compression changes
1147185029Spjd		 * the blocksize, which forces a reallocate, and makes
1148185029Spjd		 * convergence take longer.  Therefore, after the first
1149185029Spjd		 * few passes, stop compressing to ensure convergence.
1150185029Spjd		 */
1151219089Spjd		pass = spa_sync_pass(spa);
1152185029Spjd
1153219089Spjd		ASSERT(zio->io_txg == spa_syncing_txg(spa));
1154219089Spjd		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1155219089Spjd		ASSERT(!BP_GET_DEDUP(bp));
1156219089Spjd
1157243503Smm		if (pass >= zfs_sync_pass_dont_compress)
1158185029Spjd			compress = ZIO_COMPRESS_OFF;
1159185029Spjd
1160185029Spjd		/* Make sure someone doesn't change their mind on overwrites */
1161268649Sdelphij		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1162219089Spjd		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1163185029Spjd	}
1164185029Spjd
1165185029Spjd	if (compress != ZIO_COMPRESS_OFF) {
1166219089Spjd		void *cbuf = zio_buf_alloc(lsize);
1167269732Sdelphij		psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1168219089Spjd		if (psize == 0 || psize == lsize) {
1169185029Spjd			compress = ZIO_COMPRESS_OFF;
1170219089Spjd			zio_buf_free(cbuf, lsize);
1171268649Sdelphij		} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1172268649Sdelphij		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1173268649Sdelphij		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1174268649Sdelphij			encode_embedded_bp_compressed(bp,
1175268649Sdelphij			    cbuf, compress, lsize, psize);
1176268649Sdelphij			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1177268649Sdelphij			BP_SET_TYPE(bp, zio->io_prop.zp_type);
1178268649Sdelphij			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1179268649Sdelphij			zio_buf_free(cbuf, lsize);
1180268649Sdelphij			bp->blk_birth = zio->io_txg;
1181268649Sdelphij			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1182268649Sdelphij			ASSERT(spa_feature_is_active(spa,
1183268649Sdelphij			    SPA_FEATURE_EMBEDDED_DATA));
1184268649Sdelphij			return (ZIO_PIPELINE_CONTINUE);
1185219089Spjd		} else {
1186268649Sdelphij			/*
1187268649Sdelphij			 * Round up compressed size to MINBLOCKSIZE and
1188268649Sdelphij			 * zero the tail.
1189268649Sdelphij			 */
1190268649Sdelphij			size_t rounded =
1191268649Sdelphij			    P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
1192268649Sdelphij			if (rounded > psize) {
1193268649Sdelphij				bzero((char *)cbuf + psize, rounded - psize);
1194268649Sdelphij				psize = rounded;
1195268649Sdelphij			}
1196268649Sdelphij			if (psize == lsize) {
1197268649Sdelphij				compress = ZIO_COMPRESS_OFF;
1198268649Sdelphij				zio_buf_free(cbuf, lsize);
1199268649Sdelphij			} else {
1200268649Sdelphij				zio_push_transform(zio, cbuf,
1201268649Sdelphij				    psize, lsize, NULL);
1202268649Sdelphij			}
1203185029Spjd		}
1204185029Spjd	}
1205185029Spjd
1206185029Spjd	/*
1207185029Spjd	 * The final pass of spa_sync() must be all rewrites, but the first
1208185029Spjd	 * few passes offer a trade-off: allocating blocks defers convergence,
1209185029Spjd	 * but newly allocated blocks are sequential, so they can be written
1210185029Spjd	 * to disk faster.  Therefore, we allow the first few passes of
1211185029Spjd	 * spa_sync() to allocate new blocks, but force rewrites after that.
1212185029Spjd	 * There should only be a handful of blocks after pass 1 in any case.
1213185029Spjd	 */
1214263397Sdelphij	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1215263397Sdelphij	    BP_GET_PSIZE(bp) == psize &&
1216243503Smm	    pass >= zfs_sync_pass_rewrite) {
1217219089Spjd		ASSERT(psize != 0);
1218219089Spjd		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1219185029Spjd		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1220185029Spjd		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1221168404Spjd	} else {
1222185029Spjd		BP_ZERO(bp);
1223185029Spjd		zio->io_pipeline = ZIO_WRITE_PIPELINE;
1224168404Spjd	}
1225185029Spjd
1226219089Spjd	if (psize == 0) {
1227263397Sdelphij		if (zio->io_bp_orig.blk_birth != 0 &&
1228263397Sdelphij		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1229263397Sdelphij			BP_SET_LSIZE(bp, lsize);
1230263397Sdelphij			BP_SET_TYPE(bp, zp->zp_type);
1231263397Sdelphij			BP_SET_LEVEL(bp, zp->zp_level);
1232263397Sdelphij			BP_SET_BIRTH(bp, zio->io_txg, 0);
1233263397Sdelphij		}
1234185029Spjd		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1235185029Spjd	} else {
1236185029Spjd		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1237185029Spjd		BP_SET_LSIZE(bp, lsize);
1238263397Sdelphij		BP_SET_TYPE(bp, zp->zp_type);
1239263397Sdelphij		BP_SET_LEVEL(bp, zp->zp_level);
1240219089Spjd		BP_SET_PSIZE(bp, psize);
1241185029Spjd		BP_SET_COMPRESS(bp, compress);
1242185029Spjd		BP_SET_CHECKSUM(bp, zp->zp_checksum);
1243219089Spjd		BP_SET_DEDUP(bp, zp->zp_dedup);
1244185029Spjd		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1245219089Spjd		if (zp->zp_dedup) {
1246219089Spjd			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1247219089Spjd			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1248219089Spjd			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1249219089Spjd		}
1250243524Smm		if (zp->zp_nopwrite) {
1251243524Smm			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1252243524Smm			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1253243524Smm			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1254243524Smm		}
1255185029Spjd	}
1256185029Spjd
1257185029Spjd	return (ZIO_PIPELINE_CONTINUE);
1258168404Spjd}
1259168404Spjd
1260219089Spjdstatic int
1261270312Ssmhzio_free_bp_init(zio_t *zio)
1262219089Spjd{
1263219089Spjd	blkptr_t *bp = zio->io_bp;
1264219089Spjd
1265219089Spjd	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1266219089Spjd		if (BP_GET_DEDUP(bp))
1267219089Spjd			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1268219089Spjd	}
1269219089Spjd
1270219089Spjd	return (ZIO_PIPELINE_CONTINUE);
1271219089Spjd}
1272219089Spjd
1273185029Spjd/*
1274185029Spjd * ==========================================================================
1275185029Spjd * Execute the I/O pipeline
1276185029Spjd * ==========================================================================
1277185029Spjd */
1278185029Spjd
1279168404Spjdstatic void
1280260750Savgzio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1281168404Spjd{
1282211931Smm	spa_t *spa = zio->io_spa;
1283185029Spjd	zio_type_t t = zio->io_type;
1284260742Savg	int flags = (cutinline ? TQ_FRONT : 0);
1285168404Spjd
1286216919Smm	ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
1287216919Smm
1288185029Spjd	/*
1289209096Smm	 * If we're a config writer or a probe, the normal issue and
1290209096Smm	 * interrupt threads may all be blocked waiting for the config lock.
1291209096Smm	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1292185029Spjd	 */
1293209096Smm	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1294185029Spjd		t = ZIO_TYPE_NULL;
1295185029Spjd
1296185029Spjd	/*
1297185029Spjd	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1298185029Spjd	 */
1299185029Spjd	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1300185029Spjd		t = ZIO_TYPE_NULL;
1301185029Spjd
1302211931Smm	/*
1303260750Savg	 * If this is a high priority I/O, then use the high priority taskq if
1304260750Savg	 * available.
1305211931Smm	 */
1306211931Smm	if (zio->io_priority == ZIO_PRIORITY_NOW &&
1307260750Savg	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1308211931Smm		q++;
1309211931Smm
1310211931Smm	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1311260742Savg
1312260742Savg	/*
1313260742Savg	 * NB: We are assuming that the zio can only be dispatched
1314260742Savg	 * to a single taskq at a time.  It would be a grievous error
1315260742Savg	 * to dispatch the zio to another taskq at the same time.
1316260742Savg	 */
1317260742Savg#if defined(illumos) || !defined(_KERNEL)
1318260742Savg	ASSERT(zio->io_tqent.tqent_next == NULL);
1319216919Smm#else
1320260742Savg	ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
1321216919Smm#endif
1322260750Savg	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1323260750Savg	    flags, &zio->io_tqent);
1324168404Spjd}
1325168404Spjd
1326185029Spjdstatic boolean_t
1327260750Savgzio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1328168404Spjd{
1329185029Spjd	kthread_t *executor = zio->io_executor;
1330185029Spjd	spa_t *spa = zio->io_spa;
1331168404Spjd
1332260750Savg	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
1333260750Savg		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1334260750Savg		uint_t i;
1335260750Savg		for (i = 0; i < tqs->stqs_count; i++) {
1336260750Savg			if (taskq_member(tqs->stqs_taskq[i], executor))
1337260750Savg				return (B_TRUE);
1338260750Savg		}
1339260750Savg	}
1340168404Spjd
1341185029Spjd	return (B_FALSE);
1342185029Spjd}
1343168404Spjd
1344185029Spjdstatic int
1345270312Ssmhzio_issue_async(zio_t *zio)
1346185029Spjd{
1347219089Spjd	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1348168404Spjd
1349185029Spjd	return (ZIO_PIPELINE_STOP);
1350168404Spjd}
1351168404Spjd
1352185029Spjdvoid
1353185029Spjdzio_interrupt(zio_t *zio)
1354168404Spjd{
1355219089Spjd	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1356185029Spjd}
1357168404Spjd
1358185029Spjd/*
1359185029Spjd * Execute the I/O pipeline until one of the following occurs:
1360185029Spjd *
1361251631Sdelphij *	(1) the I/O completes
1362251631Sdelphij *	(2) the pipeline stalls waiting for dependent child I/Os
1363251631Sdelphij *	(3) the I/O issues, so we're waiting for an I/O completion interrupt
1364251631Sdelphij *	(4) the I/O is delegated by vdev-level caching or aggregation
1365251631Sdelphij *	(5) the I/O is deferred due to vdev-level queueing
1366251631Sdelphij *	(6) the I/O is handed off to another thread.
1367251631Sdelphij *
1368251631Sdelphij * In all cases, the pipeline stops whenever there's no CPU work; it never
1369251631Sdelphij * burns a thread in cv_wait().
1370251631Sdelphij *
1371185029Spjd * There's no locking on io_stage because there's no legitimate way
1372185029Spjd * for multiple threads to be attempting to process the same I/O.
1373185029Spjd */
1374219089Spjdstatic zio_pipe_stage_t *zio_pipeline[];
1375168404Spjd
1376185029Spjdvoid
1377185029Spjdzio_execute(zio_t *zio)
1378185029Spjd{
1379185029Spjd	zio->io_executor = curthread;
1380168404Spjd
1381185029Spjd	while (zio->io_stage < ZIO_STAGE_DONE) {
1382219089Spjd		enum zio_stage pipeline = zio->io_pipeline;
1383219089Spjd		enum zio_stage stage = zio->io_stage;
1384185029Spjd		int rv;
1385168404Spjd
1386185029Spjd		ASSERT(!MUTEX_HELD(&zio->io_lock));
1387219089Spjd		ASSERT(ISP2(stage));
1388219089Spjd		ASSERT(zio->io_stall == NULL);
1389168404Spjd
1390219089Spjd		do {
1391219089Spjd			stage <<= 1;
1392219089Spjd		} while ((stage & pipeline) == 0);
1393168404Spjd
1394185029Spjd		ASSERT(stage <= ZIO_STAGE_DONE);
1395168404Spjd
1396168404Spjd		/*
1397185029Spjd		 * If we are in interrupt context and this pipeline stage
1398185029Spjd		 * will grab a config lock that is held across I/O,
1399219089Spjd		 * or may wait for an I/O that needs an interrupt thread
1400219089Spjd		 * to complete, issue async to avoid deadlock.
1401219089Spjd		 *
1402219089Spjd		 * For VDEV_IO_START, we cut in line so that the io will
1403219089Spjd		 * be sent to disk promptly.
1404168404Spjd		 */
1405219089Spjd		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1406185029Spjd		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1407219089Spjd			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1408219089Spjd			    zio_requeue_io_start_cut_in_line : B_FALSE;
1409219089Spjd			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1410185029Spjd			return;
1411185029Spjd		}
1412168404Spjd
1413185029Spjd		zio->io_stage = stage;
1414270312Ssmh		rv = zio_pipeline[highbit64(stage) - 1](zio);
1415185029Spjd
1416185029Spjd		if (rv == ZIO_PIPELINE_STOP)
1417185029Spjd			return;
1418185029Spjd
1419185029Spjd		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1420168404Spjd	}
1421185029Spjd}
1422168404Spjd
1423185029Spjd/*
1424185029Spjd * ==========================================================================
1425185029Spjd * Initiate I/O, either sync or async
1426185029Spjd * ==========================================================================
1427185029Spjd */
1428185029Spjdint
1429185029Spjdzio_wait(zio_t *zio)
1430185029Spjd{
1431185029Spjd	int error;
1432168404Spjd
1433185029Spjd	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1434185029Spjd	ASSERT(zio->io_executor == NULL);
1435168404Spjd
1436185029Spjd	zio->io_waiter = curthread;
1437168404Spjd
1438185029Spjd	zio_execute(zio);
1439168404Spjd
1440185029Spjd	mutex_enter(&zio->io_lock);
1441185029Spjd	while (zio->io_executor != NULL)
1442185029Spjd		cv_wait(&zio->io_cv, &zio->io_lock);
1443185029Spjd	mutex_exit(&zio->io_lock);
1444168404Spjd
1445185029Spjd	error = zio->io_error;
1446185029Spjd	zio_destroy(zio);
1447168404Spjd
1448185029Spjd	return (error);
1449185029Spjd}
1450185029Spjd
1451185029Spjdvoid
1452185029Spjdzio_nowait(zio_t *zio)
1453185029Spjd{
1454185029Spjd	ASSERT(zio->io_executor == NULL);
1455185029Spjd
1456209962Smm	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1457209962Smm	    zio_unique_parent(zio) == NULL) {
1458185029Spjd		/*
1459185029Spjd		 * This is a logical async I/O with no parent to wait for it.
1460209962Smm		 * We add it to the spa_async_root_zio "Godfather" I/O which
1461209962Smm		 * will ensure they complete prior to unloading the pool.
1462185029Spjd		 */
1463185029Spjd		spa_t *spa = zio->io_spa;
1464209962Smm
1465209962Smm		zio_add_child(spa->spa_async_zio_root, zio);
1466168404Spjd	}
1467185029Spjd
1468185029Spjd	zio_execute(zio);
1469168404Spjd}
1470168404Spjd
1471168404Spjd/*
1472168404Spjd * ==========================================================================
1473185029Spjd * Reexecute or suspend/resume failed I/O
1474168404Spjd * ==========================================================================
1475168404Spjd */
1476185029Spjd
1477168404Spjdstatic void
1478185029Spjdzio_reexecute(zio_t *pio)
1479168404Spjd{
1480209962Smm	zio_t *cio, *cio_next;
1481168404Spjd
1482209962Smm	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1483209962Smm	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1484209962Smm	ASSERT(pio->io_gang_leader == NULL);
1485209962Smm	ASSERT(pio->io_gang_tree == NULL);
1486209962Smm
1487185029Spjd	pio->io_flags = pio->io_orig_flags;
1488185029Spjd	pio->io_stage = pio->io_orig_stage;
1489185029Spjd	pio->io_pipeline = pio->io_orig_pipeline;
1490185029Spjd	pio->io_reexecute = 0;
1491243524Smm	pio->io_flags |= ZIO_FLAG_REEXECUTED;
1492185029Spjd	pio->io_error = 0;
1493209962Smm	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1494209962Smm		pio->io_state[w] = 0;
1495185029Spjd	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1496185029Spjd		pio->io_child_error[c] = 0;
1497185029Spjd
1498219089Spjd	if (IO_IS_ALLOCATING(pio))
1499219089Spjd		BP_ZERO(pio->io_bp);
1500168404Spjd
1501185029Spjd	/*
1502185029Spjd	 * As we reexecute pio's children, new children could be created.
1503209962Smm	 * New children go to the head of pio's io_child_list, however,
1504185029Spjd	 * so we will (correctly) not reexecute them.  The key is that
1505209962Smm	 * the remainder of pio's io_child_list, from 'cio_next' onward,
1506209962Smm	 * cannot be affected by any side effects of reexecuting 'cio'.
1507185029Spjd	 */
1508209962Smm	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1509209962Smm		cio_next = zio_walk_children(pio);
1510185029Spjd		mutex_enter(&pio->io_lock);
1511209962Smm		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1512209962Smm			pio->io_children[cio->io_child_type][w]++;
1513185029Spjd		mutex_exit(&pio->io_lock);
1514209962Smm		zio_reexecute(cio);
1515185029Spjd	}
1516168404Spjd
1517168404Spjd	/*
1518185029Spjd	 * Now that all children have been reexecuted, execute the parent.
1519209962Smm	 * We don't reexecute "The Godfather" I/O here as it's the
1520209962Smm	 * responsibility of the caller to wait on him.
1521168404Spjd	 */
1522209962Smm	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1523209962Smm		zio_execute(pio);
1524185029Spjd}
1525185029Spjd
1526185029Spjdvoid
1527185029Spjdzio_suspend(spa_t *spa, zio_t *zio)
1528185029Spjd{
1529185029Spjd	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1530185029Spjd		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1531185029Spjd		    "failure and the failure mode property for this pool "
1532185029Spjd		    "is set to panic.", spa_name(spa));
1533185029Spjd
1534185029Spjd	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1535185029Spjd
1536185029Spjd	mutex_enter(&spa->spa_suspend_lock);
1537185029Spjd
1538185029Spjd	if (spa->spa_suspend_zio_root == NULL)
1539209962Smm		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1540209962Smm		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1541209962Smm		    ZIO_FLAG_GODFATHER);
1542185029Spjd
1543185029Spjd	spa->spa_suspended = B_TRUE;
1544185029Spjd
1545185029Spjd	if (zio != NULL) {
1546209962Smm		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1547185029Spjd		ASSERT(zio != spa->spa_suspend_zio_root);
1548185029Spjd		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1549209962Smm		ASSERT(zio_unique_parent(zio) == NULL);
1550185029Spjd		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1551185029Spjd		zio_add_child(spa->spa_suspend_zio_root, zio);
1552168404Spjd	}
1553168404Spjd
1554185029Spjd	mutex_exit(&spa->spa_suspend_lock);
1555168404Spjd}
1556168404Spjd
1557209962Smmint
1558185029Spjdzio_resume(spa_t *spa)
1559168404Spjd{
1560209962Smm	zio_t *pio;
1561168404Spjd
1562185029Spjd	/*
1563185029Spjd	 * Reexecute all previously suspended i/o.
1564185029Spjd	 */
1565185029Spjd	mutex_enter(&spa->spa_suspend_lock);
1566185029Spjd	spa->spa_suspended = B_FALSE;
1567185029Spjd	cv_broadcast(&spa->spa_suspend_cv);
1568185029Spjd	pio = spa->spa_suspend_zio_root;
1569185029Spjd	spa->spa_suspend_zio_root = NULL;
1570185029Spjd	mutex_exit(&spa->spa_suspend_lock);
1571168404Spjd
1572185029Spjd	if (pio == NULL)
1573209962Smm		return (0);
1574168404Spjd
1575209962Smm	zio_reexecute(pio);
1576209962Smm	return (zio_wait(pio));
1577168404Spjd}
1578168404Spjd
1579185029Spjdvoid
1580185029Spjdzio_resume_wait(spa_t *spa)
1581185029Spjd{
1582185029Spjd	mutex_enter(&spa->spa_suspend_lock);
1583185029Spjd	while (spa_suspended(spa))
1584185029Spjd		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1585185029Spjd	mutex_exit(&spa->spa_suspend_lock);
1586185029Spjd}
1587185029Spjd
1588168404Spjd/*
1589168404Spjd * ==========================================================================
1590185029Spjd * Gang blocks.
1591185029Spjd *
1592185029Spjd * A gang block is a collection of small blocks that looks to the DMU
1593185029Spjd * like one large block.  When zio_dva_allocate() cannot find a block
1594185029Spjd * of the requested size, due to either severe fragmentation or the pool
1595185029Spjd * being nearly full, it calls zio_write_gang_block() to construct the
1596185029Spjd * block from smaller fragments.
1597185029Spjd *
1598185029Spjd * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1599185029Spjd * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1600185029Spjd * an indirect block: it's an array of block pointers.  It consumes
1601185029Spjd * only one sector and hence is allocatable regardless of fragmentation.
1602185029Spjd * The gang header's bps point to its gang members, which hold the data.
1603185029Spjd *
1604185029Spjd * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1605185029Spjd * as the verifier to ensure uniqueness of the SHA256 checksum.
1606185029Spjd * Critically, the gang block bp's blk_cksum is the checksum of the data,
1607185029Spjd * not the gang header.  This ensures that data block signatures (needed for
1608185029Spjd * deduplication) are independent of how the block is physically stored.
1609185029Spjd *
1610185029Spjd * Gang blocks can be nested: a gang member may itself be a gang block.
1611185029Spjd * Thus every gang block is a tree in which root and all interior nodes are
1612185029Spjd * gang headers, and the leaves are normal blocks that contain user data.
1613185029Spjd * The root of the gang tree is called the gang leader.
1614185029Spjd *
1615185029Spjd * To perform any operation (read, rewrite, free, claim) on a gang block,
1616185029Spjd * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1617185029Spjd * in the io_gang_tree field of the original logical i/o by recursively
1618185029Spjd * reading the gang leader and all gang headers below it.  This yields
1619185029Spjd * an in-core tree containing the contents of every gang header and the
1620185029Spjd * bps for every constituent of the gang block.
1621185029Spjd *
1622185029Spjd * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1623185029Spjd * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1624185029Spjd * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1625185029Spjd * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1626185029Spjd * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1627185029Spjd * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1628185029Spjd * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1629185029Spjd * of the gang header plus zio_checksum_compute() of the data to update the
1630185029Spjd * gang header's blk_cksum as described above.
1631185029Spjd *
1632185029Spjd * The two-phase assemble/issue model solves the problem of partial failure --
1633185029Spjd * what if you'd freed part of a gang block but then couldn't read the
1634185029Spjd * gang header for another part?  Assembling the entire gang tree first
1635185029Spjd * ensures that all the necessary gang header I/O has succeeded before
1636185029Spjd * starting the actual work of free, claim, or write.  Once the gang tree
1637185029Spjd * is assembled, free and claim are in-memory operations that cannot fail.
1638185029Spjd *
1639185029Spjd * In the event that a gang write fails, zio_dva_unallocate() walks the
1640185029Spjd * gang tree to immediately free (i.e. insert back into the space map)
1641185029Spjd * everything we've allocated.  This ensures that we don't get ENOSPC
1642185029Spjd * errors during repeated suspend/resume cycles due to a flaky device.
1643185029Spjd *
1644185029Spjd * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1645185029Spjd * the gang tree, we won't modify the block, so we can safely defer the free
1646185029Spjd * (knowing that the block is still intact).  If we *can* assemble the gang
1647185029Spjd * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1648185029Spjd * each constituent bp and we can allocate a new block on the next sync pass.
1649185029Spjd *
1650185029Spjd * In all cases, the gang tree allows complete recovery from partial failure.
1651168404Spjd * ==========================================================================
1652168404Spjd */
1653185029Spjd
1654185029Spjdstatic zio_t *
1655185029Spjdzio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1656168404Spjd{
1657185029Spjd	if (gn != NULL)
1658185029Spjd		return (pio);
1659168404Spjd
1660185029Spjd	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1661185029Spjd	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1662185029Spjd	    &pio->io_bookmark));
1663168404Spjd}
1664168404Spjd
1665185029Spjdzio_t *
1666185029Spjdzio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1667168404Spjd{
1668185029Spjd	zio_t *zio;
1669168404Spjd
1670185029Spjd	if (gn != NULL) {
1671185029Spjd		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1672185029Spjd		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1673185029Spjd		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1674185029Spjd		/*
1675185029Spjd		 * As we rewrite each gang header, the pipeline will compute
1676185029Spjd		 * a new gang block header checksum for it; but no one will
1677185029Spjd		 * compute a new data checksum, so we do that here.  The one
1678185029Spjd		 * exception is the gang leader: the pipeline already computed
1679185029Spjd		 * its data checksum because that stage precedes gang assembly.
1680185029Spjd		 * (Presently, nothing actually uses interior data checksums;
1681185029Spjd		 * this is just good hygiene.)
1682185029Spjd		 */
1683209962Smm		if (gn != pio->io_gang_leader->io_gang_tree) {
1684185029Spjd			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1685185029Spjd			    data, BP_GET_PSIZE(bp));
1686185029Spjd		}
1687219089Spjd		/*
1688219089Spjd		 * If we are here to damage data for testing purposes,
1689219089Spjd		 * leave the GBH alone so that we can detect the damage.
1690219089Spjd		 */
1691219089Spjd		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1692219089Spjd			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1693185029Spjd	} else {
1694185029Spjd		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1695185029Spjd		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1696185029Spjd		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1697185029Spjd	}
1698185029Spjd
1699185029Spjd	return (zio);
1700168404Spjd}
1701168404Spjd
1702185029Spjd/* ARGSUSED */
1703185029Spjdzio_t *
1704185029Spjdzio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1705168404Spjd{
1706219089Spjd	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1707240868Spjd	    BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
1708219089Spjd	    ZIO_GANG_CHILD_FLAGS(pio)));
1709185029Spjd}
1710168404Spjd
1711185029Spjd/* ARGSUSED */
1712185029Spjdzio_t *
1713185029Spjdzio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1714185029Spjd{
1715185029Spjd	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1716185029Spjd	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1717185029Spjd}
1718168404Spjd
1719185029Spjdstatic zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1720185029Spjd	NULL,
1721185029Spjd	zio_read_gang,
1722185029Spjd	zio_rewrite_gang,
1723185029Spjd	zio_free_gang,
1724185029Spjd	zio_claim_gang,
1725185029Spjd	NULL
1726185029Spjd};
1727168404Spjd
1728185029Spjdstatic void zio_gang_tree_assemble_done(zio_t *zio);
1729168404Spjd
1730185029Spjdstatic zio_gang_node_t *
1731185029Spjdzio_gang_node_alloc(zio_gang_node_t **gnpp)
1732185029Spjd{
1733185029Spjd	zio_gang_node_t *gn;
1734185029Spjd
1735185029Spjd	ASSERT(*gnpp == NULL);
1736185029Spjd
1737185029Spjd	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1738185029Spjd	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1739185029Spjd	*gnpp = gn;
1740185029Spjd
1741185029Spjd	return (gn);
1742168404Spjd}
1743168404Spjd
1744168404Spjdstatic void
1745185029Spjdzio_gang_node_free(zio_gang_node_t **gnpp)
1746168404Spjd{
1747185029Spjd	zio_gang_node_t *gn = *gnpp;
1748168404Spjd
1749185029Spjd	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1750185029Spjd		ASSERT(gn->gn_child[g] == NULL);
1751168404Spjd
1752185029Spjd	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1753185029Spjd	kmem_free(gn, sizeof (*gn));
1754185029Spjd	*gnpp = NULL;
1755185029Spjd}
1756168404Spjd
1757185029Spjdstatic void
1758185029Spjdzio_gang_tree_free(zio_gang_node_t **gnpp)
1759185029Spjd{
1760185029Spjd	zio_gang_node_t *gn = *gnpp;
1761168404Spjd
1762185029Spjd	if (gn == NULL)
1763185029Spjd		return;
1764168404Spjd
1765185029Spjd	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1766185029Spjd		zio_gang_tree_free(&gn->gn_child[g]);
1767168404Spjd
1768185029Spjd	zio_gang_node_free(gnpp);
1769168404Spjd}
1770168404Spjd
1771168404Spjdstatic void
1772209962Smmzio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1773168404Spjd{
1774185029Spjd	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1775168404Spjd
1776209962Smm	ASSERT(gio->io_gang_leader == gio);
1777185029Spjd	ASSERT(BP_IS_GANG(bp));
1778168404Spjd
1779209962Smm	zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1780185029Spjd	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1781209962Smm	    gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1782185029Spjd}
1783168404Spjd
1784185029Spjdstatic void
1785185029Spjdzio_gang_tree_assemble_done(zio_t *zio)
1786185029Spjd{
1787209962Smm	zio_t *gio = zio->io_gang_leader;
1788185029Spjd	zio_gang_node_t *gn = zio->io_private;
1789185029Spjd	blkptr_t *bp = zio->io_bp;
1790168404Spjd
1791209962Smm	ASSERT(gio == zio_unique_parent(zio));
1792219089Spjd	ASSERT(zio->io_child_count == 0);
1793168404Spjd
1794185029Spjd	if (zio->io_error)
1795185029Spjd		return;
1796168404Spjd
1797185029Spjd	if (BP_SHOULD_BYTESWAP(bp))
1798185029Spjd		byteswap_uint64_array(zio->io_data, zio->io_size);
1799185029Spjd
1800185029Spjd	ASSERT(zio->io_data == gn->gn_gbh);
1801185029Spjd	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1802219089Spjd	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1803185029Spjd
1804185029Spjd	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1805185029Spjd		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1806185029Spjd		if (!BP_IS_GANG(gbp))
1807185029Spjd			continue;
1808209962Smm		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1809168404Spjd	}
1810168404Spjd}
1811168404Spjd
1812168404Spjdstatic void
1813185029Spjdzio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1814168404Spjd{
1815209962Smm	zio_t *gio = pio->io_gang_leader;
1816185029Spjd	zio_t *zio;
1817168404Spjd
1818185029Spjd	ASSERT(BP_IS_GANG(bp) == !!gn);
1819209962Smm	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1820209962Smm	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1821168404Spjd
1822185029Spjd	/*
1823185029Spjd	 * If you're a gang header, your data is in gn->gn_gbh.
1824185029Spjd	 * If you're a gang member, your data is in 'data' and gn == NULL.
1825185029Spjd	 */
1826209962Smm	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1827168404Spjd
1828185029Spjd	if (gn != NULL) {
1829219089Spjd		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1830168404Spjd
1831185029Spjd		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1832185029Spjd			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1833185029Spjd			if (BP_IS_HOLE(gbp))
1834185029Spjd				continue;
1835185029Spjd			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1836185029Spjd			data = (char *)data + BP_GET_PSIZE(gbp);
1837185029Spjd		}
1838168404Spjd	}
1839168404Spjd
1840240868Spjd	if (gn == gio->io_gang_tree && gio->io_data != NULL)
1841209962Smm		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1842185029Spjd
1843185029Spjd	if (zio != pio)
1844185029Spjd		zio_nowait(zio);
1845168404Spjd}
1846168404Spjd
1847185029Spjdstatic int
1848270312Ssmhzio_gang_assemble(zio_t *zio)
1849168404Spjd{
1850185029Spjd	blkptr_t *bp = zio->io_bp;
1851168404Spjd
1852209962Smm	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1853209962Smm	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1854168404Spjd
1855209962Smm	zio->io_gang_leader = zio;
1856209962Smm
1857185029Spjd	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1858168404Spjd
1859185029Spjd	return (ZIO_PIPELINE_CONTINUE);
1860185029Spjd}
1861168404Spjd
1862185029Spjdstatic int
1863270312Ssmhzio_gang_issue(zio_t *zio)
1864185029Spjd{
1865185029Spjd	blkptr_t *bp = zio->io_bp;
1866185029Spjd
1867185029Spjd	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1868185029Spjd		return (ZIO_PIPELINE_STOP);
1869185029Spjd
1870209962Smm	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1871209962Smm	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1872185029Spjd
1873185029Spjd	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1874209962Smm		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1875185029Spjd	else
1876209962Smm		zio_gang_tree_free(&zio->io_gang_tree);
1877185029Spjd
1878185029Spjd	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1879185029Spjd
1880185029Spjd	return (ZIO_PIPELINE_CONTINUE);
1881168404Spjd}
1882168404Spjd
1883168404Spjdstatic void
1884185029Spjdzio_write_gang_member_ready(zio_t *zio)
1885168404Spjd{
1886209962Smm	zio_t *pio = zio_unique_parent(zio);
1887209962Smm	zio_t *gio = zio->io_gang_leader;
1888168404Spjd	dva_t *cdva = zio->io_bp->blk_dva;
1889168404Spjd	dva_t *pdva = pio->io_bp->blk_dva;
1890168404Spjd	uint64_t asize;
1891168404Spjd
1892185029Spjd	if (BP_IS_HOLE(zio->io_bp))
1893185029Spjd		return;
1894185029Spjd
1895185029Spjd	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1896185029Spjd
1897185029Spjd	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1898219089Spjd	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1899219089Spjd	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1900219089Spjd	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1901168404Spjd	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1902168404Spjd
1903168404Spjd	mutex_enter(&pio->io_lock);
1904185029Spjd	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1905168404Spjd		ASSERT(DVA_GET_GANG(&pdva[d]));
1906168404Spjd		asize = DVA_GET_ASIZE(&pdva[d]);
1907168404Spjd		asize += DVA_GET_ASIZE(&cdva[d]);
1908168404Spjd		DVA_SET_ASIZE(&pdva[d], asize);
1909168404Spjd	}
1910168404Spjd	mutex_exit(&pio->io_lock);
1911168404Spjd}
1912168404Spjd
1913185029Spjdstatic int
1914185029Spjdzio_write_gang_block(zio_t *pio)
1915168404Spjd{
1916185029Spjd	spa_t *spa = pio->io_spa;
1917185029Spjd	blkptr_t *bp = pio->io_bp;
1918209962Smm	zio_t *gio = pio->io_gang_leader;
1919185029Spjd	zio_t *zio;
1920185029Spjd	zio_gang_node_t *gn, **gnpp;
1921168404Spjd	zio_gbh_phys_t *gbh;
1922185029Spjd	uint64_t txg = pio->io_txg;
1923185029Spjd	uint64_t resid = pio->io_size;
1924185029Spjd	uint64_t lsize;
1925219089Spjd	int copies = gio->io_prop.zp_copies;
1926219089Spjd	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1927185029Spjd	zio_prop_t zp;
1928168404Spjd	int error;
1929168404Spjd
1930219089Spjd	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1931219089Spjd	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1932185029Spjd	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1933185029Spjd	if (error) {
1934185029Spjd		pio->io_error = error;
1935185029Spjd		return (ZIO_PIPELINE_CONTINUE);
1936185029Spjd	}
1937168404Spjd
1938209962Smm	if (pio == gio) {
1939209962Smm		gnpp = &gio->io_gang_tree;
1940185029Spjd	} else {
1941185029Spjd		gnpp = pio->io_private;
1942185029Spjd		ASSERT(pio->io_ready == zio_write_gang_member_ready);
1943185029Spjd	}
1944168404Spjd
1945185029Spjd	gn = zio_gang_node_alloc(gnpp);
1946185029Spjd	gbh = gn->gn_gbh;
1947185029Spjd	bzero(gbh, SPA_GANGBLOCKSIZE);
1948168404Spjd
1949185029Spjd	/*
1950185029Spjd	 * Create the gang header.
1951185029Spjd	 */
1952185029Spjd	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1953185029Spjd	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1954168404Spjd
1955185029Spjd	/*
1956185029Spjd	 * Create and nowait the gang children.
1957185029Spjd	 */
1958185029Spjd	for (int g = 0; resid != 0; resid -= lsize, g++) {
1959185029Spjd		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1960185029Spjd		    SPA_MINBLOCKSIZE);
1961185029Spjd		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1962168404Spjd
1963209962Smm		zp.zp_checksum = gio->io_prop.zp_checksum;
1964185029Spjd		zp.zp_compress = ZIO_COMPRESS_OFF;
1965185029Spjd		zp.zp_type = DMU_OT_NONE;
1966185029Spjd		zp.zp_level = 0;
1967219089Spjd		zp.zp_copies = gio->io_prop.zp_copies;
1968243524Smm		zp.zp_dedup = B_FALSE;
1969243524Smm		zp.zp_dedup_verify = B_FALSE;
1970243524Smm		zp.zp_nopwrite = B_FALSE;
1971168404Spjd
1972185029Spjd		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1973185029Spjd		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1974260763Savg		    zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
1975185029Spjd		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1976185029Spjd		    &pio->io_bookmark));
1977168404Spjd	}
1978168404Spjd
1979185029Spjd	/*
1980185029Spjd	 * Set pio's pipeline to just wait for zio to finish.
1981185029Spjd	 */
1982185029Spjd	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1983168404Spjd
1984185029Spjd	zio_nowait(zio);
1985168404Spjd
1986185029Spjd	return (ZIO_PIPELINE_CONTINUE);
1987168404Spjd}
1988168404Spjd
1989168404Spjd/*
1990243524Smm * The zio_nop_write stage in the pipeline determines if allocating
1991243524Smm * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1992243524Smm * such as SHA256, we can compare the checksums of the new data and the old
1993243524Smm * to determine if allocating a new block is required.  The nopwrite
1994243524Smm * feature can handle writes in either syncing or open context (i.e. zil
1995243524Smm * writes) and as a result is mutually exclusive with dedup.
1996243524Smm */
1997243524Smmstatic int
1998270312Ssmhzio_nop_write(zio_t *zio)
1999243524Smm{
2000243524Smm	blkptr_t *bp = zio->io_bp;
2001243524Smm	blkptr_t *bp_orig = &zio->io_bp_orig;
2002243524Smm	zio_prop_t *zp = &zio->io_prop;
2003243524Smm
2004243524Smm	ASSERT(BP_GET_LEVEL(bp) == 0);
2005243524Smm	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
2006243524Smm	ASSERT(zp->zp_nopwrite);
2007243524Smm	ASSERT(!zp->zp_dedup);
2008243524Smm	ASSERT(zio->io_bp_override == NULL);
2009243524Smm	ASSERT(IO_IS_ALLOCATING(zio));
2010243524Smm
2011243524Smm	/*
2012243524Smm	 * Check to see if the original bp and the new bp have matching
2013243524Smm	 * characteristics (i.e. same checksum, compression algorithms, etc).
2014243524Smm	 * If they don't then just continue with the pipeline which will
2015243524Smm	 * allocate a new bp.
2016243524Smm	 */
2017243524Smm	if (BP_IS_HOLE(bp_orig) ||
2018243524Smm	    !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
2019243524Smm	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2020243524Smm	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2021243524Smm	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2022243524Smm	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
2023243524Smm		return (ZIO_PIPELINE_CONTINUE);
2024243524Smm
2025243524Smm	/*
2026243524Smm	 * If the checksums match then reset the pipeline so that we
2027243524Smm	 * avoid allocating a new bp and issuing any I/O.
2028243524Smm	 */
2029243524Smm	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2030243524Smm		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
2031243524Smm		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2032243524Smm		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2033243524Smm		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2034243524Smm		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2035243524Smm		    sizeof (uint64_t)) == 0);
2036243524Smm
2037243524Smm		*bp = *bp_orig;
2038243524Smm		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2039243524Smm		zio->io_flags |= ZIO_FLAG_NOPWRITE;
2040243524Smm	}
2041243524Smm
2042243524Smm	return (ZIO_PIPELINE_CONTINUE);
2043243524Smm}
2044243524Smm
2045243524Smm/*
2046168404Spjd * ==========================================================================
2047219089Spjd * Dedup
2048168404Spjd * ==========================================================================
2049168404Spjd */
2050219089Spjdstatic void
2051219089Spjdzio_ddt_child_read_done(zio_t *zio)
2052219089Spjd{
2053219089Spjd	blkptr_t *bp = zio->io_bp;
2054219089Spjd	ddt_entry_t *dde = zio->io_private;
2055219089Spjd	ddt_phys_t *ddp;
2056219089Spjd	zio_t *pio = zio_unique_parent(zio);
2057185029Spjd
2058219089Spjd	mutex_enter(&pio->io_lock);
2059219089Spjd	ddp = ddt_phys_select(dde, bp);
2060219089Spjd	if (zio->io_error == 0)
2061219089Spjd		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
2062219089Spjd	if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2063219089Spjd		dde->dde_repair_data = zio->io_data;
2064219089Spjd	else
2065219089Spjd		zio_buf_free(zio->io_data, zio->io_size);
2066219089Spjd	mutex_exit(&pio->io_lock);
2067219089Spjd}
2068219089Spjd
2069185029Spjdstatic int
2070270312Ssmhzio_ddt_read_start(zio_t *zio)
2071219089Spjd{
2072219089Spjd	blkptr_t *bp = zio->io_bp;
2073219089Spjd
2074219089Spjd	ASSERT(BP_GET_DEDUP(bp));
2075219089Spjd	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2076219089Spjd	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2077219089Spjd
2078219089Spjd	if (zio->io_child_error[ZIO_CHILD_DDT]) {
2079219089Spjd		ddt_t *ddt = ddt_select(zio->io_spa, bp);
2080219089Spjd		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2081219089Spjd		ddt_phys_t *ddp = dde->dde_phys;
2082219089Spjd		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2083219089Spjd		blkptr_t blk;
2084219089Spjd
2085219089Spjd		ASSERT(zio->io_vsd == NULL);
2086219089Spjd		zio->io_vsd = dde;
2087219089Spjd
2088219089Spjd		if (ddp_self == NULL)
2089219089Spjd			return (ZIO_PIPELINE_CONTINUE);
2090219089Spjd
2091219089Spjd		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2092219089Spjd			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2093219089Spjd				continue;
2094219089Spjd			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2095219089Spjd			    &blk);
2096219089Spjd			zio_nowait(zio_read(zio, zio->io_spa, &blk,
2097219089Spjd			    zio_buf_alloc(zio->io_size), zio->io_size,
2098219089Spjd			    zio_ddt_child_read_done, dde, zio->io_priority,
2099219089Spjd			    ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2100219089Spjd			    &zio->io_bookmark));
2101219089Spjd		}
2102219089Spjd		return (ZIO_PIPELINE_CONTINUE);
2103219089Spjd	}
2104219089Spjd
2105219089Spjd	zio_nowait(zio_read(zio, zio->io_spa, bp,
2106219089Spjd	    zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2107219089Spjd	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2108219089Spjd
2109219089Spjd	return (ZIO_PIPELINE_CONTINUE);
2110219089Spjd}
2111219089Spjd
2112219089Spjdstatic int
2113270312Ssmhzio_ddt_read_done(zio_t *zio)
2114219089Spjd{
2115219089Spjd	blkptr_t *bp = zio->io_bp;
2116219089Spjd
2117219089Spjd	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2118219089Spjd		return (ZIO_PIPELINE_STOP);
2119219089Spjd
2120219089Spjd	ASSERT(BP_GET_DEDUP(bp));
2121219089Spjd	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2122219089Spjd	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2123219089Spjd
2124219089Spjd	if (zio->io_child_error[ZIO_CHILD_DDT]) {
2125219089Spjd		ddt_t *ddt = ddt_select(zio->io_spa, bp);
2126219089Spjd		ddt_entry_t *dde = zio->io_vsd;
2127219089Spjd		if (ddt == NULL) {
2128219089Spjd			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2129219089Spjd			return (ZIO_PIPELINE_CONTINUE);
2130219089Spjd		}
2131219089Spjd		if (dde == NULL) {
2132219089Spjd			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2133219089Spjd			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2134219089Spjd			return (ZIO_PIPELINE_STOP);
2135219089Spjd		}
2136219089Spjd		if (dde->dde_repair_data != NULL) {
2137219089Spjd			bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2138219089Spjd			zio->io_child_error[ZIO_CHILD_DDT] = 0;
2139219089Spjd		}
2140219089Spjd		ddt_repair_done(ddt, dde);
2141219089Spjd		zio->io_vsd = NULL;
2142219089Spjd	}
2143219089Spjd
2144219089Spjd	ASSERT(zio->io_vsd == NULL);
2145219089Spjd
2146219089Spjd	return (ZIO_PIPELINE_CONTINUE);
2147219089Spjd}
2148219089Spjd
2149219089Spjdstatic boolean_t
2150219089Spjdzio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2151219089Spjd{
2152219089Spjd	spa_t *spa = zio->io_spa;
2153219089Spjd
2154219089Spjd	/*
2155219089Spjd	 * Note: we compare the original data, not the transformed data,
2156219089Spjd	 * because when zio->io_bp is an override bp, we will not have
2157219089Spjd	 * pushed the I/O transforms.  That's an important optimization
2158219089Spjd	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2159219089Spjd	 */
2160219089Spjd	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2161219089Spjd		zio_t *lio = dde->dde_lead_zio[p];
2162219089Spjd
2163219089Spjd		if (lio != NULL) {
2164219089Spjd			return (lio->io_orig_size != zio->io_orig_size ||
2165219089Spjd			    bcmp(zio->io_orig_data, lio->io_orig_data,
2166219089Spjd			    zio->io_orig_size) != 0);
2167219089Spjd		}
2168219089Spjd	}
2169219089Spjd
2170219089Spjd	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2171219089Spjd		ddt_phys_t *ddp = &dde->dde_phys[p];
2172219089Spjd
2173219089Spjd		if (ddp->ddp_phys_birth != 0) {
2174219089Spjd			arc_buf_t *abuf = NULL;
2175219089Spjd			uint32_t aflags = ARC_WAIT;
2176219089Spjd			blkptr_t blk = *zio->io_bp;
2177219089Spjd			int error;
2178219089Spjd
2179219089Spjd			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2180219089Spjd
2181219089Spjd			ddt_exit(ddt);
2182219089Spjd
2183246666Smm			error = arc_read(NULL, spa, &blk,
2184219089Spjd			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2185219089Spjd			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2186219089Spjd			    &aflags, &zio->io_bookmark);
2187219089Spjd
2188219089Spjd			if (error == 0) {
2189219089Spjd				if (arc_buf_size(abuf) != zio->io_orig_size ||
2190219089Spjd				    bcmp(abuf->b_data, zio->io_orig_data,
2191219089Spjd				    zio->io_orig_size) != 0)
2192249195Smm					error = SET_ERROR(EEXIST);
2193248571Smm				VERIFY(arc_buf_remove_ref(abuf, &abuf));
2194219089Spjd			}
2195219089Spjd
2196219089Spjd			ddt_enter(ddt);
2197219089Spjd			return (error != 0);
2198219089Spjd		}
2199219089Spjd	}
2200219089Spjd
2201219089Spjd	return (B_FALSE);
2202219089Spjd}
2203219089Spjd
2204219089Spjdstatic void
2205219089Spjdzio_ddt_child_write_ready(zio_t *zio)
2206219089Spjd{
2207219089Spjd	int p = zio->io_prop.zp_copies;
2208219089Spjd	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2209219089Spjd	ddt_entry_t *dde = zio->io_private;
2210219089Spjd	ddt_phys_t *ddp = &dde->dde_phys[p];
2211219089Spjd	zio_t *pio;
2212219089Spjd
2213219089Spjd	if (zio->io_error)
2214219089Spjd		return;
2215219089Spjd
2216219089Spjd	ddt_enter(ddt);
2217219089Spjd
2218219089Spjd	ASSERT(dde->dde_lead_zio[p] == zio);
2219219089Spjd
2220219089Spjd	ddt_phys_fill(ddp, zio->io_bp);
2221219089Spjd
2222219089Spjd	while ((pio = zio_walk_parents(zio)) != NULL)
2223219089Spjd		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2224219089Spjd
2225219089Spjd	ddt_exit(ddt);
2226219089Spjd}
2227219089Spjd
2228219089Spjdstatic void
2229219089Spjdzio_ddt_child_write_done(zio_t *zio)
2230219089Spjd{
2231219089Spjd	int p = zio->io_prop.zp_copies;
2232219089Spjd	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2233219089Spjd	ddt_entry_t *dde = zio->io_private;
2234219089Spjd	ddt_phys_t *ddp = &dde->dde_phys[p];
2235219089Spjd
2236219089Spjd	ddt_enter(ddt);
2237219089Spjd
2238219089Spjd	ASSERT(ddp->ddp_refcnt == 0);
2239219089Spjd	ASSERT(dde->dde_lead_zio[p] == zio);
2240219089Spjd	dde->dde_lead_zio[p] = NULL;
2241219089Spjd
2242219089Spjd	if (zio->io_error == 0) {
2243219089Spjd		while (zio_walk_parents(zio) != NULL)
2244219089Spjd			ddt_phys_addref(ddp);
2245219089Spjd	} else {
2246219089Spjd		ddt_phys_clear(ddp);
2247219089Spjd	}
2248219089Spjd
2249219089Spjd	ddt_exit(ddt);
2250219089Spjd}
2251219089Spjd
2252219089Spjdstatic void
2253219089Spjdzio_ddt_ditto_write_done(zio_t *zio)
2254219089Spjd{
2255219089Spjd	int p = DDT_PHYS_DITTO;
2256219089Spjd	zio_prop_t *zp = &zio->io_prop;
2257219089Spjd	blkptr_t *bp = zio->io_bp;
2258219089Spjd	ddt_t *ddt = ddt_select(zio->io_spa, bp);
2259219089Spjd	ddt_entry_t *dde = zio->io_private;
2260219089Spjd	ddt_phys_t *ddp = &dde->dde_phys[p];
2261219089Spjd	ddt_key_t *ddk = &dde->dde_key;
2262219089Spjd
2263219089Spjd	ddt_enter(ddt);
2264219089Spjd
2265219089Spjd	ASSERT(ddp->ddp_refcnt == 0);
2266219089Spjd	ASSERT(dde->dde_lead_zio[p] == zio);
2267219089Spjd	dde->dde_lead_zio[p] = NULL;
2268219089Spjd
2269219089Spjd	if (zio->io_error == 0) {
2270219089Spjd		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2271219089Spjd		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2272219089Spjd		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2273219089Spjd		if (ddp->ddp_phys_birth != 0)
2274219089Spjd			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2275219089Spjd		ddt_phys_fill(ddp, bp);
2276219089Spjd	}
2277219089Spjd
2278219089Spjd	ddt_exit(ddt);
2279219089Spjd}
2280219089Spjd
2281219089Spjdstatic int
2282270312Ssmhzio_ddt_write(zio_t *zio)
2283219089Spjd{
2284219089Spjd	spa_t *spa = zio->io_spa;
2285219089Spjd	blkptr_t *bp = zio->io_bp;
2286219089Spjd	uint64_t txg = zio->io_txg;
2287219089Spjd	zio_prop_t *zp = &zio->io_prop;
2288219089Spjd	int p = zp->zp_copies;
2289219089Spjd	int ditto_copies;
2290219089Spjd	zio_t *cio = NULL;
2291219089Spjd	zio_t *dio = NULL;
2292219089Spjd	ddt_t *ddt = ddt_select(spa, bp);
2293219089Spjd	ddt_entry_t *dde;
2294219089Spjd	ddt_phys_t *ddp;
2295219089Spjd
2296219089Spjd	ASSERT(BP_GET_DEDUP(bp));
2297219089Spjd	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2298219089Spjd	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2299219089Spjd
2300219089Spjd	ddt_enter(ddt);
2301219089Spjd	dde = ddt_lookup(ddt, bp, B_TRUE);
2302219089Spjd	ddp = &dde->dde_phys[p];
2303219089Spjd
2304219089Spjd	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2305219089Spjd		/*
2306219089Spjd		 * If we're using a weak checksum, upgrade to a strong checksum
2307219089Spjd		 * and try again.  If we're already using a strong checksum,
2308219089Spjd		 * we can't resolve it, so just convert to an ordinary write.
2309219089Spjd		 * (And automatically e-mail a paper to Nature?)
2310219089Spjd		 */
2311219089Spjd		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2312219089Spjd			zp->zp_checksum = spa_dedup_checksum(spa);
2313219089Spjd			zio_pop_transforms(zio);
2314219089Spjd			zio->io_stage = ZIO_STAGE_OPEN;
2315219089Spjd			BP_ZERO(bp);
2316219089Spjd		} else {
2317243524Smm			zp->zp_dedup = B_FALSE;
2318219089Spjd		}
2319219089Spjd		zio->io_pipeline = ZIO_WRITE_PIPELINE;
2320219089Spjd		ddt_exit(ddt);
2321219089Spjd		return (ZIO_PIPELINE_CONTINUE);
2322219089Spjd	}
2323219089Spjd
2324219089Spjd	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2325219089Spjd	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2326219089Spjd
2327219089Spjd	if (ditto_copies > ddt_ditto_copies_present(dde) &&
2328219089Spjd	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2329219089Spjd		zio_prop_t czp = *zp;
2330219089Spjd
2331219089Spjd		czp.zp_copies = ditto_copies;
2332219089Spjd
2333219089Spjd		/*
2334219089Spjd		 * If we arrived here with an override bp, we won't have run
2335219089Spjd		 * the transform stack, so we won't have the data we need to
2336219089Spjd		 * generate a child i/o.  So, toss the override bp and restart.
2337219089Spjd		 * This is safe, because using the override bp is just an
2338219089Spjd		 * optimization; and it's rare, so the cost doesn't matter.
2339219089Spjd		 */
2340219089Spjd		if (zio->io_bp_override) {
2341219089Spjd			zio_pop_transforms(zio);
2342219089Spjd			zio->io_stage = ZIO_STAGE_OPEN;
2343219089Spjd			zio->io_pipeline = ZIO_WRITE_PIPELINE;
2344219089Spjd			zio->io_bp_override = NULL;
2345219089Spjd			BP_ZERO(bp);
2346219089Spjd			ddt_exit(ddt);
2347219089Spjd			return (ZIO_PIPELINE_CONTINUE);
2348219089Spjd		}
2349219089Spjd
2350219089Spjd		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2351260763Savg		    zio->io_orig_size, &czp, NULL, NULL,
2352219089Spjd		    zio_ddt_ditto_write_done, dde, zio->io_priority,
2353219089Spjd		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2354219089Spjd
2355219089Spjd		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2356219089Spjd		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2357219089Spjd	}
2358219089Spjd
2359219089Spjd	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2360219089Spjd		if (ddp->ddp_phys_birth != 0)
2361219089Spjd			ddt_bp_fill(ddp, bp, txg);
2362219089Spjd		if (dde->dde_lead_zio[p] != NULL)
2363219089Spjd			zio_add_child(zio, dde->dde_lead_zio[p]);
2364219089Spjd		else
2365219089Spjd			ddt_phys_addref(ddp);
2366219089Spjd	} else if (zio->io_bp_override) {
2367219089Spjd		ASSERT(bp->blk_birth == txg);
2368219089Spjd		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2369219089Spjd		ddt_phys_fill(ddp, bp);
2370219089Spjd		ddt_phys_addref(ddp);
2371219089Spjd	} else {
2372219089Spjd		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2373260763Savg		    zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2374219089Spjd		    zio_ddt_child_write_done, dde, zio->io_priority,
2375219089Spjd		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2376219089Spjd
2377219089Spjd		zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2378219089Spjd		dde->dde_lead_zio[p] = cio;
2379219089Spjd	}
2380219089Spjd
2381219089Spjd	ddt_exit(ddt);
2382219089Spjd
2383219089Spjd	if (cio)
2384219089Spjd		zio_nowait(cio);
2385219089Spjd	if (dio)
2386219089Spjd		zio_nowait(dio);
2387219089Spjd
2388219089Spjd	return (ZIO_PIPELINE_CONTINUE);
2389219089Spjd}
2390219089Spjd
2391219089Spjdddt_entry_t *freedde; /* for debugging */
2392219089Spjd
2393219089Spjdstatic int
2394270312Ssmhzio_ddt_free(zio_t *zio)
2395219089Spjd{
2396219089Spjd	spa_t *spa = zio->io_spa;
2397219089Spjd	blkptr_t *bp = zio->io_bp;
2398219089Spjd	ddt_t *ddt = ddt_select(spa, bp);
2399219089Spjd	ddt_entry_t *dde;
2400219089Spjd	ddt_phys_t *ddp;
2401219089Spjd
2402219089Spjd	ASSERT(BP_GET_DEDUP(bp));
2403219089Spjd	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2404219089Spjd
2405219089Spjd	ddt_enter(ddt);
2406219089Spjd	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2407219089Spjd	ddp = ddt_phys_select(dde, bp);
2408219089Spjd	ddt_phys_decref(ddp);
2409219089Spjd	ddt_exit(ddt);
2410219089Spjd
2411219089Spjd	return (ZIO_PIPELINE_CONTINUE);
2412219089Spjd}
2413219089Spjd
2414219089Spjd/*
2415219089Spjd * ==========================================================================
2416219089Spjd * Allocate and free blocks
2417219089Spjd * ==========================================================================
2418219089Spjd */
2419219089Spjdstatic int
2420270312Ssmhzio_dva_allocate(zio_t *zio)
2421168404Spjd{
2422185029Spjd	spa_t *spa = zio->io_spa;
2423219089Spjd	metaslab_class_t *mc = spa_normal_class(spa);
2424168404Spjd	blkptr_t *bp = zio->io_bp;
2425168404Spjd	int error;
2426224177Smm	int flags = 0;
2427168404Spjd
2428209962Smm	if (zio->io_gang_leader == NULL) {
2429209962Smm		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2430209962Smm		zio->io_gang_leader = zio;
2431209962Smm	}
2432209962Smm
2433168404Spjd	ASSERT(BP_IS_HOLE(bp));
2434240415Smm	ASSERT0(BP_GET_NDVAS(bp));
2435219089Spjd	ASSERT3U(zio->io_prop.zp_copies, >, 0);
2436219089Spjd	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2437168404Spjd	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2438168404Spjd
2439224177Smm	/*
2440224177Smm	 * The dump device does not support gang blocks so allocation on
2441224177Smm	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2442224177Smm	 * the "fast" gang feature.
2443224177Smm	 */
2444224177Smm	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2445224177Smm	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2446224177Smm	    METASLAB_GANG_CHILD : 0;
2447185029Spjd	error = metaslab_alloc(spa, mc, zio->io_size, bp,
2448224177Smm	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2449168404Spjd
2450185029Spjd	if (error) {
2451224177Smm		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2452224177Smm		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2453224177Smm		    error);
2454185029Spjd		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2455185029Spjd			return (zio_write_gang_block(zio));
2456168404Spjd		zio->io_error = error;
2457168404Spjd	}
2458185029Spjd
2459185029Spjd	return (ZIO_PIPELINE_CONTINUE);
2460168404Spjd}
2461168404Spjd
2462185029Spjdstatic int
2463270312Ssmhzio_dva_free(zio_t *zio)
2464168404Spjd{
2465185029Spjd	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2466168404Spjd
2467185029Spjd	return (ZIO_PIPELINE_CONTINUE);
2468185029Spjd}
2469168404Spjd
2470185029Spjdstatic int
2471270312Ssmhzio_dva_claim(zio_t *zio)
2472185029Spjd{
2473185029Spjd	int error;
2474168404Spjd
2475185029Spjd	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2476185029Spjd	if (error)
2477185029Spjd		zio->io_error = error;
2478185029Spjd
2479185029Spjd	return (ZIO_PIPELINE_CONTINUE);
2480168404Spjd}
2481168404Spjd
2482185029Spjd/*
2483185029Spjd * Undo an allocation.  This is used by zio_done() when an I/O fails
2484185029Spjd * and we want to give back the block we just allocated.
2485185029Spjd * This handles both normal blocks and gang blocks.
2486185029Spjd */
2487168404Spjdstatic void
2488185029Spjdzio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2489168404Spjd{
2490185029Spjd	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2491219089Spjd	ASSERT(zio->io_bp_override == NULL);
2492185029Spjd
2493185029Spjd	if (!BP_IS_HOLE(bp))
2494219089Spjd		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2495185029Spjd
2496185029Spjd	if (gn != NULL) {
2497185029Spjd		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2498185029Spjd			zio_dva_unallocate(zio, gn->gn_child[g],
2499185029Spjd			    &gn->gn_gbh->zg_blkptr[g]);
2500185029Spjd		}
2501185029Spjd	}
2502168404Spjd}
2503168404Spjd
2504168404Spjd/*
2505185029Spjd * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2506185029Spjd */
2507185029Spjdint
2508219089Spjdzio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2509219089Spjd    uint64_t size, boolean_t use_slog)
2510185029Spjd{
2511219089Spjd	int error = 1;
2512185029Spjd
2513219089Spjd	ASSERT(txg > spa_syncing_txg(spa));
2514185029Spjd
2515230514Smm	/*
2516230514Smm	 * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2517230514Smm	 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2518230514Smm	 * when allocating them.
2519230514Smm	 */
2520230514Smm	if (use_slog) {
2521219089Spjd		error = metaslab_alloc(spa, spa_log_class(spa), size,
2522230514Smm		    new_bp, 1, txg, old_bp,
2523230514Smm		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2524230514Smm	}
2525219089Spjd
2526230514Smm	if (error) {
2527219089Spjd		error = metaslab_alloc(spa, spa_normal_class(spa), size,
2528230514Smm		    new_bp, 1, txg, old_bp,
2529260768Savg		    METASLAB_HINTBP_AVOID);
2530230514Smm	}
2531185029Spjd
2532185029Spjd	if (error == 0) {
2533185029Spjd		BP_SET_LSIZE(new_bp, size);
2534185029Spjd		BP_SET_PSIZE(new_bp, size);
2535185029Spjd		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2536219089Spjd		BP_SET_CHECKSUM(new_bp,
2537219089Spjd		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2538219089Spjd		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2539185029Spjd		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2540185029Spjd		BP_SET_LEVEL(new_bp, 0);
2541219089Spjd		BP_SET_DEDUP(new_bp, 0);
2542185029Spjd		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2543185029Spjd	}
2544185029Spjd
2545185029Spjd	return (error);
2546185029Spjd}
2547185029Spjd
2548185029Spjd/*
2549219089Spjd * Free an intent log block.
2550185029Spjd */
2551185029Spjdvoid
2552219089Spjdzio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2553185029Spjd{
2554219089Spjd	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2555185029Spjd	ASSERT(!BP_IS_GANG(bp));
2556185029Spjd
2557219089Spjd	zio_free(spa, txg, bp);
2558185029Spjd}
2559185029Spjd
2560185029Spjd/*
2561168404Spjd * ==========================================================================
2562244187Ssmh * Read, write and delete to physical devices
2563168404Spjd * ==========================================================================
2564168404Spjd */
2565185029Spjdstatic int
2566270312Ssmhzio_vdev_io_start(zio_t *zio)
2567168404Spjd{
2568168404Spjd	vdev_t *vd = zio->io_vd;
2569168404Spjd	uint64_t align;
2570185029Spjd	spa_t *spa = zio->io_spa;
2571270312Ssmh	int ret;
2572168404Spjd
2573185029Spjd	ASSERT(zio->io_error == 0);
2574185029Spjd	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2575185029Spjd
2576168404Spjd	if (vd == NULL) {
2577185029Spjd		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2578185029Spjd			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2579185029Spjd
2580185029Spjd		/*
2581185029Spjd		 * The mirror_ops handle multiple DVAs in a single BP.
2582185029Spjd		 */
2583185029Spjd		return (vdev_mirror_ops.vdev_op_io_start(zio));
2584168404Spjd	}
2585168404Spjd
2586270312Ssmh	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
2587270312Ssmh	    zio->io_priority == ZIO_PRIORITY_NOW) {
2588248574Ssmh		trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
2589240868Spjd		return (ZIO_PIPELINE_CONTINUE);
2590240868Spjd	}
2591240868Spjd
2592219089Spjd	/*
2593219089Spjd	 * We keep track of time-sensitive I/Os so that the scan thread
2594219089Spjd	 * can quickly react to certain workloads.  In particular, we care
2595219089Spjd	 * about non-scrubbing, top-level reads and writes with the following
2596219089Spjd	 * characteristics:
2597219089Spjd	 * 	- synchronous writes of user data to non-slog devices
2598219089Spjd	 *	- any reads of user data
2599219089Spjd	 * When these conditions are met, adjust the timestamp of spa_last_io
2600219089Spjd	 * which allows the scan thread to adjust its workload accordingly.
2601219089Spjd	 */
2602219089Spjd	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2603219089Spjd	    vd == vd->vdev_top && !vd->vdev_islog &&
2604219089Spjd	    zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2605219089Spjd	    zio->io_txg != spa_syncing_txg(spa)) {
2606219089Spjd		uint64_t old = spa->spa_last_io;
2607219089Spjd		uint64_t new = ddi_get_lbolt64();
2608219089Spjd		if (old != new)
2609219089Spjd			(void) atomic_cas_64(&spa->spa_last_io, old, new);
2610219089Spjd	}
2611219089Spjd
2612185029Spjd	align = 1ULL << vd->vdev_top->vdev_ashift;
2613168404Spjd
2614269733Sdelphij	if ((!(zio->io_flags & ZIO_FLAG_PHYSICAL) ||
2615269733Sdelphij	    (vd->vdev_top->vdev_physical_ashift > SPA_MINBLOCKSHIFT)) &&
2616269416Sdelphij	    P2PHASE(zio->io_size, align) != 0) {
2617269416Sdelphij		/* Transform logical writes to be a full physical block size. */
2618168404Spjd		uint64_t asize = P2ROUNDUP(zio->io_size, align);
2619240868Spjd		char *abuf = NULL;
2620240868Spjd		if (zio->io_type == ZIO_TYPE_READ ||
2621240868Spjd		    zio->io_type == ZIO_TYPE_WRITE)
2622240868Spjd			abuf = zio_buf_alloc(asize);
2623185029Spjd		ASSERT(vd == vd->vdev_top);
2624168404Spjd		if (zio->io_type == ZIO_TYPE_WRITE) {
2625168404Spjd			bcopy(zio->io_data, abuf, zio->io_size);
2626168404Spjd			bzero(abuf + zio->io_size, asize - zio->io_size);
2627168404Spjd		}
2628240868Spjd		zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
2629240868Spjd		    zio_subblock);
2630168404Spjd	}
2631168404Spjd
2632269416Sdelphij	/*
2633269416Sdelphij	 * If this is not a physical io, make sure that it is properly aligned
2634269416Sdelphij	 * before proceeding.
2635269416Sdelphij	 */
2636269416Sdelphij	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2637269416Sdelphij		ASSERT0(P2PHASE(zio->io_offset, align));
2638269416Sdelphij		ASSERT0(P2PHASE(zio->io_size, align));
2639269416Sdelphij	} else {
2640269416Sdelphij		/*
2641269416Sdelphij		 * For physical writes, we allow 512b aligned writes and assume
2642269416Sdelphij		 * the device will perform a read-modify-write as necessary.
2643269416Sdelphij		 */
2644269416Sdelphij		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2645269416Sdelphij		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2646269416Sdelphij	}
2647269416Sdelphij
2648240868Spjd	VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
2649168404Spjd
2650209962Smm	/*
2651209962Smm	 * If this is a repair I/O, and there's no self-healing involved --
2652209962Smm	 * that is, we're just resilvering what we expect to resilver --
2653209962Smm	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
2654209962Smm	 * This prevents spurious resilvering with nested replication.
2655209962Smm	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2656209962Smm	 * A is out of date, we'll read from C+D, then use the data to
2657209962Smm	 * resilver A+B -- but we don't actually want to resilver B, just A.
2658209962Smm	 * The top-level mirror has no way to know this, so instead we just
2659209962Smm	 * discard unnecessary repairs as we work our way down the vdev tree.
2660209962Smm	 * The same logic applies to any form of nested replication:
2661209962Smm	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2662209962Smm	 */
2663209962Smm	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2664209962Smm	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2665209962Smm	    zio->io_txg != 0 &&	/* not a delegated i/o */
2666209962Smm	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2667209962Smm		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2668209962Smm		zio_vdev_io_bypass(zio);
2669209962Smm		return (ZIO_PIPELINE_CONTINUE);
2670209962Smm	}
2671209962Smm
2672270312Ssmh	if (vd->vdev_ops->vdev_op_leaf) {
2673270312Ssmh		switch (zio->io_type) {
2674270312Ssmh		case ZIO_TYPE_READ:
2675270312Ssmh			if (vdev_cache_read(zio))
2676270312Ssmh				return (ZIO_PIPELINE_CONTINUE);
2677270312Ssmh			/* FALLTHROUGH */
2678270312Ssmh		case ZIO_TYPE_WRITE:
2679270312Ssmh		case ZIO_TYPE_FREE:
2680270312Ssmh			if ((zio = vdev_queue_io(zio)) == NULL)
2681270312Ssmh				return (ZIO_PIPELINE_STOP);
2682168404Spjd
2683270312Ssmh			if (!vdev_accessible(vd, zio)) {
2684270312Ssmh				zio->io_error = SET_ERROR(ENXIO);
2685270312Ssmh				zio_interrupt(zio);
2686270312Ssmh				return (ZIO_PIPELINE_STOP);
2687270312Ssmh			}
2688270312Ssmh			break;
2689185029Spjd		}
2690270312Ssmh		/*
2691270312Ssmh		 * Note that we ignore repair writes for TRIM because they can
2692270312Ssmh		 * conflict with normal writes. This isn't an issue because, by
2693270312Ssmh		 * definition, we only repair blocks that aren't freed.
2694270312Ssmh		 */
2695270312Ssmh		if (zio->io_type == ZIO_TYPE_WRITE &&
2696270312Ssmh		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2697270312Ssmh		    !trim_map_write_start(zio))
2698240868Spjd			return (ZIO_PIPELINE_STOP);
2699240868Spjd	}
2700240868Spjd
2701270312Ssmh	ret = vd->vdev_ops->vdev_op_io_start(zio);
2702270312Ssmh	ASSERT(ret == ZIO_PIPELINE_STOP);
2703270312Ssmh
2704270312Ssmh	return (ret);
2705168404Spjd}
2706168404Spjd
2707185029Spjdstatic int
2708270312Ssmhzio_vdev_io_done(zio_t *zio)
2709168404Spjd{
2710168404Spjd	vdev_t *vd = zio->io_vd;
2711185029Spjd	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2712185029Spjd	boolean_t unexpected_error = B_FALSE;
2713168404Spjd
2714185029Spjd	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2715185029Spjd		return (ZIO_PIPELINE_STOP);
2716168404Spjd
2717240868Spjd	ASSERT(zio->io_type == ZIO_TYPE_READ ||
2718240868Spjd	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
2719185029Spjd
2720240868Spjd	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2721270312Ssmh	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
2722270312Ssmh	    zio->io_type == ZIO_TYPE_FREE)) {
2723240868Spjd
2724248573Ssmh		if (zio->io_type == ZIO_TYPE_WRITE &&
2725248573Ssmh		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
2726248573Ssmh			trim_map_write_done(zio);
2727248573Ssmh
2728185029Spjd		vdev_queue_io_done(zio);
2729185029Spjd
2730185029Spjd		if (zio->io_type == ZIO_TYPE_WRITE)
2731185029Spjd			vdev_cache_write(zio);
2732185029Spjd
2733185029Spjd		if (zio_injection_enabled && zio->io_error == 0)
2734213198Smm			zio->io_error = zio_handle_device_injection(vd,
2735213198Smm			    zio, EIO);
2736185029Spjd
2737185029Spjd		if (zio_injection_enabled && zio->io_error == 0)
2738185029Spjd			zio->io_error = zio_handle_label_injection(zio, EIO);
2739185029Spjd
2740185029Spjd		if (zio->io_error) {
2741271683Ssmh			if (zio->io_error == ENOTSUP &&
2742271683Ssmh			    zio->io_type == ZIO_TYPE_FREE) {
2743271683Ssmh				/* Not all devices support TRIM. */
2744271683Ssmh			} else if (!vdev_accessible(vd, zio)) {
2745249195Smm				zio->io_error = SET_ERROR(ENXIO);
2746185029Spjd			} else {
2747185029Spjd				unexpected_error = B_TRUE;
2748185029Spjd			}
2749185029Spjd		}
2750185029Spjd	}
2751185029Spjd
2752185029Spjd	ops->vdev_op_io_done(zio);
2753185029Spjd
2754185029Spjd	if (unexpected_error)
2755209962Smm		VERIFY(vdev_probe(vd, zio) == NULL);
2756185029Spjd
2757185029Spjd	return (ZIO_PIPELINE_CONTINUE);
2758168404Spjd}
2759168404Spjd
2760219089Spjd/*
2761219089Spjd * For non-raidz ZIOs, we can just copy aside the bad data read from the
2762219089Spjd * disk, and use that to finish the checksum ereport later.
2763219089Spjd */
2764219089Spjdstatic void
2765219089Spjdzio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2766219089Spjd    const void *good_buf)
2767219089Spjd{
2768219089Spjd	/* no processing needed */
2769219089Spjd	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2770219089Spjd}
2771219089Spjd
2772219089Spjd/*ARGSUSED*/
2773219089Spjdvoid
2774219089Spjdzio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2775219089Spjd{
2776219089Spjd	void *buf = zio_buf_alloc(zio->io_size);
2777219089Spjd
2778219089Spjd	bcopy(zio->io_data, buf, zio->io_size);
2779219089Spjd
2780219089Spjd	zcr->zcr_cbinfo = zio->io_size;
2781219089Spjd	zcr->zcr_cbdata = buf;
2782219089Spjd	zcr->zcr_finish = zio_vsd_default_cksum_finish;
2783219089Spjd	zcr->zcr_free = zio_buf_free;
2784219089Spjd}
2785219089Spjd
2786185029Spjdstatic int
2787270312Ssmhzio_vdev_io_assess(zio_t *zio)
2788168404Spjd{
2789168404Spjd	vdev_t *vd = zio->io_vd;
2790168404Spjd
2791185029Spjd	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2792185029Spjd		return (ZIO_PIPELINE_STOP);
2793168404Spjd
2794185029Spjd	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2795185029Spjd		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2796185029Spjd
2797185029Spjd	if (zio->io_vsd != NULL) {
2798219089Spjd		zio->io_vsd_ops->vsd_free(zio);
2799185029Spjd		zio->io_vsd = NULL;
2800168404Spjd	}
2801168404Spjd
2802185029Spjd	if (zio_injection_enabled && zio->io_error == 0)
2803168404Spjd		zio->io_error = zio_handle_fault_injection(zio, EIO);
2804168404Spjd
2805270312Ssmh	if (zio->io_type == ZIO_TYPE_FREE &&
2806270312Ssmh	    zio->io_priority != ZIO_PRIORITY_NOW) {
2807240868Spjd		switch (zio->io_error) {
2808240868Spjd		case 0:
2809244155Ssmh			ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
2810244155Ssmh			ZIO_TRIM_STAT_BUMP(success);
2811240868Spjd			break;
2812240868Spjd		case EOPNOTSUPP:
2813244155Ssmh			ZIO_TRIM_STAT_BUMP(unsupported);
2814240868Spjd			break;
2815240868Spjd		default:
2816244155Ssmh			ZIO_TRIM_STAT_BUMP(failed);
2817240868Spjd			break;
2818240868Spjd		}
2819270312Ssmh	}
2820240868Spjd
2821168404Spjd	/*
2822168404Spjd	 * If the I/O failed, determine whether we should attempt to retry it.
2823219089Spjd	 *
2824219089Spjd	 * On retry, we cut in line in the issue queue, since we don't want
2825219089Spjd	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2826168404Spjd	 */
2827185029Spjd	if (zio->io_error && vd == NULL &&
2828185029Spjd	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2829185029Spjd		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
2830185029Spjd		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
2831168404Spjd		zio->io_error = 0;
2832185029Spjd		zio->io_flags |= ZIO_FLAG_IO_RETRY |
2833185029Spjd		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2834219089Spjd		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2835219089Spjd		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2836219089Spjd		    zio_requeue_io_start_cut_in_line);
2837185029Spjd		return (ZIO_PIPELINE_STOP);
2838185029Spjd	}
2839168404Spjd
2840185029Spjd	/*
2841185029Spjd	 * If we got an error on a leaf device, convert it to ENXIO
2842185029Spjd	 * if the device is not accessible at all.
2843185029Spjd	 */
2844185029Spjd	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2845185029Spjd	    !vdev_accessible(vd, zio))
2846249195Smm		zio->io_error = SET_ERROR(ENXIO);
2847168404Spjd
2848185029Spjd	/*
2849185029Spjd	 * If we can't write to an interior vdev (mirror or RAID-Z),
2850185029Spjd	 * set vdev_cant_write so that we stop trying to allocate from it.
2851185029Spjd	 */
2852185029Spjd	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2853248571Smm	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2854185029Spjd		vd->vdev_cant_write = B_TRUE;
2855248571Smm	}
2856168404Spjd
2857185029Spjd	if (zio->io_error)
2858185029Spjd		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2859168404Spjd
2860260763Savg	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2861260763Savg	    zio->io_physdone != NULL) {
2862260763Savg		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2863260763Savg		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2864260763Savg		zio->io_physdone(zio->io_logical);
2865260763Savg	}
2866260763Savg
2867185029Spjd	return (ZIO_PIPELINE_CONTINUE);
2868168404Spjd}
2869168404Spjd
2870168404Spjdvoid
2871168404Spjdzio_vdev_io_reissue(zio_t *zio)
2872168404Spjd{
2873168404Spjd	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2874168404Spjd	ASSERT(zio->io_error == 0);
2875168404Spjd
2876219089Spjd	zio->io_stage >>= 1;
2877168404Spjd}
2878168404Spjd
2879168404Spjdvoid
2880168404Spjdzio_vdev_io_redone(zio_t *zio)
2881168404Spjd{
2882168404Spjd	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2883168404Spjd
2884219089Spjd	zio->io_stage >>= 1;
2885168404Spjd}
2886168404Spjd
2887168404Spjdvoid
2888168404Spjdzio_vdev_io_bypass(zio_t *zio)
2889168404Spjd{
2890168404Spjd	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2891168404Spjd	ASSERT(zio->io_error == 0);
2892168404Spjd
2893168404Spjd	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2894219089Spjd	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2895168404Spjd}
2896168404Spjd
2897168404Spjd/*
2898168404Spjd * ==========================================================================
2899168404Spjd * Generate and verify checksums
2900168404Spjd * ==========================================================================
2901168404Spjd */
2902185029Spjdstatic int
2903270312Ssmhzio_checksum_generate(zio_t *zio)
2904168404Spjd{
2905168404Spjd	blkptr_t *bp = zio->io_bp;
2906185029Spjd	enum zio_checksum checksum;
2907168404Spjd
2908185029Spjd	if (bp == NULL) {
2909185029Spjd		/*
2910185029Spjd		 * This is zio_write_phys().
2911185029Spjd		 * We're either generating a label checksum, or none at all.
2912185029Spjd		 */
2913185029Spjd		checksum = zio->io_prop.zp_checksum;
2914168404Spjd
2915185029Spjd		if (checksum == ZIO_CHECKSUM_OFF)
2916185029Spjd			return (ZIO_PIPELINE_CONTINUE);
2917168404Spjd
2918185029Spjd		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2919185029Spjd	} else {
2920185029Spjd		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2921185029Spjd			ASSERT(!IO_IS_ALLOCATING(zio));
2922185029Spjd			checksum = ZIO_CHECKSUM_GANG_HEADER;
2923185029Spjd		} else {
2924185029Spjd			checksum = BP_GET_CHECKSUM(bp);
2925185029Spjd		}
2926185029Spjd	}
2927168404Spjd
2928185029Spjd	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2929185029Spjd
2930185029Spjd	return (ZIO_PIPELINE_CONTINUE);
2931168404Spjd}
2932168404Spjd
2933185029Spjdstatic int
2934270312Ssmhzio_checksum_verify(zio_t *zio)
2935168404Spjd{
2936219089Spjd	zio_bad_cksum_t info;
2937185029Spjd	blkptr_t *bp = zio->io_bp;
2938185029Spjd	int error;
2939168404Spjd
2940219089Spjd	ASSERT(zio->io_vd != NULL);
2941219089Spjd
2942185029Spjd	if (bp == NULL) {
2943185029Spjd		/*
2944185029Spjd		 * This is zio_read_phys().
2945185029Spjd		 * We're either verifying a label checksum, or nothing at all.
2946185029Spjd		 */
2947185029Spjd		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2948185029Spjd			return (ZIO_PIPELINE_CONTINUE);
2949168404Spjd
2950185029Spjd		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2951185029Spjd	}
2952168404Spjd
2953219089Spjd	if ((error = zio_checksum_error(zio, &info)) != 0) {
2954185029Spjd		zio->io_error = error;
2955185029Spjd		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2956219089Spjd			zfs_ereport_start_checksum(zio->io_spa,
2957219089Spjd			    zio->io_vd, zio, zio->io_offset,
2958219089Spjd			    zio->io_size, NULL, &info);
2959185029Spjd		}
2960168404Spjd	}
2961168404Spjd
2962185029Spjd	return (ZIO_PIPELINE_CONTINUE);
2963168404Spjd}
2964168404Spjd
2965168404Spjd/*
2966168404Spjd * Called by RAID-Z to ensure we don't compute the checksum twice.
2967168404Spjd */
2968168404Spjdvoid
2969168404Spjdzio_checksum_verified(zio_t *zio)
2970168404Spjd{
2971219089Spjd	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2972168404Spjd}
2973168404Spjd
2974168404Spjd/*
2975185029Spjd * ==========================================================================
2976185029Spjd * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2977268649Sdelphij * An error of 0 indicates success.  ENXIO indicates whole-device failure,
2978185029Spjd * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2979185029Spjd * indicate errors that are specific to one I/O, and most likely permanent.
2980185029Spjd * Any other error is presumed to be worse because we weren't expecting it.
2981185029Spjd * ==========================================================================
2982168404Spjd */
2983185029Spjdint
2984185029Spjdzio_worst_error(int e1, int e2)
2985168404Spjd{
2986185029Spjd	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2987185029Spjd	int r1, r2;
2988168404Spjd
2989185029Spjd	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2990185029Spjd		if (e1 == zio_error_rank[r1])
2991185029Spjd			break;
2992185029Spjd
2993185029Spjd	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2994185029Spjd		if (e2 == zio_error_rank[r2])
2995185029Spjd			break;
2996185029Spjd
2997185029Spjd	return (r1 > r2 ? e1 : e2);
2998168404Spjd}
2999168404Spjd
3000168404Spjd/*
3001168404Spjd * ==========================================================================
3002185029Spjd * I/O completion
3003168404Spjd * ==========================================================================
3004168404Spjd */
3005185029Spjdstatic int
3006270312Ssmhzio_ready(zio_t *zio)
3007168404Spjd{
3008185029Spjd	blkptr_t *bp = zio->io_bp;
3009209962Smm	zio_t *pio, *pio_next;
3010168404Spjd
3011219089Spjd	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
3012219089Spjd	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
3013209962Smm		return (ZIO_PIPELINE_STOP);
3014209962Smm
3015185029Spjd	if (zio->io_ready) {
3016185029Spjd		ASSERT(IO_IS_ALLOCATING(zio));
3017243524Smm		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
3018243524Smm		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
3019185029Spjd		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
3020168404Spjd
3021185029Spjd		zio->io_ready(zio);
3022168404Spjd	}
3023168404Spjd
3024185029Spjd	if (bp != NULL && bp != &zio->io_bp_copy)
3025185029Spjd		zio->io_bp_copy = *bp;
3026168404Spjd
3027185029Spjd	if (zio->io_error)
3028185029Spjd		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
3029168404Spjd
3030209962Smm	mutex_enter(&zio->io_lock);
3031209962Smm	zio->io_state[ZIO_WAIT_READY] = 1;
3032209962Smm	pio = zio_walk_parents(zio);
3033209962Smm	mutex_exit(&zio->io_lock);
3034209962Smm
3035209962Smm	/*
3036209962Smm	 * As we notify zio's parents, new parents could be added.
3037209962Smm	 * New parents go to the head of zio's io_parent_list, however,
3038209962Smm	 * so we will (correctly) not notify them.  The remainder of zio's
3039209962Smm	 * io_parent_list, from 'pio_next' onward, cannot change because
3040209962Smm	 * all parents must wait for us to be done before they can be done.
3041209962Smm	 */
3042209962Smm	for (; pio != NULL; pio = pio_next) {
3043209962Smm		pio_next = zio_walk_parents(zio);
3044185029Spjd		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
3045209962Smm	}
3046185029Spjd
3047219089Spjd	if (zio->io_flags & ZIO_FLAG_NODATA) {
3048219089Spjd		if (BP_IS_GANG(bp)) {
3049219089Spjd			zio->io_flags &= ~ZIO_FLAG_NODATA;
3050219089Spjd		} else {
3051219089Spjd			ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
3052219089Spjd			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
3053219089Spjd		}
3054219089Spjd	}
3055219089Spjd
3056219089Spjd	if (zio_injection_enabled &&
3057219089Spjd	    zio->io_spa->spa_syncing_txg == zio->io_txg)
3058219089Spjd		zio_handle_ignored_writes(zio);
3059219089Spjd
3060185029Spjd	return (ZIO_PIPELINE_CONTINUE);
3061185029Spjd}
3062185029Spjd
3063185029Spjdstatic int
3064270312Ssmhzio_done(zio_t *zio)
3065185029Spjd{
3066185029Spjd	spa_t *spa = zio->io_spa;
3067185029Spjd	zio_t *lio = zio->io_logical;
3068185029Spjd	blkptr_t *bp = zio->io_bp;
3069185029Spjd	vdev_t *vd = zio->io_vd;
3070185029Spjd	uint64_t psize = zio->io_size;
3071209962Smm	zio_t *pio, *pio_next;
3072185029Spjd
3073168404Spjd	/*
3074209962Smm	 * If our children haven't all completed,
3075185029Spjd	 * wait for them and then repeat this pipeline stage.
3076168404Spjd	 */
3077185029Spjd	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3078185029Spjd	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
3079219089Spjd	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3080185029Spjd	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3081185029Spjd		return (ZIO_PIPELINE_STOP);
3082185029Spjd
3083185029Spjd	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
3084185029Spjd		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
3085185029Spjd			ASSERT(zio->io_children[c][w] == 0);
3086185029Spjd
3087268649Sdelphij	if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
3088185029Spjd		ASSERT(bp->blk_pad[0] == 0);
3089185029Spjd		ASSERT(bp->blk_pad[1] == 0);
3090185029Spjd		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
3091209962Smm		    (bp == zio_unique_parent(zio)->io_bp));
3092185029Spjd		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
3093219089Spjd		    zio->io_bp_override == NULL &&
3094185029Spjd		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3095185029Spjd			ASSERT(!BP_SHOULD_BYTESWAP(bp));
3096219089Spjd			ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
3097185029Spjd			ASSERT(BP_COUNT_GANG(bp) == 0 ||
3098185029Spjd			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
3099185029Spjd		}
3100243524Smm		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3101243524Smm			VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
3102168404Spjd	}
3103168404Spjd
3104185029Spjd	/*
3105219089Spjd	 * If there were child vdev/gang/ddt errors, they apply to us now.
3106185029Spjd	 */
3107185029Spjd	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3108185029Spjd	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3109219089Spjd	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3110168404Spjd
3111219089Spjd	/*
3112219089Spjd	 * If the I/O on the transformed data was successful, generate any
3113219089Spjd	 * checksum reports now while we still have the transformed data.
3114219089Spjd	 */
3115219089Spjd	if (zio->io_error == 0) {
3116219089Spjd		while (zio->io_cksum_report != NULL) {
3117219089Spjd			zio_cksum_report_t *zcr = zio->io_cksum_report;
3118219089Spjd			uint64_t align = zcr->zcr_align;
3119219089Spjd			uint64_t asize = P2ROUNDUP(psize, align);
3120219089Spjd			char *abuf = zio->io_data;
3121219089Spjd
3122219089Spjd			if (asize != psize) {
3123219089Spjd				abuf = zio_buf_alloc(asize);
3124219089Spjd				bcopy(zio->io_data, abuf, psize);
3125219089Spjd				bzero(abuf + psize, asize - psize);
3126219089Spjd			}
3127219089Spjd
3128219089Spjd			zio->io_cksum_report = zcr->zcr_next;
3129219089Spjd			zcr->zcr_next = NULL;
3130219089Spjd			zcr->zcr_finish(zcr, abuf);
3131219089Spjd			zfs_ereport_free_checksum(zcr);
3132219089Spjd
3133219089Spjd			if (asize != psize)
3134219089Spjd				zio_buf_free(abuf, asize);
3135219089Spjd		}
3136219089Spjd	}
3137219089Spjd
3138185029Spjd	zio_pop_transforms(zio);	/* note: may set zio->io_error */
3139168404Spjd
3140185029Spjd	vdev_stat_update(zio, psize);
3141185029Spjd
3142168404Spjd	if (zio->io_error) {
3143185029Spjd		/*
3144185029Spjd		 * If this I/O is attached to a particular vdev,
3145185029Spjd		 * generate an error message describing the I/O failure
3146185029Spjd		 * at the block level.  We ignore these errors if the
3147185029Spjd		 * device is currently unavailable.
3148185029Spjd		 */
3149185029Spjd		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
3150185029Spjd			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
3151185029Spjd
3152219089Spjd		if ((zio->io_error == EIO || !(zio->io_flags &
3153219089Spjd		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3154219089Spjd		    zio == lio) {
3155185029Spjd			/*
3156185029Spjd			 * For logical I/O requests, tell the SPA to log the
3157185029Spjd			 * error and generate a logical data ereport.
3158185029Spjd			 */
3159185029Spjd			spa_log_error(spa, zio);
3160185029Spjd			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
3161185029Spjd			    0, 0);
3162185029Spjd		}
3163168404Spjd	}
3164168404Spjd
3165185029Spjd	if (zio->io_error && zio == lio) {
3166185029Spjd		/*
3167185029Spjd		 * Determine whether zio should be reexecuted.  This will
3168185029Spjd		 * propagate all the way to the root via zio_notify_parent().
3169185029Spjd		 */
3170185029Spjd		ASSERT(vd == NULL && bp != NULL);
3171219089Spjd		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3172168404Spjd
3173219089Spjd		if (IO_IS_ALLOCATING(zio) &&
3174219089Spjd		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3175185029Spjd			if (zio->io_error != ENOSPC)
3176185029Spjd				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3177185029Spjd			else
3178185029Spjd				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3179219089Spjd		}
3180168404Spjd
3181185029Spjd		if ((zio->io_type == ZIO_TYPE_READ ||
3182185029Spjd		    zio->io_type == ZIO_TYPE_FREE) &&
3183219089Spjd		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3184185029Spjd		    zio->io_error == ENXIO &&
3185219089Spjd		    spa_load_state(spa) == SPA_LOAD_NONE &&
3186185029Spjd		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
3187185029Spjd			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3188185029Spjd
3189185029Spjd		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3190185029Spjd			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3191219089Spjd
3192219089Spjd		/*
3193219089Spjd		 * Here is a possibly good place to attempt to do
3194219089Spjd		 * either combinatorial reconstruction or error correction
3195219089Spjd		 * based on checksums.  It also might be a good place
3196219089Spjd		 * to send out preliminary ereports before we suspend
3197219089Spjd		 * processing.
3198219089Spjd		 */
3199185029Spjd	}
3200185029Spjd
3201168404Spjd	/*
3202185029Spjd	 * If there were logical child errors, they apply to us now.
3203185029Spjd	 * We defer this until now to avoid conflating logical child
3204185029Spjd	 * errors with errors that happened to the zio itself when
3205185029Spjd	 * updating vdev stats and reporting FMA events above.
3206168404Spjd	 */
3207185029Spjd	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3208185029Spjd
3209219089Spjd	if ((zio->io_error || zio->io_reexecute) &&
3210219089Spjd	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3211243524Smm	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3212209962Smm		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
3213209962Smm
3214209962Smm	zio_gang_tree_free(&zio->io_gang_tree);
3215209962Smm
3216209962Smm	/*
3217209962Smm	 * Godfather I/Os should never suspend.
3218209962Smm	 */
3219209962Smm	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3220209962Smm	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3221209962Smm		zio->io_reexecute = 0;
3222209962Smm
3223185029Spjd	if (zio->io_reexecute) {
3224185029Spjd		/*
3225185029Spjd		 * This is a logical I/O that wants to reexecute.
3226185029Spjd		 *
3227185029Spjd		 * Reexecute is top-down.  When an i/o fails, if it's not
3228185029Spjd		 * the root, it simply notifies its parent and sticks around.
3229185029Spjd		 * The parent, seeing that it still has children in zio_done(),
3230185029Spjd		 * does the same.  This percolates all the way up to the root.
3231185029Spjd		 * The root i/o will reexecute or suspend the entire tree.
3232185029Spjd		 *
3233185029Spjd		 * This approach ensures that zio_reexecute() honors
3234185029Spjd		 * all the original i/o dependency relationships, e.g.
3235185029Spjd		 * parents not executing until children are ready.
3236185029Spjd		 */
3237185029Spjd		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3238185029Spjd
3239209962Smm		zio->io_gang_leader = NULL;
3240185029Spjd
3241209962Smm		mutex_enter(&zio->io_lock);
3242209962Smm		zio->io_state[ZIO_WAIT_DONE] = 1;
3243209962Smm		mutex_exit(&zio->io_lock);
3244185029Spjd
3245209962Smm		/*
3246209962Smm		 * "The Godfather" I/O monitors its children but is
3247209962Smm		 * not a true parent to them. It will track them through
3248209962Smm		 * the pipeline but severs its ties whenever they get into
3249209962Smm		 * trouble (e.g. suspended). This allows "The Godfather"
3250209962Smm		 * I/O to return status without blocking.
3251209962Smm		 */
3252209962Smm		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3253209962Smm			zio_link_t *zl = zio->io_walk_link;
3254209962Smm			pio_next = zio_walk_parents(zio);
3255209962Smm
3256209962Smm			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3257209962Smm			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3258209962Smm				zio_remove_child(pio, zio, zl);
3259209962Smm				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3260209962Smm			}
3261209962Smm		}
3262209962Smm
3263209962Smm		if ((pio = zio_unique_parent(zio)) != NULL) {
3264185029Spjd			/*
3265185029Spjd			 * We're not a root i/o, so there's nothing to do
3266185029Spjd			 * but notify our parent.  Don't propagate errors
3267185029Spjd			 * upward since we haven't permanently failed yet.
3268185029Spjd			 */
3269209962Smm			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3270185029Spjd			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3271185029Spjd			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3272185029Spjd		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3273185029Spjd			/*
3274185029Spjd			 * We'd fail again if we reexecuted now, so suspend
3275185029Spjd			 * until conditions improve (e.g. device comes online).
3276185029Spjd			 */
3277185029Spjd			zio_suspend(spa, zio);
3278185029Spjd		} else {
3279185029Spjd			/*
3280185029Spjd			 * Reexecution is potentially a huge amount of work.
3281185029Spjd			 * Hand it off to the otherwise-unused claim taskq.
3282185029Spjd			 */
3283260742Savg#if defined(illumos) || !defined(_KERNEL)
3284260742Savg			ASSERT(zio->io_tqent.tqent_next == NULL);
3285216919Smm#else
3286260742Savg			ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
3287260742Savg#endif
3288260750Savg			spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
3289260750Savg			    ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
3290260750Savg			    0, &zio->io_tqent);
3291185029Spjd		}
3292185029Spjd		return (ZIO_PIPELINE_STOP);
3293168404Spjd	}
3294168404Spjd
3295219089Spjd	ASSERT(zio->io_child_count == 0);
3296185029Spjd	ASSERT(zio->io_reexecute == 0);
3297185029Spjd	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3298168404Spjd
3299209962Smm	/*
3300219089Spjd	 * Report any checksum errors, since the I/O is complete.
3301219089Spjd	 */
3302219089Spjd	while (zio->io_cksum_report != NULL) {
3303219089Spjd		zio_cksum_report_t *zcr = zio->io_cksum_report;
3304219089Spjd		zio->io_cksum_report = zcr->zcr_next;
3305219089Spjd		zcr->zcr_next = NULL;
3306219089Spjd		zcr->zcr_finish(zcr, NULL);
3307219089Spjd		zfs_ereport_free_checksum(zcr);
3308219089Spjd	}
3309219089Spjd
3310219089Spjd	/*
3311209962Smm	 * It is the responsibility of the done callback to ensure that this
3312209962Smm	 * particular zio is no longer discoverable for adoption, and as
3313209962Smm	 * such, cannot acquire any new parents.
3314209962Smm	 */
3315185029Spjd	if (zio->io_done)
3316185029Spjd		zio->io_done(zio);
3317168404Spjd
3318209962Smm	mutex_enter(&zio->io_lock);
3319209962Smm	zio->io_state[ZIO_WAIT_DONE] = 1;
3320209962Smm	mutex_exit(&zio->io_lock);
3321168404Spjd
3322209962Smm	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3323209962Smm		zio_link_t *zl = zio->io_walk_link;
3324209962Smm		pio_next = zio_walk_parents(zio);
3325209962Smm		zio_remove_child(pio, zio, zl);
3326185029Spjd		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3327168404Spjd	}
3328168404Spjd
3329185029Spjd	if (zio->io_waiter != NULL) {
3330185029Spjd		mutex_enter(&zio->io_lock);
3331185029Spjd		zio->io_executor = NULL;
3332185029Spjd		cv_broadcast(&zio->io_cv);
3333185029Spjd		mutex_exit(&zio->io_lock);
3334185029Spjd	} else {
3335185029Spjd		zio_destroy(zio);
3336168404Spjd	}
3337168404Spjd
3338185029Spjd	return (ZIO_PIPELINE_STOP);
3339168404Spjd}
3340168404Spjd
3341168404Spjd/*
3342185029Spjd * ==========================================================================
3343185029Spjd * I/O pipeline definition
3344185029Spjd * ==========================================================================
3345168404Spjd */
3346219089Spjdstatic zio_pipe_stage_t *zio_pipeline[] = {
3347185029Spjd	NULL,
3348219089Spjd	zio_read_bp_init,
3349219089Spjd	zio_free_bp_init,
3350185029Spjd	zio_issue_async,
3351185029Spjd	zio_write_bp_init,
3352185029Spjd	zio_checksum_generate,
3353243524Smm	zio_nop_write,
3354219089Spjd	zio_ddt_read_start,
3355219089Spjd	zio_ddt_read_done,
3356219089Spjd	zio_ddt_write,
3357219089Spjd	zio_ddt_free,
3358185029Spjd	zio_gang_assemble,
3359185029Spjd	zio_gang_issue,
3360185029Spjd	zio_dva_allocate,
3361185029Spjd	zio_dva_free,
3362185029Spjd	zio_dva_claim,
3363185029Spjd	zio_ready,
3364185029Spjd	zio_vdev_io_start,
3365185029Spjd	zio_vdev_io_done,
3366185029Spjd	zio_vdev_io_assess,
3367185029Spjd	zio_checksum_verify,
3368185029Spjd	zio_done
3369185029Spjd};
3370236884Smm
3371236884Smm/* dnp is the dnode for zb1->zb_object */
3372236884Smmboolean_t
3373268657Sdelphijzbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
3374268657Sdelphij    const zbookmark_phys_t *zb2)
3375236884Smm{
3376236884Smm	uint64_t zb1nextL0, zb2thisobj;
3377236884Smm
3378236884Smm	ASSERT(zb1->zb_objset == zb2->zb_objset);
3379236884Smm	ASSERT(zb2->zb_level == 0);
3380236884Smm
3381236884Smm	/* The objset_phys_t isn't before anything. */
3382236884Smm	if (dnp == NULL)
3383236884Smm		return (B_FALSE);
3384236884Smm
3385236884Smm	zb1nextL0 = (zb1->zb_blkid + 1) <<
3386236884Smm	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3387236884Smm
3388236884Smm	zb2thisobj = zb2->zb_object ? zb2->zb_object :
3389236884Smm	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3390236884Smm
3391236884Smm	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3392236884Smm		uint64_t nextobj = zb1nextL0 *
3393236884Smm		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3394236884Smm		return (nextobj <= zb2thisobj);
3395236884Smm	}
3396236884Smm
3397236884Smm	if (zb1->zb_object < zb2thisobj)
3398236884Smm		return (B_TRUE);
3399236884Smm	if (zb1->zb_object > zb2thisobj)
3400236884Smm		return (B_FALSE);
3401236884Smm	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3402236884Smm		return (B_FALSE);
3403236884Smm	return (zb1nextL0 <= zb2->zb_blkid);
3404236884Smm}
3405