zio.c revision 5403:0bfd0977c989
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/zfs_context.h>
29#include <sys/fm/fs/zfs.h>
30#include <sys/spa.h>
31#include <sys/txg.h>
32#include <sys/spa_impl.h>
33#include <sys/vdev_impl.h>
34#include <sys/zio_impl.h>
35#include <sys/zio_compress.h>
36#include <sys/zio_checksum.h>
37
38/*
39 * ==========================================================================
40 * I/O priority table
41 * ==========================================================================
42 */
43uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
44	0,	/* ZIO_PRIORITY_NOW		*/
45	0,	/* ZIO_PRIORITY_SYNC_READ	*/
46	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
47	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
48	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
49	4,	/* ZIO_PRIORITY_FREE		*/
50	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
51	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
52	10,	/* ZIO_PRIORITY_RESILVER	*/
53	20,	/* ZIO_PRIORITY_SCRUB		*/
54};
55
56/*
57 * ==========================================================================
58 * I/O type descriptions
59 * ==========================================================================
60 */
61char *zio_type_name[ZIO_TYPES] = {
62	"null", "read", "write", "free", "claim", "ioctl" };
63
64/* At or above this size, force gang blocking - for testing */
65uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
66
67/* Force an allocation failure when non-zero */
68uint16_t zio_zil_fail_shift = 0;
69uint16_t zio_io_fail_shift = 0;
70
71/* Enable/disable the write-retry logic */
72int zio_write_retry = 1;
73
74/* Taskq to handle reissuing of I/Os */
75taskq_t *zio_taskq;
76int zio_resume_threads = 4;
77
78typedef struct zio_sync_pass {
79	int	zp_defer_free;		/* defer frees after this pass */
80	int	zp_dontcompress;	/* don't compress after this pass */
81	int	zp_rewrite;		/* rewrite new bps after this pass */
82} zio_sync_pass_t;
83
84zio_sync_pass_t zio_sync_pass = {
85	1,	/* zp_defer_free */
86	4,	/* zp_dontcompress */
87	1,	/* zp_rewrite */
88};
89
90static boolean_t zio_io_should_fail(uint16_t);
91
92/*
93 * ==========================================================================
94 * I/O kmem caches
95 * ==========================================================================
96 */
97kmem_cache_t *zio_cache;
98kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
99kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
100
101#ifdef _KERNEL
102extern vmem_t *zio_alloc_arena;
103#endif
104
105/*
106 * Determine if we are allowed to issue the IO based on the
107 * pool state. If we must wait then block until we are told
108 * that we may continue.
109 */
110#define	ZIO_ENTER(spa) {						\
111	if (spa->spa_state == POOL_STATE_IO_FAILURE) {			\
112		mutex_enter(&spa->spa_zio_lock);			\
113		while (spa->spa_state == POOL_STATE_IO_FAILURE)		\
114			cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock);	\
115		mutex_exit(&spa->spa_zio_lock);				\
116	}								\
117}
118
119/*
120 * An allocation zio is one that either currently has the DVA allocate
121 * stage set or will have it later in it's lifetime.
122 */
123#define	IO_IS_ALLOCATING(zio) \
124	((zio)->io_orig_pipeline == ZIO_WRITE_PIPELINE ||		\
125	(zio)->io_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
126
127void
128zio_init(void)
129{
130	size_t c;
131	vmem_t *data_alloc_arena = NULL;
132
133#ifdef _KERNEL
134	data_alloc_arena = zio_alloc_arena;
135#endif
136
137	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
138	    NULL, NULL, NULL, NULL, NULL, 0);
139
140	/*
141	 * For small buffers, we want a cache for each multiple of
142	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
143	 * for each quarter-power of 2.  For large buffers, we want
144	 * a cache for each multiple of PAGESIZE.
145	 */
146	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
147		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
148		size_t p2 = size;
149		size_t align = 0;
150
151		while (p2 & (p2 - 1))
152			p2 &= p2 - 1;
153
154		if (size <= 4 * SPA_MINBLOCKSIZE) {
155			align = SPA_MINBLOCKSIZE;
156		} else if (P2PHASE(size, PAGESIZE) == 0) {
157			align = PAGESIZE;
158		} else if (P2PHASE(size, p2 >> 2) == 0) {
159			align = p2 >> 2;
160		}
161
162		if (align != 0) {
163			char name[36];
164			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
165			zio_buf_cache[c] = kmem_cache_create(name, size,
166			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
167
168			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
169			zio_data_buf_cache[c] = kmem_cache_create(name, size,
170			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
171			    KMC_NODEBUG);
172
173			dprintf("creating cache for size %5lx align %5lx\n",
174			    size, align);
175		}
176	}
177
178	while (--c != 0) {
179		ASSERT(zio_buf_cache[c] != NULL);
180		if (zio_buf_cache[c - 1] == NULL)
181			zio_buf_cache[c - 1] = zio_buf_cache[c];
182
183		ASSERT(zio_data_buf_cache[c] != NULL);
184		if (zio_data_buf_cache[c - 1] == NULL)
185			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
186	}
187
188	zio_taskq = taskq_create("zio_taskq", zio_resume_threads,
189	    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
190
191	zio_inject_init();
192}
193
194void
195zio_fini(void)
196{
197	size_t c;
198	kmem_cache_t *last_cache = NULL;
199	kmem_cache_t *last_data_cache = NULL;
200
201	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
202		if (zio_buf_cache[c] != last_cache) {
203			last_cache = zio_buf_cache[c];
204			kmem_cache_destroy(zio_buf_cache[c]);
205		}
206		zio_buf_cache[c] = NULL;
207
208		if (zio_data_buf_cache[c] != last_data_cache) {
209			last_data_cache = zio_data_buf_cache[c];
210			kmem_cache_destroy(zio_data_buf_cache[c]);
211		}
212		zio_data_buf_cache[c] = NULL;
213	}
214
215	taskq_destroy(zio_taskq);
216
217	kmem_cache_destroy(zio_cache);
218
219	zio_inject_fini();
220}
221
222/*
223 * ==========================================================================
224 * Allocate and free I/O buffers
225 * ==========================================================================
226 */
227
228/*
229 * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
230 * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
231 * useful to inspect ZFS metadata, but if possible, we should avoid keeping
232 * excess / transient data in-core during a crashdump.
233 */
234void *
235zio_buf_alloc(size_t size)
236{
237	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
238
239	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
240
241	return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
242}
243
244/*
245 * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
246 * crashdump if the kernel panics.  This exists so that we will limit the amount
247 * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
248 * of kernel heap dumped to disk when the kernel panics)
249 */
250void *
251zio_data_buf_alloc(size_t size)
252{
253	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
254
255	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
256
257	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
258}
259
260void
261zio_buf_free(void *buf, size_t size)
262{
263	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
264
265	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
266
267	kmem_cache_free(zio_buf_cache[c], buf);
268}
269
270void
271zio_data_buf_free(void *buf, size_t size)
272{
273	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
274
275	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
276
277	kmem_cache_free(zio_data_buf_cache[c], buf);
278}
279
280/*
281 * ==========================================================================
282 * Push and pop I/O transform buffers
283 * ==========================================================================
284 */
285static void
286zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
287{
288	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
289
290	zt->zt_data = data;
291	zt->zt_size = size;
292	zt->zt_bufsize = bufsize;
293
294	zt->zt_next = zio->io_transform_stack;
295	zio->io_transform_stack = zt;
296
297	zio->io_data = data;
298	zio->io_size = size;
299}
300
301static void
302zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
303{
304	zio_transform_t *zt = zio->io_transform_stack;
305
306	*data = zt->zt_data;
307	*size = zt->zt_size;
308	*bufsize = zt->zt_bufsize;
309
310	zio->io_transform_stack = zt->zt_next;
311	kmem_free(zt, sizeof (zio_transform_t));
312
313	if ((zt = zio->io_transform_stack) != NULL) {
314		zio->io_data = zt->zt_data;
315		zio->io_size = zt->zt_size;
316	}
317}
318
319static void
320zio_clear_transform_stack(zio_t *zio)
321{
322	void *data;
323	uint64_t size, bufsize;
324
325	ASSERT(zio->io_transform_stack != NULL);
326
327	zio_pop_transform(zio, &data, &size, &bufsize);
328	while (zio->io_transform_stack != NULL) {
329		zio_buf_free(data, bufsize);
330		zio_pop_transform(zio, &data, &size, &bufsize);
331	}
332}
333
334/*
335 * ==========================================================================
336 * Create the various types of I/O (read, write, free)
337 * ==========================================================================
338 */
339static zio_t *
340zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
341    void *data, uint64_t size, zio_done_func_t *done, void *private,
342    zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
343{
344	zio_t *zio;
345
346	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
347	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
348
349	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
350	bzero(zio, sizeof (zio_t));
351	zio->io_parent = pio;
352	zio->io_spa = spa;
353	zio->io_txg = txg;
354	zio->io_flags = flags;
355	if (bp != NULL) {
356		zio->io_bp = bp;
357		zio->io_bp_copy = *bp;
358		zio->io_bp_orig = *bp;
359		if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata ||
360		    BP_GET_LEVEL(bp) != 0)
361			zio->io_flags |= ZIO_FLAG_METADATA;
362	}
363	zio->io_done = done;
364	zio->io_private = private;
365	zio->io_type = type;
366	zio->io_priority = priority;
367	zio->io_stage = stage;
368	zio->io_pipeline = pipeline;
369	zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
370	zio->io_timestamp = lbolt64;
371	if (pio != NULL)
372		zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA);
373	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
374	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
375	zio_push_transform(zio, data, size, size);
376
377	/*
378	 * Note on config lock:
379	 *
380	 * If CONFIG_HELD is set, then the caller already has the config
381	 * lock, so we don't need it for this io.
382	 *
383	 * We set CONFIG_GRABBED to indicate that we have grabbed the
384	 * config lock on behalf of this io, so it should be released
385	 * in zio_done.
386	 *
387	 * Unless CONFIG_HELD is set, we will grab the config lock for
388	 * any top-level (parent-less) io, *except* NULL top-level ios.
389	 * The NULL top-level ios rarely have any children, so we delay
390	 * grabbing the lock until the first child is added (but it is
391	 * still grabbed on behalf of the top-level i/o, so additional
392	 * children don't need to also grab it).  This greatly reduces
393	 * contention on the config lock.
394	 */
395	if (pio == NULL) {
396		if (type != ZIO_TYPE_NULL &&
397		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
398			spa_config_enter(zio->io_spa, RW_READER, zio);
399			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
400		}
401		zio->io_root = zio;
402	} else {
403		zio->io_root = pio->io_root;
404		if (!(flags & ZIO_FLAG_NOBOOKMARK))
405			zio->io_logical = pio->io_logical;
406		mutex_enter(&pio->io_lock);
407		if (pio->io_parent == NULL &&
408		    pio->io_type == ZIO_TYPE_NULL &&
409		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
410		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
411			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
412			spa_config_enter(zio->io_spa, RW_READER, pio);
413		}
414		if (stage < ZIO_STAGE_READY)
415			pio->io_children_notready++;
416		pio->io_children_notdone++;
417		zio->io_sibling_next = pio->io_child;
418		zio->io_sibling_prev = NULL;
419		if (pio->io_child != NULL)
420			pio->io_child->io_sibling_prev = zio;
421		pio->io_child = zio;
422		zio->io_ndvas = pio->io_ndvas;
423		mutex_exit(&pio->io_lock);
424	}
425
426	/*
427	 * Save off the original state incase we need to retry later.
428	 */
429	zio->io_orig_stage = zio->io_stage;
430	zio->io_orig_pipeline = zio->io_pipeline;
431	zio->io_orig_flags = zio->io_flags;
432
433	return (zio);
434}
435
436static void
437zio_reset(zio_t *zio)
438{
439	zio_clear_transform_stack(zio);
440
441	zio->io_flags = zio->io_orig_flags;
442	zio->io_stage = zio->io_orig_stage;
443	zio->io_pipeline = zio->io_orig_pipeline;
444	zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size);
445}
446
447zio_t *
448zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
449	int flags)
450{
451	zio_t *zio;
452
453	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
454	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
455	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
456
457	return (zio);
458}
459
460zio_t *
461zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
462{
463	return (zio_null(NULL, spa, done, private, flags));
464}
465
466zio_t *
467zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
468    uint64_t size, zio_done_func_t *done, void *private,
469    int priority, int flags, zbookmark_t *zb)
470{
471	zio_t *zio;
472
473	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
474
475	/*
476	 * If the user has specified that we allow I/Os to continue
477	 * then attempt to satisfy the read.
478	 */
479	if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
480		ZIO_ENTER(spa);
481
482	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
483	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
484	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
485	zio->io_bookmark = *zb;
486
487	zio->io_logical = zio;
488
489	/*
490	 * Work off our copy of the bp so the caller can free it.
491	 */
492	zio->io_bp = &zio->io_bp_copy;
493
494	return (zio);
495}
496
497zio_t *
498zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
499    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
500    zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
501    int flags, zbookmark_t *zb)
502{
503	zio_t *zio;
504
505	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
506	    checksum < ZIO_CHECKSUM_FUNCTIONS);
507
508	ASSERT(compress >= ZIO_COMPRESS_OFF &&
509	    compress < ZIO_COMPRESS_FUNCTIONS);
510
511	ZIO_ENTER(spa);
512
513	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
514	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
515	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
516
517	zio->io_ready = ready;
518
519	zio->io_bookmark = *zb;
520
521	zio->io_logical = zio;
522
523	zio->io_checksum = checksum;
524	zio->io_compress = compress;
525	zio->io_ndvas = ncopies;
526
527	if (compress != ZIO_COMPRESS_OFF)
528		zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
529
530	if (bp->blk_birth != txg) {
531		/* XXX the bp usually (always?) gets re-zeroed later */
532		BP_ZERO(bp);
533		BP_SET_LSIZE(bp, size);
534		BP_SET_PSIZE(bp, size);
535	} else {
536		/* Make sure someone doesn't change their mind on overwrites */
537		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
538		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
539	}
540
541	return (zio);
542}
543
544zio_t *
545zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
546    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
547    zio_done_func_t *done, void *private, int priority, int flags,
548    zbookmark_t *zb)
549{
550	zio_t *zio;
551
552	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
553	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
554	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
555
556	zio->io_bookmark = *zb;
557	zio->io_checksum = checksum;
558	zio->io_compress = ZIO_COMPRESS_OFF;
559
560	if (pio != NULL)
561		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
562
563	return (zio);
564}
565
566static void
567zio_write_allocate_ready(zio_t *zio)
568{
569	/* Free up the previous block */
570	if (!BP_IS_HOLE(&zio->io_bp_orig)) {
571		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
572		    &zio->io_bp_orig, NULL, NULL));
573	}
574}
575
576static zio_t *
577zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
578    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
579    zio_done_func_t *done, void *private, int priority, int flags)
580{
581	zio_t *zio;
582
583	BP_ZERO(bp);
584	BP_SET_LSIZE(bp, size);
585	BP_SET_PSIZE(bp, size);
586	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
587
588	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
589	    ZIO_TYPE_WRITE, priority, flags,
590	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
591
592	zio->io_checksum = checksum;
593	zio->io_compress = ZIO_COMPRESS_OFF;
594	zio->io_ready = zio_write_allocate_ready;
595
596	return (zio);
597}
598
599zio_t *
600zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
601    zio_done_func_t *done, void *private)
602{
603	zio_t *zio;
604
605	ASSERT(!BP_IS_HOLE(bp));
606
607	if (txg == spa->spa_syncing_txg &&
608	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
609		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
610		return (zio_null(pio, spa, NULL, NULL, 0));
611	}
612
613	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
614	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
615	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
616
617	zio->io_bp = &zio->io_bp_copy;
618
619	return (zio);
620}
621
622zio_t *
623zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
624    zio_done_func_t *done, void *private)
625{
626	zio_t *zio;
627
628	/*
629	 * A claim is an allocation of a specific block.  Claims are needed
630	 * to support immediate writes in the intent log.  The issue is that
631	 * immediate writes contain committed data, but in a txg that was
632	 * *not* committed.  Upon opening the pool after an unclean shutdown,
633	 * the intent log claims all blocks that contain immediate write data
634	 * so that the SPA knows they're in use.
635	 *
636	 * All claims *must* be resolved in the first txg -- before the SPA
637	 * starts allocating blocks -- so that nothing is allocated twice.
638	 */
639	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
640	ASSERT3U(spa_first_txg(spa), <=, txg);
641
642	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
643	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
644	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
645
646	zio->io_bp = &zio->io_bp_copy;
647
648	return (zio);
649}
650
651zio_t *
652zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
653    zio_done_func_t *done, void *private, int priority, int flags)
654{
655	zio_t *zio;
656	int c;
657
658	if (vd->vdev_children == 0) {
659		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
660		    ZIO_TYPE_IOCTL, priority, flags,
661		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
662
663		zio->io_vd = vd;
664		zio->io_cmd = cmd;
665	} else {
666		zio = zio_null(pio, spa, NULL, NULL, flags);
667
668		for (c = 0; c < vd->vdev_children; c++)
669			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
670			    done, private, priority, flags));
671	}
672
673	return (zio);
674}
675
676static void
677zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
678    int checksum)
679{
680	ASSERT(vd->vdev_children == 0);
681
682	ASSERT(size <= SPA_MAXBLOCKSIZE);
683	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
684	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
685
686	ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
687	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
688	ASSERT3U(offset + size, <=, vd->vdev_psize);
689
690	BP_ZERO(bp);
691
692	BP_SET_LSIZE(bp, size);
693	BP_SET_PSIZE(bp, size);
694
695	BP_SET_CHECKSUM(bp, checksum);
696	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
697	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
698
699	if (checksum != ZIO_CHECKSUM_OFF)
700		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
701}
702
703zio_t *
704zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
705    void *data, int checksum, zio_done_func_t *done, void *private,
706    int priority, int flags)
707{
708	zio_t *zio;
709	blkptr_t blk;
710
711	ZIO_ENTER(vd->vdev_spa);
712
713	zio_phys_bp_init(vd, &blk, offset, size, checksum);
714
715	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
716	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
717	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
718
719	zio->io_vd = vd;
720	zio->io_offset = offset;
721
722	/*
723	 * Work off our copy of the bp so the caller can free it.
724	 */
725	zio->io_bp = &zio->io_bp_copy;
726
727	return (zio);
728}
729
730zio_t *
731zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
732    void *data, int checksum, zio_done_func_t *done, void *private,
733    int priority, int flags)
734{
735	zio_block_tail_t *zbt;
736	void *wbuf;
737	zio_t *zio;
738	blkptr_t blk;
739
740	ZIO_ENTER(vd->vdev_spa);
741
742	zio_phys_bp_init(vd, &blk, offset, size, checksum);
743
744	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
745	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
746	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
747
748	zio->io_vd = vd;
749	zio->io_offset = offset;
750
751	zio->io_bp = &zio->io_bp_copy;
752	zio->io_checksum = checksum;
753
754	if (zio_checksum_table[checksum].ci_zbt) {
755		/*
756		 * zbt checksums are necessarily destructive -- they modify
757		 * one word of the write buffer to hold the verifier/checksum.
758		 * Therefore, we must make a local copy in case the data is
759		 * being written to multiple places.
760		 */
761		wbuf = zio_buf_alloc(size);
762		bcopy(data, wbuf, size);
763		zio_push_transform(zio, wbuf, size, size);
764
765		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
766		zbt->zbt_cksum = blk.blk_cksum;
767	}
768
769	return (zio);
770}
771
772/*
773 * Create a child I/O to do some work for us.  It has no associated bp.
774 */
775zio_t *
776zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
777	void *data, uint64_t size, int type, int priority, int flags,
778	zio_done_func_t *done, void *private)
779{
780	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
781	zio_t *cio;
782
783	if (type == ZIO_TYPE_READ && bp != NULL) {
784		/*
785		 * If we have the bp, then the child should perform the
786		 * checksum and the parent need not.  This pushes error
787		 * detection as close to the leaves as possible and
788		 * eliminates redundant checksums in the interior nodes.
789		 */
790		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
791		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
792	}
793
794	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
795	    done, private, type, priority,
796	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
797	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
798
799	cio->io_vd = vd;
800	cio->io_offset = offset;
801
802	return (cio);
803}
804
805/*
806 * ==========================================================================
807 * Initiate I/O, either sync or async
808 * ==========================================================================
809 */
810int
811zio_wait(zio_t *zio)
812{
813	int error;
814
815	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
816
817	zio->io_waiter = curthread;
818
819	zio_next_stage_async(zio);
820
821	mutex_enter(&zio->io_lock);
822	while (zio->io_stalled != ZIO_STAGE_DONE)
823		cv_wait(&zio->io_cv, &zio->io_lock);
824	mutex_exit(&zio->io_lock);
825
826	error = zio->io_error;
827	mutex_destroy(&zio->io_lock);
828	cv_destroy(&zio->io_cv);
829	kmem_cache_free(zio_cache, zio);
830
831	return (error);
832}
833
834void
835zio_nowait(zio_t *zio)
836{
837	zio_next_stage_async(zio);
838}
839
840/*
841 * ==========================================================================
842 * I/O pipeline interlocks: parent/child dependency scoreboarding
843 * ==========================================================================
844 */
845static void
846zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
847{
848	mutex_enter(&zio->io_lock);
849	if (*countp == 0) {
850		ASSERT(zio->io_stalled == 0);
851		mutex_exit(&zio->io_lock);
852		zio_next_stage(zio);
853	} else {
854		zio->io_stalled = stage;
855		mutex_exit(&zio->io_lock);
856	}
857}
858
859static void
860zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
861{
862	zio_t *pio = zio->io_parent;
863
864	mutex_enter(&pio->io_lock);
865	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
866		pio->io_error = zio->io_error;
867	ASSERT3U(*countp, >, 0);
868	if (--*countp == 0 && pio->io_stalled == stage) {
869		pio->io_stalled = 0;
870		mutex_exit(&pio->io_lock);
871		zio_next_stage_async(pio);
872	} else {
873		mutex_exit(&pio->io_lock);
874	}
875}
876
877static void
878zio_wait_children_ready(zio_t *zio)
879{
880	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
881	    &zio->io_children_notready);
882}
883
884void
885zio_wait_children_done(zio_t *zio)
886{
887	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
888	    &zio->io_children_notdone);
889}
890
891static void
892zio_read_init(zio_t *zio)
893{
894	if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) {
895		uint64_t csize = BP_GET_PSIZE(zio->io_bp);
896		void *cbuf = zio_buf_alloc(csize);
897
898		zio_push_transform(zio, cbuf, csize, csize);
899		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
900	}
901
902	if (BP_IS_GANG(zio->io_bp)) {
903		uint64_t gsize = SPA_GANGBLOCKSIZE;
904		void *gbuf = zio_buf_alloc(gsize);
905
906		zio_push_transform(zio, gbuf, gsize, gsize);
907		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
908	}
909	zio_next_stage(zio);
910}
911
912static void
913zio_ready(zio_t *zio)
914{
915	zio_t *pio = zio->io_parent;
916
917	if (zio->io_ready)
918		zio->io_ready(zio);
919
920	if (pio != NULL)
921		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
922		    &pio->io_children_notready);
923
924	if (zio->io_bp)
925		zio->io_bp_copy = *zio->io_bp;
926
927	zio_next_stage(zio);
928}
929
930static void
931zio_vdev_retry_io(zio_t *zio)
932{
933	zio_t *pio = zio->io_parent;
934
935	/*
936	 * Preserve the failed bp so that the io_ready() callback can
937	 * update the accounting accordingly. The callback will also be
938	 * responsible for freeing the previously allocated block, if one
939	 * exists.
940	 */
941	zio->io_bp_orig = *zio->io_bp;
942
943	/*
944	 * We must zero out the old DVA and blk_birth before reallocating
945	 * the bp.
946	 */
947	BP_ZERO_DVAS(zio->io_bp);
948	zio_reset(zio);
949
950	if (pio) {
951		/*
952		 * Let the parent know that we will
953		 * re-alloc the write (=> new bp info).
954		 */
955		mutex_enter(&pio->io_lock);
956		pio->io_children_notready++;
957
958		/*
959		 * If the parent I/O is still in the open stage, then
960		 * don't bother telling it to retry since it hasn't
961		 * progressed far enough for it to care.
962		 */
963		if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
964			pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
965
966		ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE);
967		mutex_exit(&pio->io_lock);
968	}
969
970	/*
971	 * We are getting ready to process the retry request so clear
972	 * the flag and the zio's current error status.
973	 */
974	zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
975	zio->io_error = 0;
976	zio_next_stage_async(zio);
977}
978
979int
980zio_vdev_resume_io(spa_t *spa)
981{
982	zio_t *zio;
983
984	mutex_enter(&spa->spa_zio_lock);
985
986	/*
987	 * Probe all of vdevs that have experienced an I/O error.
988	 * If we are still unable to verify the integrity of the vdev
989	 * then we prevent the resume from proceeeding.
990	 */
991	for (zio = list_head(&spa->spa_zio_list); zio != NULL;
992	    zio = list_next(&spa->spa_zio_list, zio)) {
993		int error = 0;
994
995		/* We only care about I/Os that must succeed */
996		if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL)
997			continue;
998		error = vdev_probe(zio->io_vd);
999		if (error) {
1000			mutex_exit(&spa->spa_zio_lock);
1001			return (error);
1002		}
1003	}
1004
1005	/*
1006	 * Clear the vdev stats so that I/O can flow.
1007	 */
1008	vdev_clear(spa, NULL, B_FALSE);
1009
1010	spa->spa_state = POOL_STATE_ACTIVE;
1011	while ((zio = list_head(&spa->spa_zio_list)) != NULL) {
1012		list_remove(&spa->spa_zio_list, zio);
1013		zio->io_error = 0;
1014
1015		/*
1016		 * If we are resuming an allocating I/O then we force it
1017		 * to retry and let it resume operation where it left off.
1018		 * Otherwise, go back to the ready stage and pick up from
1019		 * there.
1020		 */
1021		if (zio_write_retry && IO_IS_ALLOCATING(zio)) {
1022			zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
1023			zio->io_stage--;
1024		} else {
1025			zio->io_stage = ZIO_STAGE_READY;
1026		}
1027
1028		(void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async,
1029		    zio, TQ_SLEEP);
1030	}
1031	mutex_exit(&spa->spa_zio_lock);
1032
1033	/*
1034	 * Wait for the taskqs to finish and recheck the pool state since
1035	 * it's possible that a resumed I/O has failed again.
1036	 */
1037	taskq_wait(zio_taskq);
1038	if (spa_state(spa) == POOL_STATE_IO_FAILURE)
1039		return (EIO);
1040
1041	mutex_enter(&spa->spa_zio_lock);
1042	cv_broadcast(&spa->spa_zio_cv);
1043	mutex_exit(&spa->spa_zio_lock);
1044
1045	return (0);
1046}
1047
1048static void
1049zio_vdev_suspend_io(zio_t *zio)
1050{
1051	spa_t *spa = zio->io_spa;
1052
1053	/*
1054	 * We've experienced an unrecoverable failure so
1055	 * set the pool state accordingly and queue all
1056	 * failed IOs.
1057	 */
1058	spa->spa_state = POOL_STATE_IO_FAILURE;
1059
1060	mutex_enter(&spa->spa_zio_lock);
1061	list_insert_tail(&spa->spa_zio_list, zio);
1062
1063#ifndef _KERNEL
1064	/* Used to notify ztest that the pool has suspended */
1065	cv_broadcast(&spa->spa_zio_cv);
1066#endif
1067	mutex_exit(&spa->spa_zio_lock);
1068}
1069
1070static void
1071zio_assess(zio_t *zio)
1072{
1073	spa_t *spa = zio->io_spa;
1074	blkptr_t *bp = zio->io_bp;
1075	vdev_t *vd = zio->io_vd;
1076
1077	ASSERT(zio->io_children_notready == 0);
1078	ASSERT(zio->io_children_notdone == 0);
1079
1080	if (bp != NULL) {
1081		ASSERT(bp->blk_pad[0] == 0);
1082		ASSERT(bp->blk_pad[1] == 0);
1083		ASSERT(bp->blk_pad[2] == 0);
1084		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
1085		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
1086		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
1087			ASSERT(!BP_SHOULD_BYTESWAP(bp));
1088			if (zio->io_ndvas != 0)
1089				ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
1090			ASSERT(BP_COUNT_GANG(bp) == 0 ||
1091			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
1092		}
1093	}
1094
1095	/*
1096	 * Some child I/O has indicated that a retry is necessary, so
1097	 * we set an error on the I/O and let the logic below do the
1098	 * rest.
1099	 */
1100	if (zio->io_flags & ZIO_FLAG_WRITE_RETRY)
1101		zio->io_error = ERESTART;
1102
1103	if (vd != NULL)
1104		vdev_stat_update(zio);
1105
1106	if (zio->io_error) {
1107		/*
1108		 * If this I/O is attached to a particular vdev,
1109		 * generate an error message describing the I/O failure
1110		 * at the block level.  We ignore these errors if the
1111		 * device is currently unavailable.
1112		 */
1113		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
1114			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
1115
1116		if ((zio->io_error == EIO ||
1117		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
1118		    zio->io_logical == zio) {
1119			/*
1120			 * For root I/O requests, tell the SPA to log the error
1121			 * appropriately.  Also, generate a logical data
1122			 * ereport.
1123			 */
1124			spa_log_error(spa, zio);
1125
1126			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
1127			    0, 0);
1128		}
1129
1130		/*
1131		 * If we are an allocating I/O then we attempt to reissue
1132		 * the I/O on another vdev unless the pool is out of space.
1133		 * We handle this condition based on the spa's failmode
1134		 * property.
1135		 */
1136		if (zio_write_retry && zio->io_error != ENOSPC &&
1137		    IO_IS_ALLOCATING(zio)) {
1138			zio_vdev_retry_io(zio);
1139			return;
1140		}
1141		ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
1142
1143		/*
1144		 * For I/O requests that cannot fail, we carry out
1145		 * the requested behavior based on the failmode pool
1146		 * property.
1147		 *
1148		 * XXX - Need to differentiate between an ENOSPC as
1149		 * a result of vdev failures vs. a full pool.
1150		 */
1151		if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
1152			char *blkbuf;
1153
1154#ifdef ZFS_DEBUG
1155			blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
1156			if (blkbuf) {
1157				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
1158				    bp ? bp : &zio->io_bp_copy);
1159			}
1160			cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p "
1161			    "%s): error %d", zio->io_error == ECKSUM ?
1162			    "bad checksum" : "I/O failure",
1163			    zio_type_name[zio->io_type],
1164			    vdev_description(vd),
1165			    (u_longlong_t)zio->io_offset,
1166			    (void *)zio, blkbuf ? blkbuf : "", zio->io_error);
1167#endif
1168
1169			if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) {
1170				fm_panic("Pool '%s' has encountered an "
1171				    "uncorrectable I/O failure and the "
1172				    "failure mode property for this pool "
1173				    "is set to panic.", spa_name(spa));
1174			} else {
1175				cmn_err(CE_WARN, "Pool '%s' has encountered "
1176				    "an uncorrectable I/O error. Manual "
1177				    "intervention is required.",
1178				    spa_name(spa));
1179				zio_vdev_suspend_io(zio);
1180			}
1181			return;
1182		}
1183	}
1184	ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
1185	ASSERT(zio->io_children_notready == 0);
1186	zio_next_stage(zio);
1187}
1188
1189static void
1190zio_done(zio_t *zio)
1191{
1192	zio_t *pio = zio->io_parent;
1193	spa_t *spa = zio->io_spa;
1194
1195	ASSERT(zio->io_children_notready == 0);
1196	ASSERT(zio->io_children_notdone == 0);
1197
1198	zio_clear_transform_stack(zio);
1199
1200	if (zio->io_done)
1201		zio->io_done(zio);
1202
1203	ASSERT(zio->io_delegate_list == NULL);
1204	ASSERT(zio->io_delegate_next == NULL);
1205
1206	if (pio != NULL) {
1207		zio_t *next, *prev;
1208
1209		mutex_enter(&pio->io_lock);
1210		next = zio->io_sibling_next;
1211		prev = zio->io_sibling_prev;
1212		if (next != NULL)
1213			next->io_sibling_prev = prev;
1214		if (prev != NULL)
1215			prev->io_sibling_next = next;
1216		if (pio->io_child == zio)
1217			pio->io_child = next;
1218		mutex_exit(&pio->io_lock);
1219
1220		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
1221		    &pio->io_children_notdone);
1222	}
1223
1224	/*
1225	 * Note: this I/O is now done, and will shortly be freed, so there is no
1226	 * need to clear this (or any other) flag.
1227	 */
1228	if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
1229		spa_config_exit(spa, zio);
1230
1231	if (zio->io_waiter != NULL) {
1232		mutex_enter(&zio->io_lock);
1233		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1234		zio->io_stalled = zio->io_stage;
1235		cv_broadcast(&zio->io_cv);
1236		mutex_exit(&zio->io_lock);
1237	} else {
1238		mutex_destroy(&zio->io_lock);
1239		cv_destroy(&zio->io_cv);
1240		kmem_cache_free(zio_cache, zio);
1241	}
1242}
1243
1244/*
1245 * ==========================================================================
1246 * Compression support
1247 * ==========================================================================
1248 */
1249static void
1250zio_write_compress(zio_t *zio)
1251{
1252	int compress = zio->io_compress;
1253	blkptr_t *bp = zio->io_bp;
1254	void *cbuf;
1255	uint64_t lsize = zio->io_size;
1256	uint64_t csize = lsize;
1257	uint64_t cbufsize = 0;
1258	int pass;
1259
1260	if (bp->blk_birth == zio->io_txg) {
1261		/*
1262		 * We're rewriting an existing block, which means we're
1263		 * working on behalf of spa_sync().  For spa_sync() to
1264		 * converge, it must eventually be the case that we don't
1265		 * have to allocate new blocks.  But compression changes
1266		 * the blocksize, which forces a reallocate, and makes
1267		 * convergence take longer.  Therefore, after the first
1268		 * few passes, stop compressing to ensure convergence.
1269		 */
1270		pass = spa_sync_pass(zio->io_spa);
1271		if (pass > zio_sync_pass.zp_dontcompress)
1272			compress = ZIO_COMPRESS_OFF;
1273	} else {
1274		ASSERT(BP_IS_HOLE(bp));
1275		pass = 1;
1276	}
1277
1278	if (compress != ZIO_COMPRESS_OFF)
1279		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
1280		    &cbuf, &csize, &cbufsize))
1281			compress = ZIO_COMPRESS_OFF;
1282
1283	if (compress != ZIO_COMPRESS_OFF && csize != 0)
1284		zio_push_transform(zio, cbuf, csize, cbufsize);
1285
1286	/*
1287	 * The final pass of spa_sync() must be all rewrites, but the first
1288	 * few passes offer a trade-off: allocating blocks defers convergence,
1289	 * but newly allocated blocks are sequential, so they can be written
1290	 * to disk faster.  Therefore, we allow the first few passes of
1291	 * spa_sync() to reallocate new blocks, but force rewrites after that.
1292	 * There should only be a handful of blocks after pass 1 in any case.
1293	 */
1294	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
1295	    pass > zio_sync_pass.zp_rewrite) {
1296		ASSERT(csize != 0);
1297		BP_SET_LSIZE(bp, lsize);
1298		BP_SET_COMPRESS(bp, compress);
1299		zio->io_pipeline = ZIO_REWRITE_PIPELINE;
1300	} else {
1301		if (bp->blk_birth == zio->io_txg)
1302			BP_ZERO(bp);
1303		if (csize == 0) {
1304			BP_ZERO(bp);
1305			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
1306		} else {
1307			ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
1308			BP_SET_LSIZE(bp, lsize);
1309			BP_SET_PSIZE(bp, csize);
1310			BP_SET_COMPRESS(bp, compress);
1311			zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE;
1312		}
1313	}
1314
1315	zio_next_stage(zio);
1316}
1317
1318static void
1319zio_read_decompress(zio_t *zio)
1320{
1321	blkptr_t *bp = zio->io_bp;
1322	void *data;
1323	uint64_t size;
1324	uint64_t bufsize;
1325	int compress = BP_GET_COMPRESS(bp);
1326
1327	ASSERT(compress != ZIO_COMPRESS_OFF);
1328
1329	zio_pop_transform(zio, &data, &size, &bufsize);
1330
1331	if (zio_decompress_data(compress, data, size,
1332	    zio->io_data, zio->io_size))
1333		zio->io_error = EIO;
1334
1335	zio_buf_free(data, bufsize);
1336
1337	zio_next_stage(zio);
1338}
1339
1340/*
1341 * ==========================================================================
1342 * Gang block support
1343 * ==========================================================================
1344 */
1345static void
1346zio_gang_pipeline(zio_t *zio)
1347{
1348	/*
1349	 * By default, the pipeline assumes that we're dealing with a gang
1350	 * block.  If we're not, strip out any gang-specific stages.
1351	 */
1352	if (!BP_IS_GANG(zio->io_bp))
1353		zio->io_pipeline &= ~ZIO_GANG_STAGES;
1354
1355	zio_next_stage(zio);
1356}
1357
1358static void
1359zio_gang_byteswap(zio_t *zio)
1360{
1361	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1362
1363	if (BP_SHOULD_BYTESWAP(zio->io_bp))
1364		byteswap_uint64_array(zio->io_data, zio->io_size);
1365}
1366
1367static void
1368zio_get_gang_header(zio_t *zio)
1369{
1370	blkptr_t *bp = zio->io_bp;
1371	uint64_t gsize = SPA_GANGBLOCKSIZE;
1372	void *gbuf = zio_buf_alloc(gsize);
1373
1374	ASSERT(BP_IS_GANG(bp));
1375
1376	zio_push_transform(zio, gbuf, gsize, gsize);
1377
1378	zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
1379	    NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
1380	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
1381	    ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
1382
1383	zio_wait_children_done(zio);
1384}
1385
1386static void
1387zio_read_gang_members(zio_t *zio)
1388{
1389	zio_gbh_phys_t *gbh;
1390	uint64_t gsize, gbufsize, loff, lsize;
1391	int i;
1392
1393	ASSERT(BP_IS_GANG(zio->io_bp));
1394
1395	zio_gang_byteswap(zio);
1396	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1397
1398	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
1399		blkptr_t *gbp = &gbh->zg_blkptr[i];
1400		lsize = BP_GET_PSIZE(gbp);
1401
1402		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
1403		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
1404		ASSERT3U(loff + lsize, <=, zio->io_size);
1405		ASSERT(i < SPA_GBH_NBLKPTRS);
1406		ASSERT(!BP_IS_HOLE(gbp));
1407
1408		zio_nowait(zio_read(zio, zio->io_spa, gbp,
1409		    (char *)zio->io_data + loff, lsize, NULL, NULL,
1410		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
1411		    &zio->io_bookmark));
1412	}
1413
1414	zio_buf_free(gbh, gbufsize);
1415	zio_wait_children_done(zio);
1416}
1417
1418static void
1419zio_rewrite_gang_members(zio_t *zio)
1420{
1421	zio_gbh_phys_t *gbh;
1422	uint64_t gsize, gbufsize, loff, lsize;
1423	int i;
1424
1425	ASSERT(BP_IS_GANG(zio->io_bp));
1426	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
1427
1428	zio_gang_byteswap(zio);
1429	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1430
1431	ASSERT(gsize == gbufsize);
1432
1433	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
1434		blkptr_t *gbp = &gbh->zg_blkptr[i];
1435		lsize = BP_GET_PSIZE(gbp);
1436
1437		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
1438		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
1439		ASSERT3U(loff + lsize, <=, zio->io_size);
1440		ASSERT(i < SPA_GBH_NBLKPTRS);
1441		ASSERT(!BP_IS_HOLE(gbp));
1442
1443		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
1444		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
1445		    NULL, NULL, zio->io_priority, zio->io_flags,
1446		    &zio->io_bookmark));
1447	}
1448
1449	zio_push_transform(zio, gbh, gsize, gbufsize);
1450	zio_wait_children_ready(zio);
1451}
1452
1453static void
1454zio_free_gang_members(zio_t *zio)
1455{
1456	zio_gbh_phys_t *gbh;
1457	uint64_t gsize, gbufsize;
1458	int i;
1459
1460	ASSERT(BP_IS_GANG(zio->io_bp));
1461
1462	zio_gang_byteswap(zio);
1463	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1464
1465	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1466		blkptr_t *gbp = &gbh->zg_blkptr[i];
1467
1468		if (BP_IS_HOLE(gbp))
1469			continue;
1470		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
1471		    gbp, NULL, NULL));
1472	}
1473
1474	zio_buf_free(gbh, gbufsize);
1475	zio_next_stage(zio);
1476}
1477
1478static void
1479zio_claim_gang_members(zio_t *zio)
1480{
1481	zio_gbh_phys_t *gbh;
1482	uint64_t gsize, gbufsize;
1483	int i;
1484
1485	ASSERT(BP_IS_GANG(zio->io_bp));
1486
1487	zio_gang_byteswap(zio);
1488	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1489
1490	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1491		blkptr_t *gbp = &gbh->zg_blkptr[i];
1492		if (BP_IS_HOLE(gbp))
1493			continue;
1494		zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
1495		    gbp, NULL, NULL));
1496	}
1497
1498	zio_buf_free(gbh, gbufsize);
1499	zio_next_stage(zio);
1500}
1501
1502static void
1503zio_write_allocate_gang_member_done(zio_t *zio)
1504{
1505	zio_t *pio = zio->io_parent;
1506	dva_t *cdva = zio->io_bp->blk_dva;
1507	dva_t *pdva = pio->io_bp->blk_dva;
1508	uint64_t asize;
1509	int d;
1510
1511	ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
1512	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1513	ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
1514	ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
1515
1516	mutex_enter(&pio->io_lock);
1517	for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
1518		ASSERT(DVA_GET_GANG(&pdva[d]));
1519		asize = DVA_GET_ASIZE(&pdva[d]);
1520		asize += DVA_GET_ASIZE(&cdva[d]);
1521		DVA_SET_ASIZE(&pdva[d], asize);
1522	}
1523	mutex_exit(&pio->io_lock);
1524}
1525
1526static int
1527zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
1528{
1529	blkptr_t *bp = zio->io_bp;
1530	dva_t *dva = bp->blk_dva;
1531	spa_t *spa = zio->io_spa;
1532	zio_gbh_phys_t *gbh;
1533	uint64_t txg = zio->io_txg;
1534	uint64_t resid = zio->io_size;
1535	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
1536	uint64_t gsize, loff, lsize;
1537	uint32_t gbps_left;
1538	int ndvas = zio->io_ndvas;
1539	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
1540	int error;
1541	int i, d;
1542
1543	gsize = SPA_GANGBLOCKSIZE;
1544	gbps_left = SPA_GBH_NBLKPTRS;
1545
1546	error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
1547	    B_FALSE);
1548	if (error)
1549		return (error);
1550
1551	for (d = 0; d < gbh_ndvas; d++)
1552		DVA_SET_GANG(&dva[d], 1);
1553
1554	bp->blk_birth = txg;
1555
1556	gbh = zio_buf_alloc(gsize);
1557	bzero(gbh, gsize);
1558
1559	/* We need to test multi-level gang blocks */
1560	if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
1561		maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
1562
1563	for (loff = 0, i = 0; loff != zio->io_size;
1564	    loff += lsize, resid -= lsize, gbps_left--, i++) {
1565		blkptr_t *gbp = &gbh->zg_blkptr[i];
1566		dva = gbp->blk_dva;
1567
1568		ASSERT(gbps_left != 0);
1569		maxalloc = MIN(maxalloc, resid);
1570
1571		while (resid <= maxalloc * gbps_left) {
1572			error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas,
1573			    txg, bp, B_FALSE);
1574			if (error == 0)
1575				break;
1576			ASSERT3U(error, ==, ENOSPC);
1577			/* XXX - free up previous allocations? */
1578			if (maxalloc == SPA_MINBLOCKSIZE)
1579				return (error);
1580			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
1581		}
1582
1583		if (resid <= maxalloc * gbps_left) {
1584			lsize = maxalloc;
1585			BP_SET_LSIZE(gbp, lsize);
1586			BP_SET_PSIZE(gbp, lsize);
1587			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
1588			gbp->blk_birth = txg;
1589			zio_nowait(zio_rewrite(zio, spa,
1590			    zio->io_checksum, txg, gbp,
1591			    (char *)zio->io_data + loff, lsize,
1592			    zio_write_allocate_gang_member_done, NULL,
1593			    zio->io_priority,
1594			    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
1595			    &zio->io_bookmark));
1596		} else {
1597			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
1598			ASSERT(lsize != SPA_MINBLOCKSIZE);
1599			zio_nowait(zio_write_allocate(zio, spa,
1600			    zio->io_checksum, txg, gbp,
1601			    (char *)zio->io_data + loff, lsize,
1602			    zio_write_allocate_gang_member_done, NULL,
1603			    zio->io_priority,
1604			    zio->io_flags & ZIO_FLAG_GANG_INHERIT));
1605		}
1606	}
1607
1608	ASSERT(resid == 0 && loff == zio->io_size);
1609
1610	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
1611
1612	zio_push_transform(zio, gbh, gsize, gsize);
1613	/*
1614	 * As much as we'd like this to be zio_wait_children_ready(),
1615	 * updating our ASIZE doesn't happen until the io_done callback,
1616	 * so we have to wait for that to finish in order for our BP
1617	 * to be stable.
1618	 */
1619	zio_wait_children_done(zio);
1620	return (0);
1621}
1622
1623/*
1624 * ==========================================================================
1625 * Allocate and free blocks
1626 * ==========================================================================
1627 */
1628static void
1629zio_dva_allocate(zio_t *zio)
1630{
1631	spa_t *spa = zio->io_spa;
1632	metaslab_class_t *mc = spa->spa_normal_class;
1633	blkptr_t *bp = zio->io_bp;
1634	int error;
1635
1636	ASSERT(BP_IS_HOLE(bp));
1637	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
1638	ASSERT3U(zio->io_ndvas, >, 0);
1639	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa));
1640
1641	/* For testing, make some blocks above a certain size be gang blocks */
1642	if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
1643		error = zio_write_allocate_gang_members(zio, mc);
1644		if (error)
1645			zio->io_error = error;
1646		return;
1647	}
1648
1649	/*
1650	 * For testing purposes, we force I/Os to retry. We don't allow
1651	 * retries beyond the first pass since those I/Os are non-allocating
1652	 * writes.
1653	 */
1654	if (zio_io_fail_shift &&
1655	    spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite &&
1656	    zio_io_should_fail(zio_io_fail_shift))
1657		zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
1658
1659	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1660
1661	error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas,
1662	    zio->io_txg, NULL, B_FALSE);
1663
1664	if (error == 0) {
1665		bp->blk_birth = zio->io_txg;
1666	} else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
1667		error = zio_write_allocate_gang_members(zio, mc);
1668		if (error == 0)
1669			return;
1670		zio->io_error = error;
1671	} else {
1672		zio->io_error = error;
1673	}
1674	zio_next_stage(zio);
1675}
1676
1677static void
1678zio_dva_free(zio_t *zio)
1679{
1680	blkptr_t *bp = zio->io_bp;
1681
1682	metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
1683
1684	BP_ZERO(bp);
1685
1686	zio_next_stage(zio);
1687}
1688
1689static void
1690zio_dva_claim(zio_t *zio)
1691{
1692	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
1693
1694	zio_next_stage(zio);
1695}
1696
1697/*
1698 * ==========================================================================
1699 * Read and write to physical devices
1700 * ==========================================================================
1701 */
1702
1703static void
1704zio_vdev_io_start(zio_t *zio)
1705{
1706	vdev_t *vd = zio->io_vd;
1707	vdev_t *tvd = vd ? vd->vdev_top : NULL;
1708	blkptr_t *bp = zio->io_bp;
1709	uint64_t align;
1710	spa_t *spa = zio->io_spa;
1711
1712	/*
1713	 * If the pool is already in a failure state then just suspend
1714	 * this IO until the problem is resolved. We will reissue them
1715	 * at that time.
1716	 */
1717	if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
1718	    zio->io_type == ZIO_TYPE_WRITE) {
1719		zio_vdev_suspend_io(zio);
1720		return;
1721	}
1722
1723	if (vd == NULL) {
1724		/* The mirror_ops handle multiple DVAs in a single BP */
1725		vdev_mirror_ops.vdev_op_io_start(zio);
1726		return;
1727	}
1728
1729	align = 1ULL << tvd->vdev_ashift;
1730
1731	if (zio->io_retries == 0 && vd == tvd)
1732		zio->io_flags |= ZIO_FLAG_FAILFAST;
1733
1734	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
1735	    vd->vdev_children == 0) {
1736		zio->io_flags |= ZIO_FLAG_PHYSICAL;
1737		zio->io_offset += VDEV_LABEL_START_SIZE;
1738	}
1739
1740	if (P2PHASE(zio->io_size, align) != 0) {
1741		uint64_t asize = P2ROUNDUP(zio->io_size, align);
1742		char *abuf = zio_buf_alloc(asize);
1743		ASSERT(vd == tvd);
1744		if (zio->io_type == ZIO_TYPE_WRITE) {
1745			bcopy(zio->io_data, abuf, zio->io_size);
1746			bzero(abuf + zio->io_size, asize - zio->io_size);
1747		}
1748		zio_push_transform(zio, abuf, asize, asize);
1749		ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
1750		zio->io_flags |= ZIO_FLAG_SUBBLOCK;
1751	}
1752
1753	ASSERT(P2PHASE(zio->io_offset, align) == 0);
1754	ASSERT(P2PHASE(zio->io_size, align) == 0);
1755	ASSERT(bp == NULL ||
1756	    P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
1757	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
1758
1759	vdev_io_start(zio);
1760
1761	/* zio_next_stage_async() gets called from io completion interrupt */
1762}
1763
1764static void
1765zio_vdev_io_done(zio_t *zio)
1766{
1767	if (zio->io_vd == NULL)
1768		/* The mirror_ops handle multiple DVAs in a single BP */
1769		vdev_mirror_ops.vdev_op_io_done(zio);
1770	else
1771		vdev_io_done(zio);
1772}
1773
1774/* XXPOLICY */
1775boolean_t
1776zio_should_retry(zio_t *zio)
1777{
1778	vdev_t *vd = zio->io_vd;
1779
1780	if (zio->io_error == 0)
1781		return (B_FALSE);
1782	if (zio->io_delegate_list != NULL)
1783		return (B_FALSE);
1784	if (vd && vd != vd->vdev_top)
1785		return (B_FALSE);
1786	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
1787		return (B_FALSE);
1788	if (zio->io_retries > 0)
1789		return (B_FALSE);
1790
1791	return (B_TRUE);
1792}
1793
1794static void
1795zio_vdev_io_assess(zio_t *zio)
1796{
1797	vdev_t *vd = zio->io_vd;
1798	vdev_t *tvd = vd ? vd->vdev_top : NULL;
1799
1800	ASSERT(zio->io_vsd == NULL);
1801
1802	if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
1803		void *abuf;
1804		uint64_t asize;
1805		ASSERT(vd == tvd);
1806		zio_pop_transform(zio, &abuf, &asize, &asize);
1807		if (zio->io_type == ZIO_TYPE_READ)
1808			bcopy(abuf, zio->io_data, zio->io_size);
1809		zio_buf_free(abuf, asize);
1810		zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
1811	}
1812
1813	if (zio_injection_enabled && !zio->io_error)
1814		zio->io_error = zio_handle_fault_injection(zio, EIO);
1815
1816	/*
1817	 * If the I/O failed, determine whether we should attempt to retry it.
1818	 */
1819	/* XXPOLICY */
1820	if (zio_should_retry(zio)) {
1821		ASSERT(tvd == vd);
1822
1823		zio->io_retries++;
1824		zio->io_error = 0;
1825		zio->io_flags &= ZIO_FLAG_VDEV_INHERIT |
1826		    ZIO_FLAG_CONFIG_GRABBED;
1827		/* XXPOLICY */
1828		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
1829		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1830		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
1831
1832		dprintf("retry #%d for %s to %s offset %llx\n",
1833		    zio->io_retries, zio_type_name[zio->io_type],
1834		    vdev_description(vd), zio->io_offset);
1835
1836		zio_next_stage_async(zio);
1837		return;
1838	}
1839
1840	zio_next_stage(zio);
1841}
1842
1843void
1844zio_vdev_io_reissue(zio_t *zio)
1845{
1846	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1847	ASSERT(zio->io_error == 0);
1848
1849	zio->io_stage--;
1850}
1851
1852void
1853zio_vdev_io_redone(zio_t *zio)
1854{
1855	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
1856
1857	zio->io_stage--;
1858}
1859
1860void
1861zio_vdev_io_bypass(zio_t *zio)
1862{
1863	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1864	ASSERT(zio->io_error == 0);
1865
1866	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
1867	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
1868}
1869
1870/*
1871 * ==========================================================================
1872 * Generate and verify checksums
1873 * ==========================================================================
1874 */
1875static void
1876zio_checksum_generate(zio_t *zio)
1877{
1878	int checksum = zio->io_checksum;
1879	blkptr_t *bp = zio->io_bp;
1880
1881	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1882
1883	BP_SET_CHECKSUM(bp, checksum);
1884	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1885
1886	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
1887
1888	zio_next_stage(zio);
1889}
1890
1891static void
1892zio_gang_checksum_generate(zio_t *zio)
1893{
1894	zio_cksum_t zc;
1895	zio_gbh_phys_t *gbh = zio->io_data;
1896
1897	ASSERT(BP_IS_GANG(zio->io_bp));
1898	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
1899
1900	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
1901
1902	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
1903
1904	zio_next_stage(zio);
1905}
1906
1907static void
1908zio_checksum_verify(zio_t *zio)
1909{
1910	if (zio->io_bp != NULL) {
1911		zio->io_error = zio_checksum_error(zio);
1912		if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
1913			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1914			    zio->io_spa, zio->io_vd, zio, 0, 0);
1915	}
1916
1917	zio_next_stage(zio);
1918}
1919
1920/*
1921 * Called by RAID-Z to ensure we don't compute the checksum twice.
1922 */
1923void
1924zio_checksum_verified(zio_t *zio)
1925{
1926	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
1927}
1928
1929/*
1930 * Set the external verifier for a gang block based on stuff in the bp
1931 */
1932void
1933zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
1934{
1935	blkptr_t *bp = zio->io_bp;
1936
1937	zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
1938	zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
1939	zcp->zc_word[2] = bp->blk_birth;
1940	zcp->zc_word[3] = 0;
1941}
1942
1943/*
1944 * ==========================================================================
1945 * Define the pipeline
1946 * ==========================================================================
1947 */
1948typedef void zio_pipe_stage_t(zio_t *zio);
1949
1950static void
1951zio_badop(zio_t *zio)
1952{
1953	panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
1954}
1955
1956zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
1957	zio_badop,
1958	zio_wait_children_ready,
1959	zio_write_compress,
1960	zio_checksum_generate,
1961	zio_gang_pipeline,
1962	zio_get_gang_header,
1963	zio_rewrite_gang_members,
1964	zio_free_gang_members,
1965	zio_claim_gang_members,
1966	zio_dva_allocate,
1967	zio_dva_free,
1968	zio_dva_claim,
1969	zio_gang_checksum_generate,
1970	zio_ready,
1971	zio_read_init,
1972	zio_vdev_io_start,
1973	zio_vdev_io_done,
1974	zio_vdev_io_assess,
1975	zio_wait_children_done,
1976	zio_checksum_verify,
1977	zio_read_gang_members,
1978	zio_read_decompress,
1979	zio_assess,
1980	zio_done,
1981	zio_badop
1982};
1983
1984/*
1985 * Move an I/O to the next stage of the pipeline and execute that stage.
1986 * There's no locking on io_stage because there's no legitimate way for
1987 * multiple threads to be attempting to process the same I/O.
1988 */
1989void
1990zio_next_stage(zio_t *zio)
1991{
1992	uint32_t pipeline = zio->io_pipeline;
1993
1994	ASSERT(!MUTEX_HELD(&zio->io_lock));
1995
1996	if (zio->io_error) {
1997		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
1998		    zio, vdev_description(zio->io_vd),
1999		    zio->io_offset, zio->io_stage, zio->io_error);
2000		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
2001			pipeline &= ZIO_ERROR_PIPELINE_MASK;
2002	}
2003
2004	while (((1U << ++zio->io_stage) & pipeline) == 0)
2005		continue;
2006
2007	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
2008	ASSERT(zio->io_stalled == 0);
2009
2010	/*
2011	 * See the comment in zio_next_stage_async() about per-CPU taskqs.
2012	 */
2013	if (((1U << zio->io_stage) & zio->io_async_stages) &&
2014	    (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
2015	    !(zio->io_flags & ZIO_FLAG_METADATA)) {
2016		taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
2017		(void) taskq_dispatch(tq,
2018		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
2019	} else {
2020		zio_pipeline[zio->io_stage](zio);
2021	}
2022}
2023
2024void
2025zio_next_stage_async(zio_t *zio)
2026{
2027	taskq_t *tq;
2028	uint32_t pipeline = zio->io_pipeline;
2029
2030	ASSERT(!MUTEX_HELD(&zio->io_lock));
2031
2032	if (zio->io_error) {
2033		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
2034		    zio, vdev_description(zio->io_vd),
2035		    zio->io_offset, zio->io_stage, zio->io_error);
2036		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
2037			pipeline &= ZIO_ERROR_PIPELINE_MASK;
2038	}
2039
2040	while (((1U << ++zio->io_stage) & pipeline) == 0)
2041		continue;
2042
2043	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
2044	ASSERT(zio->io_stalled == 0);
2045
2046	/*
2047	 * For performance, we'll probably want two sets of task queues:
2048	 * per-CPU issue taskqs and per-CPU completion taskqs.  The per-CPU
2049	 * part is for read performance: since we have to make a pass over
2050	 * the data to checksum it anyway, we want to do this on the same CPU
2051	 * that issued the read, because (assuming CPU scheduling affinity)
2052	 * that thread is probably still there.  Getting this optimization
2053	 * right avoids performance-hostile cache-to-cache transfers.
2054	 *
2055	 * Note that having two sets of task queues is also necessary for
2056	 * correctness: if all of the issue threads get bogged down waiting
2057	 * for dependent reads (e.g. metaslab freelist) to complete, then
2058	 * there won't be any threads available to service I/O completion
2059	 * interrupts.
2060	 */
2061	if ((1U << zio->io_stage) & zio->io_async_stages) {
2062		if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
2063			tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
2064		else
2065			tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
2066		(void) taskq_dispatch(tq,
2067		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
2068	} else {
2069		zio_pipeline[zio->io_stage](zio);
2070	}
2071}
2072
2073void
2074zio_resubmit_stage_async(void *arg)
2075{
2076	zio_t *zio = (zio_t *)(uintptr_t)arg;
2077
2078	zio_next_stage_async(zio);
2079}
2080
2081static boolean_t
2082zio_io_should_fail(uint16_t range)
2083{
2084	static uint16_t	allocs = 0;
2085
2086	return (P2PHASE(allocs++, 1U<<range) == 0);
2087}
2088
2089/*
2090 * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2091 */
2092int
2093zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
2094    uint64_t txg)
2095{
2096	int error;
2097
2098	spa_config_enter(spa, RW_READER, FTAG);
2099
2100	if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) {
2101		spa_config_exit(spa, FTAG);
2102		return (ENOSPC);
2103	}
2104
2105	/*
2106	 * We were passed the previous log block's DVA in bp->blk_dva[0].
2107	 * We use that as a hint for which vdev to allocate from next.
2108	 */
2109	error = metaslab_alloc(spa, spa->spa_log_class, size,
2110	    new_bp, 1, txg, old_bp, B_TRUE);
2111
2112	if (error)
2113		error = metaslab_alloc(spa, spa->spa_normal_class, size,
2114		    new_bp, 1, txg, old_bp, B_TRUE);
2115
2116	if (error == 0) {
2117		BP_SET_LSIZE(new_bp, size);
2118		BP_SET_PSIZE(new_bp, size);
2119		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2120		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
2121		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2122		BP_SET_LEVEL(new_bp, 0);
2123		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2124		new_bp->blk_birth = txg;
2125	}
2126
2127	spa_config_exit(spa, FTAG);
2128
2129	return (error);
2130}
2131
2132/*
2133 * Free an intent log block.  We know it can't be a gang block, so there's
2134 * nothing to do except metaslab_free() it.
2135 */
2136void
2137zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
2138{
2139	ASSERT(!BP_IS_GANG(bp));
2140
2141	spa_config_enter(spa, RW_READER, FTAG);
2142
2143	metaslab_free(spa, bp, txg, B_FALSE);
2144
2145	spa_config_exit(spa, FTAG);
2146}
2147
2148/*
2149 * start an async flush of the write cache for this vdev
2150 */
2151void
2152zio_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio)
2153{
2154	vdev_t *vd;
2155
2156	/*
2157	 * Lock out configuration changes.
2158	 */
2159	spa_config_enter(spa, RW_READER, FTAG);
2160
2161	if (*zio == NULL)
2162		*zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
2163
2164	vd = vdev_lookup_top(spa, vdev);
2165	ASSERT(vd);
2166
2167	(void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE,
2168	    NULL, NULL, ZIO_PRIORITY_NOW,
2169	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
2170
2171	spa_config_exit(spa, FTAG);
2172}
2173