dmu_traverse.c revision 6423:437422a29d3a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/zfs_context.h>
29#include <sys/dmu_objset.h>
30#include <sys/dmu_traverse.h>
31#include <sys/dsl_dataset.h>
32#include <sys/dsl_dir.h>
33#include <sys/dsl_pool.h>
34#include <sys/dnode.h>
35#include <sys/spa.h>
36#include <sys/zio.h>
37#include <sys/dmu_impl.h>
38#include <sys/zvol.h>
39
40#define	BP_SPAN_SHIFT(level, width)	((level) * (width))
41
42#define	BP_EQUAL(b1, b2)				\
43	(DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) &&	\
44	(b1)->blk_birth == (b2)->blk_birth)
45
46/*
47 * Compare two bookmarks.
48 *
49 * For ADVANCE_PRE, the visitation order is:
50 *
51 *	objset 0, 1, 2, ..., ZB_MAXOBJSET.
52 *	object 0, 1, 2, ..., ZB_MAXOBJECT.
53 *	blkoff 0, 1, 2, ...
54 *	level ZB_MAXLEVEL, ..., 2, 1, 0.
55 *
56 * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
57 * ordering vector is:
58 *
59 *	< objset, object, blkoff, -level >
60 *
61 * For ADVANCE_POST, the starting offsets aren't sequential but ending
62 * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
63 * The visitation order is:
64 *
65 *	objset 1, 2, ..., ZB_MAXOBJSET, 0.
66 *	object 1, 2, ..., ZB_MAXOBJECT, 0.
67 *	blkoff 1, 2, ...
68 *	level 0, 1, 2, ..., ZB_MAXLEVEL.
69 *
70 * and thus a valid ordering vector is:
71 *
72 *	< objset - 1, object - 1, blkoff, level >
73 *
74 * Both orderings can be expressed as:
75 *
76 *	< objset + bias, object + bias, blkoff, level ^ bias >
77 *
78 * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
79 * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
80 *
81 * Special case: an objset's osphys is represented as level -1 of object 0.
82 * It is always either the very first or very last block we visit in an objset.
83 * Therefore, if either bookmark's level is -1, level alone determines order.
84 */
85static int
86compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
87    int advance)
88{
89	int bias = (advance & ADVANCE_PRE) ? 0 : -1;
90	uint64_t sblkoff, eblkoff;
91	int slevel, elevel, wshift;
92
93	if (szb->zb_objset + bias < ezb->zb_objset + bias)
94		return (-1);
95
96	if (szb->zb_objset + bias > ezb->zb_objset + bias)
97		return (1);
98
99	slevel = szb->zb_level;
100	elevel = ezb->zb_level;
101
102	if ((slevel | elevel) < 0)
103		return ((slevel ^ bias) - (elevel ^ bias));
104
105	if (szb->zb_object + bias < ezb->zb_object + bias)
106		return (-1);
107
108	if (szb->zb_object + bias > ezb->zb_object + bias)
109		return (1);
110
111	if (dnp == NULL)
112		return (0);
113
114	wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
115
116	sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
117	eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
118
119	if (sblkoff < eblkoff)
120		return (-1);
121
122	if (sblkoff > eblkoff)
123		return (1);
124
125	return ((elevel ^ bias) - (slevel ^ bias));
126}
127
128#define	SET_BOOKMARK(zb, objset, object, level, blkid)	\
129{							\
130	(zb)->zb_objset = objset;			\
131	(zb)->zb_object = object;			\
132	(zb)->zb_level = level;				\
133	(zb)->zb_blkid = blkid;				\
134}
135
136#define	SET_BOOKMARK_LB(zb, level, blkid)		\
137{							\
138	(zb)->zb_level = level;				\
139	(zb)->zb_blkid = blkid;				\
140}
141
142static int
143advance_objset(zseg_t *zseg, uint64_t objset, int advance)
144{
145	zbookmark_t *zb = &zseg->seg_start;
146
147	if (advance & ADVANCE_PRE) {
148		if (objset >= ZB_MAXOBJSET)
149			return (ERANGE);
150		SET_BOOKMARK(zb, objset, 0, -1, 0);
151	} else {
152		if (objset >= ZB_MAXOBJSET)
153			objset = 0;
154		SET_BOOKMARK(zb, objset, 1, 0, 0);
155	}
156
157	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
158		return (ERANGE);
159
160	return (EAGAIN);
161}
162
163static int
164advance_object(zseg_t *zseg, uint64_t object, int advance)
165{
166	zbookmark_t *zb = &zseg->seg_start;
167
168	if (advance & ADVANCE_PRE) {
169		if (object >= ZB_MAXOBJECT) {
170			SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
171		} else {
172			SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
173		}
174	} else {
175		if (zb->zb_object == 0) {
176			SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
177		} else {
178			if (object >= ZB_MAXOBJECT)
179				object = 0;
180			SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
181		}
182	}
183
184	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
185		return (ERANGE);
186
187	return (EAGAIN);
188}
189
190static int
191advance_from_osphys(zseg_t *zseg, int advance)
192{
193	zbookmark_t *zb = &zseg->seg_start;
194
195	ASSERT(zb->zb_object == 0);
196	ASSERT(zb->zb_level == -1);
197	ASSERT(zb->zb_blkid == 0);
198
199	if (advance & ADVANCE_PRE) {
200		SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
201	} else {
202		if (zb->zb_objset == 0)
203			return (ERANGE);
204		SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
205	}
206
207	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
208		return (ERANGE);
209
210	return (EAGAIN);
211}
212
213static int
214advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
215{
216	zbookmark_t *zb = &zseg->seg_start;
217	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
218	int maxlevel = dnp->dn_nlevels - 1;
219	int level = zb->zb_level;
220	uint64_t blkid = zb->zb_blkid;
221
222	if (advance & ADVANCE_PRE) {
223		if (level > 0 && rc == 0) {
224			level--;
225			blkid <<= wshift;
226		} else {
227			blkid++;
228
229			if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
230			    dnp->dn_maxblkid)
231				return (ERANGE);
232
233			while (level < maxlevel) {
234				if (P2PHASE(blkid, 1ULL << wshift))
235					break;
236				blkid >>= wshift;
237				level++;
238			}
239		}
240	} else {
241		if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
242			blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
243			level = 0;
244		} else {
245			blkid >>= wshift;
246			level++;
247		}
248
249		while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
250		    dnp->dn_maxblkid) {
251			if (level == maxlevel)
252				return (ERANGE);
253			blkid >>= wshift;
254			level++;
255		}
256	}
257	SET_BOOKMARK_LB(zb, level, blkid);
258
259	if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
260		return (ERANGE);
261
262	return (EAGAIN);
263}
264
265/*
266 * The traverse_callback function will call the function specified in th_func.
267 * In the event of an error the callee, specified by th_func, must return
268 * one of the following errors:
269 *
270 *	EINTR		- Indicates that the callee wants the traversal to
271 *			  abort immediately.
272 * 	ERESTART	- The callee has acknowledged the error and would
273 *			  like to continue.
274 */
275static int
276traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
277{
278	/*
279	 * Before we issue the callback, prune against maxtxg.
280	 *
281	 * We prune against mintxg before we get here because it's a big win.
282	 * If a given block was born in txg 37, then we know that the entire
283	 * subtree below that block must have been born in txg 37 or earlier.
284	 * We can therefore lop off huge branches of the tree as we go.
285	 *
286	 * There's no corresponding optimization for maxtxg because knowing
287	 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
288	 * children.  In fact, the copy-on-write design of ZFS ensures that
289	 * top-level blocks will pretty much always be new.
290	 *
291	 * Therefore, in the name of simplicity we don't prune against
292	 * maxtxg until the last possible moment -- that being right now.
293	 */
294	if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
295		return (0);
296
297	/*
298	 * Debugging: verify that the order we visit things agrees with the
299	 * order defined by compare_bookmark().  We don't check this for
300	 * log blocks because there's no defined ordering for them; they're
301	 * always visited (or not) as part of visiting the objset_phys_t.
302	 */
303	if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
304		zbookmark_t *zb = &bc->bc_bookmark;
305		zbookmark_t *szb = &zseg->seg_start;
306		zbookmark_t *ezb = &zseg->seg_end;
307		zbookmark_t *lzb = &th->th_lastcb;
308		dnode_phys_t *dnp = bc->bc_dnode;
309
310		ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
311		ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
312		ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
313		    lzb->zb_level == ZB_NO_LEVEL);
314		*lzb = *zb;
315	}
316
317	th->th_callbacks++;
318	return (th->th_func(bc, th->th_spa, th->th_arg));
319}
320
321static int
322traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
323	dnode_phys_t *dnp)
324{
325	zbookmark_t *zb = &bc->bc_bookmark;
326	int error;
327
328	th->th_hits++;
329
330	bc->bc_dnode = dnp;
331	bc->bc_errno = 0;
332
333	if (BP_EQUAL(&bc->bc_blkptr, bp))
334		return (0);
335
336	bc->bc_blkptr = *bp;
337
338	if (bc->bc_data == NULL)
339		return (0);
340
341	if (BP_IS_HOLE(bp)) {
342		ASSERT(th->th_advance & ADVANCE_HOLES);
343		return (0);
344	}
345
346	if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
347		error = EIO;
348	} else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
349		error = 0;
350		th->th_arc_hits++;
351	} else {
352		error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
353		    BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
354		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
355
356		if (BP_SHOULD_BYTESWAP(bp) && error == 0)
357			(zb->zb_level > 0 ? byteswap_uint64_array :
358			    dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
359			    BP_GET_LSIZE(bp));
360		th->th_reads++;
361	}
362
363	if (error) {
364		bc->bc_errno = error;
365		error = traverse_callback(th, NULL, bc);
366		ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
367		bc->bc_blkptr.blk_birth = -1ULL;
368	}
369
370	dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
371	    bc - &th->th_cache[0][0], error,
372	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
373
374	return (error);
375}
376
377static int
378find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
379{
380	zbookmark_t *zb = &zseg->seg_start;
381	traverse_blk_cache_t *bc;
382	blkptr_t *bp = dnp->dn_blkptr;
383	int i, first, level;
384	int nbp = dnp->dn_nblkptr;
385	int minlevel = zb->zb_level;
386	int maxlevel = dnp->dn_nlevels - 1;
387	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
388	int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
389	uint64_t blkid = zb->zb_blkid >> bp_shift;
390	int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
391	int rc;
392
393	if (minlevel > maxlevel || blkid >= nbp)
394		return (ERANGE);
395
396	for (level = maxlevel; level >= minlevel; level--) {
397		first = P2PHASE(blkid, 1ULL << wshift);
398
399		for (i = first; i < nbp; i++)
400			if (bp[i].blk_birth > zseg->seg_mintxg ||
401			    BP_IS_HOLE(&bp[i]) && do_holes)
402				break;
403
404		if (i != first) {
405			i--;
406			SET_BOOKMARK_LB(zb, level, blkid + (i - first));
407			return (ENOTBLK);
408		}
409
410		bc = &th->th_cache[depth][level];
411
412		SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
413		    level, blkid);
414
415		if (rc = traverse_read(th, bc, bp + i, dnp)) {
416			if (rc != EAGAIN) {
417				SET_BOOKMARK_LB(zb, level, blkid);
418			}
419			return (rc);
420		}
421
422		if (BP_IS_HOLE(&bp[i])) {
423			SET_BOOKMARK_LB(zb, level, blkid);
424			th->th_lastcb.zb_level = ZB_NO_LEVEL;
425			return (0);
426		}
427
428		nbp = 1 << wshift;
429		bp = bc->bc_data;
430		bp_shift -= wshift;
431		blkid = zb->zb_blkid >> bp_shift;
432	}
433
434	return (0);
435}
436
437static int
438get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
439    uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
440{
441	zseg_t zseg;
442	zbookmark_t *zb = &zseg.seg_start;
443	uint64_t object = *objectp;
444	int i, rc;
445
446	SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
447	SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
448
449	zseg.seg_mintxg = txg;
450	zseg.seg_maxtxg = -1ULL;
451
452	for (;;) {
453		rc = find_block(th, &zseg, mdn, depth);
454
455		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
456			break;
457
458		if (rc == 0 && zb->zb_level == 0) {
459			dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
460			for (i = 0; i < DNODES_PER_BLOCK; i++) {
461				object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
462				if (object >= *objectp &&
463				    dnp[i].dn_type != DMU_OT_NONE &&
464				    (type == -1 || dnp[i].dn_type == type)) {
465					*objectp = object;
466					*dnpp = &dnp[i];
467					return (0);
468				}
469			}
470		}
471
472		rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
473
474		if (rc == ERANGE)
475			break;
476	}
477
478	if (rc == ERANGE)
479		*objectp = ZB_MAXOBJECT;
480
481	return (rc);
482}
483
484/* ARGSUSED */
485static void
486traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
487{
488	traverse_handle_t *th = arg;
489	traverse_blk_cache_t *bc = &th->th_zil_cache;
490	zbookmark_t *zb = &bc->bc_bookmark;
491	zseg_t *zseg = list_head(&th->th_seglist);
492
493	if (bp->blk_birth <= zseg->seg_mintxg)
494		return;
495
496	if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
497		zb->zb_object = 0;
498		zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
499		bc->bc_blkptr = *bp;
500		(void) traverse_callback(th, zseg, bc);
501	}
502}
503
504/* ARGSUSED */
505static void
506traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
507{
508	traverse_handle_t *th = arg;
509	traverse_blk_cache_t *bc = &th->th_zil_cache;
510	zbookmark_t *zb = &bc->bc_bookmark;
511	zseg_t *zseg = list_head(&th->th_seglist);
512
513	if (lrc->lrc_txtype == TX_WRITE) {
514		lr_write_t *lr = (lr_write_t *)lrc;
515		blkptr_t *bp = &lr->lr_blkptr;
516
517		if (bp->blk_birth <= zseg->seg_mintxg)
518			return;
519
520		if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
521			zb->zb_object = lr->lr_foid;
522			zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
523			bc->bc_blkptr = *bp;
524			(void) traverse_callback(th, zseg, bc);
525		}
526	}
527}
528
529static void
530traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
531{
532	spa_t *spa = th->th_spa;
533	dsl_pool_t *dp = spa_get_dsl(spa);
534	objset_phys_t *osphys = bc->bc_data;
535	zil_header_t *zh = &osphys->os_zil_header;
536	uint64_t claim_txg = zh->zh_claim_txg;
537	zilog_t *zilog;
538
539	ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
540	ASSERT(bc->bc_bookmark.zb_level == -1);
541
542	/*
543	 * We only want to visit blocks that have been claimed but not yet
544	 * replayed (or, in read-only mode, blocks that *would* be claimed).
545	 */
546	if (claim_txg == 0 && (spa_mode & FWRITE))
547		return;
548
549	th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
550
551	zilog = zil_alloc(dp->dp_meta_objset, zh);
552
553	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
554	    claim_txg);
555
556	zil_free(zilog);
557}
558
559static int
560traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
561{
562	zbookmark_t *zb = &zseg->seg_start;
563	traverse_blk_cache_t *bc;
564	dnode_phys_t *dn, *dn_tmp;
565	int worklimit = 100;
566	int rc;
567
568	dprintf("<%llu, %llu, %d, %llx>\n",
569	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
570
571	bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
572	dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
573
574	SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
575
576	rc = traverse_read(th, bc, mosbp, dn);
577
578	if (rc)		/* If we get ERESTART, we've got nowhere left to go */
579		return (rc == ERESTART ? EINTR : rc);
580
581	ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
582
583	if (zb->zb_objset != 0) {
584		uint64_t objset = zb->zb_objset;
585		dsl_dataset_phys_t *dsp;
586
587		rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
588		    DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
589
590		if (objset != zb->zb_objset)
591			rc = advance_objset(zseg, objset, th->th_advance);
592
593		if (rc != 0)
594			return (rc);
595
596		dsp = DN_BONUS(dn_tmp);
597
598		bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
599		dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
600
601		SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
602
603		/*
604		 * If we're traversing an open snapshot, we know that it
605		 * can't be deleted (because it's open) and it can't change
606		 * (because it's a snapshot).  Therefore, once we've gotten
607		 * from the uberblock down to the snapshot's objset_phys_t,
608		 * we no longer need to synchronize with spa_sync(); we're
609		 * traversing a completely static block tree from here on.
610		 */
611		if (th->th_advance & ADVANCE_NOLOCK) {
612			ASSERT(th->th_locked);
613			rw_exit(spa_traverse_rwlock(th->th_spa));
614			th->th_locked = 0;
615		}
616
617		rc = traverse_read(th, bc, &dsp->ds_bp, dn);
618
619		if (rc != 0) {
620			if (rc == ERESTART)
621				rc = advance_objset(zseg, zb->zb_objset + 1,
622				    th->th_advance);
623			return (rc);
624		}
625
626		if (th->th_advance & ADVANCE_PRUNE)
627			zseg->seg_mintxg =
628			    MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
629	}
630
631	if (zb->zb_level == -1) {
632		ASSERT(zb->zb_object == 0);
633		ASSERT(zb->zb_blkid == 0);
634		ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
635
636		if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
637			rc = traverse_callback(th, zseg, bc);
638			if (rc) {
639				ASSERT(rc == EINTR);
640				return (rc);
641			}
642			if ((th->th_advance & ADVANCE_ZIL) &&
643			    zb->zb_objset != 0)
644				traverse_zil(th, bc);
645		}
646
647		return (advance_from_osphys(zseg, th->th_advance));
648	}
649
650	if (zb->zb_object != 0) {
651		uint64_t object = zb->zb_object;
652
653		rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
654		    zseg->seg_mintxg, -1, ZB_MDN_CACHE);
655
656		if (object != zb->zb_object)
657			rc = advance_object(zseg, object, th->th_advance);
658
659		if (rc != 0)
660			return (rc);
661
662		dn = dn_tmp;
663	}
664
665	if (zb->zb_level == ZB_MAXLEVEL)
666		zb->zb_level = dn->dn_nlevels - 1;
667
668	for (;;) {
669		rc = find_block(th, zseg, dn, ZB_DN_CACHE);
670
671		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
672			break;
673
674		if (rc == 0) {
675			bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
676			ASSERT(bc->bc_dnode == dn);
677			ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
678			rc = traverse_callback(th, zseg, bc);
679			if (rc) {
680				ASSERT(rc == EINTR);
681				return (rc);
682			}
683			if (BP_IS_HOLE(&bc->bc_blkptr)) {
684				ASSERT(th->th_advance & ADVANCE_HOLES);
685				rc = ENOTBLK;
686			}
687		}
688
689		rc = advance_block(zseg, dn, rc, th->th_advance);
690
691		if (rc == ERANGE)
692			break;
693
694		/*
695		 * Give spa_sync() a chance to run.
696		 */
697		if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
698			th->th_syncs++;
699			return (EAGAIN);
700		}
701
702		if (--worklimit == 0)
703			return (EAGAIN);
704	}
705
706	if (rc == ERANGE)
707		rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
708
709	return (rc);
710}
711
712/*
713 * It is the caller's responsibility to ensure that the dsl_dataset_t
714 * doesn't go away during traversal.
715 */
716int
717traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
718    blkptr_cb_t func, void *arg)
719{
720	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
721	traverse_handle_t *th;
722	int err;
723
724	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
725
726	traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
727
728	while ((err = traverse_more(th)) == EAGAIN)
729		continue;
730
731	traverse_fini(th);
732	return (err);
733}
734
735int
736traverse_zvol(objset_t *os, int advance,  blkptr_cb_t func, void *arg)
737{
738	spa_t *spa = dmu_objset_spa(os);
739	traverse_handle_t *th;
740	int err;
741
742	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL);
743
744	traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ);
745
746	while ((err = traverse_more(th)) == EAGAIN)
747		continue;
748
749	traverse_fini(th);
750	return (err);
751}
752
753int
754traverse_more(traverse_handle_t *th)
755{
756	zseg_t *zseg = list_head(&th->th_seglist);
757	uint64_t save_txg;	/* XXX won't be necessary with real itinerary */
758	krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
759	blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
760	int rc;
761
762	if (zseg == NULL)
763		return (0);
764
765	th->th_restarts++;
766
767	save_txg = zseg->seg_mintxg;
768
769	rw_enter(rw, RW_READER);
770	th->th_locked = 1;
771
772	rc = traverse_segment(th, zseg, mosbp);
773	ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
774
775	if (th->th_locked)
776		rw_exit(rw);
777	th->th_locked = 0;
778
779	zseg->seg_mintxg = save_txg;
780
781	if (rc == ERANGE) {
782		list_remove(&th->th_seglist, zseg);
783		kmem_free(zseg, sizeof (*zseg));
784		return (EAGAIN);
785	}
786
787	return (rc);
788}
789
790/*
791 * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
792 * are not included.  The blocks covered by this segment will all have
793 * mintxg < birth < maxtxg.
794 */
795static void
796traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
797    uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
798    uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
799{
800	zseg_t *zseg;
801
802	zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
803
804	zseg->seg_mintxg = mintxg;
805	zseg->seg_maxtxg = maxtxg;
806
807	zseg->seg_start.zb_objset = sobjset;
808	zseg->seg_start.zb_object = sobject;
809	zseg->seg_start.zb_level = slevel;
810	zseg->seg_start.zb_blkid = sblkid;
811
812	zseg->seg_end.zb_objset = eobjset;
813	zseg->seg_end.zb_object = eobject;
814	zseg->seg_end.zb_level = elevel;
815	zseg->seg_end.zb_blkid = eblkid;
816
817	list_insert_tail(&th->th_seglist, zseg);
818}
819
820void
821traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
822    uint64_t objset, uint64_t object)
823{
824	if (th->th_advance & ADVANCE_PRE)
825		traverse_add_segment(th, mintxg, maxtxg,
826		    objset, object, ZB_MAXLEVEL, 0,
827		    objset, object, 0, ZB_MAXBLKID);
828	else
829		traverse_add_segment(th, mintxg, maxtxg,
830		    objset, object, 0, 0,
831		    objset, object, 0, ZB_MAXBLKID);
832}
833
834void
835traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
836    uint64_t objset)
837{
838	if (th->th_advance & ADVANCE_PRE)
839		traverse_add_segment(th, mintxg, maxtxg,
840		    objset, 0, -1, 0,
841		    objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
842	else
843		traverse_add_segment(th, mintxg, maxtxg,
844		    objset, 1, 0, 0,
845		    objset, 0, -1, 0);
846}
847
848void
849traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
850{
851	if (th->th_advance & ADVANCE_PRE)
852		traverse_add_segment(th, mintxg, maxtxg,
853		    0, 0, -1, 0,
854		    ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
855	else
856		traverse_add_segment(th, mintxg, maxtxg,
857		    1, 1, 0, 0,
858		    0, 0, -1, 0);
859}
860
861traverse_handle_t *
862traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
863    int zio_flags)
864{
865	traverse_handle_t *th;
866	int d, l;
867
868	th = kmem_zalloc(sizeof (*th), KM_SLEEP);
869
870	th->th_spa = spa;
871	th->th_func = func;
872	th->th_arg = arg;
873	th->th_advance = advance;
874	th->th_lastcb.zb_level = ZB_NO_LEVEL;
875	th->th_noread.zb_level = ZB_NO_LEVEL;
876	th->th_zio_flags = zio_flags;
877
878	list_create(&th->th_seglist, sizeof (zseg_t),
879	    offsetof(zseg_t, seg_node));
880
881	for (d = 0; d < ZB_DEPTH; d++) {
882		for (l = 0; l < ZB_MAXLEVEL; l++) {
883			if ((advance & ADVANCE_DATA) ||
884			    l != 0 || d != ZB_DN_CACHE)
885				th->th_cache[d][l].bc_data =
886				    zio_buf_alloc(SPA_MAXBLOCKSIZE);
887		}
888	}
889
890	return (th);
891}
892
893void
894traverse_fini(traverse_handle_t *th)
895{
896	int d, l;
897	zseg_t *zseg;
898
899	for (d = 0; d < ZB_DEPTH; d++)
900		for (l = 0; l < ZB_MAXLEVEL; l++)
901			if (th->th_cache[d][l].bc_data != NULL)
902				zio_buf_free(th->th_cache[d][l].bc_data,
903				    SPA_MAXBLOCKSIZE);
904
905	while ((zseg = list_head(&th->th_seglist)) != NULL) {
906		list_remove(&th->th_seglist, zseg);
907		kmem_free(zseg, sizeof (*zseg));
908	}
909
910	list_destroy(&th->th_seglist);
911
912	dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
913	    th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
914	    th->th_syncs, th->th_restarts);
915
916	kmem_free(th, sizeof (*th));
917}
918