Deleted Added
full compact
dmu_traverse.c (185029) dmu_traverse.c (208047)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 9 unchanged lines hidden (view full) ---

18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 9 unchanged lines hidden (view full) ---

18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident "%Z%%M% %I% %E% SMI"
27
28#include <sys/zfs_context.h>
29#include <sys/dmu_objset.h>
30#include <sys/dmu_traverse.h>
31#include <sys/dsl_dataset.h>
32#include <sys/dsl_dir.h>
33#include <sys/dsl_pool.h>
34#include <sys/dnode.h>
35#include <sys/spa.h>
36#include <sys/zio.h>
37#include <sys/dmu_impl.h>
26#include <sys/zfs_context.h>
27#include <sys/dmu_objset.h>
28#include <sys/dmu_traverse.h>
29#include <sys/dsl_dataset.h>
30#include <sys/dsl_dir.h>
31#include <sys/dsl_pool.h>
32#include <sys/dnode.h>
33#include <sys/spa.h>
34#include <sys/zio.h>
35#include <sys/dmu_impl.h>
38#include <sys/zvol.h>
36#include <sys/callb.h>
39
37
40#define BP_SPAN_SHIFT(level, width) ((level) * (width))
41
42#define BP_EQUAL(b1, b2) \
43 (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
44 (b1)->blk_birth == (b2)->blk_birth)
45
46/*
47 * Compare two bookmarks.
48 *
49 * For ADVANCE_PRE, the visitation order is:
50 *
51 * objset 0, 1, 2, ..., ZB_MAXOBJSET.
52 * object 0, 1, 2, ..., ZB_MAXOBJECT.
53 * blkoff 0, 1, 2, ...
54 * level ZB_MAXLEVEL, ..., 2, 1, 0.
55 *
56 * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
57 * ordering vector is:
58 *
59 * < objset, object, blkoff, -level >
60 *
61 * For ADVANCE_POST, the starting offsets aren't sequential but ending
62 * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
63 * The visitation order is:
64 *
65 * objset 1, 2, ..., ZB_MAXOBJSET, 0.
66 * object 1, 2, ..., ZB_MAXOBJECT, 0.
67 * blkoff 1, 2, ...
68 * level 0, 1, 2, ..., ZB_MAXLEVEL.
69 *
70 * and thus a valid ordering vector is:
71 *
72 * < objset - 1, object - 1, blkoff, level >
73 *
74 * Both orderings can be expressed as:
75 *
76 * < objset + bias, object + bias, blkoff, level ^ bias >
77 *
78 * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
79 * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
80 *
81 * Special case: an objset's osphys is represented as level -1 of object 0.
82 * It is always either the very first or very last block we visit in an objset.
83 * Therefore, if either bookmark's level is -1, level alone determines order.
84 */
85static int
86compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
87 int advance)
88{
89 int bias = (advance & ADVANCE_PRE) ? 0 : -1;
90 uint64_t sblkoff, eblkoff;
91 int slevel, elevel, wshift;
92
93 if (szb->zb_objset + bias < ezb->zb_objset + bias)
94 return (-1);
95
96 if (szb->zb_objset + bias > ezb->zb_objset + bias)
97 return (1);
98
99 slevel = szb->zb_level;
100 elevel = ezb->zb_level;
101
102 if ((slevel | elevel) < 0)
103 return ((slevel ^ bias) - (elevel ^ bias));
104
105 if (szb->zb_object + bias < ezb->zb_object + bias)
106 return (-1);
107
108 if (szb->zb_object + bias > ezb->zb_object + bias)
109 return (1);
110
111 if (dnp == NULL)
112 return (0);
113
114 wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
115
116 sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
117 eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
118
119 if (sblkoff < eblkoff)
120 return (-1);
121
122 if (sblkoff > eblkoff)
123 return (1);
124
125 return ((elevel ^ bias) - (slevel ^ bias));
38#define SET_BOOKMARK(zb, objset, object, level, blkid) \
39{ \
40 (zb)->zb_objset = objset; \
41 (zb)->zb_object = object; \
42 (zb)->zb_level = level; \
43 (zb)->zb_blkid = blkid; \
126}
127
44}
45
128#define SET_BOOKMARK(zb, objset, object, level, blkid) \
129{ \
130 (zb)->zb_objset = objset; \
131 (zb)->zb_object = object; \
132 (zb)->zb_level = level; \
133 (zb)->zb_blkid = blkid; \
134}
46struct prefetch_data {
47 kmutex_t pd_mtx;
48 kcondvar_t pd_cv;
49 int pd_blks_max;
50 int pd_blks_fetched;
51 int pd_flags;
52 boolean_t pd_cancel;
53 boolean_t pd_exited;
54};
135
55
136#define SET_BOOKMARK_LB(zb, level, blkid) \
137{ \
138 (zb)->zb_level = level; \
139 (zb)->zb_blkid = blkid; \
140}
56struct traverse_data {
57 spa_t *td_spa;
58 uint64_t td_objset;
59 blkptr_t *td_rootbp;
60 uint64_t td_min_txg;
61 int td_flags;
62 struct prefetch_data *td_pfd;
63 blkptr_cb_t *td_func;
64 void *td_arg;
65};
141
66
142static int
143advance_objset(zseg_t *zseg, uint64_t objset, int advance)
144{
145 zbookmark_t *zb = &zseg->seg_start;
146
147 if (advance & ADVANCE_PRE) {
148 if (objset >= ZB_MAXOBJSET)
149 return (ERANGE);
150 SET_BOOKMARK(zb, objset, 0, -1, 0);
151 } else {
152 if (objset >= ZB_MAXOBJSET)
153 objset = 0;
154 SET_BOOKMARK(zb, objset, 1, 0, 0);
155 }
156
157 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
158 return (ERANGE);
159
160 return (EAGAIN);
161}
162
163static int
164advance_object(zseg_t *zseg, uint64_t object, int advance)
165{
166 zbookmark_t *zb = &zseg->seg_start;
167
168 if (advance & ADVANCE_PRE) {
169 if (object >= ZB_MAXOBJECT) {
170 SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
171 } else {
172 SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
173 }
174 } else {
175 if (zb->zb_object == 0) {
176 SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
177 } else {
178 if (object >= ZB_MAXOBJECT)
179 object = 0;
180 SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
181 }
182 }
183
184 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
185 return (ERANGE);
186
187 return (EAGAIN);
188}
189
190static int
191advance_from_osphys(zseg_t *zseg, int advance)
192{
193 zbookmark_t *zb = &zseg->seg_start;
194
195 ASSERT(zb->zb_object == 0);
196 ASSERT(zb->zb_level == -1);
197 ASSERT(zb->zb_blkid == 0);
198
199 if (advance & ADVANCE_PRE) {
200 SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
201 } else {
202 if (zb->zb_objset == 0)
203 return (ERANGE);
204 SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
205 }
206
207 if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
208 return (ERANGE);
209
210 return (EAGAIN);
211}
212
213static int
214advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
215{
216 zbookmark_t *zb = &zseg->seg_start;
217 int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
218 int maxlevel = dnp->dn_nlevels - 1;
219 int level = zb->zb_level;
220 uint64_t blkid = zb->zb_blkid;
221
222 if (advance & ADVANCE_PRE) {
223 if (level > 0 && rc == 0) {
224 level--;
225 blkid <<= wshift;
226 } else {
227 blkid++;
228
229 if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
230 dnp->dn_maxblkid)
231 return (ERANGE);
232
233 while (level < maxlevel) {
234 if (P2PHASE(blkid, 1ULL << wshift))
235 break;
236 blkid >>= wshift;
237 level++;
238 }
239 }
240 } else {
241 if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
242 blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
243 level = 0;
244 } else {
245 blkid >>= wshift;
246 level++;
247 }
248
249 while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
250 dnp->dn_maxblkid) {
251 if (level == maxlevel)
252 return (ERANGE);
253 blkid >>= wshift;
254 level++;
255 }
256 }
257 SET_BOOKMARK_LB(zb, level, blkid);
258
259 if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
260 return (ERANGE);
261
262 return (EAGAIN);
263}
264
265/*
266 * The traverse_callback function will call the function specified in th_func.
267 * In the event of an error the callee, specified by th_func, must return
268 * one of the following errors:
269 *
270 * EINTR - Indicates that the callee wants the traversal to
271 * abort immediately.
272 * ERESTART - The callee has acknowledged the error and would
273 * like to continue.
274 */
275static int
276traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
277{
278 /*
279 * Before we issue the callback, prune against maxtxg.
280 *
281 * We prune against mintxg before we get here because it's a big win.
282 * If a given block was born in txg 37, then we know that the entire
283 * subtree below that block must have been born in txg 37 or earlier.
284 * We can therefore lop off huge branches of the tree as we go.
285 *
286 * There's no corresponding optimization for maxtxg because knowing
287 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
288 * children. In fact, the copy-on-write design of ZFS ensures that
289 * top-level blocks will pretty much always be new.
290 *
291 * Therefore, in the name of simplicity we don't prune against
292 * maxtxg until the last possible moment -- that being right now.
293 */
294 if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
295 return (0);
296
297 /*
298 * Debugging: verify that the order we visit things agrees with the
299 * order defined by compare_bookmark(). We don't check this for
300 * log blocks because there's no defined ordering for them; they're
301 * always visited (or not) as part of visiting the objset_phys_t.
302 */
303 if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
304 zbookmark_t *zb = &bc->bc_bookmark;
305 zbookmark_t *szb = &zseg->seg_start;
306 zbookmark_t *ezb = &zseg->seg_end;
307 zbookmark_t *lzb = &th->th_lastcb;
308 dnode_phys_t *dnp = bc->bc_dnode;
309
310 ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
311 ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
312 ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
313 lzb->zb_level == ZB_NO_LEVEL);
314 *lzb = *zb;
315 }
316
317 th->th_callbacks++;
318 return (th->th_func(bc, th->th_spa, th->th_arg));
319}
320
321static int
322traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
323 dnode_phys_t *dnp)
324{
325 zbookmark_t *zb = &bc->bc_bookmark;
326 int error;
327
328 th->th_hits++;
329
330 bc->bc_dnode = dnp;
331 bc->bc_errno = 0;
332
333 if (BP_EQUAL(&bc->bc_blkptr, bp))
334 return (0);
335
336 bc->bc_blkptr = *bp;
337
338 if (bc->bc_data == NULL)
339 return (0);
340
341 if (BP_IS_HOLE(bp)) {
342 ASSERT(th->th_advance & ADVANCE_HOLES);
343 return (0);
344 }
345
346 if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
347 error = EIO;
348 } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
349 error = 0;
350 th->th_arc_hits++;
351 } else {
352 error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
353 BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
354 th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
355
356 if (BP_SHOULD_BYTESWAP(bp) && error == 0)
357 (zb->zb_level > 0 ? byteswap_uint64_array :
358 dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
359 BP_GET_LSIZE(bp));
360 th->th_reads++;
361 }
362
363 if (error) {
364 bc->bc_errno = error;
365 error = traverse_callback(th, NULL, bc);
366 ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
367 bc->bc_blkptr.blk_birth = -1ULL;
368 }
369
370 dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
371 bc - &th->th_cache[0][0], error,
372 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
373
374 return (error);
375}
376
377static int
378find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
379{
380 zbookmark_t *zb = &zseg->seg_start;
381 traverse_blk_cache_t *bc;
382 blkptr_t *bp = dnp->dn_blkptr;
383 int i, first, level;
384 int nbp = dnp->dn_nblkptr;
385 int minlevel = zb->zb_level;
386 int maxlevel = dnp->dn_nlevels - 1;
387 int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
388 int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
389 uint64_t blkid = zb->zb_blkid >> bp_shift;
390 int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
391 int rc;
392
393 if (minlevel > maxlevel || blkid >= nbp)
394 return (ERANGE);
395
396 for (level = maxlevel; level >= minlevel; level--) {
397 first = P2PHASE(blkid, 1ULL << wshift);
398
399 for (i = first; i < nbp; i++)
400 if (bp[i].blk_birth > zseg->seg_mintxg ||
401 BP_IS_HOLE(&bp[i]) && do_holes)
402 break;
403
404 if (i != first) {
405 i--;
406 SET_BOOKMARK_LB(zb, level, blkid + (i - first));
407 return (ENOTBLK);
408 }
409
410 bc = &th->th_cache[depth][level];
411
412 SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
413 level, blkid);
414
415 if (rc = traverse_read(th, bc, bp + i, dnp)) {
416 if (rc != EAGAIN) {
417 SET_BOOKMARK_LB(zb, level, blkid);
418 }
419 return (rc);
420 }
421
422 if (BP_IS_HOLE(&bp[i])) {
423 SET_BOOKMARK_LB(zb, level, blkid);
424 th->th_lastcb.zb_level = ZB_NO_LEVEL;
425 return (0);
426 }
427
428 nbp = 1 << wshift;
429 bp = bc->bc_data;
430 bp_shift -= wshift;
431 blkid = zb->zb_blkid >> bp_shift;
432 }
433
434 return (0);
435}
436
437static int
438get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
439 uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
440{
441 zseg_t zseg;
442 zbookmark_t *zb = &zseg.seg_start;
443 uint64_t object = *objectp;
444 int i, rc;
445
446 SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
447 SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
448
449 zseg.seg_mintxg = txg;
450 zseg.seg_maxtxg = -1ULL;
451
452 for (;;) {
453 rc = find_block(th, &zseg, mdn, depth);
454
455 if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
456 break;
457
458 if (rc == 0 && zb->zb_level == 0) {
459 dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
460 for (i = 0; i < DNODES_PER_BLOCK; i++) {
461 object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
462 if (object >= *objectp &&
463 dnp[i].dn_type != DMU_OT_NONE &&
464 (type == -1 || dnp[i].dn_type == type)) {
465 *objectp = object;
466 *dnpp = &dnp[i];
467 return (0);
468 }
469 }
470 }
471
472 rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
473
474 if (rc == ERANGE)
475 break;
476 }
477
478 if (rc == ERANGE)
479 *objectp = ZB_MAXOBJECT;
480
481 return (rc);
482}
483
484/* ARGSUSED */
485static void
486traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
487{
67/* ARGSUSED */
68static void
69traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
70{
488 traverse_handle_t *th = arg;
489 traverse_blk_cache_t *bc = &th->th_zil_cache;
490 zbookmark_t *zb = &bc->bc_bookmark;
491 zseg_t *zseg = list_head(&th->th_seglist);
71 struct traverse_data *td = arg;
72 zbookmark_t zb;
492
73
493 if (bp->blk_birth <= zseg->seg_mintxg)
74 if (bp->blk_birth == 0)
494 return;
495
75 return;
76
496 if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
497 zb->zb_object = 0;
498 zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
499 bc->bc_blkptr = *bp;
500 (void) traverse_callback(th, zseg, bc);
501 }
77 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
78 return;
79
80 zb.zb_objset = td->td_objset;
81 zb.zb_object = 0;
82 zb.zb_level = -1;
83 zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
84 VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
502}
503
504/* ARGSUSED */
505static void
506traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
507{
85}
86
87/* ARGSUSED */
88static void
89traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
90{
508 traverse_handle_t *th = arg;
509 traverse_blk_cache_t *bc = &th->th_zil_cache;
510 zbookmark_t *zb = &bc->bc_bookmark;
511 zseg_t *zseg = list_head(&th->th_seglist);
91 struct traverse_data *td = arg;
512
513 if (lrc->lrc_txtype == TX_WRITE) {
514 lr_write_t *lr = (lr_write_t *)lrc;
515 blkptr_t *bp = &lr->lr_blkptr;
92
93 if (lrc->lrc_txtype == TX_WRITE) {
94 lr_write_t *lr = (lr_write_t *)lrc;
95 blkptr_t *bp = &lr->lr_blkptr;
96 zbookmark_t zb;
516
97
517 if (bp->blk_birth <= zseg->seg_mintxg)
98 if (bp->blk_birth == 0)
518 return;
519
99 return;
100
520 if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
521 zb->zb_object = lr->lr_foid;
522 zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
523 bc->bc_blkptr = *bp;
524 (void) traverse_callback(th, zseg, bc);
525 }
101 if (claim_txg == 0 || bp->blk_birth < claim_txg)
102 return;
103
104 zb.zb_objset = td->td_objset;
105 zb.zb_object = lr->lr_foid;
106 zb.zb_level = BP_GET_LEVEL(bp);
107 zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
108 VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
526 }
527}
528
529static void
109 }
110}
111
112static void
530traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
113traverse_zil(struct traverse_data *td, zil_header_t *zh)
531{
114{
532 spa_t *spa = th->th_spa;
533 dsl_pool_t *dp = spa_get_dsl(spa);
534 objset_phys_t *osphys = bc->bc_data;
535 zil_header_t *zh = &osphys->os_zil_header;
536 uint64_t claim_txg = zh->zh_claim_txg;
537 zilog_t *zilog;
538
115 uint64_t claim_txg = zh->zh_claim_txg;
116 zilog_t *zilog;
117
539 ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
540 ASSERT(bc->bc_bookmark.zb_level == -1);
541
542 /*
543 * We only want to visit blocks that have been claimed but not yet
544 * replayed (or, in read-only mode, blocks that *would* be claimed).
545 */
546 if (claim_txg == 0 && (spa_mode & FWRITE))
547 return;
548
118 /*
119 * We only want to visit blocks that have been claimed but not yet
120 * replayed (or, in read-only mode, blocks that *would* be claimed).
121 */
122 if (claim_txg == 0 && (spa_mode & FWRITE))
123 return;
124
549 th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
125 zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
550
126
551 zilog = zil_alloc(dp->dp_meta_objset, zh);
552
553 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
127 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
554 claim_txg);
555
556 zil_free(zilog);
557}
558
559static int
128 claim_txg);
129
130 zil_free(zilog);
131}
132
133static int
560traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
134traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
135 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
561{
136{
562 zbookmark_t *zb = &zseg->seg_start;
563 traverse_blk_cache_t *bc;
564 dnode_phys_t *dn, *dn_tmp;
565 int worklimit = 100;
566 int rc;
137 int err = 0;
138 arc_buf_t *buf = NULL;
139 struct prefetch_data *pd = td->td_pfd;
567
140
568 dprintf("<%llu, %llu, %d, %llx>\n",
569 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
141 if (bp->blk_birth == 0) {
142 err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
143 return (err);
144 }
570
145
571 bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
572 dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
146 if (bp->blk_birth <= td->td_min_txg)
147 return (0);
573
148
574 SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
575
576 rc = traverse_read(th, bc, mosbp, dn);
577
578 if (rc) /* If we get ERESTART, we've got nowhere left to go */
579 return (rc == ERESTART ? EINTR : rc);
580
581 ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
582
583 if (zb->zb_objset != 0) {
584 uint64_t objset = zb->zb_objset;
585 dsl_dataset_phys_t *dsp;
586
587 rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
588 DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
589
590 if (objset != zb->zb_objset)
591 rc = advance_objset(zseg, objset, th->th_advance);
592
593 if (rc != 0)
594 return (rc);
595
596 dsp = DN_BONUS(dn_tmp);
597
598 bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
599 dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
600
601 SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
602
603 /*
604 * If we're traversing an open snapshot, we know that it
605 * can't be deleted (because it's open) and it can't change
606 * (because it's a snapshot). Therefore, once we've gotten
607 * from the uberblock down to the snapshot's objset_phys_t,
608 * we no longer need to synchronize with spa_sync(); we're
609 * traversing a completely static block tree from here on.
610 */
611 if (th->th_advance & ADVANCE_NOLOCK) {
612 ASSERT(th->th_locked);
613 rw_exit(spa_traverse_rwlock(th->th_spa));
614 th->th_locked = 0;
615 }
616
617 if (BP_IS_HOLE(&dsp->ds_bp))
618 rc = ERESTART;
619 else
620 rc = traverse_read(th, bc, &dsp->ds_bp, dn);
621
622 if (rc != 0) {
623 if (rc == ERESTART)
624 rc = advance_objset(zseg, zb->zb_objset + 1,
625 th->th_advance);
626 return (rc);
627 }
628
629 if (th->th_advance & ADVANCE_PRUNE)
630 zseg->seg_mintxg =
631 MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
149 if (pd && !pd->pd_exited &&
150 ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
151 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
152 mutex_enter(&pd->pd_mtx);
153 ASSERT(pd->pd_blks_fetched >= 0);
154 while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
155 cv_wait(&pd->pd_cv, &pd->pd_mtx);
156 pd->pd_blks_fetched--;
157 cv_broadcast(&pd->pd_cv);
158 mutex_exit(&pd->pd_mtx);
632 }
633
159 }
160
634 if (zb->zb_level == -1) {
635 ASSERT(zb->zb_object == 0);
636 ASSERT(zb->zb_blkid == 0);
637 ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
638
639 if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
640 rc = traverse_callback(th, zseg, bc);
641 if (rc) {
642 ASSERT(rc == EINTR);
643 return (rc);
644 }
645 if ((th->th_advance & ADVANCE_ZIL) &&
646 zb->zb_objset != 0)
647 traverse_zil(th, bc);
648 }
649
650 return (advance_from_osphys(zseg, th->th_advance));
161 if (td->td_flags & TRAVERSE_PRE) {
162 err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
163 if (err)
164 return (err);
651 }
652
165 }
166
653 if (zb->zb_object != 0) {
654 uint64_t object = zb->zb_object;
167 if (BP_GET_LEVEL(bp) > 0) {
168 uint32_t flags = ARC_WAIT;
169 int i;
170 blkptr_t *cbp;
171 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
655
172
656 rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
657 zseg->seg_mintxg, -1, ZB_MDN_CACHE);
173 err = arc_read(NULL, td->td_spa, bp, pbuf,
174 arc_getbuf_func, &buf,
175 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
176 if (err)
177 return (err);
658
178
659 if (object != zb->zb_object)
660 rc = advance_object(zseg, object, th->th_advance);
179 /* recursively visitbp() blocks below this */
180 cbp = buf->b_data;
181 for (i = 0; i < epb; i++, cbp++) {
182 zbookmark_t czb;
661
183
662 if (rc != 0)
663 return (rc);
184 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
185 zb->zb_level - 1,
186 zb->zb_blkid * epb + i);
187 err = traverse_visitbp(td, dnp, buf, cbp, &czb);
188 if (err)
189 break;
190 }
191 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
192 uint32_t flags = ARC_WAIT;
193 int i, j;
194 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
664
195
665 dn = dn_tmp;
666 }
196 err = arc_read(NULL, td->td_spa, bp, pbuf,
197 arc_getbuf_func, &buf,
198 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
199 if (err)
200 return (err);
667
201
668 if (zb->zb_level == ZB_MAXLEVEL)
669 zb->zb_level = dn->dn_nlevels - 1;
202 /* recursively visitbp() blocks below this */
203 dnp = buf->b_data;
204 for (i = 0; i < epb && err == 0; i++, dnp++) {
205 for (j = 0; j < dnp->dn_nblkptr; j++) {
206 zbookmark_t czb;
670
207
671 for (;;) {
672 rc = find_block(th, zseg, dn, ZB_DN_CACHE);
673
674 if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
675 break;
676
677 if (rc == 0) {
678 bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
679 ASSERT(bc->bc_dnode == dn);
680 ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
681 rc = traverse_callback(th, zseg, bc);
682 if (rc) {
683 ASSERT(rc == EINTR);
684 return (rc);
208 SET_BOOKMARK(&czb, zb->zb_objset,
209 zb->zb_blkid * epb + i,
210 dnp->dn_nlevels - 1, j);
211 err = traverse_visitbp(td, dnp, buf,
212 (blkptr_t *)&dnp->dn_blkptr[j], &czb);
213 if (err)
214 break;
685 }
215 }
686 if (BP_IS_HOLE(&bc->bc_blkptr)) {
687 ASSERT(th->th_advance & ADVANCE_HOLES);
688 rc = ENOTBLK;
689 }
690 }
216 }
217 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
218 uint32_t flags = ARC_WAIT;
219 objset_phys_t *osp;
220 int j;
691
221
692 rc = advance_block(zseg, dn, rc, th->th_advance);
222 err = arc_read_nolock(NULL, td->td_spa, bp,
223 arc_getbuf_func, &buf,
224 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
225 if (err)
226 return (err);
693
227
694 if (rc == ERANGE)
695 break;
696
228 osp = buf->b_data;
697 /*
229 /*
698 * Give spa_sync() a chance to run.
230 * traverse_zil is just here for zdb's leak checking.
231 * For other consumers, there will be no ZIL blocks.
699 */
232 */
700 if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
701 th->th_syncs++;
702 return (EAGAIN);
703 }
233 traverse_zil(td, &osp->os_zil_header);
704
234
705 if (--worklimit == 0)
706 return (EAGAIN);
235 for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
236 zbookmark_t czb;
237
238 SET_BOOKMARK(&czb, zb->zb_objset, 0,
239 osp->os_meta_dnode.dn_nlevels - 1, j);
240 err = traverse_visitbp(td, &osp->os_meta_dnode, buf,
241 (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j],
242 &czb);
243 if (err)
244 break;
245 }
707 }
708
246 }
247
709 if (rc == ERANGE)
710 rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
248 if (buf)
249 (void) arc_buf_remove_ref(buf, &buf);
711
250
712 return (rc);
713}
251 if (err == 0 && (td->td_flags & TRAVERSE_POST))
252 err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
714
253
715/*
716 * It is the caller's responsibility to ensure that the dsl_dataset_t
717 * doesn't go away during traversal.
718 */
719int
720traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
721 blkptr_cb_t func, void *arg)
722{
723 spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
724 traverse_handle_t *th;
725 int err;
726
727 th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
728
729 traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
730
731 while ((err = traverse_more(th)) == EAGAIN)
732 continue;
733
734 traverse_fini(th);
735 return (err);
736}
737
254 return (err);
255}
256
738int
739traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg)
257/* ARGSUSED */
258static int
259traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
260 const dnode_phys_t *dnp, void *arg)
740{
261{
741 spa_t *spa = dmu_objset_spa(os);
742 traverse_handle_t *th;
743 int err;
262 struct prefetch_data *pfd = arg;
263 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
744
264
745 th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL);
265 ASSERT(pfd->pd_blks_fetched >= 0);
266 if (pfd->pd_cancel)
267 return (EINTR);
746
268
747 traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ);
269 if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
270 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
271 return (0);
748
272
749 while ((err = traverse_more(th)) == EAGAIN)
750 continue;
273 mutex_enter(&pfd->pd_mtx);
274 while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
275 cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
276 pfd->pd_blks_fetched++;
277 cv_broadcast(&pfd->pd_cv);
278 mutex_exit(&pfd->pd_mtx);
751
279
752 traverse_fini(th);
753 return (err);
280 (void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
281 ZIO_PRIORITY_ASYNC_READ,
282 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
283 &aflags, zb);
284
285 return (0);
754}
755
286}
287
756int
757traverse_more(traverse_handle_t *th)
288static void
289traverse_prefetch_thread(void *arg)
758{
290{
759 zseg_t *zseg = list_head(&th->th_seglist);
760 uint64_t save_txg; /* XXX won't be necessary with real itinerary */
761 krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
762 blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
763 int rc;
291 struct traverse_data *td_main = arg;
292 struct traverse_data td = *td_main;
293 zbookmark_t czb;
764
294
765 if (zseg == NULL)
766 return (0);
295 td.td_func = traverse_prefetcher;
296 td.td_arg = td_main->td_pfd;
297 td.td_pfd = NULL;
767
298
768 th->th_restarts++;
299 SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
300 (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
769
301
770 save_txg = zseg->seg_mintxg;
771
772 rw_enter(rw, RW_READER);
773 th->th_locked = 1;
774
775 rc = traverse_segment(th, zseg, mosbp);
776 ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
777
778 if (th->th_locked)
779 rw_exit(rw);
780 th->th_locked = 0;
781
782 zseg->seg_mintxg = save_txg;
783
784 if (rc == ERANGE) {
785 list_remove(&th->th_seglist, zseg);
786 kmem_free(zseg, sizeof (*zseg));
787 return (EAGAIN);
788 }
789
790 return (rc);
302 mutex_enter(&td_main->td_pfd->pd_mtx);
303 td_main->td_pfd->pd_exited = B_TRUE;
304 cv_broadcast(&td_main->td_pfd->pd_cv);
305 mutex_exit(&td_main->td_pfd->pd_mtx);
791}
792
793/*
306}
307
308/*
794 * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
795 * are not included. The blocks covered by this segment will all have
796 * mintxg < birth < maxtxg.
309 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
310 * in syncing context).
797 */
311 */
798static void
799traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
800 uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
801 uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
312static int
313traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
314 uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
802{
315{
803 zseg_t *zseg;
316 struct traverse_data td;
317 struct prefetch_data pd = { 0 };
318 zbookmark_t czb;
319 int err;
804
320
805 zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
321 td.td_spa = spa;
322 td.td_objset = objset;
323 td.td_rootbp = rootbp;
324 td.td_min_txg = txg_start;
325 td.td_func = func;
326 td.td_arg = arg;
327 td.td_pfd = &pd;
328 td.td_flags = flags;
806
329
807 zseg->seg_mintxg = mintxg;
808 zseg->seg_maxtxg = maxtxg;
330 pd.pd_blks_max = 100;
331 pd.pd_flags = flags;
332 mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
333 cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
809
334
810 zseg->seg_start.zb_objset = sobjset;
811 zseg->seg_start.zb_object = sobject;
812 zseg->seg_start.zb_level = slevel;
813 zseg->seg_start.zb_blkid = sblkid;
335 if (!(flags & TRAVERSE_PREFETCH) ||
336 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
337 &td, TQ_NOQUEUE))
338 pd.pd_exited = B_TRUE;
814
339
815 zseg->seg_end.zb_objset = eobjset;
816 zseg->seg_end.zb_object = eobject;
817 zseg->seg_end.zb_level = elevel;
818 zseg->seg_end.zb_blkid = eblkid;
340 SET_BOOKMARK(&czb, objset, 0, -1, 0);
341 err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
819
342
820 list_insert_tail(&th->th_seglist, zseg);
821}
343 mutex_enter(&pd.pd_mtx);
344 pd.pd_cancel = B_TRUE;
345 cv_broadcast(&pd.pd_cv);
346 while (!pd.pd_exited)
347 cv_wait(&pd.pd_cv, &pd.pd_mtx);
348 mutex_exit(&pd.pd_mtx);
822
349
823void
824traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
825 uint64_t objset, uint64_t object)
826{
827 if (th->th_advance & ADVANCE_PRE)
828 traverse_add_segment(th, mintxg, maxtxg,
829 objset, object, ZB_MAXLEVEL, 0,
830 objset, object, 0, ZB_MAXBLKID);
831 else
832 traverse_add_segment(th, mintxg, maxtxg,
833 objset, object, 0, 0,
834 objset, object, 0, ZB_MAXBLKID);
835}
350 mutex_destroy(&pd.pd_mtx);
351 cv_destroy(&pd.pd_cv);
836
352
837void
838traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
839 uint64_t objset)
840{
841 if (th->th_advance & ADVANCE_PRE)
842 traverse_add_segment(th, mintxg, maxtxg,
843 objset, 0, -1, 0,
844 objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
845 else
846 traverse_add_segment(th, mintxg, maxtxg,
847 objset, 1, 0, 0,
848 objset, 0, -1, 0);
353 return (err);
849}
850
354}
355
851void
852traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
356/*
357 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
358 * in syncing context).
359 */
360int
361traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
362 blkptr_cb_t func, void *arg)
853{
363{
854 if (th->th_advance & ADVANCE_PRE)
855 traverse_add_segment(th, mintxg, maxtxg,
856 0, 0, -1, 0,
857 ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
858 else
859 traverse_add_segment(th, mintxg, maxtxg,
860 1, 1, 0, 0,
861 0, 0, -1, 0);
364 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
365 &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
862}
863
366}
367
864traverse_handle_t *
865traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
866 int zio_flags)
368/*
369 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
370 */
371int
372traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
867{
373{
868 traverse_handle_t *th;
869 int d, l;
374 int err;
375 uint64_t obj;
376 dsl_pool_t *dp = spa_get_dsl(spa);
377 objset_t *mos = dp->dp_meta_objset;
870
378
871 th = kmem_zalloc(sizeof (*th), KM_SLEEP);
379 /* visit the MOS */
380 err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
381 0, TRAVERSE_PRE, func, arg);
382 if (err)
383 return (err);
872
384
873 th->th_spa = spa;
874 th->th_func = func;
875 th->th_arg = arg;
876 th->th_advance = advance;
877 th->th_lastcb.zb_level = ZB_NO_LEVEL;
878 th->th_noread.zb_level = ZB_NO_LEVEL;
879 th->th_zio_flags = zio_flags;
385 /* visit each dataset */
386 for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
387 dmu_object_info_t doi;
880
388
881 list_create(&th->th_seglist, sizeof (zseg_t),
882 offsetof(zseg_t, seg_node));
389 err = dmu_object_info(mos, obj, &doi);
390 if (err)
391 return (err);
883
392
884 for (d = 0; d < ZB_DEPTH; d++) {
885 for (l = 0; l < ZB_MAXLEVEL; l++) {
886 if ((advance & ADVANCE_DATA) ||
887 l != 0 || d != ZB_DN_CACHE)
888 th->th_cache[d][l].bc_data =
889 zio_buf_alloc(SPA_MAXBLOCKSIZE);
393 if (doi.doi_type == DMU_OT_DSL_DATASET) {
394 dsl_dataset_t *ds;
395 rw_enter(&dp->dp_config_rwlock, RW_READER);
396 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
397 rw_exit(&dp->dp_config_rwlock);
398 if (err)
399 return (err);
400 err = traverse_dataset(ds,
401 ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
402 func, arg);
403 dsl_dataset_rele(ds, FTAG);
404 if (err)
405 return (err);
890 }
891 }
406 }
407 }
892
893 return (th);
408 if (err == ESRCH)
409 err = 0;
410 return (err);
894}
411}
895
896void
897traverse_fini(traverse_handle_t *th)
898{
899 int d, l;
900 zseg_t *zseg;
901
902 for (d = 0; d < ZB_DEPTH; d++)
903 for (l = 0; l < ZB_MAXLEVEL; l++)
904 if (th->th_cache[d][l].bc_data != NULL)
905 zio_buf_free(th->th_cache[d][l].bc_data,
906 SPA_MAXBLOCKSIZE);
907
908 while ((zseg = list_head(&th->th_seglist)) != NULL) {
909 list_remove(&th->th_seglist, zseg);
910 kmem_free(zseg, sizeof (*zseg));
911 }
912
913 list_destroy(&th->th_seglist);
914
915 dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
916 th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
917 th->th_syncs, th->th_restarts);
918
919 kmem_free(th, sizeof (*th));
920}