1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2015 Chunwei Chen. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/dmu_objset.h> 29#include <sys/dmu_traverse.h> 30#include <sys/dsl_dataset.h> 31#include <sys/dsl_dir.h> 32#include <sys/dsl_pool.h> 33#include <sys/dnode.h> 34#include <sys/spa.h> 35#include <sys/spa_impl.h> 36#include <sys/zio.h> 37#include <sys/dmu_impl.h> 38#include <sys/sa.h> 39#include <sys/sa_impl.h> 40#include <sys/callb.h> 41#include <sys/zfeature.h> 42 43int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ 44boolean_t send_holes_without_birth_time = B_TRUE; 45 46#ifdef _KERNEL 47SYSCTL_DECL(_vfs_zfs); 48SYSCTL_UINT(_vfs_zfs, OID_AUTO, send_holes_without_birth_time, CTLFLAG_RWTUN, 49 &send_holes_without_birth_time, 0, "Send holes without birth time"); 50#endif 51 52typedef struct prefetch_data { 53 kmutex_t pd_mtx; 54 kcondvar_t pd_cv; 55 int32_t pd_bytes_fetched; 56 int pd_flags; 57 boolean_t pd_cancel; 58 boolean_t pd_exited; 59 zbookmark_phys_t pd_resume; 60} prefetch_data_t; 61 62typedef struct traverse_data { 63 spa_t *td_spa; 64 uint64_t td_objset; 65 blkptr_t *td_rootbp; 66 uint64_t td_min_txg; 67 zbookmark_phys_t *td_resume; 68 int td_flags; 69 prefetch_data_t *td_pfd; 70 boolean_t td_paused; 71 uint64_t td_hole_birth_enabled_txg; 72 blkptr_cb_t *td_func; 73 void *td_arg; 74 boolean_t td_realloc_possible; 75} traverse_data_t; 76 77static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, 78 uint64_t objset, uint64_t object); 79static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, 80 uint64_t objset, uint64_t object); 81 82static int 83traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 84{ 85 traverse_data_t *td = arg; 86 zbookmark_phys_t zb; 87 88 if (BP_IS_HOLE(bp)) 89 return (0); 90 91 if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa)) 92 return (-1); 93 94 SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 95 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 96 97 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); 98 99 return (0); 100} 101 102static int 103traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 104{ 105 traverse_data_t *td = arg; 106 107 if (lrc->lrc_txtype == TX_WRITE) { 108 lr_write_t *lr = (lr_write_t *)lrc; 109 blkptr_t *bp = &lr->lr_blkptr; 110 zbookmark_phys_t zb; 111 112 if (BP_IS_HOLE(bp)) 113 return (0); 114 115 if (claim_txg == 0 || bp->blk_birth < claim_txg) 116 return (0); 117 118 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, 119 ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 120 121 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, 122 td->td_arg); 123 } 124 return (0); 125} 126 127static void 128traverse_zil(traverse_data_t *td, zil_header_t *zh) 129{ 130 uint64_t claim_txg = zh->zh_claim_txg; 131 132 /* 133 * We only want to visit blocks that have been claimed but not yet 134 * replayed; plus blocks that are already stable in read-only mode. 135 */ 136 if (claim_txg == 0 && spa_writeable(td->td_spa)) 137 return; 138 139 zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); 140 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, 141 claim_txg); 142 zil_free(zilog); 143} 144 145typedef enum resume_skip { 146 RESUME_SKIP_ALL, 147 RESUME_SKIP_NONE, 148 RESUME_SKIP_CHILDREN 149} resume_skip_t; 150 151/* 152 * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and 153 * the block indicated by zb does not need to be visited at all. Returns 154 * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the 155 * resume point. This indicates that this block should be visited but not its 156 * children (since they must have been visited in a previous traversal). 157 * Otherwise returns RESUME_SKIP_NONE. 158 */ 159static resume_skip_t 160resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, 161 const zbookmark_phys_t *zb) 162{ 163 if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { 164 /* 165 * If we already visited this bp & everything below, 166 * don't bother doing it again. 167 */ 168 if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) 169 return (RESUME_SKIP_ALL); 170 171 /* 172 * If we found the block we're trying to resume from, zero 173 * the bookmark out to indicate that we have resumed. 174 */ 175 if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { 176 bzero(td->td_resume, sizeof (*zb)); 177 if (td->td_flags & TRAVERSE_POST) 178 return (RESUME_SKIP_CHILDREN); 179 } 180 } 181 return (RESUME_SKIP_NONE); 182} 183 184static void 185traverse_prefetch_metadata(traverse_data_t *td, 186 const blkptr_t *bp, const zbookmark_phys_t *zb) 187{ 188 arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 189 190 if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) 191 return; 192 /* 193 * If we are in the process of resuming, don't prefetch, because 194 * some children will not be needed (and in fact may have already 195 * been freed). 196 */ 197 if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) 198 return; 199 if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) 200 return; 201 if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) 202 return; 203 204 (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, 205 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 206} 207 208static boolean_t 209prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) 210{ 211 ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); 212 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || 213 BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) 214 return (B_FALSE); 215 return (B_TRUE); 216} 217 218static int 219traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, 220 const blkptr_t *bp, const zbookmark_phys_t *zb) 221{ 222 zbookmark_phys_t czb; 223 int err = 0; 224 arc_buf_t *buf = NULL; 225 prefetch_data_t *pd = td->td_pfd; 226 boolean_t hard = td->td_flags & TRAVERSE_HARD; 227 228 switch (resume_skip_check(td, dnp, zb)) { 229 case RESUME_SKIP_ALL: 230 return (0); 231 case RESUME_SKIP_CHILDREN: 232 goto post; 233 case RESUME_SKIP_NONE: 234 break; 235 default: 236 ASSERT(0); 237 } 238 239 if (bp->blk_birth == 0) { 240 /* 241 * Since this block has a birth time of 0 it must be one of 242 * two things: a hole created before the 243 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole 244 * which has always been a hole in an object. 245 * 246 * If a file is written sparsely, then the unwritten parts of 247 * the file were "always holes" -- that is, they have been 248 * holes since this object was allocated. However, we (and 249 * our callers) can not necessarily tell when an object was 250 * allocated. Therefore, if it's possible that this object 251 * was freed and then its object number reused, we need to 252 * visit all the holes with birth==0. 253 * 254 * If it isn't possible that the object number was reused, 255 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote 256 * all the blocks we will visit as part of this traversal, 257 * then this hole must have always existed, so we can skip 258 * it. We visit blocks born after (exclusive) td_min_txg. 259 * 260 * Note that the meta-dnode cannot be reallocated. 261 */ 262 if (!send_holes_without_birth_time && 263 (!td->td_realloc_possible || 264 zb->zb_object == DMU_META_DNODE_OBJECT) && 265 td->td_hole_birth_enabled_txg <= td->td_min_txg) 266 return (0); 267 } else if (bp->blk_birth <= td->td_min_txg) { 268 return (0); 269 } 270 271 if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { 272 uint64_t size = BP_GET_LSIZE(bp); 273 mutex_enter(&pd->pd_mtx); 274 ASSERT(pd->pd_bytes_fetched >= 0); 275 while (pd->pd_bytes_fetched < size && !pd->pd_exited) 276 cv_wait(&pd->pd_cv, &pd->pd_mtx); 277 pd->pd_bytes_fetched -= size; 278 cv_broadcast(&pd->pd_cv); 279 mutex_exit(&pd->pd_mtx); 280 } 281 282 if (BP_IS_HOLE(bp)) { 283 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); 284 if (err != 0) 285 goto post; 286 return (0); 287 } 288 289 if (td->td_flags & TRAVERSE_PRE) { 290 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, 291 td->td_arg); 292 if (err == TRAVERSE_VISIT_NO_CHILDREN) 293 return (0); 294 if (err != 0) 295 goto post; 296 } 297 298 if (BP_GET_LEVEL(bp) > 0) { 299 arc_flags_t flags = ARC_FLAG_WAIT; 300 int i; 301 blkptr_t *cbp; 302 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 303 304 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 305 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 306 if (err != 0) 307 goto post; 308 cbp = buf->b_data; 309 310 for (i = 0; i < epb; i++) { 311 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 312 zb->zb_level - 1, 313 zb->zb_blkid * epb + i); 314 traverse_prefetch_metadata(td, &cbp[i], &czb); 315 } 316 317 /* recursively visitbp() blocks below this */ 318 for (i = 0; i < epb; i++) { 319 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 320 zb->zb_level - 1, 321 zb->zb_blkid * epb + i); 322 err = traverse_visitbp(td, dnp, &cbp[i], &czb); 323 if (err != 0) 324 break; 325 } 326 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 327 arc_flags_t flags = ARC_FLAG_WAIT; 328 int i; 329 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 330 331 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 332 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 333 if (err != 0) 334 goto post; 335 dnode_phys_t *child_dnp = buf->b_data; 336 337 for (i = 0; i < epb; i++) { 338 prefetch_dnode_metadata(td, &child_dnp[i], 339 zb->zb_objset, zb->zb_blkid * epb + i); 340 } 341 342 /* recursively visitbp() blocks below this */ 343 for (i = 0; i < epb; i++) { 344 err = traverse_dnode(td, &child_dnp[i], 345 zb->zb_objset, zb->zb_blkid * epb + i); 346 if (err != 0) 347 break; 348 } 349 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 350 arc_flags_t flags = ARC_FLAG_WAIT; 351 352 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 353 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 354 if (err != 0) 355 goto post; 356 357 objset_phys_t *osp = buf->b_data; 358 prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset, 359 DMU_META_DNODE_OBJECT); 360 /* 361 * See the block comment above for the goal of this variable. 362 * If the maxblkid of the meta-dnode is 0, then we know that 363 * we've never had more than DNODES_PER_BLOCK objects in the 364 * dataset, which means we can't have reused any object ids. 365 */ 366 if (osp->os_meta_dnode.dn_maxblkid == 0) 367 td->td_realloc_possible = B_FALSE; 368 369 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { 370 prefetch_dnode_metadata(td, &osp->os_groupused_dnode, 371 zb->zb_objset, DMU_GROUPUSED_OBJECT); 372 prefetch_dnode_metadata(td, &osp->os_userused_dnode, 373 zb->zb_objset, DMU_USERUSED_OBJECT); 374 } 375 376 err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset, 377 DMU_META_DNODE_OBJECT); 378 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { 379 err = traverse_dnode(td, &osp->os_groupused_dnode, 380 zb->zb_objset, DMU_GROUPUSED_OBJECT); 381 } 382 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { 383 err = traverse_dnode(td, &osp->os_userused_dnode, 384 zb->zb_objset, DMU_USERUSED_OBJECT); 385 } 386 } 387 388 if (buf) 389 arc_buf_destroy(buf, &buf); 390 391post: 392 if (err == 0 && (td->td_flags & TRAVERSE_POST)) 393 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); 394 395 if (hard && (err == EIO || err == ECKSUM)) { 396 /* 397 * Ignore this disk error as requested by the HARD flag, 398 * and continue traversal. 399 */ 400 err = 0; 401 } 402 403 /* 404 * If we are stopping here, set td_resume. 405 */ 406 if (td->td_resume != NULL && err != 0 && !td->td_paused) { 407 td->td_resume->zb_objset = zb->zb_objset; 408 td->td_resume->zb_object = zb->zb_object; 409 td->td_resume->zb_level = 0; 410 /* 411 * If we have stopped on an indirect block (e.g. due to 412 * i/o error), we have not visited anything below it. 413 * Set the bookmark to the first level-0 block that we need 414 * to visit. This way, the resuming code does not need to 415 * deal with resuming from indirect blocks. 416 * 417 * Note, if zb_level <= 0, dnp may be NULL, so we don't want 418 * to dereference it. 419 */ 420 td->td_resume->zb_blkid = zb->zb_blkid; 421 if (zb->zb_level > 0) { 422 td->td_resume->zb_blkid <<= zb->zb_level * 423 (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); 424 } 425 td->td_paused = B_TRUE; 426 } 427 428 return (err); 429} 430 431static void 432prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, 433 uint64_t objset, uint64_t object) 434{ 435 int j; 436 zbookmark_phys_t czb; 437 438 for (j = 0; j < dnp->dn_nblkptr; j++) { 439 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 440 traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); 441 } 442 443 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 444 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); 445 traverse_prefetch_metadata(td, &dnp->dn_spill, &czb); 446 } 447} 448 449static int 450traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, 451 uint64_t objset, uint64_t object) 452{ 453 int j, err = 0; 454 zbookmark_phys_t czb; 455 456 if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && 457 object < td->td_resume->zb_object) 458 return (0); 459 460 if (td->td_flags & TRAVERSE_PRE) { 461 SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, 462 ZB_DNODE_BLKID); 463 err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, 464 td->td_arg); 465 if (err == TRAVERSE_VISIT_NO_CHILDREN) 466 return (0); 467 if (err != 0) 468 return (err); 469 } 470 471 for (j = 0; j < dnp->dn_nblkptr; j++) { 472 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 473 err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); 474 if (err != 0) 475 break; 476 } 477 478 if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { 479 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); 480 err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); 481 } 482 483 if (err == 0 && (td->td_flags & TRAVERSE_POST)) { 484 SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, 485 ZB_DNODE_BLKID); 486 err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, 487 td->td_arg); 488 if (err == TRAVERSE_VISIT_NO_CHILDREN) 489 return (0); 490 if (err != 0) 491 return (err); 492 } 493 return (err); 494} 495 496/* ARGSUSED */ 497static int 498traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 499 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 500{ 501 prefetch_data_t *pfd = arg;
| 1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2015 Chunwei Chen. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/dmu_objset.h> 29#include <sys/dmu_traverse.h> 30#include <sys/dsl_dataset.h> 31#include <sys/dsl_dir.h> 32#include <sys/dsl_pool.h> 33#include <sys/dnode.h> 34#include <sys/spa.h> 35#include <sys/spa_impl.h> 36#include <sys/zio.h> 37#include <sys/dmu_impl.h> 38#include <sys/sa.h> 39#include <sys/sa_impl.h> 40#include <sys/callb.h> 41#include <sys/zfeature.h> 42 43int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ 44boolean_t send_holes_without_birth_time = B_TRUE; 45 46#ifdef _KERNEL 47SYSCTL_DECL(_vfs_zfs); 48SYSCTL_UINT(_vfs_zfs, OID_AUTO, send_holes_without_birth_time, CTLFLAG_RWTUN, 49 &send_holes_without_birth_time, 0, "Send holes without birth time"); 50#endif 51 52typedef struct prefetch_data { 53 kmutex_t pd_mtx; 54 kcondvar_t pd_cv; 55 int32_t pd_bytes_fetched; 56 int pd_flags; 57 boolean_t pd_cancel; 58 boolean_t pd_exited; 59 zbookmark_phys_t pd_resume; 60} prefetch_data_t; 61 62typedef struct traverse_data { 63 spa_t *td_spa; 64 uint64_t td_objset; 65 blkptr_t *td_rootbp; 66 uint64_t td_min_txg; 67 zbookmark_phys_t *td_resume; 68 int td_flags; 69 prefetch_data_t *td_pfd; 70 boolean_t td_paused; 71 uint64_t td_hole_birth_enabled_txg; 72 blkptr_cb_t *td_func; 73 void *td_arg; 74 boolean_t td_realloc_possible; 75} traverse_data_t; 76 77static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, 78 uint64_t objset, uint64_t object); 79static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, 80 uint64_t objset, uint64_t object); 81 82static int 83traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 84{ 85 traverse_data_t *td = arg; 86 zbookmark_phys_t zb; 87 88 if (BP_IS_HOLE(bp)) 89 return (0); 90 91 if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa)) 92 return (-1); 93 94 SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 95 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 96 97 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); 98 99 return (0); 100} 101 102static int 103traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 104{ 105 traverse_data_t *td = arg; 106 107 if (lrc->lrc_txtype == TX_WRITE) { 108 lr_write_t *lr = (lr_write_t *)lrc; 109 blkptr_t *bp = &lr->lr_blkptr; 110 zbookmark_phys_t zb; 111 112 if (BP_IS_HOLE(bp)) 113 return (0); 114 115 if (claim_txg == 0 || bp->blk_birth < claim_txg) 116 return (0); 117 118 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, 119 ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 120 121 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, 122 td->td_arg); 123 } 124 return (0); 125} 126 127static void 128traverse_zil(traverse_data_t *td, zil_header_t *zh) 129{ 130 uint64_t claim_txg = zh->zh_claim_txg; 131 132 /* 133 * We only want to visit blocks that have been claimed but not yet 134 * replayed; plus blocks that are already stable in read-only mode. 135 */ 136 if (claim_txg == 0 && spa_writeable(td->td_spa)) 137 return; 138 139 zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); 140 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, 141 claim_txg); 142 zil_free(zilog); 143} 144 145typedef enum resume_skip { 146 RESUME_SKIP_ALL, 147 RESUME_SKIP_NONE, 148 RESUME_SKIP_CHILDREN 149} resume_skip_t; 150 151/* 152 * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and 153 * the block indicated by zb does not need to be visited at all. Returns 154 * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the 155 * resume point. This indicates that this block should be visited but not its 156 * children (since they must have been visited in a previous traversal). 157 * Otherwise returns RESUME_SKIP_NONE. 158 */ 159static resume_skip_t 160resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, 161 const zbookmark_phys_t *zb) 162{ 163 if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { 164 /* 165 * If we already visited this bp & everything below, 166 * don't bother doing it again. 167 */ 168 if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) 169 return (RESUME_SKIP_ALL); 170 171 /* 172 * If we found the block we're trying to resume from, zero 173 * the bookmark out to indicate that we have resumed. 174 */ 175 if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { 176 bzero(td->td_resume, sizeof (*zb)); 177 if (td->td_flags & TRAVERSE_POST) 178 return (RESUME_SKIP_CHILDREN); 179 } 180 } 181 return (RESUME_SKIP_NONE); 182} 183 184static void 185traverse_prefetch_metadata(traverse_data_t *td, 186 const blkptr_t *bp, const zbookmark_phys_t *zb) 187{ 188 arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 189 190 if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) 191 return; 192 /* 193 * If we are in the process of resuming, don't prefetch, because 194 * some children will not be needed (and in fact may have already 195 * been freed). 196 */ 197 if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) 198 return; 199 if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) 200 return; 201 if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) 202 return; 203 204 (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, 205 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 206} 207 208static boolean_t 209prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) 210{ 211 ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); 212 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || 213 BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) 214 return (B_FALSE); 215 return (B_TRUE); 216} 217 218static int 219traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, 220 const blkptr_t *bp, const zbookmark_phys_t *zb) 221{ 222 zbookmark_phys_t czb; 223 int err = 0; 224 arc_buf_t *buf = NULL; 225 prefetch_data_t *pd = td->td_pfd; 226 boolean_t hard = td->td_flags & TRAVERSE_HARD; 227 228 switch (resume_skip_check(td, dnp, zb)) { 229 case RESUME_SKIP_ALL: 230 return (0); 231 case RESUME_SKIP_CHILDREN: 232 goto post; 233 case RESUME_SKIP_NONE: 234 break; 235 default: 236 ASSERT(0); 237 } 238 239 if (bp->blk_birth == 0) { 240 /* 241 * Since this block has a birth time of 0 it must be one of 242 * two things: a hole created before the 243 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole 244 * which has always been a hole in an object. 245 * 246 * If a file is written sparsely, then the unwritten parts of 247 * the file were "always holes" -- that is, they have been 248 * holes since this object was allocated. However, we (and 249 * our callers) can not necessarily tell when an object was 250 * allocated. Therefore, if it's possible that this object 251 * was freed and then its object number reused, we need to 252 * visit all the holes with birth==0. 253 * 254 * If it isn't possible that the object number was reused, 255 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote 256 * all the blocks we will visit as part of this traversal, 257 * then this hole must have always existed, so we can skip 258 * it. We visit blocks born after (exclusive) td_min_txg. 259 * 260 * Note that the meta-dnode cannot be reallocated. 261 */ 262 if (!send_holes_without_birth_time && 263 (!td->td_realloc_possible || 264 zb->zb_object == DMU_META_DNODE_OBJECT) && 265 td->td_hole_birth_enabled_txg <= td->td_min_txg) 266 return (0); 267 } else if (bp->blk_birth <= td->td_min_txg) { 268 return (0); 269 } 270 271 if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { 272 uint64_t size = BP_GET_LSIZE(bp); 273 mutex_enter(&pd->pd_mtx); 274 ASSERT(pd->pd_bytes_fetched >= 0); 275 while (pd->pd_bytes_fetched < size && !pd->pd_exited) 276 cv_wait(&pd->pd_cv, &pd->pd_mtx); 277 pd->pd_bytes_fetched -= size; 278 cv_broadcast(&pd->pd_cv); 279 mutex_exit(&pd->pd_mtx); 280 } 281 282 if (BP_IS_HOLE(bp)) { 283 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); 284 if (err != 0) 285 goto post; 286 return (0); 287 } 288 289 if (td->td_flags & TRAVERSE_PRE) { 290 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, 291 td->td_arg); 292 if (err == TRAVERSE_VISIT_NO_CHILDREN) 293 return (0); 294 if (err != 0) 295 goto post; 296 } 297 298 if (BP_GET_LEVEL(bp) > 0) { 299 arc_flags_t flags = ARC_FLAG_WAIT; 300 int i; 301 blkptr_t *cbp; 302 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 303 304 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 305 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 306 if (err != 0) 307 goto post; 308 cbp = buf->b_data; 309 310 for (i = 0; i < epb; i++) { 311 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 312 zb->zb_level - 1, 313 zb->zb_blkid * epb + i); 314 traverse_prefetch_metadata(td, &cbp[i], &czb); 315 } 316 317 /* recursively visitbp() blocks below this */ 318 for (i = 0; i < epb; i++) { 319 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 320 zb->zb_level - 1, 321 zb->zb_blkid * epb + i); 322 err = traverse_visitbp(td, dnp, &cbp[i], &czb); 323 if (err != 0) 324 break; 325 } 326 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 327 arc_flags_t flags = ARC_FLAG_WAIT; 328 int i; 329 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 330 331 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 332 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 333 if (err != 0) 334 goto post; 335 dnode_phys_t *child_dnp = buf->b_data; 336 337 for (i = 0; i < epb; i++) { 338 prefetch_dnode_metadata(td, &child_dnp[i], 339 zb->zb_objset, zb->zb_blkid * epb + i); 340 } 341 342 /* recursively visitbp() blocks below this */ 343 for (i = 0; i < epb; i++) { 344 err = traverse_dnode(td, &child_dnp[i], 345 zb->zb_objset, zb->zb_blkid * epb + i); 346 if (err != 0) 347 break; 348 } 349 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 350 arc_flags_t flags = ARC_FLAG_WAIT; 351 352 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 353 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 354 if (err != 0) 355 goto post; 356 357 objset_phys_t *osp = buf->b_data; 358 prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset, 359 DMU_META_DNODE_OBJECT); 360 /* 361 * See the block comment above for the goal of this variable. 362 * If the maxblkid of the meta-dnode is 0, then we know that 363 * we've never had more than DNODES_PER_BLOCK objects in the 364 * dataset, which means we can't have reused any object ids. 365 */ 366 if (osp->os_meta_dnode.dn_maxblkid == 0) 367 td->td_realloc_possible = B_FALSE; 368 369 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { 370 prefetch_dnode_metadata(td, &osp->os_groupused_dnode, 371 zb->zb_objset, DMU_GROUPUSED_OBJECT); 372 prefetch_dnode_metadata(td, &osp->os_userused_dnode, 373 zb->zb_objset, DMU_USERUSED_OBJECT); 374 } 375 376 err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset, 377 DMU_META_DNODE_OBJECT); 378 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { 379 err = traverse_dnode(td, &osp->os_groupused_dnode, 380 zb->zb_objset, DMU_GROUPUSED_OBJECT); 381 } 382 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { 383 err = traverse_dnode(td, &osp->os_userused_dnode, 384 zb->zb_objset, DMU_USERUSED_OBJECT); 385 } 386 } 387 388 if (buf) 389 arc_buf_destroy(buf, &buf); 390 391post: 392 if (err == 0 && (td->td_flags & TRAVERSE_POST)) 393 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); 394 395 if (hard && (err == EIO || err == ECKSUM)) { 396 /* 397 * Ignore this disk error as requested by the HARD flag, 398 * and continue traversal. 399 */ 400 err = 0; 401 } 402 403 /* 404 * If we are stopping here, set td_resume. 405 */ 406 if (td->td_resume != NULL && err != 0 && !td->td_paused) { 407 td->td_resume->zb_objset = zb->zb_objset; 408 td->td_resume->zb_object = zb->zb_object; 409 td->td_resume->zb_level = 0; 410 /* 411 * If we have stopped on an indirect block (e.g. due to 412 * i/o error), we have not visited anything below it. 413 * Set the bookmark to the first level-0 block that we need 414 * to visit. This way, the resuming code does not need to 415 * deal with resuming from indirect blocks. 416 * 417 * Note, if zb_level <= 0, dnp may be NULL, so we don't want 418 * to dereference it. 419 */ 420 td->td_resume->zb_blkid = zb->zb_blkid; 421 if (zb->zb_level > 0) { 422 td->td_resume->zb_blkid <<= zb->zb_level * 423 (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); 424 } 425 td->td_paused = B_TRUE; 426 } 427 428 return (err); 429} 430 431static void 432prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, 433 uint64_t objset, uint64_t object) 434{ 435 int j; 436 zbookmark_phys_t czb; 437 438 for (j = 0; j < dnp->dn_nblkptr; j++) { 439 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 440 traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); 441 } 442 443 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 444 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); 445 traverse_prefetch_metadata(td, &dnp->dn_spill, &czb); 446 } 447} 448 449static int 450traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, 451 uint64_t objset, uint64_t object) 452{ 453 int j, err = 0; 454 zbookmark_phys_t czb; 455 456 if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && 457 object < td->td_resume->zb_object) 458 return (0); 459 460 if (td->td_flags & TRAVERSE_PRE) { 461 SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, 462 ZB_DNODE_BLKID); 463 err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, 464 td->td_arg); 465 if (err == TRAVERSE_VISIT_NO_CHILDREN) 466 return (0); 467 if (err != 0) 468 return (err); 469 } 470 471 for (j = 0; j < dnp->dn_nblkptr; j++) { 472 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 473 err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); 474 if (err != 0) 475 break; 476 } 477 478 if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { 479 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); 480 err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); 481 } 482 483 if (err == 0 && (td->td_flags & TRAVERSE_POST)) { 484 SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, 485 ZB_DNODE_BLKID); 486 err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, 487 td->td_arg); 488 if (err == TRAVERSE_VISIT_NO_CHILDREN) 489 return (0); 490 if (err != 0) 491 return (err); 492 } 493 return (err); 494} 495 496/* ARGSUSED */ 497static int 498traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 499 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 500{ 501 prefetch_data_t *pfd = arg;
|
504 ASSERT(pfd->pd_bytes_fetched >= 0); 505 if (bp == NULL) 506 return (0); 507 if (pfd->pd_cancel) 508 return (SET_ERROR(EINTR)); 509 510 if (!prefetch_needed(pfd, bp)) 511 return (0); 512 513 mutex_enter(&pfd->pd_mtx); 514 while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max) 515 cv_wait(&pfd->pd_cv, &pfd->pd_mtx); 516 pfd->pd_bytes_fetched += BP_GET_LSIZE(bp); 517 cv_broadcast(&pfd->pd_cv); 518 mutex_exit(&pfd->pd_mtx); 519 520 (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 521 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); 522 523 return (0); 524} 525 526static void 527traverse_prefetch_thread(void *arg) 528{ 529 traverse_data_t *td_main = arg; 530 traverse_data_t td = *td_main; 531 zbookmark_phys_t czb; 532 533 td.td_func = traverse_prefetcher; 534 td.td_arg = td_main->td_pfd; 535 td.td_pfd = NULL; 536 td.td_resume = &td_main->td_pfd->pd_resume; 537 538 SET_BOOKMARK(&czb, td.td_objset, 539 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 540 (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); 541 542 mutex_enter(&td_main->td_pfd->pd_mtx); 543 td_main->td_pfd->pd_exited = B_TRUE; 544 cv_broadcast(&td_main->td_pfd->pd_cv); 545 mutex_exit(&td_main->td_pfd->pd_mtx); 546} 547 548/* 549 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 550 * in syncing context). 551 */ 552static int 553traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, 554 uint64_t txg_start, zbookmark_phys_t *resume, int flags, 555 blkptr_cb_t func, void *arg) 556{ 557 traverse_data_t td; 558 prefetch_data_t pd = { 0 }; 559 zbookmark_phys_t czb; 560 int err; 561 562 ASSERT(ds == NULL || objset == ds->ds_object); 563 ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); 564 565 td.td_spa = spa; 566 td.td_objset = objset; 567 td.td_rootbp = rootbp; 568 td.td_min_txg = txg_start; 569 td.td_resume = resume; 570 td.td_func = func; 571 td.td_arg = arg; 572 td.td_pfd = &pd; 573 td.td_flags = flags; 574 td.td_paused = B_FALSE; 575 td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE); 576 577 if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 578 VERIFY(spa_feature_enabled_txg(spa, 579 SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg)); 580 } else { 581 td.td_hole_birth_enabled_txg = UINT64_MAX; 582 } 583 584 pd.pd_flags = flags; 585 if (resume != NULL) 586 pd.pd_resume = *resume; 587 mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); 588 cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); 589 590 /* See comment on ZIL traversal in dsl_scan_visitds. */ 591 if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { 592 arc_flags_t flags = ARC_FLAG_WAIT; 593 objset_phys_t *osp; 594 arc_buf_t *buf; 595 596 err = arc_read(NULL, td.td_spa, rootbp, 597 arc_getbuf_func, &buf, 598 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); 599 if (err != 0) 600 return (err); 601 602 osp = buf->b_data; 603 traverse_zil(&td, &osp->os_zil_header); 604 arc_buf_destroy(buf, &buf); 605 } 606 607 if (!(flags & TRAVERSE_PREFETCH_DATA) || 608 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, 609 &td, TQ_NOQUEUE)) 610 pd.pd_exited = B_TRUE; 611 612 SET_BOOKMARK(&czb, td.td_objset, 613 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 614 err = traverse_visitbp(&td, NULL, rootbp, &czb); 615 616 mutex_enter(&pd.pd_mtx); 617 pd.pd_cancel = B_TRUE; 618 cv_broadcast(&pd.pd_cv); 619 while (!pd.pd_exited) 620 cv_wait(&pd.pd_cv, &pd.pd_mtx); 621 mutex_exit(&pd.pd_mtx); 622 623 mutex_destroy(&pd.pd_mtx); 624 cv_destroy(&pd.pd_cv); 625 626 return (err); 627} 628 629/* 630 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 631 * in syncing context). 632 */ 633int 634traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, 635 zbookmark_phys_t *resume, 636 int flags, blkptr_cb_t func, void *arg) 637{ 638 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, 639 &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); 640} 641 642int 643traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, 644 int flags, blkptr_cb_t func, void *arg) 645{ 646 return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); 647} 648 649int 650traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, 651 uint64_t txg_start, zbookmark_phys_t *resume, int flags, 652 blkptr_cb_t func, void *arg) 653{ 654 return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, 655 blkptr, txg_start, resume, flags, func, arg)); 656} 657 658/* 659 * NB: pool must not be changing on-disk (eg, from zdb or sync context). 660 */ 661int 662traverse_pool(spa_t *spa, uint64_t txg_start, int flags, 663 blkptr_cb_t func, void *arg) 664{ 665 int err; 666 dsl_pool_t *dp = spa_get_dsl(spa); 667 objset_t *mos = dp->dp_meta_objset; 668 boolean_t hard = (flags & TRAVERSE_HARD); 669 670 /* visit the MOS */ 671 err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), 672 txg_start, NULL, flags, func, arg); 673 if (err != 0) 674 return (err); 675 676 /* visit each dataset */ 677 for (uint64_t obj = 1; err == 0; 678 err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { 679 dmu_object_info_t doi; 680 681 err = dmu_object_info(mos, obj, &doi); 682 if (err != 0) { 683 if (hard) 684 continue; 685 break; 686 } 687 688 if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { 689 dsl_dataset_t *ds; 690 uint64_t txg = txg_start; 691 692 dsl_pool_config_enter(dp, FTAG); 693 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); 694 dsl_pool_config_exit(dp, FTAG); 695 if (err != 0) { 696 if (hard) 697 continue; 698 break; 699 } 700 if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg) 701 txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; 702 err = traverse_dataset(ds, txg, flags, func, arg); 703 dsl_dataset_rele(ds, FTAG); 704 if (err != 0) 705 break; 706 } 707 } 708 if (err == ESRCH) 709 err = 0; 710 return (err); 711}
| 505 ASSERT(pfd->pd_bytes_fetched >= 0); 506 if (bp == NULL) 507 return (0); 508 if (pfd->pd_cancel) 509 return (SET_ERROR(EINTR)); 510 511 if (!prefetch_needed(pfd, bp)) 512 return (0); 513 514 mutex_enter(&pfd->pd_mtx); 515 while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max) 516 cv_wait(&pfd->pd_cv, &pfd->pd_mtx); 517 pfd->pd_bytes_fetched += BP_GET_LSIZE(bp); 518 cv_broadcast(&pfd->pd_cv); 519 mutex_exit(&pfd->pd_mtx); 520 521 (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 522 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); 523 524 return (0); 525} 526 527static void 528traverse_prefetch_thread(void *arg) 529{ 530 traverse_data_t *td_main = arg; 531 traverse_data_t td = *td_main; 532 zbookmark_phys_t czb; 533 534 td.td_func = traverse_prefetcher; 535 td.td_arg = td_main->td_pfd; 536 td.td_pfd = NULL; 537 td.td_resume = &td_main->td_pfd->pd_resume; 538 539 SET_BOOKMARK(&czb, td.td_objset, 540 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 541 (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); 542 543 mutex_enter(&td_main->td_pfd->pd_mtx); 544 td_main->td_pfd->pd_exited = B_TRUE; 545 cv_broadcast(&td_main->td_pfd->pd_cv); 546 mutex_exit(&td_main->td_pfd->pd_mtx); 547} 548 549/* 550 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 551 * in syncing context). 552 */ 553static int 554traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, 555 uint64_t txg_start, zbookmark_phys_t *resume, int flags, 556 blkptr_cb_t func, void *arg) 557{ 558 traverse_data_t td; 559 prefetch_data_t pd = { 0 }; 560 zbookmark_phys_t czb; 561 int err; 562 563 ASSERT(ds == NULL || objset == ds->ds_object); 564 ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); 565 566 td.td_spa = spa; 567 td.td_objset = objset; 568 td.td_rootbp = rootbp; 569 td.td_min_txg = txg_start; 570 td.td_resume = resume; 571 td.td_func = func; 572 td.td_arg = arg; 573 td.td_pfd = &pd; 574 td.td_flags = flags; 575 td.td_paused = B_FALSE; 576 td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE); 577 578 if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 579 VERIFY(spa_feature_enabled_txg(spa, 580 SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg)); 581 } else { 582 td.td_hole_birth_enabled_txg = UINT64_MAX; 583 } 584 585 pd.pd_flags = flags; 586 if (resume != NULL) 587 pd.pd_resume = *resume; 588 mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); 589 cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); 590 591 /* See comment on ZIL traversal in dsl_scan_visitds. */ 592 if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { 593 arc_flags_t flags = ARC_FLAG_WAIT; 594 objset_phys_t *osp; 595 arc_buf_t *buf; 596 597 err = arc_read(NULL, td.td_spa, rootbp, 598 arc_getbuf_func, &buf, 599 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); 600 if (err != 0) 601 return (err); 602 603 osp = buf->b_data; 604 traverse_zil(&td, &osp->os_zil_header); 605 arc_buf_destroy(buf, &buf); 606 } 607 608 if (!(flags & TRAVERSE_PREFETCH_DATA) || 609 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, 610 &td, TQ_NOQUEUE)) 611 pd.pd_exited = B_TRUE; 612 613 SET_BOOKMARK(&czb, td.td_objset, 614 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 615 err = traverse_visitbp(&td, NULL, rootbp, &czb); 616 617 mutex_enter(&pd.pd_mtx); 618 pd.pd_cancel = B_TRUE; 619 cv_broadcast(&pd.pd_cv); 620 while (!pd.pd_exited) 621 cv_wait(&pd.pd_cv, &pd.pd_mtx); 622 mutex_exit(&pd.pd_mtx); 623 624 mutex_destroy(&pd.pd_mtx); 625 cv_destroy(&pd.pd_cv); 626 627 return (err); 628} 629 630/* 631 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 632 * in syncing context). 633 */ 634int 635traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, 636 zbookmark_phys_t *resume, 637 int flags, blkptr_cb_t func, void *arg) 638{ 639 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, 640 &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); 641} 642 643int 644traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, 645 int flags, blkptr_cb_t func, void *arg) 646{ 647 return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); 648} 649 650int 651traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, 652 uint64_t txg_start, zbookmark_phys_t *resume, int flags, 653 blkptr_cb_t func, void *arg) 654{ 655 return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, 656 blkptr, txg_start, resume, flags, func, arg)); 657} 658 659/* 660 * NB: pool must not be changing on-disk (eg, from zdb or sync context). 661 */ 662int 663traverse_pool(spa_t *spa, uint64_t txg_start, int flags, 664 blkptr_cb_t func, void *arg) 665{ 666 int err; 667 dsl_pool_t *dp = spa_get_dsl(spa); 668 objset_t *mos = dp->dp_meta_objset; 669 boolean_t hard = (flags & TRAVERSE_HARD); 670 671 /* visit the MOS */ 672 err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), 673 txg_start, NULL, flags, func, arg); 674 if (err != 0) 675 return (err); 676 677 /* visit each dataset */ 678 for (uint64_t obj = 1; err == 0; 679 err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { 680 dmu_object_info_t doi; 681 682 err = dmu_object_info(mos, obj, &doi); 683 if (err != 0) { 684 if (hard) 685 continue; 686 break; 687 } 688 689 if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { 690 dsl_dataset_t *ds; 691 uint64_t txg = txg_start; 692 693 dsl_pool_config_enter(dp, FTAG); 694 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); 695 dsl_pool_config_exit(dp, FTAG); 696 if (err != 0) { 697 if (hard) 698 continue; 699 break; 700 } 701 if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg) 702 txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; 703 err = traverse_dataset(ds, txg, flags, func, arg); 704 dsl_dataset_rele(ds, FTAG); 705 if (err != 0) 706 break; 707 } 708 } 709 if (err == ESRCH) 710 err = 0; 711 return (err); 712}
|