1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#include <sys/dsl_pool.h> 27#include <sys/dsl_dataset.h> 28#include <sys/dsl_prop.h> 29#include <sys/dsl_dir.h> 30#include <sys/dsl_synctask.h> 31#include <sys/dnode.h> 32#include <sys/dmu_tx.h> 33#include <sys/dmu_objset.h> 34#include <sys/arc.h> 35#include <sys/zap.h> 36#include <sys/zio.h> 37#include <sys/zfs_context.h> 38#include <sys/fs/zfs.h> 39#include <sys/zfs_znode.h> 40#include <sys/spa_impl.h> 41#include <sys/vdev_impl.h> 42#include <sys/zil_impl.h> 43#include <sys/zio_checksum.h> 44#include <sys/ddt.h> 45 46typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); 47 48static scrub_cb_t dsl_pool_scrub_clean_cb; 49static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; 50static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, 51 uint64_t objset, uint64_t object); 52 53int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ 54int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ 55boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ 56boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ 57enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; 58 59extern int zfs_txg_timeout; 60 61static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { 62 NULL, 63 dsl_pool_scrub_clean_cb 64}; 65 66/* ARGSUSED */ 67static void 68dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 69{ 70 dsl_pool_t *dp = arg1; 71 enum scrub_func *funcp = arg2; 72 dmu_object_type_t ot = 0; 73 boolean_t complete = B_FALSE; 74 75 dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); 76 77 ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); 78 ASSERT(*funcp > SCRUB_FUNC_NONE); 79 ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); 80 81 dp->dp_scrub_min_txg = 0; 82 dp->dp_scrub_max_txg = tx->tx_txg; 83 dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max; 84 85 if (*funcp == SCRUB_FUNC_CLEAN) { 86 vdev_t *rvd = dp->dp_spa->spa_root_vdev; 87 88 /* rewrite all disk labels */ 89 vdev_config_dirty(rvd); 90 91 if (vdev_resilver_needed(rvd, 92 &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { 93 spa_event_notify(dp->dp_spa, NULL, 94 ESC_ZFS_RESILVER_START); 95 dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, 96 tx->tx_txg); 97 } else { 98 spa_event_notify(dp->dp_spa, NULL, 99 ESC_ZFS_SCRUB_START); 100 } 101 102 /* zero out the scrub stats in all vdev_stat_t's */ 103 vdev_scrub_stat_update(rvd, 104 dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : 105 POOL_SCRUB_EVERYTHING, B_FALSE); 106 107 /* 108 * If this is an incremental scrub, limit the DDT scrub phase 109 * to just the auto-ditto class (for correctness); the rest 110 * of the scrub should go faster using top-down pruning. 111 */ 112 if (dp->dp_scrub_min_txg > TXG_INITIAL) 113 dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO; 114 115 dp->dp_spa->spa_scrub_started = B_TRUE; 116 } 117 118 /* back to the generic stuff */ 119 120 if (dp->dp_blkstats == NULL) { 121 dp->dp_blkstats = 122 kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); 123 } 124 bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 125 126 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) 127 ot = DMU_OT_ZAP_OTHER; 128 129 dp->dp_scrub_func = *funcp; 130 dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, 131 ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); 132 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 133 bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); 134 dp->dp_scrub_restart = B_FALSE; 135 dp->dp_spa->spa_scrub_errors = 0; 136 137 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 138 DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, 139 &dp->dp_scrub_func, tx)); 140 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 141 DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, 142 &dp->dp_scrub_queue_obj, tx)); 143 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 144 DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, 145 &dp->dp_scrub_min_txg, tx)); 146 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 147 DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, 148 &dp->dp_scrub_max_txg, tx)); 149 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 150 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 151 sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), 152 &dp->dp_scrub_bookmark, tx)); 153 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 154 DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), 155 sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), 156 &dp->dp_scrub_ddt_bookmark, tx)); 157 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 158 DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, 159 &dp->dp_scrub_ddt_class_max, tx)); 160 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 161 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, 162 &dp->dp_spa->spa_scrub_errors, tx)); 163 164 spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, 165 "func=%u mintxg=%llu maxtxg=%llu", 166 *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); 167} 168 169int 170dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) 171{ 172 return (dsl_sync_task_do(dp, NULL, 173 dsl_pool_scrub_setup_sync, dp, &func, 0)); 174} 175 176/* ARGSUSED */ 177static void 178dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 179{ 180 dsl_pool_t *dp = arg1; 181 boolean_t *completep = arg2; 182 183 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 184 return; 185 186 mutex_enter(&dp->dp_scrub_cancel_lock); 187 188 if (dp->dp_scrub_restart) { 189 dp->dp_scrub_restart = B_FALSE; 190 *completep = B_FALSE; 191 } 192 193 /* XXX this is scrub-clean specific */ 194 mutex_enter(&dp->dp_spa->spa_scrub_lock); 195 while (dp->dp_spa->spa_scrub_inflight > 0) { 196 cv_wait(&dp->dp_spa->spa_scrub_io_cv, 197 &dp->dp_spa->spa_scrub_lock); 198 } 199 mutex_exit(&dp->dp_spa->spa_scrub_lock); 200 dp->dp_spa->spa_scrub_started = B_FALSE; 201 dp->dp_spa->spa_scrub_active = B_FALSE; 202 203 dp->dp_scrub_func = SCRUB_FUNC_NONE; 204 VERIFY(0 == dmu_object_free(dp->dp_meta_objset, 205 dp->dp_scrub_queue_obj, tx)); 206 dp->dp_scrub_queue_obj = 0; 207 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 208 bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); 209 210 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 211 DMU_POOL_SCRUB_QUEUE, tx)); 212 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 213 DMU_POOL_SCRUB_MIN_TXG, tx)); 214 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 215 DMU_POOL_SCRUB_MAX_TXG, tx)); 216 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 217 DMU_POOL_SCRUB_BOOKMARK, tx)); 218 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 219 DMU_POOL_SCRUB_FUNC, tx)); 220 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 221 DMU_POOL_SCRUB_ERRORS, tx)); 222 223 (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 224 DMU_POOL_SCRUB_DDT_BOOKMARK, tx); 225 (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 226 DMU_POOL_SCRUB_DDT_CLASS_MAX, tx); 227 228 spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, 229 "complete=%u", *completep); 230 231 /* below is scrub-clean specific */ 232 vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, 233 *completep); 234 /* 235 * If the scrub/resilver completed, update all DTLs to reflect this. 236 * Whether it succeeded or not, vacate all temporary scrub DTLs. 237 */ 238 vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, 239 *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); 240 if (*completep) 241 spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? 242 ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); 243 spa_errlog_rotate(dp->dp_spa); 244 245 /* 246 * We may have finished replacing a device. 247 * Let the async thread assess this and handle the detach. 248 */ 249 spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); 250 251 dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; 252 mutex_exit(&dp->dp_scrub_cancel_lock); 253} 254 255int 256dsl_pool_scrub_cancel(dsl_pool_t *dp) 257{ 258 boolean_t complete = B_FALSE; 259 260 return (dsl_sync_task_do(dp, NULL, 261 dsl_pool_scrub_cancel_sync, dp, &complete, 3)); 262} 263 264void 265dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) 266{ 267 /* 268 * This function will be used by bp-rewrite wad to intercept frees. 269 */ 270 zio_free(dp->dp_spa, txg, bpp); 271} 272 273static boolean_t 274bookmark_is_zero(const zbookmark_t *zb) 275{ 276 return (zb->zb_objset == 0 && zb->zb_object == 0 && 277 zb->zb_level == 0 && zb->zb_blkid == 0); 278} 279 280/* dnp is the dnode for zb1->zb_object */ 281static boolean_t 282bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, 283 const zbookmark_t *zb2) 284{ 285 uint64_t zb1nextL0, zb2thisobj; 286 287 ASSERT(zb1->zb_objset == zb2->zb_objset); 288 ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT); 289 ASSERT(zb2->zb_level == 0); 290 291 /* 292 * A bookmark in the deadlist is considered to be after 293 * everything else. 294 */ 295 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 296 return (B_TRUE); 297 298 /* The objset_phys_t isn't before anything. */ 299 if (dnp == NULL) 300 return (B_FALSE); 301 302 zb1nextL0 = (zb1->zb_blkid + 1) << 303 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 304 305 zb2thisobj = zb2->zb_object ? zb2->zb_object : 306 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 307 308 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 309 uint64_t nextobj = zb1nextL0 * 310 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 311 return (nextobj <= zb2thisobj); 312 } 313 314 if (zb1->zb_object < zb2thisobj) 315 return (B_TRUE); 316 if (zb1->zb_object > zb2thisobj) 317 return (B_FALSE); 318 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 319 return (B_FALSE); 320 return (zb1nextL0 <= zb2->zb_blkid); 321} 322 323static boolean_t 324scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb) 325{ 326 uint64_t elapsed_nanosecs; 327 int mintime; 328 329 if (dp->dp_scrub_pausing) 330 return (B_TRUE); /* we're already pausing */ 331 332 if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) 333 return (B_FALSE); /* we're resuming */ 334 335 /* We only know how to resume from level-0 blocks. */ 336 if (zb != NULL && zb->zb_level != 0) 337 return (B_FALSE); 338 339 mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms : 340 zfs_scrub_min_time_ms; 341 elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time; 342 if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || 343 (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) { 344 if (zb) { 345 dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", 346 (longlong_t)zb->zb_objset, 347 (longlong_t)zb->zb_object, 348 (longlong_t)zb->zb_level, 349 (longlong_t)zb->zb_blkid); 350 dp->dp_scrub_bookmark = *zb; 351 } 352 if (ddb) { 353 dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", 354 (longlong_t)ddb->ddb_class, 355 (longlong_t)ddb->ddb_type, 356 (longlong_t)ddb->ddb_checksum, 357 (longlong_t)ddb->ddb_cursor); 358 ASSERT(&dp->dp_scrub_ddt_bookmark == ddb); 359 } 360 dp->dp_scrub_pausing = B_TRUE; 361 return (B_TRUE); 362 } 363 return (B_FALSE); 364} 365 366typedef struct zil_traverse_arg { 367 dsl_pool_t *zta_dp; 368 zil_header_t *zta_zh; 369} zil_traverse_arg_t; 370 371/* ARGSUSED */ 372static int 373traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 374{ 375 zil_traverse_arg_t *zta = arg; 376 dsl_pool_t *dp = zta->zta_dp; 377 zil_header_t *zh = zta->zta_zh; 378 zbookmark_t zb; 379 380 if (bp->blk_birth <= dp->dp_scrub_min_txg) 381 return (0); 382 383 /* 384 * One block ("stubby") can be allocated a long time ago; we 385 * want to visit that one because it has been allocated 386 * (on-disk) even if it hasn't been claimed (even though for 387 * plain scrub there's nothing to do to it). 388 */ 389 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) 390 return (0); 391 392 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 393 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 394 395 VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); 396 return (0); 397} 398 399/* ARGSUSED */ 400static int 401traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 402{ 403 if (lrc->lrc_txtype == TX_WRITE) { 404 zil_traverse_arg_t *zta = arg; 405 dsl_pool_t *dp = zta->zta_dp; 406 zil_header_t *zh = zta->zta_zh; 407 lr_write_t *lr = (lr_write_t *)lrc; 408 blkptr_t *bp = &lr->lr_blkptr; 409 zbookmark_t zb; 410 411 if (bp->blk_birth <= dp->dp_scrub_min_txg) 412 return (0); 413 414 /* 415 * birth can be < claim_txg if this record's txg is 416 * already txg sync'ed (but this log block contains 417 * other records that are not synced) 418 */ 419 if (claim_txg == 0 || bp->blk_birth < claim_txg) 420 return (0); 421 422 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 423 lr->lr_foid, ZB_ZIL_LEVEL, 424 lr->lr_offset / BP_GET_LSIZE(bp)); 425 426 VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); 427 } 428 return (0); 429} 430 431static void 432traverse_zil(dsl_pool_t *dp, zil_header_t *zh) 433{ 434 uint64_t claim_txg = zh->zh_claim_txg; 435 zil_traverse_arg_t zta = { dp, zh }; 436 zilog_t *zilog; 437 438 /* 439 * We only want to visit blocks that have been claimed but not yet 440 * replayed (or, in read-only mode, blocks that *would* be claimed). 441 */ 442 if (claim_txg == 0 && spa_writeable(dp->dp_spa)) 443 return; 444 445 zilog = zil_alloc(dp->dp_meta_objset, zh); 446 447 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, 448 claim_txg); 449 450 zil_free(zilog); 451} 452 453static void 454scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, 455 uint64_t object, uint64_t blkid) 456{ 457 zbookmark_t czb; 458 uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; 459 460 if (zfs_no_scrub_prefetch) 461 return; 462 463 if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg || 464 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) 465 return; 466 467 SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); 468 469 (void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp, 470 buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 471 &flags, &czb); 472} 473 474static void 475scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, 476 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) 477{ 478 int err; 479 arc_buf_t *buf = NULL; 480 481 if (bp->blk_birth <= dp->dp_scrub_min_txg) 482 return; 483 484 if (scrub_pause(dp, zb, NULL)) 485 return; 486 487 if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { 488 /* 489 * If we already visited this bp & everything below (in 490 * a prior txg), don't bother doing it again. 491 */ 492 if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) 493 return; 494 495 /* 496 * If we found the block we're trying to resume from, or 497 * we went past it to a different object, zero it out to 498 * indicate that it's OK to start checking for pausing 499 * again. 500 */ 501 if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || 502 zb->zb_object > dp->dp_scrub_bookmark.zb_object) { 503 dprintf("resuming at %llx/%llx/%llx/%llx\n", 504 (longlong_t)zb->zb_objset, 505 (longlong_t)zb->zb_object, 506 (longlong_t)zb->zb_level, 507 (longlong_t)zb->zb_blkid); 508 bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); 509 } 510 } 511 512 /* 513 * If dsl_pool_scrub_ddt() has aready scrubbed this block, 514 * don't scrub it again. 515 */ 516 if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp)) 517 (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); 518 519 if (BP_GET_LEVEL(bp) > 0) { 520 uint32_t flags = ARC_WAIT; 521 int i; 522 blkptr_t *cbp; 523 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 524 525 err = arc_read(NULL, dp->dp_spa, bp, pbuf, 526 arc_getbuf_func, &buf, 527 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 528 if (err) { 529 mutex_enter(&dp->dp_spa->spa_scrub_lock); 530 dp->dp_spa->spa_scrub_errors++; 531 mutex_exit(&dp->dp_spa->spa_scrub_lock); 532 return; 533 } 534 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { 535 scrub_prefetch(dp, buf, cbp, zb->zb_objset, 536 zb->zb_object, zb->zb_blkid * epb + i); 537 } 538 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { 539 zbookmark_t czb; 540 541 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 542 zb->zb_level - 1, 543 zb->zb_blkid * epb + i); 544 scrub_visitbp(dp, dnp, buf, cbp, &czb); 545 } 546 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 547 uint32_t flags = ARC_WAIT; 548 dnode_phys_t *cdnp; 549 int i, j; 550 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 551 552 err = arc_read(NULL, dp->dp_spa, bp, pbuf, 553 arc_getbuf_func, &buf, 554 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 555 if (err) { 556 mutex_enter(&dp->dp_spa->spa_scrub_lock); 557 dp->dp_spa->spa_scrub_errors++; 558 mutex_exit(&dp->dp_spa->spa_scrub_lock); 559 return; 560 } 561 for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { 562 for (j = 0; j < cdnp->dn_nblkptr; j++) { 563 blkptr_t *cbp = &cdnp->dn_blkptr[j]; 564 scrub_prefetch(dp, buf, cbp, zb->zb_objset, 565 zb->zb_blkid * epb + i, j); 566 } 567 } 568 for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { 569 scrub_visitdnode(dp, cdnp, buf, zb->zb_objset, 570 zb->zb_blkid * epb + i); 571 } 572 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 573 uint32_t flags = ARC_WAIT; 574 objset_phys_t *osp; 575 576 err = arc_read_nolock(NULL, dp->dp_spa, bp, 577 arc_getbuf_func, &buf, 578 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 579 if (err) { 580 mutex_enter(&dp->dp_spa->spa_scrub_lock); 581 dp->dp_spa->spa_scrub_errors++; 582 mutex_exit(&dp->dp_spa->spa_scrub_lock); 583 return; 584 } 585 586 osp = buf->b_data; 587 588 traverse_zil(dp, &osp->os_zil_header); 589 590 scrub_visitdnode(dp, &osp->os_meta_dnode, 591 buf, zb->zb_objset, DMU_META_DNODE_OBJECT); 592 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { 593 scrub_visitdnode(dp, &osp->os_userused_dnode, 594 buf, zb->zb_objset, DMU_USERUSED_OBJECT); 595 scrub_visitdnode(dp, &osp->os_groupused_dnode, 596 buf, zb->zb_objset, DMU_GROUPUSED_OBJECT); 597 } 598 } 599 600 if (buf) 601 (void) arc_buf_remove_ref(buf, &buf); 602} 603 604static void 605scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, 606 uint64_t objset, uint64_t object) 607{ 608 int j; 609 610 for (j = 0; j < dnp->dn_nblkptr; j++) { 611 zbookmark_t czb; 612 613 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 614 scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); 615 } 616} 617 618static void 619scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) 620{ 621 zbookmark_t zb; 622 623 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 624 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 625 scrub_visitbp(dp, NULL, NULL, bp, &zb); 626} 627 628void 629dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) 630{ 631 dsl_pool_t *dp = ds->ds_dir->dd_pool; 632 633 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 634 return; 635 636 if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { 637 SET_BOOKMARK(&dp->dp_scrub_bookmark, 638 ZB_DESTROYED_OBJSET, 0, 0, 0); 639 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 640 ds->ds_object, tx) != 0) { 641 return; 642 } 643 644 if (ds->ds_phys->ds_next_snap_obj != 0) { 645 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 646 ds->ds_phys->ds_next_snap_obj, tx) == 0); 647 } 648 ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); 649} 650 651void 652dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) 653{ 654 dsl_pool_t *dp = ds->ds_dir->dd_pool; 655 656 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 657 return; 658 659 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); 660 661 if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { 662 dp->dp_scrub_bookmark.zb_objset = 663 ds->ds_phys->ds_prev_snap_obj; 664 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 665 ds->ds_object, tx) == 0) { 666 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 667 ds->ds_phys->ds_prev_snap_obj, tx) == 0); 668 } 669} 670 671void 672dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) 673{ 674 dsl_pool_t *dp = ds1->ds_dir->dd_pool; 675 676 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 677 return; 678 679 if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { 680 dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; 681 } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { 682 dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; 683 } 684 685 if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 686 ds1->ds_object, tx) == 0) { 687 int err = zap_add_int(dp->dp_meta_objset, 688 dp->dp_scrub_queue_obj, ds2->ds_object, tx); 689 VERIFY(err == 0 || err == EEXIST); 690 if (err == EEXIST) { 691 /* Both were there to begin with */ 692 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 693 dp->dp_scrub_queue_obj, ds1->ds_object, tx)); 694 } 695 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 696 ds2->ds_object, tx) == 0) { 697 VERIFY(0 == zap_add_int(dp->dp_meta_objset, 698 dp->dp_scrub_queue_obj, ds1->ds_object, tx)); 699 } 700} 701 702struct enqueue_clones_arg { 703 dmu_tx_t *tx; 704 uint64_t originobj; 705}; 706 707/* ARGSUSED */ 708static int 709enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 710{ 711 struct enqueue_clones_arg *eca = arg; 712 dsl_dataset_t *ds; 713 int err; 714 dsl_pool_t *dp; 715 716 err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); 717 if (err) 718 return (err); 719 dp = ds->ds_dir->dd_pool; 720 721 if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { 722 while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { 723 dsl_dataset_t *prev; 724 err = dsl_dataset_hold_obj(dp, 725 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); 726 727 dsl_dataset_rele(ds, FTAG); 728 if (err) 729 return (err); 730 ds = prev; 731 } 732 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 733 ds->ds_object, eca->tx) == 0); 734 } 735 dsl_dataset_rele(ds, FTAG); 736 return (0); 737} 738 739static void 740scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) 741{ 742 dsl_dataset_t *ds; 743 uint64_t min_txg_save; 744 745 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 746 747 /* 748 * Iterate over the bps in this ds. 749 */ 750 min_txg_save = dp->dp_scrub_min_txg; 751 dp->dp_scrub_min_txg = 752 MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); 753 scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); 754 dp->dp_scrub_min_txg = min_txg_save; 755 756 if (dp->dp_scrub_pausing) 757 goto out; 758 759 /* 760 * Add descendent datasets to work queue. 761 */ 762 if (ds->ds_phys->ds_next_snap_obj != 0) { 763 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 764 ds->ds_phys->ds_next_snap_obj, tx) == 0); 765 } 766 if (ds->ds_phys->ds_num_children > 1) { 767 boolean_t usenext = B_FALSE; 768 if (ds->ds_phys->ds_next_clones_obj != 0) { 769 uint64_t count; 770 /* 771 * A bug in a previous version of the code could 772 * cause upgrade_clones_cb() to not set 773 * ds_next_snap_obj when it should, leading to a 774 * missing entry. Therefore we can only use the 775 * next_clones_obj when its count is correct. 776 */ 777 int err = zap_count(dp->dp_meta_objset, 778 ds->ds_phys->ds_next_clones_obj, &count); 779 if (err == 0 && 780 count == ds->ds_phys->ds_num_children - 1) 781 usenext = B_TRUE; 782 } 783 784 if (usenext) { 785 VERIFY(zap_join(dp->dp_meta_objset, 786 ds->ds_phys->ds_next_clones_obj, 787 dp->dp_scrub_queue_obj, tx) == 0); 788 } else { 789 struct enqueue_clones_arg eca; 790 eca.tx = tx; 791 eca.originobj = ds->ds_object; 792 793 (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, 794 NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); 795 } 796 } 797 798out: 799 dsl_dataset_rele(ds, FTAG); 800} 801 802/* ARGSUSED */ 803static int 804enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 805{ 806 dmu_tx_t *tx = arg; 807 dsl_dataset_t *ds; 808 int err; 809 dsl_pool_t *dp; 810 811 err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); 812 if (err) 813 return (err); 814 815 dp = ds->ds_dir->dd_pool; 816 817 while (ds->ds_phys->ds_prev_snap_obj != 0) { 818 dsl_dataset_t *prev; 819 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 820 FTAG, &prev); 821 if (err) { 822 dsl_dataset_rele(ds, FTAG); 823 return (err); 824 } 825 826 /* 827 * If this is a clone, we don't need to worry about it for now. 828 */ 829 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { 830 dsl_dataset_rele(ds, FTAG); 831 dsl_dataset_rele(prev, FTAG); 832 return (0); 833 } 834 dsl_dataset_rele(ds, FTAG); 835 ds = prev; 836 } 837 838 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, 839 ds->ds_object, tx) == 0); 840 dsl_dataset_rele(ds, FTAG); 841 return (0); 842} 843 844/* 845 * Scrub/dedup interaction. 846 * 847 * If there are N references to a deduped block, we don't want to scrub it 848 * N times -- ideally, we should scrub it exactly once. 849 * 850 * We leverage the fact that the dde's replication class (enum ddt_class) 851 * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest 852 * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. 853 * 854 * To prevent excess scrubbing, the scrub begins by walking the DDT 855 * to find all blocks with refcnt > 1, and scrubs each of these once. 856 * Since there are two replication classes which contain blocks with 857 * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. 858 * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. 859 * 860 * There would be nothing more to say if a block's refcnt couldn't change 861 * during a scrub, but of course it can so we must account for changes 862 * in a block's replication class. 863 * 864 * Here's an example of what can occur: 865 * 866 * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 867 * when visited during the top-down scrub phase, it will be scrubbed twice. 868 * This negates our scrub optimization, but is otherwise harmless. 869 * 870 * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 871 * on each visit during the top-down scrub phase, it will never be scrubbed. 872 * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's 873 * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to 874 * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 875 * while a scrub is in progress, it scrubs the block right then. 876 */ 877static void 878dsl_pool_scrub_ddt(dsl_pool_t *dp) 879{ 880 ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark; 881 ddt_entry_t dde; 882 int error; 883 884 while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) { 885 if (ddb->ddb_class > dp->dp_scrub_ddt_class_max) 886 return; 887 dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde); 888 if (scrub_pause(dp, NULL, ddb)) 889 return; 890 } 891 ASSERT(error == ENOENT); 892 ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max); 893} 894 895void 896dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, 897 const ddt_entry_t *dde) 898{ 899 const ddt_key_t *ddk = &dde->dde_key; 900 const ddt_phys_t *ddp = dde->dde_phys; 901 blkptr_t blk; 902 zbookmark_t zb = { 0 }; 903 904 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 905 if (ddp->ddp_phys_birth == 0) 906 continue; 907 ddt_bp_create(checksum, ddk, ddp, &blk); 908 scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); 909 } 910} 911 912void 913dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) 914{ 915 spa_t *spa = dp->dp_spa; 916 zap_cursor_t zc; 917 zap_attribute_t za; 918 boolean_t complete = B_TRUE; 919 920 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) 921 return; 922 923 /* 924 * If the pool is not loaded, or is trying to unload, leave it alone. 925 */ 926 if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa)) 927 return; 928 929 if (dp->dp_scrub_restart) { 930 enum scrub_func func = dp->dp_scrub_func; 931 dp->dp_scrub_restart = B_FALSE; 932 dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); 933 } 934 935 if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { 936 /* 937 * We must have resumed after rebooting; reset the vdev 938 * stats to know that we're doing a scrub (although it 939 * will think we're just starting now). 940 */ 941 vdev_scrub_stat_update(spa->spa_root_vdev, 942 dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : 943 POOL_SCRUB_EVERYTHING, B_FALSE); 944 } 945 946 dp->dp_scrub_pausing = B_FALSE; 947 dp->dp_scrub_start_time = gethrtime(); 948 dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); 949 spa->spa_scrub_active = B_TRUE; 950 951 if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) { 952 dsl_pool_scrub_ddt(dp); 953 if (dp->dp_scrub_pausing) 954 goto out; 955 } 956 957 if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) { 958 /* First do the MOS & ORIGIN */ 959 scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); 960 if (dp->dp_scrub_pausing) 961 goto out; 962 963 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { 964 VERIFY(0 == dmu_objset_find_spa(spa, 965 NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); 966 } else { 967 scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); 968 } 969 ASSERT(!dp->dp_scrub_pausing); 970 } else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { 971 /* 972 * If we were paused, continue from here. Note if the ds 973 * we were paused on was destroyed, the zb_objset will be 974 * ZB_DESTROYED_OBJSET, so we will skip this and find a new 975 * objset below. 976 */ 977 scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); 978 if (dp->dp_scrub_pausing) 979 goto out; 980 } 981 982 /* 983 * In case we were paused right at the end of the ds, zero the 984 * bookmark so we don't think that we're still trying to resume. 985 */ 986 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); 987 988 /* keep pulling things out of the zap-object-as-queue */ 989 while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), 990 zap_cursor_retrieve(&zc, &za) == 0) { 991 VERIFY(0 == zap_remove(dp->dp_meta_objset, 992 dp->dp_scrub_queue_obj, za.za_name, tx)); 993 scrub_visitds(dp, za.za_first_integer, tx); 994 if (dp->dp_scrub_pausing) 995 break; 996 zap_cursor_fini(&zc); 997 } 998 zap_cursor_fini(&zc); 999 if (dp->dp_scrub_pausing) 1000 goto out; 1001 1002 /* done. */ 1003 1004 dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); 1005 return; 1006out: 1007 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1008 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 1009 sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), 1010 &dp->dp_scrub_bookmark, tx)); 1011 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1012 DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), 1013 sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), 1014 &dp->dp_scrub_ddt_bookmark, tx)); 1015 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1016 DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, 1017 &dp->dp_scrub_ddt_class_max, tx)); 1018 VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1019 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, 1020 &spa->spa_scrub_errors, tx)); 1021} 1022 1023void 1024dsl_pool_scrub_restart(dsl_pool_t *dp) 1025{ 1026 mutex_enter(&dp->dp_scrub_cancel_lock); 1027 dp->dp_scrub_restart = B_TRUE; 1028 mutex_exit(&dp->dp_scrub_cancel_lock); 1029} 1030 1031/* 1032 * scrub consumers 1033 */ 1034 1035static void 1036count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) 1037{ 1038 int i; 1039 1040 /* 1041 * If we resume after a reboot, zab will be NULL; don't record 1042 * incomplete stats in that case. 1043 */ 1044 if (zab == NULL) 1045 return; 1046 1047 for (i = 0; i < 4; i++) { 1048 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; 1049 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; 1050 zfs_blkstat_t *zb = &zab->zab_type[l][t]; 1051 int equal; 1052 1053 zb->zb_count++; 1054 zb->zb_asize += BP_GET_ASIZE(bp); 1055 zb->zb_lsize += BP_GET_LSIZE(bp); 1056 zb->zb_psize += BP_GET_PSIZE(bp); 1057 zb->zb_gangs += BP_COUNT_GANG(bp); 1058 1059 switch (BP_GET_NDVAS(bp)) { 1060 case 2: 1061 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 1062 DVA_GET_VDEV(&bp->blk_dva[1])) 1063 zb->zb_ditto_2_of_2_samevdev++; 1064 break; 1065 case 3: 1066 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 1067 DVA_GET_VDEV(&bp->blk_dva[1])) + 1068 (DVA_GET_VDEV(&bp->blk_dva[0]) == 1069 DVA_GET_VDEV(&bp->blk_dva[2])) + 1070 (DVA_GET_VDEV(&bp->blk_dva[1]) == 1071 DVA_GET_VDEV(&bp->blk_dva[2])); 1072 if (equal == 1) 1073 zb->zb_ditto_2_of_3_samevdev++; 1074 else if (equal == 3) 1075 zb->zb_ditto_3_of_3_samevdev++; 1076 break; 1077 } 1078 } 1079} 1080 1081static void 1082dsl_pool_scrub_clean_done(zio_t *zio) 1083{ 1084 spa_t *spa = zio->io_spa; 1085 1086 zio_data_buf_free(zio->io_data, zio->io_size); 1087 1088 mutex_enter(&spa->spa_scrub_lock); 1089 spa->spa_scrub_inflight--; 1090 cv_broadcast(&spa->spa_scrub_io_cv); 1091 1092 if (zio->io_error && (zio->io_error != ECKSUM || 1093 !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) 1094 spa->spa_scrub_errors++; 1095 mutex_exit(&spa->spa_scrub_lock); 1096} 1097 1098static int 1099dsl_pool_scrub_clean_cb(dsl_pool_t *dp, 1100 const blkptr_t *bp, const zbookmark_t *zb) 1101{ 1102 size_t size = BP_GET_PSIZE(bp); 1103 spa_t *spa = dp->dp_spa; 1104 uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); 1105 boolean_t needs_io; 1106 int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; 1107 int zio_priority; 1108 1109 if (phys_birth <= dp->dp_scrub_min_txg || 1110 phys_birth >= dp->dp_scrub_max_txg) 1111 return (0); 1112 1113 count_block(dp->dp_blkstats, bp); 1114 1115 if (dp->dp_scrub_isresilver == 0) { 1116 /* It's a scrub */ 1117 zio_flags |= ZIO_FLAG_SCRUB; 1118 zio_priority = ZIO_PRIORITY_SCRUB; 1119 needs_io = B_TRUE; 1120 } else { 1121 /* It's a resilver */ 1122 zio_flags |= ZIO_FLAG_RESILVER; 1123 zio_priority = ZIO_PRIORITY_RESILVER; 1124 needs_io = B_FALSE; 1125 } 1126 1127 /* If it's an intent log block, failure is expected. */ 1128 if (zb->zb_level == ZB_ZIL_LEVEL) 1129 zio_flags |= ZIO_FLAG_SPECULATIVE; 1130 1131 for (int d = 0; d < BP_GET_NDVAS(bp); d++) { 1132 vdev_t *vd = vdev_lookup_top(spa, 1133 DVA_GET_VDEV(&bp->blk_dva[d])); 1134 1135 /* 1136 * Keep track of how much data we've examined so that 1137 * zpool(1M) status can make useful progress reports. 1138 */ 1139 mutex_enter(&vd->vdev_stat_lock); 1140 vd->vdev_stat.vs_scrub_examined += 1141 DVA_GET_ASIZE(&bp->blk_dva[d]); 1142 mutex_exit(&vd->vdev_stat_lock); 1143 1144 /* if it's a resilver, this may not be in the target range */ 1145 if (!needs_io) { 1146 if (DVA_GET_GANG(&bp->blk_dva[d])) { 1147 /* 1148 * Gang members may be spread across multiple 1149 * vdevs, so the best estimate we have is the 1150 * scrub range, which has already been checked. 1151 * XXX -- it would be better to change our 1152 * allocation policy to ensure that all 1153 * gang members reside on the same vdev. 1154 */ 1155 needs_io = B_TRUE; 1156 } else { 1157 needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, 1158 phys_birth, 1); 1159 } 1160 } 1161 } 1162 1163 if (needs_io && !zfs_no_scrub_io) { 1164 void *data = zio_data_buf_alloc(size); 1165 1166 mutex_enter(&spa->spa_scrub_lock); 1167 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) 1168 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1169 spa->spa_scrub_inflight++; 1170 mutex_exit(&spa->spa_scrub_lock); 1171 1172 zio_nowait(zio_read(NULL, spa, bp, data, size, 1173 dsl_pool_scrub_clean_done, NULL, zio_priority, 1174 zio_flags, zb)); 1175 } 1176 1177 /* do not relocate this block */ 1178 return (0); 1179} 1180 1181int 1182dsl_pool_scrub_clean(dsl_pool_t *dp) 1183{ 1184 spa_t *spa = dp->dp_spa; 1185 1186 /* 1187 * Purge all vdev caches and probe all devices. We do this here 1188 * rather than in sync context because this requires a writer lock 1189 * on the spa_config lock, which we can't do from sync context. The 1190 * spa_scrub_reopen flag indicates that vdev_open() should not 1191 * attempt to start another scrub. 1192 */ 1193 spa_vdev_state_enter(spa, SCL_NONE); 1194 spa->spa_scrub_reopen = B_TRUE; 1195 vdev_reopen(spa->spa_root_vdev); 1196 spa->spa_scrub_reopen = B_FALSE; 1197 (void) spa_vdev_state_exit(spa, NULL, 0); 1198 1199 return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); 1200} 1201