1219089Spjd/* 2219089Spjd * CDDL HEADER START 3219089Spjd * 4219089Spjd * The contents of this file are subject to the terms of the 5219089Spjd * Common Development and Distribution License (the "License"). 6219089Spjd * You may not use this file except in compliance with the License. 7219089Spjd * 8219089Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9219089Spjd * or http://www.opensolaris.org/os/licensing. 10219089Spjd * See the License for the specific language governing permissions 11219089Spjd * and limitations under the License. 12219089Spjd * 13219089Spjd * When distributing Covered Code, include this CDDL HEADER in each 14219089Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15219089Spjd * If applicable, add the following below this CDDL HEADER, with the 16219089Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17219089Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18219089Spjd * 19219089Spjd * CDDL HEADER END 20219089Spjd */ 21219089Spjd/* 22219089Spjd * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23290747Smav * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24219089Spjd */ 25219089Spjd 26219089Spjd#include <sys/dsl_scan.h> 27219089Spjd#include <sys/dsl_pool.h> 28219089Spjd#include <sys/dsl_dataset.h> 29219089Spjd#include <sys/dsl_prop.h> 30219089Spjd#include <sys/dsl_dir.h> 31219089Spjd#include <sys/dsl_synctask.h> 32219089Spjd#include <sys/dnode.h> 33219089Spjd#include <sys/dmu_tx.h> 34219089Spjd#include <sys/dmu_objset.h> 35219089Spjd#include <sys/arc.h> 36219089Spjd#include <sys/zap.h> 37219089Spjd#include <sys/zio.h> 38219089Spjd#include <sys/zfs_context.h> 39219089Spjd#include <sys/fs/zfs.h> 40219089Spjd#include <sys/zfs_znode.h> 41219089Spjd#include <sys/spa_impl.h> 42219089Spjd#include <sys/vdev_impl.h> 43219089Spjd#include <sys/zil_impl.h> 44219089Spjd#include <sys/zio_checksum.h> 45219089Spjd#include <sys/ddt.h> 46219089Spjd#include <sys/sa.h> 47219089Spjd#include <sys/sa_impl.h> 48236884Smm#include <sys/zfeature.h> 49219089Spjd#ifdef _KERNEL 50219089Spjd#include <sys/zfs_vfsops.h> 51219089Spjd#endif 52219089Spjd 53268657Sdelphijtypedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, 54268657Sdelphij const zbookmark_phys_t *); 55219089Spjd 56219089Spjdstatic scan_cb_t dsl_scan_scrub_cb; 57248571Smmstatic void dsl_scan_cancel_sync(void *, dmu_tx_t *); 58219089Spjdstatic void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); 59219089Spjd 60237972Smmunsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ 61237972Smmunsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ 62237972Smmunsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ 63237972Smmunsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ 64219089Spjd 65237972Smmunsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ 66237972Smmunsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ 67237972Smmunsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver 68237972Smm per txg */ 69219089Spjdboolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ 70268650Sdelphijboolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ 71237972Smm 72237972SmmSYSCTL_DECL(_vfs_zfs); 73237972SmmTUNABLE_INT("vfs.zfs.top_maxinflight", &zfs_top_maxinflight); 74237972SmmSYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RW, 75237972Smm &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); 76237972SmmTUNABLE_INT("vfs.zfs.resilver_delay", &zfs_resilver_delay); 77237972SmmSYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RW, 78237972Smm &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); 79237972SmmTUNABLE_INT("vfs.zfs.scrub_delay", &zfs_scrub_delay); 80237972SmmSYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RW, 81237972Smm &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); 82237972SmmTUNABLE_INT("vfs.zfs.scan_idle", &zfs_scan_idle); 83237972SmmSYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RW, 84237972Smm &zfs_scan_idle, 0, "Idle scan window in clock ticks"); 85237972SmmTUNABLE_INT("vfs.zfs.scan_min_time_ms", &zfs_scan_min_time_ms); 86237972SmmSYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RW, 87237972Smm &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg"); 88237972SmmTUNABLE_INT("vfs.zfs.free_min_time_ms", &zfs_free_min_time_ms); 89237972SmmSYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RW, 90237972Smm &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); 91237972SmmTUNABLE_INT("vfs.zfs.resilver_min_time_ms", &zfs_resilver_min_time_ms); 92237972SmmSYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RW, 93237972Smm &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); 94237972SmmTUNABLE_INT("vfs.zfs.no_scrub_io", &zfs_no_scrub_io); 95237972SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RW, 96237972Smm &zfs_no_scrub_io, 0, "Disable scrub I/O"); 97237972SmmTUNABLE_INT("vfs.zfs.no_scrub_prefetch", &zfs_no_scrub_prefetch); 98237972SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RW, 99237972Smm &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); 100237972Smm 101219089Spjdenum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; 102272665Sdelphij/* max number of blocks to free in a single TXG */ 103272665Sdelphijuint64_t zfs_free_max_blocks = UINT64_MAX; 104272665SdelphijSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, 105272665Sdelphij &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG"); 106219089Spjd 107272665Sdelphij 108219089Spjd#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ 109219089Spjd ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ 110219089Spjd (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) 111219089Spjd 112219089Spjdextern int zfs_txg_timeout; 113219089Spjd 114290747Smav/* 115290747Smav * Enable/disable the processing of the free_bpobj object. 116290747Smav */ 117290747Smavboolean_t zfs_free_bpobj_enabled = B_TRUE; 118290747Smav 119290747SmavSYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN, 120290747Smav &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing"); 121290747Smav 122219089Spjd/* the order has to match pool_scan_type */ 123219089Spjdstatic scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { 124219089Spjd NULL, 125219089Spjd dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ 126219089Spjd dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ 127219089Spjd}; 128219089Spjd 129219089Spjdint 130219089Spjddsl_scan_init(dsl_pool_t *dp, uint64_t txg) 131219089Spjd{ 132219089Spjd int err; 133219089Spjd dsl_scan_t *scn; 134219089Spjd spa_t *spa = dp->dp_spa; 135219089Spjd uint64_t f; 136219089Spjd 137219089Spjd scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); 138219089Spjd scn->scn_dp = dp; 139219089Spjd 140249858Smm /* 141249858Smm * It's possible that we're resuming a scan after a reboot so 142249858Smm * make sure that the scan_async_destroying flag is initialized 143249858Smm * appropriately. 144249858Smm */ 145249858Smm ASSERT(!scn->scn_async_destroying); 146249858Smm scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, 147263390Sdelphij SPA_FEATURE_ASYNC_DESTROY); 148249858Smm 149219089Spjd err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 150219089Spjd "scrub_func", sizeof (uint64_t), 1, &f); 151219089Spjd if (err == 0) { 152219089Spjd /* 153219089Spjd * There was an old-style scrub in progress. Restart a 154219089Spjd * new-style scrub from the beginning. 155219089Spjd */ 156219089Spjd scn->scn_restart_txg = txg; 157219089Spjd zfs_dbgmsg("old-style scrub was in progress; " 158219089Spjd "restarting new-style scrub in txg %llu", 159219089Spjd scn->scn_restart_txg); 160219089Spjd 161219089Spjd /* 162219089Spjd * Load the queue obj from the old location so that it 163219089Spjd * can be freed by dsl_scan_done(). 164219089Spjd */ 165219089Spjd (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 166219089Spjd "scrub_queue", sizeof (uint64_t), 1, 167219089Spjd &scn->scn_phys.scn_queue_obj); 168219089Spjd } else { 169219089Spjd err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 170219089Spjd DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 171219089Spjd &scn->scn_phys); 172219089Spjd if (err == ENOENT) 173219089Spjd return (0); 174219089Spjd else if (err) 175219089Spjd return (err); 176219089Spjd 177219089Spjd if (scn->scn_phys.scn_state == DSS_SCANNING && 178219089Spjd spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { 179219089Spjd /* 180219089Spjd * A new-type scrub was in progress on an old 181219089Spjd * pool, and the pool was accessed by old 182219089Spjd * software. Restart from the beginning, since 183219089Spjd * the old software may have changed the pool in 184219089Spjd * the meantime. 185219089Spjd */ 186219089Spjd scn->scn_restart_txg = txg; 187219089Spjd zfs_dbgmsg("new-style scrub was modified " 188219089Spjd "by old software; restarting in txg %llu", 189219089Spjd scn->scn_restart_txg); 190219089Spjd } 191219089Spjd } 192219089Spjd 193219089Spjd spa_scan_stat_init(spa); 194219089Spjd return (0); 195219089Spjd} 196219089Spjd 197219089Spjdvoid 198219089Spjddsl_scan_fini(dsl_pool_t *dp) 199219089Spjd{ 200219089Spjd if (dp->dp_scan) { 201219089Spjd kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); 202219089Spjd dp->dp_scan = NULL; 203219089Spjd } 204219089Spjd} 205219089Spjd 206219089Spjd/* ARGSUSED */ 207219089Spjdstatic int 208248571Smmdsl_scan_setup_check(void *arg, dmu_tx_t *tx) 209219089Spjd{ 210248571Smm dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 211219089Spjd 212219089Spjd if (scn->scn_phys.scn_state == DSS_SCANNING) 213249195Smm return (SET_ERROR(EBUSY)); 214219089Spjd 215219089Spjd return (0); 216219089Spjd} 217219089Spjd 218219089Spjdstatic void 219248571Smmdsl_scan_setup_sync(void *arg, dmu_tx_t *tx) 220219089Spjd{ 221248571Smm dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 222248571Smm pool_scan_func_t *funcp = arg; 223219089Spjd dmu_object_type_t ot = 0; 224219089Spjd dsl_pool_t *dp = scn->scn_dp; 225219089Spjd spa_t *spa = dp->dp_spa; 226219089Spjd 227219089Spjd ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); 228219089Spjd ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); 229219089Spjd bzero(&scn->scn_phys, sizeof (scn->scn_phys)); 230219089Spjd scn->scn_phys.scn_func = *funcp; 231219089Spjd scn->scn_phys.scn_state = DSS_SCANNING; 232219089Spjd scn->scn_phys.scn_min_txg = 0; 233219089Spjd scn->scn_phys.scn_max_txg = tx->tx_txg; 234219089Spjd scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ 235219089Spjd scn->scn_phys.scn_start_time = gethrestime_sec(); 236219089Spjd scn->scn_phys.scn_errors = 0; 237219089Spjd scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; 238219089Spjd scn->scn_restart_txg = 0; 239254112Sdelphij scn->scn_done_txg = 0; 240219089Spjd spa_scan_stat_init(spa); 241219089Spjd 242219089Spjd if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 243219089Spjd scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; 244219089Spjd 245219089Spjd /* rewrite all disk labels */ 246219089Spjd vdev_config_dirty(spa->spa_root_vdev); 247219089Spjd 248219089Spjd if (vdev_resilver_needed(spa->spa_root_vdev, 249219089Spjd &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { 250219089Spjd spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 251219089Spjd } else { 252219089Spjd spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); 253219089Spjd } 254219089Spjd 255219089Spjd spa->spa_scrub_started = B_TRUE; 256219089Spjd /* 257219089Spjd * If this is an incremental scrub, limit the DDT scrub phase 258219089Spjd * to just the auto-ditto class (for correctness); the rest 259219089Spjd * of the scrub should go faster using top-down pruning. 260219089Spjd */ 261219089Spjd if (scn->scn_phys.scn_min_txg > TXG_INITIAL) 262219089Spjd scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; 263219089Spjd 264219089Spjd } 265219089Spjd 266219089Spjd /* back to the generic stuff */ 267219089Spjd 268219089Spjd if (dp->dp_blkstats == NULL) { 269219089Spjd dp->dp_blkstats = 270219089Spjd kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); 271219089Spjd } 272219089Spjd bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 273219089Spjd 274219089Spjd if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) 275219089Spjd ot = DMU_OT_ZAP_OTHER; 276219089Spjd 277219089Spjd scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, 278219089Spjd ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); 279219089Spjd 280219089Spjd dsl_scan_sync_state(scn, tx); 281219089Spjd 282248571Smm spa_history_log_internal(spa, "scan setup", tx, 283219089Spjd "func=%u mintxg=%llu maxtxg=%llu", 284219089Spjd *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); 285219089Spjd} 286219089Spjd 287219089Spjd/* ARGSUSED */ 288219089Spjdstatic void 289219089Spjddsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) 290219089Spjd{ 291219089Spjd static const char *old_names[] = { 292219089Spjd "scrub_bookmark", 293219089Spjd "scrub_ddt_bookmark", 294219089Spjd "scrub_ddt_class_max", 295219089Spjd "scrub_queue", 296219089Spjd "scrub_min_txg", 297219089Spjd "scrub_max_txg", 298219089Spjd "scrub_func", 299219089Spjd "scrub_errors", 300219089Spjd NULL 301219089Spjd }; 302219089Spjd 303219089Spjd dsl_pool_t *dp = scn->scn_dp; 304219089Spjd spa_t *spa = dp->dp_spa; 305219089Spjd int i; 306219089Spjd 307219089Spjd /* Remove any remnants of an old-style scrub. */ 308219089Spjd for (i = 0; old_names[i]; i++) { 309219089Spjd (void) zap_remove(dp->dp_meta_objset, 310219089Spjd DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); 311219089Spjd } 312219089Spjd 313219089Spjd if (scn->scn_phys.scn_queue_obj != 0) { 314219089Spjd VERIFY(0 == dmu_object_free(dp->dp_meta_objset, 315219089Spjd scn->scn_phys.scn_queue_obj, tx)); 316219089Spjd scn->scn_phys.scn_queue_obj = 0; 317219089Spjd } 318219089Spjd 319219089Spjd /* 320219089Spjd * If we were "restarted" from a stopped state, don't bother 321219089Spjd * with anything else. 322219089Spjd */ 323219089Spjd if (scn->scn_phys.scn_state != DSS_SCANNING) 324219089Spjd return; 325219089Spjd 326219089Spjd if (complete) 327219089Spjd scn->scn_phys.scn_state = DSS_FINISHED; 328219089Spjd else 329219089Spjd scn->scn_phys.scn_state = DSS_CANCELED; 330219089Spjd 331248571Smm spa_history_log_internal(spa, "scan done", tx, 332219089Spjd "complete=%u", complete); 333219089Spjd 334219089Spjd if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 335219089Spjd mutex_enter(&spa->spa_scrub_lock); 336219089Spjd while (spa->spa_scrub_inflight > 0) { 337219089Spjd cv_wait(&spa->spa_scrub_io_cv, 338219089Spjd &spa->spa_scrub_lock); 339219089Spjd } 340219089Spjd mutex_exit(&spa->spa_scrub_lock); 341219089Spjd spa->spa_scrub_started = B_FALSE; 342219089Spjd spa->spa_scrub_active = B_FALSE; 343219089Spjd 344219089Spjd /* 345219089Spjd * If the scrub/resilver completed, update all DTLs to 346219089Spjd * reflect this. Whether it succeeded or not, vacate 347219089Spjd * all temporary scrub DTLs. 348219089Spjd */ 349219089Spjd vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 350219089Spjd complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); 351219089Spjd if (complete) { 352219089Spjd spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? 353219089Spjd ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); 354219089Spjd } 355219089Spjd spa_errlog_rotate(spa); 356219089Spjd 357219089Spjd /* 358219089Spjd * We may have finished replacing a device. 359219089Spjd * Let the async thread assess this and handle the detach. 360219089Spjd */ 361219089Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 362219089Spjd } 363219089Spjd 364219089Spjd scn->scn_phys.scn_end_time = gethrestime_sec(); 365219089Spjd} 366219089Spjd 367219089Spjd/* ARGSUSED */ 368219089Spjdstatic int 369248571Smmdsl_scan_cancel_check(void *arg, dmu_tx_t *tx) 370219089Spjd{ 371248571Smm dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 372219089Spjd 373219089Spjd if (scn->scn_phys.scn_state != DSS_SCANNING) 374249195Smm return (SET_ERROR(ENOENT)); 375219089Spjd return (0); 376219089Spjd} 377219089Spjd 378219089Spjd/* ARGSUSED */ 379219089Spjdstatic void 380248571Smmdsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) 381219089Spjd{ 382248571Smm dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 383219089Spjd 384219089Spjd dsl_scan_done(scn, B_FALSE, tx); 385219089Spjd dsl_scan_sync_state(scn, tx); 386219089Spjd} 387219089Spjd 388219089Spjdint 389219089Spjddsl_scan_cancel(dsl_pool_t *dp) 390219089Spjd{ 391248571Smm return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, 392269006Sdelphij dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); 393219089Spjd} 394219089Spjd 395273347Sdelphijstatic void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, 396273347Sdelphij dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, 397273347Sdelphij dmu_objset_type_t ostype, dmu_tx_t *tx); 398219089Spjdstatic void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, 399219089Spjd dmu_objset_type_t ostype, 400273347Sdelphij dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); 401219089Spjd 402219089Spjdvoid 403219089Spjddsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) 404219089Spjd{ 405219089Spjd zio_free(dp->dp_spa, txg, bp); 406219089Spjd} 407219089Spjd 408219089Spjdvoid 409219089Spjddsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) 410219089Spjd{ 411219089Spjd ASSERT(dsl_pool_sync_context(dp)); 412240868Spjd zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), 413240868Spjd pio->io_flags)); 414219089Spjd} 415219089Spjd 416219089Spjdstatic uint64_t 417219089Spjddsl_scan_ds_maxtxg(dsl_dataset_t *ds) 418219089Spjd{ 419219089Spjd uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; 420288549Smav if (ds->ds_is_snapshot) 421277585Sdelphij return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); 422219089Spjd return (smt); 423219089Spjd} 424219089Spjd 425219089Spjdstatic void 426219089Spjddsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) 427219089Spjd{ 428248571Smm VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, 429219089Spjd DMU_POOL_DIRECTORY_OBJECT, 430219089Spjd DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 431219089Spjd &scn->scn_phys, tx)); 432219089Spjd} 433219089Spjd 434277576Sdelphijextern int zfs_vdev_async_write_active_min_dirty_percent; 435277576Sdelphij 436219089Spjdstatic boolean_t 437268657Sdelphijdsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb) 438219089Spjd{ 439219089Spjd /* we never skip user/group accounting objects */ 440219089Spjd if (zb && (int64_t)zb->zb_object < 0) 441219089Spjd return (B_FALSE); 442219089Spjd 443219089Spjd if (scn->scn_pausing) 444219089Spjd return (B_TRUE); /* we're already pausing */ 445219089Spjd 446236884Smm if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) 447219089Spjd return (B_FALSE); /* we're resuming */ 448219089Spjd 449219089Spjd /* We only know how to resume from level-0 blocks. */ 450219089Spjd if (zb && zb->zb_level != 0) 451219089Spjd return (B_FALSE); 452219089Spjd 453277576Sdelphij /* 454277576Sdelphij * We pause if: 455277576Sdelphij * - we have scanned for the maximum time: an entire txg 456277576Sdelphij * timeout (default 5 sec) 457277576Sdelphij * or 458277576Sdelphij * - we have scanned for at least the minimum time (default 1 sec 459277576Sdelphij * for scrub, 3 sec for resilver), and either we have sufficient 460277576Sdelphij * dirty data that we are starting to write more quickly 461277576Sdelphij * (default 30%), or someone is explicitly waiting for this txg 462277576Sdelphij * to complete. 463277576Sdelphij * or 464277576Sdelphij * - the spa is shutting down because this pool is being exported 465277576Sdelphij * or the machine is rebooting. 466277576Sdelphij */ 467277576Sdelphij int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? 468219089Spjd zfs_resilver_min_time_ms : zfs_scan_min_time_ms; 469277576Sdelphij uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; 470277576Sdelphij int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; 471277576Sdelphij if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout || 472255437Sdelphij (NSEC2MSEC(elapsed_nanosecs) > mintime && 473277576Sdelphij (txg_sync_waiting(scn->scn_dp) || 474277576Sdelphij dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) || 475219089Spjd spa_shutting_down(scn->scn_dp->dp_spa)) { 476219089Spjd if (zb) { 477219089Spjd dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", 478219089Spjd (longlong_t)zb->zb_objset, 479219089Spjd (longlong_t)zb->zb_object, 480219089Spjd (longlong_t)zb->zb_level, 481219089Spjd (longlong_t)zb->zb_blkid); 482219089Spjd scn->scn_phys.scn_bookmark = *zb; 483219089Spjd } 484219089Spjd dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", 485219089Spjd (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, 486219089Spjd (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, 487219089Spjd (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, 488219089Spjd (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); 489219089Spjd scn->scn_pausing = B_TRUE; 490219089Spjd return (B_TRUE); 491219089Spjd } 492219089Spjd return (B_FALSE); 493219089Spjd} 494219089Spjd 495219089Spjdtypedef struct zil_scan_arg { 496219089Spjd dsl_pool_t *zsa_dp; 497219089Spjd zil_header_t *zsa_zh; 498219089Spjd} zil_scan_arg_t; 499219089Spjd 500219089Spjd/* ARGSUSED */ 501219089Spjdstatic int 502219089Spjddsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 503219089Spjd{ 504219089Spjd zil_scan_arg_t *zsa = arg; 505219089Spjd dsl_pool_t *dp = zsa->zsa_dp; 506219089Spjd dsl_scan_t *scn = dp->dp_scan; 507219089Spjd zil_header_t *zh = zsa->zsa_zh; 508268657Sdelphij zbookmark_phys_t zb; 509219089Spjd 510263397Sdelphij if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 511219089Spjd return (0); 512219089Spjd 513219089Spjd /* 514219089Spjd * One block ("stubby") can be allocated a long time ago; we 515219089Spjd * want to visit that one because it has been allocated 516219089Spjd * (on-disk) even if it hasn't been claimed (even though for 517219089Spjd * scrub there's nothing to do to it). 518219089Spjd */ 519219089Spjd if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) 520219089Spjd return (0); 521219089Spjd 522219089Spjd SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 523219089Spjd ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 524219089Spjd 525219089Spjd VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 526219089Spjd return (0); 527219089Spjd} 528219089Spjd 529219089Spjd/* ARGSUSED */ 530219089Spjdstatic int 531219089Spjddsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 532219089Spjd{ 533219089Spjd if (lrc->lrc_txtype == TX_WRITE) { 534219089Spjd zil_scan_arg_t *zsa = arg; 535219089Spjd dsl_pool_t *dp = zsa->zsa_dp; 536219089Spjd dsl_scan_t *scn = dp->dp_scan; 537219089Spjd zil_header_t *zh = zsa->zsa_zh; 538219089Spjd lr_write_t *lr = (lr_write_t *)lrc; 539219089Spjd blkptr_t *bp = &lr->lr_blkptr; 540268657Sdelphij zbookmark_phys_t zb; 541219089Spjd 542263397Sdelphij if (BP_IS_HOLE(bp) || 543263397Sdelphij bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 544219089Spjd return (0); 545219089Spjd 546219089Spjd /* 547219089Spjd * birth can be < claim_txg if this record's txg is 548219089Spjd * already txg sync'ed (but this log block contains 549219089Spjd * other records that are not synced) 550219089Spjd */ 551219089Spjd if (claim_txg == 0 || bp->blk_birth < claim_txg) 552219089Spjd return (0); 553219089Spjd 554219089Spjd SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 555219089Spjd lr->lr_foid, ZB_ZIL_LEVEL, 556219089Spjd lr->lr_offset / BP_GET_LSIZE(bp)); 557219089Spjd 558219089Spjd VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 559219089Spjd } 560219089Spjd return (0); 561219089Spjd} 562219089Spjd 563219089Spjdstatic void 564219089Spjddsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) 565219089Spjd{ 566219089Spjd uint64_t claim_txg = zh->zh_claim_txg; 567219089Spjd zil_scan_arg_t zsa = { dp, zh }; 568219089Spjd zilog_t *zilog; 569219089Spjd 570219089Spjd /* 571219089Spjd * We only want to visit blocks that have been claimed but not yet 572219089Spjd * replayed (or, in read-only mode, blocks that *would* be claimed). 573219089Spjd */ 574219089Spjd if (claim_txg == 0 && spa_writeable(dp->dp_spa)) 575219089Spjd return; 576219089Spjd 577219089Spjd zilog = zil_alloc(dp->dp_meta_objset, zh); 578219089Spjd 579219089Spjd (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, 580219089Spjd claim_txg); 581219089Spjd 582219089Spjd zil_free(zilog); 583219089Spjd} 584219089Spjd 585219089Spjd/* ARGSUSED */ 586219089Spjdstatic void 587219089Spjddsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, 588219089Spjd uint64_t objset, uint64_t object, uint64_t blkid) 589219089Spjd{ 590268657Sdelphij zbookmark_phys_t czb; 591277586Sdelphij arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 592219089Spjd 593219089Spjd if (zfs_no_scrub_prefetch) 594219089Spjd return; 595219089Spjd 596219089Spjd if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || 597219089Spjd (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) 598219089Spjd return; 599219089Spjd 600219089Spjd SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); 601219089Spjd 602219089Spjd (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, 603246666Smm NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 604219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); 605219089Spjd} 606219089Spjd 607219089Spjdstatic boolean_t 608219089Spjddsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, 609268657Sdelphij const zbookmark_phys_t *zb) 610219089Spjd{ 611219089Spjd /* 612219089Spjd * We never skip over user/group accounting objects (obj<0) 613219089Spjd */ 614236884Smm if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && 615219089Spjd (int64_t)zb->zb_object >= 0) { 616219089Spjd /* 617219089Spjd * If we already visited this bp & everything below (in 618219089Spjd * a prior txg sync), don't bother doing it again. 619219089Spjd */ 620288571Smav if (zbookmark_subtree_completed(dnp, zb, 621288571Smav &scn->scn_phys.scn_bookmark)) 622219089Spjd return (B_TRUE); 623219089Spjd 624219089Spjd /* 625219089Spjd * If we found the block we're trying to resume from, or 626219089Spjd * we went past it to a different object, zero it out to 627219089Spjd * indicate that it's OK to start checking for pausing 628219089Spjd * again. 629219089Spjd */ 630219089Spjd if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || 631219089Spjd zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { 632219089Spjd dprintf("resuming at %llx/%llx/%llx/%llx\n", 633219089Spjd (longlong_t)zb->zb_objset, 634219089Spjd (longlong_t)zb->zb_object, 635219089Spjd (longlong_t)zb->zb_level, 636219089Spjd (longlong_t)zb->zb_blkid); 637219089Spjd bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); 638219089Spjd } 639219089Spjd } 640219089Spjd return (B_FALSE); 641219089Spjd} 642219089Spjd 643219089Spjd/* 644219089Spjd * Return nonzero on i/o error. 645219089Spjd * Return new buf to write out in *bufp. 646219089Spjd */ 647219089Spjdstatic int 648219089Spjddsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, 649219089Spjd dnode_phys_t *dnp, const blkptr_t *bp, 650273347Sdelphij const zbookmark_phys_t *zb, dmu_tx_t *tx) 651219089Spjd{ 652219089Spjd dsl_pool_t *dp = scn->scn_dp; 653219089Spjd int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; 654219089Spjd int err; 655219089Spjd 656219089Spjd if (BP_GET_LEVEL(bp) > 0) { 657277586Sdelphij arc_flags_t flags = ARC_FLAG_WAIT; 658219089Spjd int i; 659219089Spjd blkptr_t *cbp; 660219089Spjd int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 661273347Sdelphij arc_buf_t *buf; 662219089Spjd 663273347Sdelphij err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, 664219089Spjd ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 665219089Spjd if (err) { 666219089Spjd scn->scn_phys.scn_errors++; 667219089Spjd return (err); 668219089Spjd } 669273347Sdelphij for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { 670273347Sdelphij dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, 671219089Spjd zb->zb_object, zb->zb_blkid * epb + i); 672219089Spjd } 673273347Sdelphij for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { 674268657Sdelphij zbookmark_phys_t czb; 675219089Spjd 676219089Spjd SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 677219089Spjd zb->zb_level - 1, 678219089Spjd zb->zb_blkid * epb + i); 679219089Spjd dsl_scan_visitbp(cbp, &czb, dnp, 680273347Sdelphij ds, scn, ostype, tx); 681219089Spjd } 682273347Sdelphij (void) arc_buf_remove_ref(buf, &buf); 683219089Spjd } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 684277586Sdelphij arc_flags_t flags = ARC_FLAG_WAIT; 685219089Spjd dnode_phys_t *cdnp; 686219089Spjd int i, j; 687219089Spjd int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 688273347Sdelphij arc_buf_t *buf; 689219089Spjd 690273347Sdelphij err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, 691219089Spjd ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 692219089Spjd if (err) { 693219089Spjd scn->scn_phys.scn_errors++; 694219089Spjd return (err); 695219089Spjd } 696273347Sdelphij for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { 697219089Spjd for (j = 0; j < cdnp->dn_nblkptr; j++) { 698219089Spjd blkptr_t *cbp = &cdnp->dn_blkptr[j]; 699273347Sdelphij dsl_scan_prefetch(scn, buf, cbp, 700219089Spjd zb->zb_objset, zb->zb_blkid * epb + i, j); 701219089Spjd } 702219089Spjd } 703273347Sdelphij for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { 704219089Spjd dsl_scan_visitdnode(scn, ds, ostype, 705273347Sdelphij cdnp, zb->zb_blkid * epb + i, tx); 706219089Spjd } 707219089Spjd 708273347Sdelphij (void) arc_buf_remove_ref(buf, &buf); 709219089Spjd } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 710277586Sdelphij arc_flags_t flags = ARC_FLAG_WAIT; 711219089Spjd objset_phys_t *osp; 712273347Sdelphij arc_buf_t *buf; 713219089Spjd 714273347Sdelphij err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, 715219089Spjd ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 716219089Spjd if (err) { 717219089Spjd scn->scn_phys.scn_errors++; 718219089Spjd return (err); 719219089Spjd } 720219089Spjd 721273347Sdelphij osp = buf->b_data; 722219089Spjd 723219089Spjd dsl_scan_visitdnode(scn, ds, osp->os_type, 724273347Sdelphij &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); 725219089Spjd 726273347Sdelphij if (OBJSET_BUF_HAS_USERUSED(buf)) { 727219089Spjd /* 728219089Spjd * We also always visit user/group accounting 729219089Spjd * objects, and never skip them, even if we are 730219089Spjd * pausing. This is necessary so that the space 731219089Spjd * deltas from this txg get integrated. 732219089Spjd */ 733219089Spjd dsl_scan_visitdnode(scn, ds, osp->os_type, 734273347Sdelphij &osp->os_groupused_dnode, 735219089Spjd DMU_GROUPUSED_OBJECT, tx); 736219089Spjd dsl_scan_visitdnode(scn, ds, osp->os_type, 737273347Sdelphij &osp->os_userused_dnode, 738219089Spjd DMU_USERUSED_OBJECT, tx); 739219089Spjd } 740273347Sdelphij (void) arc_buf_remove_ref(buf, &buf); 741219089Spjd } 742219089Spjd 743219089Spjd return (0); 744219089Spjd} 745219089Spjd 746219089Spjdstatic void 747219089Spjddsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, 748273347Sdelphij dmu_objset_type_t ostype, dnode_phys_t *dnp, 749219089Spjd uint64_t object, dmu_tx_t *tx) 750219089Spjd{ 751219089Spjd int j; 752219089Spjd 753219089Spjd for (j = 0; j < dnp->dn_nblkptr; j++) { 754268657Sdelphij zbookmark_phys_t czb; 755219089Spjd 756219089Spjd SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 757219089Spjd dnp->dn_nlevels - 1, j); 758219089Spjd dsl_scan_visitbp(&dnp->dn_blkptr[j], 759273347Sdelphij &czb, dnp, ds, scn, ostype, tx); 760219089Spjd } 761219089Spjd 762219089Spjd if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 763268657Sdelphij zbookmark_phys_t czb; 764219089Spjd SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 765219089Spjd 0, DMU_SPILL_BLKID); 766219089Spjd dsl_scan_visitbp(&dnp->dn_spill, 767273347Sdelphij &czb, dnp, ds, scn, ostype, tx); 768219089Spjd } 769219089Spjd} 770219089Spjd 771219089Spjd/* 772219089Spjd * The arguments are in this order because mdb can only print the 773219089Spjd * first 5; we want them to be useful. 774219089Spjd */ 775219089Spjdstatic void 776268657Sdelphijdsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, 777273347Sdelphij dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, 778273347Sdelphij dmu_objset_type_t ostype, dmu_tx_t *tx) 779219089Spjd{ 780219089Spjd dsl_pool_t *dp = scn->scn_dp; 781219089Spjd arc_buf_t *buf = NULL; 782219089Spjd blkptr_t bp_toread = *bp; 783219089Spjd 784219089Spjd /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ 785219089Spjd 786219089Spjd if (dsl_scan_check_pause(scn, zb)) 787219089Spjd return; 788219089Spjd 789219089Spjd if (dsl_scan_check_resume(scn, dnp, zb)) 790219089Spjd return; 791219089Spjd 792263397Sdelphij if (BP_IS_HOLE(bp)) 793219089Spjd return; 794219089Spjd 795219089Spjd scn->scn_visited_this_txg++; 796219089Spjd 797219089Spjd dprintf_bp(bp, 798273347Sdelphij "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", 799219089Spjd ds, ds ? ds->ds_object : 0, 800219089Spjd zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, 801273347Sdelphij bp); 802219089Spjd 803219089Spjd if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 804219089Spjd return; 805219089Spjd 806273347Sdelphij if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0) 807219089Spjd return; 808219089Spjd 809219089Spjd /* 810219089Spjd * If dsl_scan_ddt() has aready visited this block, it will have 811219089Spjd * already done any translations or scrubbing, so don't call the 812219089Spjd * callback again. 813219089Spjd */ 814219089Spjd if (ddt_class_contains(dp->dp_spa, 815219089Spjd scn->scn_phys.scn_ddt_class_max, bp)) { 816219089Spjd ASSERT(buf == NULL); 817219089Spjd return; 818219089Spjd } 819219089Spjd 820219089Spjd /* 821219089Spjd * If this block is from the future (after cur_max_txg), then we 822219089Spjd * are doing this on behalf of a deleted snapshot, and we will 823219089Spjd * revisit the future block on the next pass of this dataset. 824219089Spjd * Don't scan it now unless we need to because something 825219089Spjd * under it was modified. 826219089Spjd */ 827254112Sdelphij if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { 828219089Spjd scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); 829219089Spjd } 830219089Spjd} 831219089Spjd 832219089Spjdstatic void 833219089Spjddsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, 834219089Spjd dmu_tx_t *tx) 835219089Spjd{ 836268657Sdelphij zbookmark_phys_t zb; 837219089Spjd 838219089Spjd SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 839219089Spjd ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 840273347Sdelphij dsl_scan_visitbp(bp, &zb, NULL, 841219089Spjd ds, scn, DMU_OST_NONE, tx); 842219089Spjd 843219089Spjd dprintf_ds(ds, "finished scan%s", ""); 844219089Spjd} 845219089Spjd 846219089Spjdvoid 847219089Spjddsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) 848219089Spjd{ 849219089Spjd dsl_pool_t *dp = ds->ds_dir->dd_pool; 850219089Spjd dsl_scan_t *scn = dp->dp_scan; 851219089Spjd uint64_t mintxg; 852219089Spjd 853219089Spjd if (scn->scn_phys.scn_state != DSS_SCANNING) 854219089Spjd return; 855219089Spjd 856219089Spjd if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { 857288549Smav if (ds->ds_is_snapshot) { 858219089Spjd /* Note, scn_cur_{min,max}_txg stays the same. */ 859219089Spjd scn->scn_phys.scn_bookmark.zb_objset = 860277585Sdelphij dsl_dataset_phys(ds)->ds_next_snap_obj; 861219089Spjd zfs_dbgmsg("destroying ds %llu; currently traversing; " 862219089Spjd "reset zb_objset to %llu", 863219089Spjd (u_longlong_t)ds->ds_object, 864277585Sdelphij (u_longlong_t)dsl_dataset_phys(ds)-> 865277585Sdelphij ds_next_snap_obj); 866219089Spjd scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; 867219089Spjd } else { 868219089Spjd SET_BOOKMARK(&scn->scn_phys.scn_bookmark, 869219089Spjd ZB_DESTROYED_OBJSET, 0, 0, 0); 870219089Spjd zfs_dbgmsg("destroying ds %llu; currently traversing; " 871219089Spjd "reset bookmark to -1,0,0,0", 872219089Spjd (u_longlong_t)ds->ds_object); 873219089Spjd } 874219089Spjd } else if (zap_lookup_int_key(dp->dp_meta_objset, 875219089Spjd scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { 876277585Sdelphij ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); 877219089Spjd VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 878219089Spjd scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 879288549Smav if (ds->ds_is_snapshot) { 880219089Spjd /* 881219089Spjd * We keep the same mintxg; it could be > 882219089Spjd * ds_creation_txg if the previous snapshot was 883219089Spjd * deleted too. 884219089Spjd */ 885219089Spjd VERIFY(zap_add_int_key(dp->dp_meta_objset, 886219089Spjd scn->scn_phys.scn_queue_obj, 887277585Sdelphij dsl_dataset_phys(ds)->ds_next_snap_obj, 888277585Sdelphij mintxg, tx) == 0); 889219089Spjd zfs_dbgmsg("destroying ds %llu; in queue; " 890219089Spjd "replacing with %llu", 891219089Spjd (u_longlong_t)ds->ds_object, 892277585Sdelphij (u_longlong_t)dsl_dataset_phys(ds)-> 893277585Sdelphij ds_next_snap_obj); 894219089Spjd } else { 895219089Spjd zfs_dbgmsg("destroying ds %llu; in queue; removing", 896219089Spjd (u_longlong_t)ds->ds_object); 897219089Spjd } 898219089Spjd } else { 899219089Spjd zfs_dbgmsg("destroying ds %llu; ignoring", 900219089Spjd (u_longlong_t)ds->ds_object); 901219089Spjd } 902219089Spjd 903219089Spjd /* 904219089Spjd * dsl_scan_sync() should be called after this, and should sync 905219089Spjd * out our changed state, but just to be safe, do it here. 906219089Spjd */ 907219089Spjd dsl_scan_sync_state(scn, tx); 908219089Spjd} 909219089Spjd 910219089Spjdvoid 911219089Spjddsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) 912219089Spjd{ 913219089Spjd dsl_pool_t *dp = ds->ds_dir->dd_pool; 914219089Spjd dsl_scan_t *scn = dp->dp_scan; 915219089Spjd uint64_t mintxg; 916219089Spjd 917219089Spjd if (scn->scn_phys.scn_state != DSS_SCANNING) 918219089Spjd return; 919219089Spjd 920277585Sdelphij ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); 921219089Spjd 922219089Spjd if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { 923219089Spjd scn->scn_phys.scn_bookmark.zb_objset = 924277585Sdelphij dsl_dataset_phys(ds)->ds_prev_snap_obj; 925219089Spjd zfs_dbgmsg("snapshotting ds %llu; currently traversing; " 926219089Spjd "reset zb_objset to %llu", 927219089Spjd (u_longlong_t)ds->ds_object, 928277585Sdelphij (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); 929219089Spjd } else if (zap_lookup_int_key(dp->dp_meta_objset, 930219089Spjd scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { 931219089Spjd VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 932219089Spjd scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 933219089Spjd VERIFY(zap_add_int_key(dp->dp_meta_objset, 934219089Spjd scn->scn_phys.scn_queue_obj, 935277585Sdelphij dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); 936219089Spjd zfs_dbgmsg("snapshotting ds %llu; in queue; " 937219089Spjd "replacing with %llu", 938219089Spjd (u_longlong_t)ds->ds_object, 939277585Sdelphij (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); 940219089Spjd } 941219089Spjd dsl_scan_sync_state(scn, tx); 942219089Spjd} 943219089Spjd 944219089Spjdvoid 945219089Spjddsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) 946219089Spjd{ 947219089Spjd dsl_pool_t *dp = ds1->ds_dir->dd_pool; 948219089Spjd dsl_scan_t *scn = dp->dp_scan; 949219089Spjd uint64_t mintxg; 950219089Spjd 951219089Spjd if (scn->scn_phys.scn_state != DSS_SCANNING) 952219089Spjd return; 953219089Spjd 954219089Spjd if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { 955219089Spjd scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; 956219089Spjd zfs_dbgmsg("clone_swap ds %llu; currently traversing; " 957219089Spjd "reset zb_objset to %llu", 958219089Spjd (u_longlong_t)ds1->ds_object, 959219089Spjd (u_longlong_t)ds2->ds_object); 960219089Spjd } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { 961219089Spjd scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; 962219089Spjd zfs_dbgmsg("clone_swap ds %llu; currently traversing; " 963219089Spjd "reset zb_objset to %llu", 964219089Spjd (u_longlong_t)ds2->ds_object, 965219089Spjd (u_longlong_t)ds1->ds_object); 966219089Spjd } 967219089Spjd 968219089Spjd if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 969219089Spjd ds1->ds_object, &mintxg) == 0) { 970219089Spjd int err; 971219089Spjd 972277585Sdelphij ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 973277585Sdelphij ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 974219089Spjd VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 975219089Spjd scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); 976219089Spjd err = zap_add_int_key(dp->dp_meta_objset, 977219089Spjd scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); 978219089Spjd VERIFY(err == 0 || err == EEXIST); 979219089Spjd if (err == EEXIST) { 980219089Spjd /* Both were there to begin with */ 981219089Spjd VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, 982219089Spjd scn->scn_phys.scn_queue_obj, 983219089Spjd ds1->ds_object, mintxg, tx)); 984219089Spjd } 985219089Spjd zfs_dbgmsg("clone_swap ds %llu; in queue; " 986219089Spjd "replacing with %llu", 987219089Spjd (u_longlong_t)ds1->ds_object, 988219089Spjd (u_longlong_t)ds2->ds_object); 989219089Spjd } else if (zap_lookup_int_key(dp->dp_meta_objset, 990219089Spjd scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { 991277585Sdelphij ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 992277585Sdelphij ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 993219089Spjd VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 994219089Spjd scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); 995219089Spjd VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, 996219089Spjd scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); 997219089Spjd zfs_dbgmsg("clone_swap ds %llu; in queue; " 998219089Spjd "replacing with %llu", 999219089Spjd (u_longlong_t)ds2->ds_object, 1000219089Spjd (u_longlong_t)ds1->ds_object); 1001219089Spjd } 1002219089Spjd 1003219089Spjd dsl_scan_sync_state(scn, tx); 1004219089Spjd} 1005219089Spjd 1006219089Spjdstruct enqueue_clones_arg { 1007219089Spjd dmu_tx_t *tx; 1008219089Spjd uint64_t originobj; 1009219089Spjd}; 1010219089Spjd 1011219089Spjd/* ARGSUSED */ 1012219089Spjdstatic int 1013248571Smmenqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 1014219089Spjd{ 1015219089Spjd struct enqueue_clones_arg *eca = arg; 1016219089Spjd dsl_dataset_t *ds; 1017219089Spjd int err; 1018219089Spjd dsl_scan_t *scn = dp->dp_scan; 1019219089Spjd 1020277585Sdelphij if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj) 1021248571Smm return (0); 1022248571Smm 1023248571Smm err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 1024219089Spjd if (err) 1025219089Spjd return (err); 1026219089Spjd 1027277585Sdelphij while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) { 1028248571Smm dsl_dataset_t *prev; 1029248571Smm err = dsl_dataset_hold_obj(dp, 1030277585Sdelphij dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 1031219089Spjd 1032248571Smm dsl_dataset_rele(ds, FTAG); 1033248571Smm if (err) 1034248571Smm return (err); 1035248571Smm ds = prev; 1036219089Spjd } 1037248571Smm VERIFY(zap_add_int_key(dp->dp_meta_objset, 1038248571Smm scn->scn_phys.scn_queue_obj, ds->ds_object, 1039277585Sdelphij dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0); 1040219089Spjd dsl_dataset_rele(ds, FTAG); 1041219089Spjd return (0); 1042219089Spjd} 1043219089Spjd 1044219089Spjdstatic void 1045219089Spjddsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) 1046219089Spjd{ 1047219089Spjd dsl_pool_t *dp = scn->scn_dp; 1048219089Spjd dsl_dataset_t *ds; 1049219089Spjd objset_t *os; 1050219089Spjd 1051219089Spjd VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 1052219089Spjd 1053219089Spjd if (dmu_objset_from_ds(ds, &os)) 1054219089Spjd goto out; 1055219089Spjd 1056219089Spjd /* 1057219089Spjd * Only the ZIL in the head (non-snapshot) is valid. Even though 1058219089Spjd * snapshots can have ZIL block pointers (which may be the same 1059219089Spjd * BP as in the head), they must be ignored. So we traverse the 1060219089Spjd * ZIL here, rather than in scan_recurse(), because the regular 1061219089Spjd * snapshot block-sharing rules don't apply to it. 1062219089Spjd */ 1063288549Smav if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot) 1064219089Spjd dsl_scan_zil(dp, &os->os_zil_header); 1065219089Spjd 1066219089Spjd /* 1067219089Spjd * Iterate over the bps in this ds. 1068219089Spjd */ 1069219089Spjd dmu_buf_will_dirty(ds->ds_dbuf, tx); 1070277585Sdelphij dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); 1071219089Spjd 1072219089Spjd char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); 1073219089Spjd dsl_dataset_name(ds, dsname); 1074219089Spjd zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " 1075219089Spjd "pausing=%u", 1076219089Spjd (longlong_t)dsobj, dsname, 1077219089Spjd (longlong_t)scn->scn_phys.scn_cur_min_txg, 1078219089Spjd (longlong_t)scn->scn_phys.scn_cur_max_txg, 1079219089Spjd (int)scn->scn_pausing); 1080219089Spjd kmem_free(dsname, ZFS_MAXNAMELEN); 1081219089Spjd 1082219089Spjd if (scn->scn_pausing) 1083219089Spjd goto out; 1084219089Spjd 1085219089Spjd /* 1086219089Spjd * We've finished this pass over this dataset. 1087219089Spjd */ 1088219089Spjd 1089219089Spjd /* 1090219089Spjd * If we did not completely visit this dataset, do another pass. 1091219089Spjd */ 1092219089Spjd if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { 1093219089Spjd zfs_dbgmsg("incomplete pass; visiting again"); 1094219089Spjd scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; 1095219089Spjd VERIFY(zap_add_int_key(dp->dp_meta_objset, 1096219089Spjd scn->scn_phys.scn_queue_obj, ds->ds_object, 1097219089Spjd scn->scn_phys.scn_cur_max_txg, tx) == 0); 1098219089Spjd goto out; 1099219089Spjd } 1100219089Spjd 1101219089Spjd /* 1102219089Spjd * Add descendent datasets to work queue. 1103219089Spjd */ 1104277585Sdelphij if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { 1105219089Spjd VERIFY(zap_add_int_key(dp->dp_meta_objset, 1106277585Sdelphij scn->scn_phys.scn_queue_obj, 1107277585Sdelphij dsl_dataset_phys(ds)->ds_next_snap_obj, 1108277585Sdelphij dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0); 1109219089Spjd } 1110277585Sdelphij if (dsl_dataset_phys(ds)->ds_num_children > 1) { 1111219089Spjd boolean_t usenext = B_FALSE; 1112277585Sdelphij if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { 1113219089Spjd uint64_t count; 1114219089Spjd /* 1115219089Spjd * A bug in a previous version of the code could 1116219089Spjd * cause upgrade_clones_cb() to not set 1117219089Spjd * ds_next_snap_obj when it should, leading to a 1118219089Spjd * missing entry. Therefore we can only use the 1119219089Spjd * next_clones_obj when its count is correct. 1120219089Spjd */ 1121219089Spjd int err = zap_count(dp->dp_meta_objset, 1122277585Sdelphij dsl_dataset_phys(ds)->ds_next_clones_obj, &count); 1123219089Spjd if (err == 0 && 1124277585Sdelphij count == dsl_dataset_phys(ds)->ds_num_children - 1) 1125219089Spjd usenext = B_TRUE; 1126219089Spjd } 1127219089Spjd 1128219089Spjd if (usenext) { 1129248571Smm VERIFY0(zap_join_key(dp->dp_meta_objset, 1130277585Sdelphij dsl_dataset_phys(ds)->ds_next_clones_obj, 1131219089Spjd scn->scn_phys.scn_queue_obj, 1132277585Sdelphij dsl_dataset_phys(ds)->ds_creation_txg, tx)); 1133219089Spjd } else { 1134219089Spjd struct enqueue_clones_arg eca; 1135219089Spjd eca.tx = tx; 1136219089Spjd eca.originobj = ds->ds_object; 1137219089Spjd 1138248571Smm VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1139248571Smm enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); 1140219089Spjd } 1141219089Spjd } 1142219089Spjd 1143219089Spjdout: 1144219089Spjd dsl_dataset_rele(ds, FTAG); 1145219089Spjd} 1146219089Spjd 1147219089Spjd/* ARGSUSED */ 1148219089Spjdstatic int 1149248571Smmenqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 1150219089Spjd{ 1151219089Spjd dmu_tx_t *tx = arg; 1152219089Spjd dsl_dataset_t *ds; 1153219089Spjd int err; 1154219089Spjd dsl_scan_t *scn = dp->dp_scan; 1155219089Spjd 1156248571Smm err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 1157219089Spjd if (err) 1158219089Spjd return (err); 1159219089Spjd 1160277585Sdelphij while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { 1161219089Spjd dsl_dataset_t *prev; 1162277585Sdelphij err = dsl_dataset_hold_obj(dp, 1163277585Sdelphij dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 1164219089Spjd if (err) { 1165219089Spjd dsl_dataset_rele(ds, FTAG); 1166219089Spjd return (err); 1167219089Spjd } 1168219089Spjd 1169219089Spjd /* 1170219089Spjd * If this is a clone, we don't need to worry about it for now. 1171219089Spjd */ 1172277585Sdelphij if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { 1173219089Spjd dsl_dataset_rele(ds, FTAG); 1174219089Spjd dsl_dataset_rele(prev, FTAG); 1175219089Spjd return (0); 1176219089Spjd } 1177219089Spjd dsl_dataset_rele(ds, FTAG); 1178219089Spjd ds = prev; 1179219089Spjd } 1180219089Spjd 1181219089Spjd VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 1182277585Sdelphij ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0); 1183219089Spjd dsl_dataset_rele(ds, FTAG); 1184219089Spjd return (0); 1185219089Spjd} 1186219089Spjd 1187219089Spjd/* 1188219089Spjd * Scrub/dedup interaction. 1189219089Spjd * 1190219089Spjd * If there are N references to a deduped block, we don't want to scrub it 1191219089Spjd * N times -- ideally, we should scrub it exactly once. 1192219089Spjd * 1193219089Spjd * We leverage the fact that the dde's replication class (enum ddt_class) 1194219089Spjd * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest 1195219089Spjd * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. 1196219089Spjd * 1197219089Spjd * To prevent excess scrubbing, the scrub begins by walking the DDT 1198219089Spjd * to find all blocks with refcnt > 1, and scrubs each of these once. 1199219089Spjd * Since there are two replication classes which contain blocks with 1200219089Spjd * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. 1201219089Spjd * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. 1202219089Spjd * 1203219089Spjd * There would be nothing more to say if a block's refcnt couldn't change 1204219089Spjd * during a scrub, but of course it can so we must account for changes 1205219089Spjd * in a block's replication class. 1206219089Spjd * 1207219089Spjd * Here's an example of what can occur: 1208219089Spjd * 1209219089Spjd * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 1210219089Spjd * when visited during the top-down scrub phase, it will be scrubbed twice. 1211219089Spjd * This negates our scrub optimization, but is otherwise harmless. 1212219089Spjd * 1213219089Spjd * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 1214219089Spjd * on each visit during the top-down scrub phase, it will never be scrubbed. 1215219089Spjd * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's 1216219089Spjd * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to 1217219089Spjd * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 1218219089Spjd * while a scrub is in progress, it scrubs the block right then. 1219219089Spjd */ 1220219089Spjdstatic void 1221219089Spjddsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) 1222219089Spjd{ 1223219089Spjd ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; 1224219089Spjd ddt_entry_t dde = { 0 }; 1225219089Spjd int error; 1226219089Spjd uint64_t n = 0; 1227219089Spjd 1228219089Spjd while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { 1229219089Spjd ddt_t *ddt; 1230219089Spjd 1231219089Spjd if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) 1232219089Spjd break; 1233219089Spjd dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", 1234219089Spjd (longlong_t)ddb->ddb_class, 1235219089Spjd (longlong_t)ddb->ddb_type, 1236219089Spjd (longlong_t)ddb->ddb_checksum, 1237219089Spjd (longlong_t)ddb->ddb_cursor); 1238219089Spjd 1239219089Spjd /* There should be no pending changes to the dedup table */ 1240219089Spjd ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; 1241219089Spjd ASSERT(avl_first(&ddt->ddt_tree) == NULL); 1242219089Spjd 1243219089Spjd dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); 1244219089Spjd n++; 1245219089Spjd 1246219089Spjd if (dsl_scan_check_pause(scn, NULL)) 1247219089Spjd break; 1248219089Spjd } 1249219089Spjd 1250219089Spjd zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", 1251219089Spjd (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, 1252219089Spjd (int)scn->scn_pausing); 1253219089Spjd 1254219089Spjd ASSERT(error == 0 || error == ENOENT); 1255219089Spjd ASSERT(error != ENOENT || 1256219089Spjd ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); 1257219089Spjd} 1258219089Spjd 1259219089Spjd/* ARGSUSED */ 1260219089Spjdvoid 1261219089Spjddsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, 1262219089Spjd ddt_entry_t *dde, dmu_tx_t *tx) 1263219089Spjd{ 1264219089Spjd const ddt_key_t *ddk = &dde->dde_key; 1265219089Spjd ddt_phys_t *ddp = dde->dde_phys; 1266219089Spjd blkptr_t bp; 1267268657Sdelphij zbookmark_phys_t zb = { 0 }; 1268219089Spjd 1269219089Spjd if (scn->scn_phys.scn_state != DSS_SCANNING) 1270219089Spjd return; 1271219089Spjd 1272219089Spjd for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1273219089Spjd if (ddp->ddp_phys_birth == 0 || 1274254112Sdelphij ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) 1275219089Spjd continue; 1276219089Spjd ddt_bp_create(checksum, ddk, ddp, &bp); 1277219089Spjd 1278219089Spjd scn->scn_visited_this_txg++; 1279219089Spjd scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); 1280219089Spjd } 1281219089Spjd} 1282219089Spjd 1283219089Spjdstatic void 1284219089Spjddsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) 1285219089Spjd{ 1286219089Spjd dsl_pool_t *dp = scn->scn_dp; 1287219089Spjd zap_cursor_t zc; 1288219089Spjd zap_attribute_t za; 1289219089Spjd 1290219089Spjd if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= 1291219089Spjd scn->scn_phys.scn_ddt_class_max) { 1292219089Spjd scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 1293219089Spjd scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 1294219089Spjd dsl_scan_ddt(scn, tx); 1295219089Spjd if (scn->scn_pausing) 1296219089Spjd return; 1297219089Spjd } 1298219089Spjd 1299219089Spjd if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { 1300219089Spjd /* First do the MOS & ORIGIN */ 1301219089Spjd 1302219089Spjd scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 1303219089Spjd scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 1304219089Spjd dsl_scan_visit_rootbp(scn, NULL, 1305219089Spjd &dp->dp_meta_rootbp, tx); 1306219089Spjd spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 1307219089Spjd if (scn->scn_pausing) 1308219089Spjd return; 1309219089Spjd 1310219089Spjd if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { 1311248571Smm VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1312248571Smm enqueue_cb, tx, DS_FIND_CHILDREN)); 1313219089Spjd } else { 1314219089Spjd dsl_scan_visitds(scn, 1315219089Spjd dp->dp_origin_snap->ds_object, tx); 1316219089Spjd } 1317219089Spjd ASSERT(!scn->scn_pausing); 1318219089Spjd } else if (scn->scn_phys.scn_bookmark.zb_objset != 1319219089Spjd ZB_DESTROYED_OBJSET) { 1320219089Spjd /* 1321219089Spjd * If we were paused, continue from here. Note if the 1322219089Spjd * ds we were paused on was deleted, the zb_objset may 1323219089Spjd * be -1, so we will skip this and find a new objset 1324219089Spjd * below. 1325219089Spjd */ 1326219089Spjd dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); 1327219089Spjd if (scn->scn_pausing) 1328219089Spjd return; 1329219089Spjd } 1330219089Spjd 1331219089Spjd /* 1332219089Spjd * In case we were paused right at the end of the ds, zero the 1333219089Spjd * bookmark so we don't think that we're still trying to resume. 1334219089Spjd */ 1335268657Sdelphij bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); 1336219089Spjd 1337219089Spjd /* keep pulling things out of the zap-object-as-queue */ 1338219089Spjd while (zap_cursor_init(&zc, dp->dp_meta_objset, 1339219089Spjd scn->scn_phys.scn_queue_obj), 1340219089Spjd zap_cursor_retrieve(&zc, &za) == 0) { 1341219089Spjd dsl_dataset_t *ds; 1342219089Spjd uint64_t dsobj; 1343219089Spjd 1344219089Spjd dsobj = strtonum(za.za_name, NULL); 1345219089Spjd VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 1346219089Spjd scn->scn_phys.scn_queue_obj, dsobj, tx)); 1347219089Spjd 1348219089Spjd /* Set up min/max txg */ 1349219089Spjd VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 1350219089Spjd if (za.za_first_integer != 0) { 1351219089Spjd scn->scn_phys.scn_cur_min_txg = 1352219089Spjd MAX(scn->scn_phys.scn_min_txg, 1353219089Spjd za.za_first_integer); 1354219089Spjd } else { 1355219089Spjd scn->scn_phys.scn_cur_min_txg = 1356219089Spjd MAX(scn->scn_phys.scn_min_txg, 1357277585Sdelphij dsl_dataset_phys(ds)->ds_prev_snap_txg); 1358219089Spjd } 1359219089Spjd scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); 1360219089Spjd dsl_dataset_rele(ds, FTAG); 1361219089Spjd 1362219089Spjd dsl_scan_visitds(scn, dsobj, tx); 1363219089Spjd zap_cursor_fini(&zc); 1364219089Spjd if (scn->scn_pausing) 1365219089Spjd return; 1366219089Spjd } 1367219089Spjd zap_cursor_fini(&zc); 1368219089Spjd} 1369219089Spjd 1370236884Smmstatic boolean_t 1371236884Smmdsl_scan_free_should_pause(dsl_scan_t *scn) 1372219089Spjd{ 1373219089Spjd uint64_t elapsed_nanosecs; 1374219089Spjd 1375262120Savg if (zfs_recover) 1376262120Savg return (B_FALSE); 1377262120Savg 1378272665Sdelphij if (scn->scn_visited_this_txg >= zfs_free_max_blocks) 1379272665Sdelphij return (B_TRUE); 1380272665Sdelphij 1381219089Spjd elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; 1382236884Smm return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || 1383255437Sdelphij (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms && 1384219089Spjd txg_sync_waiting(scn->scn_dp)) || 1385236884Smm spa_shutting_down(scn->scn_dp->dp_spa)); 1386236884Smm} 1387219089Spjd 1388236884Smmstatic int 1389236884Smmdsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 1390236884Smm{ 1391236884Smm dsl_scan_t *scn = arg; 1392236884Smm 1393236884Smm if (!scn->scn_is_bptree || 1394236884Smm (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { 1395236884Smm if (dsl_scan_free_should_pause(scn)) 1396249195Smm return (SET_ERROR(ERESTART)); 1397236884Smm } 1398236884Smm 1399219089Spjd zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, 1400240868Spjd dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); 1401219089Spjd dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 1402219089Spjd -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), 1403219089Spjd -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 1404219089Spjd scn->scn_visited_this_txg++; 1405219089Spjd return (0); 1406219089Spjd} 1407219089Spjd 1408219089Spjdboolean_t 1409219089Spjddsl_scan_active(dsl_scan_t *scn) 1410219089Spjd{ 1411219089Spjd spa_t *spa = scn->scn_dp->dp_spa; 1412219089Spjd uint64_t used = 0, comp, uncomp; 1413219089Spjd 1414219089Spjd if (spa->spa_load_state != SPA_LOAD_NONE) 1415219089Spjd return (B_FALSE); 1416219089Spjd if (spa_shutting_down(spa)) 1417219089Spjd return (B_FALSE); 1418249858Smm if (scn->scn_phys.scn_state == DSS_SCANNING || 1419268650Sdelphij (scn->scn_async_destroying && !scn->scn_async_stalled)) 1420219089Spjd return (B_TRUE); 1421219089Spjd 1422219089Spjd if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 1423219089Spjd (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, 1424219089Spjd &used, &comp, &uncomp); 1425219089Spjd } 1426219089Spjd return (used != 0); 1427219089Spjd} 1428219089Spjd 1429219089Spjdvoid 1430219089Spjddsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) 1431219089Spjd{ 1432219089Spjd dsl_scan_t *scn = dp->dp_scan; 1433219089Spjd spa_t *spa = dp->dp_spa; 1434268650Sdelphij int err = 0; 1435219089Spjd 1436219089Spjd /* 1437219089Spjd * Check for scn_restart_txg before checking spa_load_state, so 1438219089Spjd * that we can restart an old-style scan while the pool is being 1439219089Spjd * imported (see dsl_scan_init). 1440219089Spjd */ 1441219089Spjd if (scn->scn_restart_txg != 0 && 1442219089Spjd scn->scn_restart_txg <= tx->tx_txg) { 1443219089Spjd pool_scan_func_t func = POOL_SCAN_SCRUB; 1444219089Spjd dsl_scan_done(scn, B_FALSE, tx); 1445219089Spjd if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) 1446219089Spjd func = POOL_SCAN_RESILVER; 1447219089Spjd zfs_dbgmsg("restarting scan func=%u txg=%llu", 1448219089Spjd func, tx->tx_txg); 1449248571Smm dsl_scan_setup_sync(&func, tx); 1450219089Spjd } 1451219089Spjd 1452268650Sdelphij /* 1453268650Sdelphij * If the scan is inactive due to a stalled async destroy, try again. 1454268650Sdelphij */ 1455268650Sdelphij if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) || 1456219089Spjd spa_sync_pass(dp->dp_spa) > 1) 1457219089Spjd return; 1458219089Spjd 1459219089Spjd scn->scn_visited_this_txg = 0; 1460219089Spjd scn->scn_pausing = B_FALSE; 1461219089Spjd scn->scn_sync_start_time = gethrtime(); 1462219089Spjd spa->spa_scrub_active = B_TRUE; 1463219089Spjd 1464219089Spjd /* 1465268650Sdelphij * First process the async destroys. If we pause, don't do 1466268650Sdelphij * any scrubbing or resilvering. This ensures that there are no 1467268650Sdelphij * async destroys while we are scanning, so the scan code doesn't 1468268650Sdelphij * have to worry about traversing it. It is also faster to free the 1469268650Sdelphij * blocks than to scrub them. 1470219089Spjd */ 1471290747Smav if (zfs_free_bpobj_enabled && 1472290747Smav spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 1473236884Smm scn->scn_is_bptree = B_FALSE; 1474219089Spjd scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 1475219089Spjd NULL, ZIO_FLAG_MUSTSUCCEED); 1476219089Spjd err = bpobj_iterate(&dp->dp_free_bpobj, 1477236884Smm dsl_scan_free_block_cb, scn, tx); 1478219089Spjd VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); 1479236884Smm 1480268650Sdelphij if (err != 0 && err != ERESTART) 1481268650Sdelphij zfs_panic_recover("error %u from bpobj_iterate()", err); 1482268650Sdelphij } 1483236884Smm 1484268650Sdelphij if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { 1485268650Sdelphij ASSERT(scn->scn_async_destroying); 1486268650Sdelphij scn->scn_is_bptree = B_TRUE; 1487268650Sdelphij scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 1488268650Sdelphij NULL, ZIO_FLAG_MUSTSUCCEED); 1489268650Sdelphij err = bptree_iterate(dp->dp_meta_objset, 1490268650Sdelphij dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); 1491268650Sdelphij VERIFY0(zio_wait(scn->scn_zio_root)); 1492268650Sdelphij 1493268650Sdelphij if (err == EIO || err == ECKSUM) { 1494268650Sdelphij err = 0; 1495268650Sdelphij } else if (err != 0 && err != ERESTART) { 1496268650Sdelphij zfs_panic_recover("error %u from " 1497268650Sdelphij "traverse_dataset_destroyed()", err); 1498236884Smm } 1499268650Sdelphij 1500268650Sdelphij if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { 1501268650Sdelphij /* finished; deactivate async destroy feature */ 1502268650Sdelphij spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); 1503268650Sdelphij ASSERT(!spa_feature_is_active(spa, 1504268650Sdelphij SPA_FEATURE_ASYNC_DESTROY)); 1505268650Sdelphij VERIFY0(zap_remove(dp->dp_meta_objset, 1506268650Sdelphij DMU_POOL_DIRECTORY_OBJECT, 1507268650Sdelphij DMU_POOL_BPTREE_OBJ, tx)); 1508268650Sdelphij VERIFY0(bptree_free(dp->dp_meta_objset, 1509268650Sdelphij dp->dp_bptree_obj, tx)); 1510268650Sdelphij dp->dp_bptree_obj = 0; 1511268650Sdelphij scn->scn_async_destroying = B_FALSE; 1512277584Sdelphij scn->scn_async_stalled = B_FALSE; 1513273161Ssmh } else { 1514273161Ssmh /* 1515277584Sdelphij * If we didn't make progress, mark the async 1516277584Sdelphij * destroy as stalled, so that we will not initiate 1517277584Sdelphij * a spa_sync() on its behalf. Note that we only 1518277584Sdelphij * check this if we are not finished, because if the 1519277584Sdelphij * bptree had no blocks for us to visit, we can 1520277584Sdelphij * finish without "making progress". 1521273161Ssmh */ 1522273161Ssmh scn->scn_async_stalled = 1523273161Ssmh (scn->scn_visited_this_txg == 0); 1524219089Spjd } 1525268650Sdelphij } 1526268650Sdelphij if (scn->scn_visited_this_txg) { 1527268650Sdelphij zfs_dbgmsg("freed %llu blocks in %llums from " 1528276082Sdelphij "free_bpobj/bptree txg %llu; err=%d", 1529268650Sdelphij (longlong_t)scn->scn_visited_this_txg, 1530268650Sdelphij (longlong_t) 1531268650Sdelphij NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), 1532268650Sdelphij (longlong_t)tx->tx_txg, err); 1533268650Sdelphij scn->scn_visited_this_txg = 0; 1534268650Sdelphij 1535268650Sdelphij /* 1536268650Sdelphij * Write out changes to the DDT that may be required as a 1537268650Sdelphij * result of the blocks freed. This ensures that the DDT 1538268650Sdelphij * is clean when a scrub/resilver runs. 1539268650Sdelphij */ 1540268650Sdelphij ddt_sync(spa, tx->tx_txg); 1541268650Sdelphij } 1542268650Sdelphij if (err != 0) 1543268650Sdelphij return; 1544268650Sdelphij if (!scn->scn_async_destroying && zfs_free_leak_on_eio && 1545277585Sdelphij (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || 1546277585Sdelphij dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || 1547277585Sdelphij dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { 1548268650Sdelphij /* 1549268650Sdelphij * We have finished background destroying, but there is still 1550268650Sdelphij * some space left in the dp_free_dir. Transfer this leaked 1551268650Sdelphij * space to the dp_leak_dir. 1552268650Sdelphij */ 1553268650Sdelphij if (dp->dp_leak_dir == NULL) { 1554268650Sdelphij rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 1555268650Sdelphij (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 1556268650Sdelphij LEAK_DIR_NAME, tx); 1557268650Sdelphij VERIFY0(dsl_pool_open_special_dir(dp, 1558268650Sdelphij LEAK_DIR_NAME, &dp->dp_leak_dir)); 1559268650Sdelphij rrw_exit(&dp->dp_config_rwlock, FTAG); 1560268650Sdelphij } 1561268650Sdelphij dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, 1562277585Sdelphij dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, 1563277585Sdelphij dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, 1564277585Sdelphij dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); 1565268650Sdelphij dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, 1566277585Sdelphij -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, 1567277585Sdelphij -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, 1568277585Sdelphij -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); 1569268650Sdelphij } 1570268650Sdelphij if (!scn->scn_async_destroying) { 1571268649Sdelphij /* finished; verify that space accounting went to zero */ 1572277585Sdelphij ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); 1573277585Sdelphij ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); 1574277585Sdelphij ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); 1575219089Spjd } 1576219089Spjd 1577219089Spjd if (scn->scn_phys.scn_state != DSS_SCANNING) 1578219089Spjd return; 1579219089Spjd 1580254112Sdelphij if (scn->scn_done_txg == tx->tx_txg) { 1581254112Sdelphij ASSERT(!scn->scn_pausing); 1582254112Sdelphij /* finished with scan. */ 1583254112Sdelphij zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); 1584254112Sdelphij dsl_scan_done(scn, B_TRUE, tx); 1585254112Sdelphij ASSERT3U(spa->spa_scrub_inflight, ==, 0); 1586254112Sdelphij dsl_scan_sync_state(scn, tx); 1587254112Sdelphij return; 1588254112Sdelphij } 1589254112Sdelphij 1590219089Spjd if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= 1591219089Spjd scn->scn_phys.scn_ddt_class_max) { 1592219089Spjd zfs_dbgmsg("doing scan sync txg %llu; " 1593219089Spjd "ddt bm=%llu/%llu/%llu/%llx", 1594219089Spjd (longlong_t)tx->tx_txg, 1595219089Spjd (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, 1596219089Spjd (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, 1597219089Spjd (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, 1598219089Spjd (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); 1599219089Spjd ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); 1600219089Spjd ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); 1601219089Spjd ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); 1602219089Spjd ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); 1603219089Spjd } else { 1604219089Spjd zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", 1605219089Spjd (longlong_t)tx->tx_txg, 1606219089Spjd (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, 1607219089Spjd (longlong_t)scn->scn_phys.scn_bookmark.zb_object, 1608219089Spjd (longlong_t)scn->scn_phys.scn_bookmark.zb_level, 1609219089Spjd (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); 1610219089Spjd } 1611219089Spjd 1612219089Spjd scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 1613219089Spjd NULL, ZIO_FLAG_CANFAIL); 1614248571Smm dsl_pool_config_enter(dp, FTAG); 1615219089Spjd dsl_scan_visit(scn, tx); 1616248571Smm dsl_pool_config_exit(dp, FTAG); 1617219089Spjd (void) zio_wait(scn->scn_zio_root); 1618219089Spjd scn->scn_zio_root = NULL; 1619219089Spjd 1620219089Spjd zfs_dbgmsg("visited %llu blocks in %llums", 1621219089Spjd (longlong_t)scn->scn_visited_this_txg, 1622255437Sdelphij (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); 1623219089Spjd 1624219089Spjd if (!scn->scn_pausing) { 1625254112Sdelphij scn->scn_done_txg = tx->tx_txg + 1; 1626254112Sdelphij zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", 1627254112Sdelphij tx->tx_txg, scn->scn_done_txg); 1628219089Spjd } 1629219089Spjd 1630219089Spjd if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 1631219089Spjd mutex_enter(&spa->spa_scrub_lock); 1632219089Spjd while (spa->spa_scrub_inflight > 0) { 1633219089Spjd cv_wait(&spa->spa_scrub_io_cv, 1634219089Spjd &spa->spa_scrub_lock); 1635219089Spjd } 1636219089Spjd mutex_exit(&spa->spa_scrub_lock); 1637219089Spjd } 1638219089Spjd 1639219089Spjd dsl_scan_sync_state(scn, tx); 1640219089Spjd} 1641219089Spjd 1642219089Spjd/* 1643219089Spjd * This will start a new scan, or restart an existing one. 1644219089Spjd */ 1645219089Spjdvoid 1646219089Spjddsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) 1647219089Spjd{ 1648219089Spjd if (txg == 0) { 1649219089Spjd dmu_tx_t *tx; 1650219089Spjd tx = dmu_tx_create_dd(dp->dp_mos_dir); 1651219089Spjd VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); 1652219089Spjd 1653219089Spjd txg = dmu_tx_get_txg(tx); 1654219089Spjd dp->dp_scan->scn_restart_txg = txg; 1655219089Spjd dmu_tx_commit(tx); 1656219089Spjd } else { 1657219089Spjd dp->dp_scan->scn_restart_txg = txg; 1658219089Spjd } 1659219089Spjd zfs_dbgmsg("restarting resilver txg=%llu", txg); 1660219089Spjd} 1661219089Spjd 1662219089Spjdboolean_t 1663219089Spjddsl_scan_resilvering(dsl_pool_t *dp) 1664219089Spjd{ 1665219089Spjd return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && 1666219089Spjd dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); 1667219089Spjd} 1668219089Spjd 1669219089Spjd/* 1670219089Spjd * scrub consumers 1671219089Spjd */ 1672219089Spjd 1673219089Spjdstatic void 1674219089Spjdcount_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) 1675219089Spjd{ 1676219089Spjd int i; 1677219089Spjd 1678219089Spjd /* 1679219089Spjd * If we resume after a reboot, zab will be NULL; don't record 1680219089Spjd * incomplete stats in that case. 1681219089Spjd */ 1682219089Spjd if (zab == NULL) 1683219089Spjd return; 1684219089Spjd 1685219089Spjd for (i = 0; i < 4; i++) { 1686219089Spjd int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; 1687219089Spjd int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; 1688236884Smm if (t & DMU_OT_NEWTYPE) 1689236884Smm t = DMU_OT_OTHER; 1690219089Spjd zfs_blkstat_t *zb = &zab->zab_type[l][t]; 1691219089Spjd int equal; 1692219089Spjd 1693219089Spjd zb->zb_count++; 1694219089Spjd zb->zb_asize += BP_GET_ASIZE(bp); 1695219089Spjd zb->zb_lsize += BP_GET_LSIZE(bp); 1696219089Spjd zb->zb_psize += BP_GET_PSIZE(bp); 1697219089Spjd zb->zb_gangs += BP_COUNT_GANG(bp); 1698219089Spjd 1699219089Spjd switch (BP_GET_NDVAS(bp)) { 1700219089Spjd case 2: 1701219089Spjd if (DVA_GET_VDEV(&bp->blk_dva[0]) == 1702219089Spjd DVA_GET_VDEV(&bp->blk_dva[1])) 1703219089Spjd zb->zb_ditto_2_of_2_samevdev++; 1704219089Spjd break; 1705219089Spjd case 3: 1706219089Spjd equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 1707219089Spjd DVA_GET_VDEV(&bp->blk_dva[1])) + 1708219089Spjd (DVA_GET_VDEV(&bp->blk_dva[0]) == 1709219089Spjd DVA_GET_VDEV(&bp->blk_dva[2])) + 1710219089Spjd (DVA_GET_VDEV(&bp->blk_dva[1]) == 1711219089Spjd DVA_GET_VDEV(&bp->blk_dva[2])); 1712219089Spjd if (equal == 1) 1713219089Spjd zb->zb_ditto_2_of_3_samevdev++; 1714219089Spjd else if (equal == 3) 1715219089Spjd zb->zb_ditto_3_of_3_samevdev++; 1716219089Spjd break; 1717219089Spjd } 1718219089Spjd } 1719219089Spjd} 1720219089Spjd 1721219089Spjdstatic void 1722219089Spjddsl_scan_scrub_done(zio_t *zio) 1723219089Spjd{ 1724219089Spjd spa_t *spa = zio->io_spa; 1725219089Spjd 1726219089Spjd zio_data_buf_free(zio->io_data, zio->io_size); 1727219089Spjd 1728219089Spjd mutex_enter(&spa->spa_scrub_lock); 1729219089Spjd spa->spa_scrub_inflight--; 1730219089Spjd cv_broadcast(&spa->spa_scrub_io_cv); 1731219089Spjd 1732219089Spjd if (zio->io_error && (zio->io_error != ECKSUM || 1733219089Spjd !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { 1734219089Spjd spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; 1735219089Spjd } 1736219089Spjd mutex_exit(&spa->spa_scrub_lock); 1737219089Spjd} 1738219089Spjd 1739219089Spjdstatic int 1740219089Spjddsl_scan_scrub_cb(dsl_pool_t *dp, 1741268657Sdelphij const blkptr_t *bp, const zbookmark_phys_t *zb) 1742219089Spjd{ 1743219089Spjd dsl_scan_t *scn = dp->dp_scan; 1744219089Spjd size_t size = BP_GET_PSIZE(bp); 1745219089Spjd spa_t *spa = dp->dp_spa; 1746219089Spjd uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); 1747219089Spjd boolean_t needs_io; 1748219089Spjd int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; 1749237972Smm unsigned int scan_delay = 0; 1750219089Spjd 1751219089Spjd if (phys_birth <= scn->scn_phys.scn_min_txg || 1752219089Spjd phys_birth >= scn->scn_phys.scn_max_txg) 1753219089Spjd return (0); 1754219089Spjd 1755219089Spjd count_block(dp->dp_blkstats, bp); 1756219089Spjd 1757268649Sdelphij if (BP_IS_EMBEDDED(bp)) 1758268649Sdelphij return (0); 1759268649Sdelphij 1760219089Spjd ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); 1761219089Spjd if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { 1762219089Spjd zio_flags |= ZIO_FLAG_SCRUB; 1763219089Spjd needs_io = B_TRUE; 1764219089Spjd scan_delay = zfs_scrub_delay; 1765247187Smm } else { 1766247187Smm ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); 1767219089Spjd zio_flags |= ZIO_FLAG_RESILVER; 1768219089Spjd needs_io = B_FALSE; 1769219089Spjd scan_delay = zfs_resilver_delay; 1770219089Spjd } 1771219089Spjd 1772219089Spjd /* If it's an intent log block, failure is expected. */ 1773219089Spjd if (zb->zb_level == ZB_ZIL_LEVEL) 1774219089Spjd zio_flags |= ZIO_FLAG_SPECULATIVE; 1775219089Spjd 1776219089Spjd for (int d = 0; d < BP_GET_NDVAS(bp); d++) { 1777219089Spjd vdev_t *vd = vdev_lookup_top(spa, 1778219089Spjd DVA_GET_VDEV(&bp->blk_dva[d])); 1779219089Spjd 1780219089Spjd /* 1781219089Spjd * Keep track of how much data we've examined so that 1782219089Spjd * zpool(1M) status can make useful progress reports. 1783219089Spjd */ 1784219089Spjd scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); 1785219089Spjd spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); 1786219089Spjd 1787219089Spjd /* if it's a resilver, this may not be in the target range */ 1788219089Spjd if (!needs_io) { 1789219089Spjd if (DVA_GET_GANG(&bp->blk_dva[d])) { 1790219089Spjd /* 1791219089Spjd * Gang members may be spread across multiple 1792219089Spjd * vdevs, so the best estimate we have is the 1793219089Spjd * scrub range, which has already been checked. 1794219089Spjd * XXX -- it would be better to change our 1795219089Spjd * allocation policy to ensure that all 1796219089Spjd * gang members reside on the same vdev. 1797219089Spjd */ 1798219089Spjd needs_io = B_TRUE; 1799219089Spjd } else { 1800219089Spjd needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, 1801219089Spjd phys_birth, 1); 1802219089Spjd } 1803219089Spjd } 1804219089Spjd } 1805219089Spjd 1806219089Spjd if (needs_io && !zfs_no_scrub_io) { 1807219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1808237972Smm uint64_t maxinflight = rvd->vdev_children * 1809237972Smm MAX(zfs_top_maxinflight, 1); 1810219089Spjd void *data = zio_data_buf_alloc(size); 1811219089Spjd 1812219089Spjd mutex_enter(&spa->spa_scrub_lock); 1813219089Spjd while (spa->spa_scrub_inflight >= maxinflight) 1814219089Spjd cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1815219089Spjd spa->spa_scrub_inflight++; 1816219089Spjd mutex_exit(&spa->spa_scrub_lock); 1817219089Spjd 1818219089Spjd /* 1819219089Spjd * If we're seeing recent (zfs_scan_idle) "important" I/Os 1820219089Spjd * then throttle our workload to limit the impact of a scan. 1821219089Spjd */ 1822219089Spjd if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) 1823237972Smm delay(MAX((int)scan_delay, 0)); 1824219089Spjd 1825219089Spjd zio_nowait(zio_read(NULL, spa, bp, data, size, 1826260763Savg dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, 1827219089Spjd zio_flags, zb)); 1828219089Spjd } 1829219089Spjd 1830219089Spjd /* do not relocate this block */ 1831219089Spjd return (0); 1832219089Spjd} 1833219089Spjd 1834219089Spjdint 1835219089Spjddsl_scan(dsl_pool_t *dp, pool_scan_func_t func) 1836219089Spjd{ 1837219089Spjd spa_t *spa = dp->dp_spa; 1838219089Spjd 1839219089Spjd /* 1840219089Spjd * Purge all vdev caches and probe all devices. We do this here 1841219089Spjd * rather than in sync context because this requires a writer lock 1842219089Spjd * on the spa_config lock, which we can't do from sync context. The 1843219089Spjd * spa_scrub_reopen flag indicates that vdev_open() should not 1844219089Spjd * attempt to start another scrub. 1845219089Spjd */ 1846219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 1847219089Spjd spa->spa_scrub_reopen = B_TRUE; 1848219089Spjd vdev_reopen(spa->spa_root_vdev); 1849219089Spjd spa->spa_scrub_reopen = B_FALSE; 1850219089Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 1851219089Spjd 1852248571Smm return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, 1853269006Sdelphij dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE)); 1854219089Spjd} 1855