Cross Reference: /freebsd-11-stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl

Deleted Added

sdiff udiff text old ( 332547 ) new ( 339034 )

full compact

dsl_scan.c (332547)	dsl_scan.c (339034)
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21/ 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 24 * Copyright 2016 Gary Mills 25 * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 26 * Copyright 2017 Joyent, Inc. 27 * Copyright (c) 2017 Datto Inc. 28 */ 29 30#include <sys/dsl_scan.h> 31#include <sys/dsl_pool.h> 32#include <sys/dsl_dataset.h> 33#include <sys/dsl_prop.h> 34#include <sys/dsl_dir.h> 35#include <sys/dsl_synctask.h> 36#include <sys/dnode.h> 37#include <sys/dmu_tx.h> 38#include <sys/dmu_objset.h> 39#include <sys/arc.h> 40#include <sys/zap.h> 41#include <sys/zio.h> 42#include <sys/zfs_context.h> 43#include <sys/fs/zfs.h> 44#include <sys/zfs_znode.h> 45#include <sys/spa_impl.h> 46#include <sys/vdev_impl.h> 47#include <sys/zil_impl.h> 48#include <sys/zio_checksum.h> 49#include <sys/ddt.h> 50#include <sys/sa.h> 51#include <sys/sa_impl.h> 52#include <sys/zfeature.h> 53#include <sys/abd.h>	1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 / 21/ 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 24 * Copyright 2016 Gary Mills 25 * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 26 * Copyright 2017 Joyent, Inc. 27 * Copyright (c) 2017 Datto Inc. 28 */ 29 30#include <sys/dsl_scan.h> 31#include <sys/dsl_pool.h> 32#include <sys/dsl_dataset.h> 33#include <sys/dsl_prop.h> 34#include <sys/dsl_dir.h> 35#include <sys/dsl_synctask.h> 36#include <sys/dnode.h> 37#include <sys/dmu_tx.h> 38#include <sys/dmu_objset.h> 39#include <sys/arc.h> 40#include <sys/zap.h> 41#include <sys/zio.h> 42#include <sys/zfs_context.h> 43#include <sys/fs/zfs.h> 44#include <sys/zfs_znode.h> 45#include <sys/spa_impl.h> 46#include <sys/vdev_impl.h> 47#include <sys/zil_impl.h> 48#include <sys/zio_checksum.h> 49#include <sys/ddt.h> 50#include <sys/sa.h> 51#include <sys/sa_impl.h> 52#include <sys/zfeature.h> 53#include <sys/abd.h>
	54#include <sys/range_tree.h>
54#ifdef _KERNEL 55#include <sys/zfs_vfsops.h> 56#endif 57	55#ifdef _KERNEL 56#include <sys/zfs_vfsops.h> 57#endif 58
	59/* 60 * Grand theory statement on scan queue sorting 61 * 62 * Scanning is implemented by recursively traversing all indirection levels 63 * in an object and reading all blocks referenced from said objects. This 64 * results in us approximately traversing the object from lowest logical 65 * offset to the highest. For best performance, we would want the logical 66 * blocks to be physically contiguous. However, this is frequently not the 67 * case with pools given the allocation patterns of copy-on-write filesystems. 68 * So instead, we put the I/Os into a reordering queue and issue them in a 69 * way that will most benefit physical disks (LBA-order). 70 * 71 * Queue management: 72 * 73 * Ideally, we would want to scan all metadata and queue up all block I/O 74 * prior to starting to issue it, because that allows us to do an optimal 75 * sorting job. This can however consume large amounts of memory. Therefore 76 * we continuously monitor the size of the queues and constrain them to 5% 77 * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this 78 * limit, we clear out a few of the largest extents at the head of the queues 79 * to make room for more scanning. Hopefully, these extents will be fairly 80 * large and contiguous, allowing us to approach sequential I/O throughput 81 * even without a fully sorted tree. 82 * 83 * Metadata scanning takes place in dsl_scan_visit(), which is called from 84 * dsl_scan_sync() every spa_sync(). If we have either fully scanned all 85 * metadata on the pool, or we need to make room in memory because our 86 * queues are too large, dsl_scan_visit() is postponed and 87 * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies 88 * that metadata scanning and queued I/O issuing are mutually exclusive. This 89 * allows us to provide maximum sequential I/O throughput for the majority of 90 * I/O's issued since sequential I/O performance is significantly negatively 91 * impacted if it is interleaved with random I/O. 92 * 93 * Implementation Notes 94 * 95 * One side effect of the queued scanning algorithm is that the scanning code 96 * needs to be notified whenever a block is freed. This is needed to allow 97 * the scanning code to remove these I/Os from the issuing queue. Additionally, 98 * we do not attempt to queue gang blocks to be issued sequentially since this 99 * is very hard to do and would have an extremely limitted performance benefit. 100 * Instead, we simply issue gang I/Os as soon as we find them using the legacy 101 * algorithm. 102 * 103 * Backwards compatibility 104 * 105 * This new algorithm is backwards compatible with the legacy on-disk data 106 * structures (and therefore does not require a new feature flag). 107 * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan 108 * will stop scanning metadata (in logical order) and wait for all outstanding 109 * sorted I/O to complete. Once this is done, we write out a checkpoint 110 * bookmark, indicating that we have scanned everything logically before it. 111 * If the pool is imported on a machine without the new sorting algorithm, 112 * the scan simply resumes from the last checkpoint using the legacy algorithm. 113 / 114*
58typedef int (scan_cb_t)(dsl_pool_t , const blkptr_t , 59 const zbookmark_phys_t *); 60 61static scan_cb_t dsl_scan_scrub_cb;	115typedef int (scan_cb_t)(dsl_pool_t , const blkptr_t , 116 const zbookmark_phys_t ); 117* 118static scan_cb_t dsl_scan_scrub_cb;
62static void dsl_scan_cancel_sync(void , dmu_tx_t ); 63static void dsl_scan_sync_state(dsl_scan_t , dmu_tx_t ); 64static boolean_t dsl_scan_restarting(dsl_scan_t , dmu_tx_t );
65	119
66unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level / 67unsigned int zfs_resilver_delay = 2; / number of ticks to delay resilver / 68unsigned int zfs_scrub_delay = 4; / number of ticks to delay scrub */	120static int scan_ds_queue_compare(const void a, const void b); 121static int scan_prefetch_queue_compare(const void a, const void b); 122static void scan_ds_queue_clear(dsl_scan_t scn); 123static boolean_t scan_ds_queue_contains(dsl_scan_t scn, uint64_t dsobj, 124 uint64_t txg); 125static void scan_ds_queue_insert(dsl_scan_t scn, uint64_t dsobj, uint64_t txg); 126static void scan_ds_queue_remove(dsl_scan_t scn, uint64_t dsobj); 127static void scan_ds_queue_sync(dsl_scan_t scn, dmu_tx_t tx); 128* 129extern int zfs_vdev_async_write_active_min_dirty_percent; 130 131/* 132 * By default zfs will check to ensure it is not over the hard memory 133 * limit before each txg. If finer-grained control of this is needed 134 * this value can be set to 1 to enable checking before scanning each 135 * block. 136 / 137int zfs_scan_strict_mem_lim = B_FALSE; 138* 139/* 140 * Maximum number of parallelly executing I/Os per top-level vdev. 141 * Tune with care. Very high settings (hundreds) are known to trigger 142 * some firmware bugs and resets on certain SSDs. 143 / 144int zfs_top_maxinflight = 32; / maximum I/Os per top-level / 145unsigned int zfs_resilver_delay = 2; / number of ticks to delay resilver -- 2 is a good number / 146unsigned int zfs_scrub_delay = 4; / number of ticks to delay scrub -- 4 is a good number */
69unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ 70	147unsigned int zfs_scan_idle = 50; /* idle window in clock ticks / 148*
71unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */	149/* 150 * Maximum number of parallelly executed bytes per leaf vdev. We attempt 151 * to strike a balance here between keeping the vdev queues full of I/Os 152 * at all times and not overflowing the queues to cause long latency, 153 * which would cause long txg sync times. No matter what, we will not 154 * overload the drives with I/O, since that is protected by 155 * zfs_vdev_scrub_max_active. 156 / 157unsigned long zfs_scan_vdev_limit = 4 << 20; 158* 159int zfs_scan_issue_strategy = 0; 160int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct / 161uint64_t zfs_scan_max_ext_gap = 2 << 20; / in bytes / 162* 163unsigned int zfs_scan_checkpoint_intval = 7200; /* seconds / 164#define ZFS_SCAN_CHECKPOINT_INTVAL SEC_TO_TICK(zfs_scan_checkpoint_intval) 165* 166/* 167 * fill_weight is non-tunable at runtime, so we copy it at module init from 168 * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would 169 * break queue sorting. 170 / 171uint64_t zfs_scan_fill_weight = 3; 172static uint64_t fill_weight; 173* 174/* See dsl_scan_should_clear() for details on the memory limit tunables / 175uint64_t zfs_scan_mem_lim_min = 16 << 20; / bytes / 176uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; / bytes / 177int zfs_scan_mem_lim_fact = 20; / fraction of physmem / 178int zfs_scan_mem_lim_soft_fact = 20; / fraction of mem lim above / 179* 180unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
72unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg / 73unsigned int zfs_obsolete_min_time_ms = 500; / min millisecs to obsolete per txg */	181unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg / 182unsigned int zfs_obsolete_min_time_ms = 500; / min millisecs to obsolete per txg */
74unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver 75 per txg */	183unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
76boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o / 77boolean_t zfs_no_scrub_prefetch = B_FALSE; / set to disable scrub prefetch */ 78 79SYSCTL_DECL(_vfs_zfs); 80SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, 81 &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); 82SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN, 83 &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); 84SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN, 85 &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); 86SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN, 87 &zfs_scan_idle, 0, "Idle scan window in clock ticks"); 88SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN,	184boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o / 185boolean_t zfs_no_scrub_prefetch = B_FALSE; / set to disable scrub prefetch / 186* 187SYSCTL_DECL(_vfs_zfs); 188SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, 189 &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); 190SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN, 191 &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); 192SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN, 193 &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); 194SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN, 195 &zfs_scan_idle, 0, "Idle scan window in clock ticks"); 196SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN,
89 &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg");	197 &zfs_scrub_min_time_ms, 0, "Min millisecs to scrub per txg");
90SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN, 91 &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); 92SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN, 93 &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); 94SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN, 95 &zfs_no_scrub_io, 0, "Disable scrub I/O"); 96SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, 97 &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");	198SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN, 199 &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); 200SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN, 201 &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); 202SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN, 203 &zfs_no_scrub_io, 0, "Disable scrub I/O"); 204SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, 205 &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
	206SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_legacy, CTLFLAG_RWTUN, 207 &zfs_scan_legacy, 0, "Scrub using legacy non-sequential method"); 208SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_checkpoint_interval, CTLFLAG_RWTUN, 209 &zfs_scan_checkpoint_intval, 0, "Scan progress on-disk checkpointing interval");
98 99enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; 100/* max number of blocks to free in a single TXG / 101uint64_t zfs_async_block_max_blocks = UINT64_MAX; 102SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, 103* &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG"); 104	210 211enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; 212/* max number of blocks to free in a single TXG / 213uint64_t zfs_async_block_max_blocks = UINT64_MAX; 214SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, 215* &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG"); 216
	217/* 218 * We wait a few txgs after importing a pool to begin scanning so that 219 * the import / mounting code isn't held up by scrub / resilver IO. 220 * Unfortunately, it is a bit difficult to determine exactly how long 221 * this will take since userspace will trigger fs mounts asynchronously 222 * and the kernel will create zvol minors asynchronously. As a result, 223 * the value provided here is a bit arbitrary, but represents a 224 * reasonable estimate of how many txgs it will take to finish fully 225 * importing a pool 226 / 227*#define SCAN_IMPORT_WAIT_TXGS 5
105	228
	229
106#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ 107 ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB \|\| \ 108 (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) 109 110extern int zfs_txg_timeout; 111 112/* 113 * Enable/disable the processing of the free_bpobj object. 114 / 115boolean_t zfs_free_bpobj_enabled = B_TRUE; 116* 117SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN, 118 &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing"); 119 120/* the order has to match pool_scan_type / 121static scan_cb_t scan_funcs[POOL_SCAN_FUNCS] = { 122 NULL, 123 dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB / 124* dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER / 125}; 126*	230#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ 231 ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB \|\| \ 232 (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) 233 234extern int zfs_txg_timeout; 235 236/* 237 * Enable/disable the processing of the free_bpobj object. 238 / 239boolean_t zfs_free_bpobj_enabled = B_TRUE; 240* 241SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN, 242 &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing"); 243 244/* the order has to match pool_scan_type / 245static scan_cb_t scan_funcs[POOL_SCAN_FUNCS] = { 246 NULL, 247 dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB / 248* dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER / 249}; 250*
	251/* In core node for the scn->scn_queue. Represents a dataset to be scanned / 252typedef struct { 253* uint64_t sds_dsobj; 254 uint64_t sds_txg; 255 avl_node_t sds_node; 256} scan_ds_t; 257 258/* 259 * This controls what conditions are placed on dsl_scan_sync_state(): 260 * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 261 * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. 262 * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise 263 * write out the scn_phys_cached version. 264 * See dsl_scan_sync_state for details. 265 / 266typedef enum { 267* SYNC_OPTIONAL, 268 SYNC_MANDATORY, 269 SYNC_CACHED 270} state_sync_type_t; 271 272/* 273 * This struct represents the minimum information needed to reconstruct a 274 * zio for sequential scanning. This is useful because many of these will 275 * accumulate in the sequential IO queues before being issued, so saving 276 * memory matters here. 277 / 278typedef struct scan_io { 279* /* fields from blkptr_t / 280* uint64_t sio_offset; 281 uint64_t sio_blk_prop; 282 uint64_t sio_phys_birth; 283 uint64_t sio_birth; 284 zio_cksum_t sio_cksum; 285 uint32_t sio_asize; 286 287 /* fields from zio_t / 288* int sio_flags; 289 zbookmark_phys_t sio_zb; 290 291 /* members for queue sorting / 292* union { 293 avl_node_t sio_addr_node; /* link into issueing queue / 294* list_node_t sio_list_node; /* link for issuing to disk / 295* } sio_nodes; 296} scan_io_t; 297 298struct dsl_scan_io_queue { 299 dsl_scan_t q_scn; / associated dsl_scan_t / 300* vdev_t q_vd; / top-level vdev that this queue represents / 301* 302 /* trees used for sorting I/Os and extents of I/Os / 303* range_tree_t q_exts_by_addr; 304* avl_tree_t q_exts_by_size; 305 avl_tree_t q_sios_by_addr; 306 307 /* members for zio rate limiting / 308* uint64_t q_maxinflight_bytes; 309 uint64_t q_inflight_bytes; 310 kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock / 311* 312 /* per txg statistics / 313* uint64_t q_total_seg_size_this_txg; 314 uint64_t q_segs_this_txg; 315 uint64_t q_total_zio_size_this_txg; 316 uint64_t q_zios_this_txg; 317}; 318 319/* private data for dsl_scan_prefetch_cb() / 320typedef struct scan_prefetch_ctx { 321* refcount_t spc_refcnt; /* refcount for memory management / 322* dsl_scan_t spc_scn; / dsl_scan_t for the pool / 323* boolean_t spc_root; /* is this prefetch for an objset? / 324* uint8_t spc_indblkshift; /* dn_indblkshift of current dnode / 325* uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode / 326} scan_prefetch_ctx_t; 327* 328/* private data for dsl_scan_prefetch() / 329typedef struct scan_prefetch_issue_ctx { 330* avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue / 331* scan_prefetch_ctx_t spic_spc; / spc for the callback / 332* blkptr_t spic_bp; /* bp to prefetch / 333* zbookmark_phys_t spic_zb; /* bookmark to prefetch / 334} scan_prefetch_issue_ctx_t; 335* 336static void scan_exec_io(dsl_pool_t dp, const blkptr_t bp, int zio_flags, 337 const zbookmark_phys_t zb, dsl_scan_io_queue_t queue); 338static void scan_io_queue_insert_impl(dsl_scan_io_queue_t queue, 339* scan_io_t sio); 340* 341static dsl_scan_io_queue_t scan_io_queue_create(vdev_t vd); 342static void scan_io_queues_destroy(dsl_scan_t scn); 343* 344static kmem_cache_t sio_cache; 345* 346void 347scan_init(void) 348{ 349 /* 350 * This is used in ext_size_compare() to weight segments 351 * based on how sparse they are. This cannot be changed 352 * mid-scan and the tree comparison functions don't currently 353 * have a mechansim for passing additional context to the 354 * compare functions. Thus we store this value globally and 355 * we only allow it to be set at module intiailization time 356 / 357* fill_weight = zfs_scan_fill_weight; 358 359 sio_cache = kmem_cache_create("sio_cache", 360 sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 361} 362 363void 364scan_fini(void) 365{ 366 kmem_cache_destroy(sio_cache); 367} 368 369static inline boolean_t 370dsl_scan_is_running(const dsl_scan_t scn) 371{ 372* return (scn->scn_phys.scn_state == DSS_SCANNING); 373} 374 375boolean_t 376dsl_scan_resilvering(dsl_pool_t dp) 377{ 378* return (dsl_scan_is_running(dp->dp_scan) && 379 dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); 380} 381 382static inline void 383sio2bp(const scan_io_t sio, blkptr_t bp, uint64_t vdev_id) 384{ 385 bzero(bp, sizeof (bp)); 386* DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); 387 DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); 388 DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); 389 bp->blk_prop = sio->sio_blk_prop; 390 bp->blk_phys_birth = sio->sio_phys_birth; 391 bp->blk_birth = sio->sio_birth; 392 bp->blk_fill = 1; /* we always only work with data pointers / 393* bp->blk_cksum = sio->sio_cksum; 394} 395 396static inline void 397bp2sio(const blkptr_t bp, scan_io_t sio, int dva_i) 398{ 399 /* we discard the vdev id, since we can deduce it from the queue / 400* sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); 401 sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); 402 sio->sio_blk_prop = bp->blk_prop; 403 sio->sio_phys_birth = bp->blk_phys_birth; 404 sio->sio_birth = bp->blk_birth; 405 sio->sio_cksum = bp->blk_cksum; 406} 407 408void 409dsl_scan_global_init(void) 410{ 411 /* 412 * This is used in ext_size_compare() to weight segments 413 * based on how sparse they are. This cannot be changed 414 * mid-scan and the tree comparison functions don't currently 415 * have a mechansim for passing additional context to the 416 * compare functions. Thus we store this value globally and 417 * we only allow it to be set at module intiailization time 418 / 419* fill_weight = zfs_scan_fill_weight; 420} 421
127int 128dsl_scan_init(dsl_pool_t dp, uint64_t txg) 129{ 130* int err; 131 dsl_scan_t scn; 132* spa_t spa = dp->dp_spa; 133* uint64_t f; 134 135 scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); 136 scn->scn_dp = dp; 137 138 /* 139 * It's possible that we're resuming a scan after a reboot so 140 * make sure that the scan_async_destroying flag is initialized 141 * appropriately. 142 / 143* ASSERT(!scn->scn_async_destroying); 144 scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, 145 SPA_FEATURE_ASYNC_DESTROY); 146	422int 423dsl_scan_init(dsl_pool_t dp, uint64_t txg) 424{ 425* int err; 426 dsl_scan_t scn; 427* spa_t spa = dp->dp_spa; 428* uint64_t f; 429 430 scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); 431 scn->scn_dp = dp; 432 433 /* 434 * It's possible that we're resuming a scan after a reboot so 435 * make sure that the scan_async_destroying flag is initialized 436 * appropriately. 437 / 438* ASSERT(!scn->scn_async_destroying); 439 scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, 440 SPA_FEATURE_ASYNC_DESTROY); 441
	442 bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); 443 avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), 444 offsetof(scan_ds_t, sds_node)); 445 avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, 446 sizeof (scan_prefetch_issue_ctx_t), 447 offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); 448
147 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 148 "scrub_func", sizeof (uint64_t), 1, &f); 149 if (err == 0) { 150 /* 151 * There was an old-style scrub in progress. Restart a 152 * new-style scrub from the beginning. 153 / 154* scn->scn_restart_txg = txg; 155 zfs_dbgmsg("old-style scrub was in progress; " 156 "restarting new-style scrub in txg %llu",	449 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 450 "scrub_func", sizeof (uint64_t), 1, &f); 451 if (err == 0) { 452 /* 453 * There was an old-style scrub in progress. Restart a 454 * new-style scrub from the beginning. 455 / 456* scn->scn_restart_txg = txg; 457 zfs_dbgmsg("old-style scrub was in progress; " 458 "restarting new-style scrub in txg %llu",
157 scn->scn_restart_txg);	459 (longlong_t)scn->scn_restart_txg);
158 159 /* 160 * Load the queue obj from the old location so that it 161 * can be freed by dsl_scan_done(). 162 / 163* (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 164 "scrub_queue", sizeof (uint64_t), 1, 165 &scn->scn_phys.scn_queue_obj); 166 } else { 167 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 168 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 169 &scn->scn_phys); 170 if (err == ENOENT) 171 return (0); 172 else if (err) 173 return (err); 174	460 461 /* 462 * Load the queue obj from the old location so that it 463 * can be freed by dsl_scan_done(). 464 / 465* (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 466 "scrub_queue", sizeof (uint64_t), 1, 467 &scn->scn_phys.scn_queue_obj); 468 } else { 469 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 470 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 471 &scn->scn_phys); 472 if (err == ENOENT) 473 return (0); 474 else if (err) 475 return (err); 476
175 if (scn->scn_phys.scn_state == DSS_SCANNING &&	477 /* 478 * We might be restarting after a reboot, so jump the issued 479 * counter to how far we've scanned. We know we're consistent 480 * up to here. 481 / 482* scn->scn_issued_before_pass = scn->scn_phys.scn_examined; 483 484 if (dsl_scan_is_running(scn) &&
176 spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { 177 /* 178 * A new-type scrub was in progress on an old 179 * pool, and the pool was accessed by old 180 * software. Restart from the beginning, since 181 * the old software may have changed the pool in 182 * the meantime. 183 / 184* scn->scn_restart_txg = txg; 185 zfs_dbgmsg("new-style scrub was modified " 186 "by old software; restarting in txg %llu",	485 spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { 486 /* 487 * A new-type scrub was in progress on an old 488 * pool, and the pool was accessed by old 489 * software. Restart from the beginning, since 490 * the old software may have changed the pool in 491 * the meantime. 492 / 493* scn->scn_restart_txg = txg; 494 zfs_dbgmsg("new-style scrub was modified " 495 "by old software; restarting in txg %llu",
187 scn->scn_restart_txg);	496 (longlong_t)scn->scn_restart_txg);
188 } 189 } 190	497 } 498 } 499
	500 /* reload the queue into the in-core state / 501* if (scn->scn_phys.scn_queue_obj != 0) { 502 zap_cursor_t zc; 503 zap_attribute_t za; 504 505 for (zap_cursor_init(&zc, dp->dp_meta_objset, 506 scn->scn_phys.scn_queue_obj); 507 zap_cursor_retrieve(&zc, &za) == 0; 508 (void) zap_cursor_advance(&zc)) { 509 scan_ds_queue_insert(scn, 510 zfs_strtonum(za.za_name, NULL), 511 za.za_first_integer); 512 } 513 zap_cursor_fini(&zc); 514 } 515
191 spa_scan_stat_init(spa); 192 return (0); 193} 194 195void 196dsl_scan_fini(dsl_pool_t dp) 197*{	516 spa_scan_stat_init(spa); 517 return (0); 518} 519 520void 521dsl_scan_fini(dsl_pool_t dp) 522*{
198 if (dp->dp_scan) {	523 if (dp->dp_scan != NULL) { 524 dsl_scan_t scn = dp->dp_scan; 525* 526 if (scn->scn_taskq != NULL) 527 taskq_destroy(scn->scn_taskq); 528 scan_ds_queue_clear(scn); 529 avl_destroy(&scn->scn_queue); 530 avl_destroy(&scn->scn_prefetch_queue); 531
199 kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); 200 dp->dp_scan = NULL; 201 } 202} 203	532 kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); 533 dp->dp_scan = NULL; 534 } 535} 536
	537static boolean_t 538dsl_scan_restarting(dsl_scan_t scn, dmu_tx_t tx) 539{ 540 return (scn->scn_restart_txg != 0 && 541 scn->scn_restart_txg <= tx->tx_txg); 542} 543 544boolean_t 545dsl_scan_scrubbing(const dsl_pool_t dp) 546{ 547* dsl_scan_phys_t scn_phys = &dp->dp_scan->scn_phys; 548* 549 return (scn_phys->scn_state == DSS_SCANNING && 550 scn_phys->scn_func == POOL_SCAN_SCRUB); 551} 552 553boolean_t 554dsl_scan_is_paused_scrub(const dsl_scan_t scn) 555{ 556* return (dsl_scan_scrubbing(scn->scn_dp) && 557 scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); 558} 559 560/* 561 * Writes out a persistent dsl_scan_phys_t record to the pool directory. 562 * Because we can be running in the block sorting algorithm, we do not always 563 * want to write out the record, only when it is "safe" to do so. This safety 564 * condition is achieved by making sure that the sorting queues are empty 565 * (scn_bytes_pending == 0). When this condition is not true, the sync'd state 566 * is inconsistent with how much actual scanning progress has been made. The 567 * kind of sync to be performed is specified by the sync_type argument. If the 568 * sync is optional, we only sync if the queues are empty. If the sync is 569 * mandatory, we do a hard ASSERT to make sure that the queues are empty. The 570 * third possible state is a "cached" sync. This is done in response to: 571 * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been 572 * destroyed, so we wouldn't be able to restart scanning from it. 573 * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been 574 * superseded by a newer snapshot. 575 * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been 576 * swapped with its clone. 577 * In all cases, a cached sync simply rewrites the last record we've written, 578 * just slightly modified. For the modifications that are performed to the 579 * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, 580 * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. 581 / 582static void 583dsl_scan_sync_state(dsl_scan_t scn, dmu_tx_t tx, state_sync_type_t sync_type) 584{ 585* int i; 586 spa_t spa = scn->scn_dp->dp_spa; 587* 588 ASSERT(sync_type != SYNC_MANDATORY \|\| scn->scn_bytes_pending == 0); 589 if (scn->scn_bytes_pending == 0) { 590 for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { 591 vdev_t vd = spa->spa_root_vdev->vdev_child[i]; 592* dsl_scan_io_queue_t q = vd->vdev_scan_io_queue; 593* 594 if (q == NULL) 595 continue; 596 597 mutex_enter(&vd->vdev_scan_io_queue_lock); 598 ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); 599 ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL); 600 ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); 601 mutex_exit(&vd->vdev_scan_io_queue_lock); 602 } 603 604 if (scn->scn_phys.scn_queue_obj != 0) 605 scan_ds_queue_sync(scn, tx); 606 VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, 607 DMU_POOL_DIRECTORY_OBJECT, 608 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 609 &scn->scn_phys, tx)); 610 bcopy(&scn->scn_phys, &scn->scn_phys_cached, 611 sizeof (scn->scn_phys)); 612 613 if (scn->scn_checkpointing) 614 zfs_dbgmsg("finish scan checkpoint"); 615 616 scn->scn_checkpointing = B_FALSE; 617 scn->scn_last_checkpoint = ddi_get_lbolt(); 618 } else if (sync_type == SYNC_CACHED) { 619 VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, 620 DMU_POOL_DIRECTORY_OBJECT, 621 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 622 &scn->scn_phys_cached, tx)); 623 } 624} 625
204/* ARGSUSED / 205static int 206dsl_scan_setup_check(void arg, dmu_tx_t tx) 207{ 208* dsl_scan_t scn = dmu_tx_pool(tx)->dp_scan; 209*	626/* ARGSUSED / 627static int 628dsl_scan_setup_check(void arg, dmu_tx_t tx) 629{ 630* dsl_scan_t scn = dmu_tx_pool(tx)->dp_scan; 631*
210 if (scn->scn_phys.scn_state == DSS_SCANNING)	632 if (dsl_scan_is_running(scn))
211 return (SET_ERROR(EBUSY)); 212 213 return (0); 214} 215 216static void 217dsl_scan_setup_sync(void arg, dmu_tx_t tx) 218{ 219 dsl_scan_t scn = dmu_tx_pool(tx)->dp_scan; 220* pool_scan_func_t funcp = arg; 221* dmu_object_type_t ot = 0; 222 dsl_pool_t dp = scn->scn_dp; 223* spa_t spa = dp->dp_spa; 224*	633 return (SET_ERROR(EBUSY)); 634 635 return (0); 636} 637 638static void 639dsl_scan_setup_sync(void arg, dmu_tx_t tx) 640{ 641 dsl_scan_t scn = dmu_tx_pool(tx)->dp_scan; 642* pool_scan_func_t funcp = arg; 643* dmu_object_type_t ot = 0; 644 dsl_pool_t dp = scn->scn_dp; 645* spa_t spa = dp->dp_spa; 646*
225 ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);	647 ASSERT(!dsl_scan_is_running(scn));
226 ASSERT(funcp > POOL_SCAN_NONE && funcp < POOL_SCAN_FUNCS); 227 bzero(&scn->scn_phys, sizeof (scn->scn_phys)); 228 scn->scn_phys.scn_func = funcp; 229* scn->scn_phys.scn_state = DSS_SCANNING; 230 scn->scn_phys.scn_min_txg = 0; 231 scn->scn_phys.scn_max_txg = tx->tx_txg; 232 scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT / 233* scn->scn_phys.scn_start_time = gethrestime_sec(); 234 scn->scn_phys.scn_errors = 0; 235 scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;	648 ASSERT(funcp > POOL_SCAN_NONE && funcp < POOL_SCAN_FUNCS); 649 bzero(&scn->scn_phys, sizeof (scn->scn_phys)); 650 scn->scn_phys.scn_func = funcp; 651* scn->scn_phys.scn_state = DSS_SCANNING; 652 scn->scn_phys.scn_min_txg = 0; 653 scn->scn_phys.scn_max_txg = tx->tx_txg; 654 scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT / 655* scn->scn_phys.scn_start_time = gethrestime_sec(); 656 scn->scn_phys.scn_errors = 0; 657 scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
	658 scn->scn_issued_before_pass = 0;
236 scn->scn_restart_txg = 0; 237 scn->scn_done_txg = 0;	659 scn->scn_restart_txg = 0; 660 scn->scn_done_txg = 0;
	661 scn->scn_last_checkpoint = 0; 662 scn->scn_checkpointing = B_FALSE;
238 spa_scan_stat_init(spa); 239 240 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 241 scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; 242 243 /* rewrite all disk labels / 244* vdev_config_dirty(spa->spa_root_vdev); 245 246 if (vdev_resilver_needed(spa->spa_root_vdev, 247 &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { 248 spa_event_notify(spa, NULL, NULL, 249 ESC_ZFS_RESILVER_START); 250 } else { 251 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); 252 } 253 254 spa->spa_scrub_started = B_TRUE; 255 /* 256 * If this is an incremental scrub, limit the DDT scrub phase 257 * to just the auto-ditto class (for correctness); the rest 258 * of the scrub should go faster using top-down pruning. 259 / 260* if (scn->scn_phys.scn_min_txg > TXG_INITIAL) 261 scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; 262 263 } 264 265 /* back to the generic stuff / 266* 267 if (dp->dp_blkstats == NULL) { 268 dp->dp_blkstats = 269 kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);	663 spa_scan_stat_init(spa); 664 665 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 666 scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; 667 668 /* rewrite all disk labels / 669* vdev_config_dirty(spa->spa_root_vdev); 670 671 if (vdev_resilver_needed(spa->spa_root_vdev, 672 &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { 673 spa_event_notify(spa, NULL, NULL, 674 ESC_ZFS_RESILVER_START); 675 } else { 676 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); 677 } 678 679 spa->spa_scrub_started = B_TRUE; 680 /* 681 * If this is an incremental scrub, limit the DDT scrub phase 682 * to just the auto-ditto class (for correctness); the rest 683 * of the scrub should go faster using top-down pruning. 684 / 685* if (scn->scn_phys.scn_min_txg > TXG_INITIAL) 686 scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; 687 688 } 689 690 /* back to the generic stuff / 691* 692 if (dp->dp_blkstats == NULL) { 693 dp->dp_blkstats = 694 kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
	695 mutex_init(&dp->dp_blkstats->zab_lock, NULL, 696 MUTEX_DEFAULT, NULL);
270 }	697 }
271 bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));	698 bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
272 273 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) 274 ot = DMU_OT_ZAP_OTHER; 275 276 scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, 277 ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); 278	699 700 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) 701 ot = DMU_OT_ZAP_OTHER; 702 703 scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, 704 ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); 705
279 dsl_scan_sync_state(scn, tx);	706 bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
280	707
	708 dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); 709
281 spa_history_log_internal(spa, "scan setup", tx, 282 "func=%u mintxg=%llu maxtxg=%llu", 283 funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); 284} 285*	710 spa_history_log_internal(spa, "scan setup", tx, 711 "func=%u mintxg=%llu maxtxg=%llu", 712 funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); 713} 714*
	715/* 716 * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. 717 * Can also be called to resume a paused scrub. 718 / 719int 720dsl_scan(dsl_pool_t dp, pool_scan_func_t func) 721{ 722 spa_t spa = dp->dp_spa; 723* dsl_scan_t scn = dp->dp_scan; 724* 725 /* 726 * Purge all vdev caches and probe all devices. We do this here 727 * rather than in sync context because this requires a writer lock 728 * on the spa_config lock, which we can't do from sync context. The 729 * spa_scrub_reopen flag indicates that vdev_open() should not 730 * attempt to start another scrub. 731 / 732* spa_vdev_state_enter(spa, SCL_NONE); 733 spa->spa_scrub_reopen = B_TRUE; 734 vdev_reopen(spa->spa_root_vdev); 735 spa->spa_scrub_reopen = B_FALSE; 736 (void) spa_vdev_state_exit(spa, NULL, 0); 737 738 if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { 739 /* got scrub start cmd, resume paused scrub / 740* int err = dsl_scrub_set_pause_resume(scn->scn_dp, 741 POOL_SCRUB_NORMAL); 742 if (err == 0) { 743 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); 744 return (ECANCELED); 745 } 746 return (SET_ERROR(err)); 747 } 748 749 return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, 750 dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); 751} 752
286/* ARGSUSED / 287static void 288dsl_scan_done(dsl_scan_t scn, boolean_t complete, dmu_tx_t tx) 289{ 290* static const char old_names[] = { 291* "scrub_bookmark", 292 "scrub_ddt_bookmark", 293 "scrub_ddt_class_max", 294 "scrub_queue", 295 "scrub_min_txg", 296 "scrub_max_txg", 297 "scrub_func", 298 "scrub_errors", 299 NULL 300 }; 301 302 dsl_pool_t dp = scn->scn_dp; 303* spa_t spa = dp->dp_spa; 304* int i; 305 306 /* Remove any remnants of an old-style scrub. / 307* for (i = 0; old_names[i]; i++) { 308 (void) zap_remove(dp->dp_meta_objset, 309 DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); 310 } 311 312 if (scn->scn_phys.scn_queue_obj != 0) {	753/* ARGSUSED / 754static void 755dsl_scan_done(dsl_scan_t scn, boolean_t complete, dmu_tx_t tx) 756{ 757* static const char old_names[] = { 758* "scrub_bookmark", 759 "scrub_ddt_bookmark", 760 "scrub_ddt_class_max", 761 "scrub_queue", 762 "scrub_min_txg", 763 "scrub_max_txg", 764 "scrub_func", 765 "scrub_errors", 766 NULL 767 }; 768 769 dsl_pool_t dp = scn->scn_dp; 770* spa_t spa = dp->dp_spa; 771* int i; 772 773 /* Remove any remnants of an old-style scrub. / 774* for (i = 0; old_names[i]; i++) { 775 (void) zap_remove(dp->dp_meta_objset, 776 DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); 777 } 778 779 if (scn->scn_phys.scn_queue_obj != 0) {
313 VERIFY(0 == dmu_object_free(dp->dp_meta_objset,	780 VERIFY0(dmu_object_free(dp->dp_meta_objset,
314 scn->scn_phys.scn_queue_obj, tx)); 315 scn->scn_phys.scn_queue_obj = 0; 316 }	781 scn->scn_phys.scn_queue_obj, tx)); 782 scn->scn_phys.scn_queue_obj = 0; 783 }
	784 scan_ds_queue_clear(scn);
317 318 scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; 319 320 /* 321 * If we were "restarted" from a stopped state, don't bother 322 * with anything else. 323 */	785 786 scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; 787 788 /* 789 * If we were "restarted" from a stopped state, don't bother 790 * with anything else. 791 */
324 if (scn->scn_phys.scn_state != DSS_SCANNING)	792 if (!dsl_scan_is_running(scn)) { 793 ASSERT(!scn->scn_is_sorted);
325 return;	794 return;
	795 }
326	796
327 if (complete) 328 scn->scn_phys.scn_state = DSS_FINISHED; 329 else 330 scn->scn_phys.scn_state = DSS_CANCELED;	797 if (scn->scn_is_sorted) { 798 scan_io_queues_destroy(scn); 799 scn->scn_is_sorted = B_FALSE;
331	800
	801 if (scn->scn_taskq != NULL) { 802 taskq_destroy(scn->scn_taskq); 803 scn->scn_taskq = NULL; 804 } 805 } 806 807 scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; 808
332 if (dsl_scan_restarting(scn, tx)) 333 spa_history_log_internal(spa, "scan aborted, restarting", tx, 334 "errors=%llu", spa_get_errlog_size(spa)); 335 else if (!complete) 336 spa_history_log_internal(spa, "scan cancelled", tx, 337 "errors=%llu", spa_get_errlog_size(spa)); 338 else 339 spa_history_log_internal(spa, "scan done", tx, 340 "errors=%llu", spa_get_errlog_size(spa)); 341 342 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {	809 if (dsl_scan_restarting(scn, tx)) 810 spa_history_log_internal(spa, "scan aborted, restarting", tx, 811 "errors=%llu", spa_get_errlog_size(spa)); 812 else if (!complete) 813 spa_history_log_internal(spa, "scan cancelled", tx, 814 "errors=%llu", spa_get_errlog_size(spa)); 815 else 816 spa_history_log_internal(spa, "scan done", tx, 817 "errors=%llu", spa_get_errlog_size(spa)); 818 819 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
343 mutex_enter(&spa->spa_scrub_lock); 344 while (spa->spa_scrub_inflight > 0) { 345 cv_wait(&spa->spa_scrub_io_cv, 346 &spa->spa_scrub_lock); 347 } 348 mutex_exit(&spa->spa_scrub_lock);
349 spa->spa_scrub_started = B_FALSE; 350 spa->spa_scrub_active = B_FALSE; 351 352 /* 353 * If the scrub/resilver completed, update all DTLs to 354 * reflect this. Whether it succeeded or not, vacate 355 * all temporary scrub DTLs. 356 * 357 * As the scrub does not currently support traversing 358 * data that have been freed but are part of a checkpoint, 359 * we don't mark the scrub as done in the DTLs as faults 360 * may still exist in those vdevs. 361 / 362* if (complete && 363 !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 364 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 365 scn->scn_phys.scn_max_txg, B_TRUE); 366 367 spa_event_notify(spa, NULL, NULL, 368 scn->scn_phys.scn_min_txg ? 369 ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); 370 } else { 371 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 372 0, B_TRUE); 373 } 374 spa_errlog_rotate(spa); 375 376 /* 377 * We may have finished replacing a device. 378 * Let the async thread assess this and handle the detach. 379 / 380* spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 381 } 382 383 scn->scn_phys.scn_end_time = gethrestime_sec();	820 spa->spa_scrub_started = B_FALSE; 821 spa->spa_scrub_active = B_FALSE; 822 823 /* 824 * If the scrub/resilver completed, update all DTLs to 825 * reflect this. Whether it succeeded or not, vacate 826 * all temporary scrub DTLs. 827 * 828 * As the scrub does not currently support traversing 829 * data that have been freed but are part of a checkpoint, 830 * we don't mark the scrub as done in the DTLs as faults 831 * may still exist in those vdevs. 832 / 833* if (complete && 834 !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 835 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 836 scn->scn_phys.scn_max_txg, B_TRUE); 837 838 spa_event_notify(spa, NULL, NULL, 839 scn->scn_phys.scn_min_txg ? 840 ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); 841 } else { 842 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 843 0, B_TRUE); 844 } 845 spa_errlog_rotate(spa); 846 847 /* 848 * We may have finished replacing a device. 849 * Let the async thread assess this and handle the detach. 850 / 851* spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 852 } 853 854 scn->scn_phys.scn_end_time = gethrestime_sec();
	855 856 ASSERT(!dsl_scan_is_running(scn));
384} 385 386/* ARGSUSED / 387static int 388dsl_scan_cancel_check(void arg, dmu_tx_t tx) 389{ 390* dsl_scan_t scn = dmu_tx_pool(tx)->dp_scan; 391*	857} 858 859/* ARGSUSED / 860static int 861dsl_scan_cancel_check(void arg, dmu_tx_t tx) 862{ 863* dsl_scan_t scn = dmu_tx_pool(tx)->dp_scan; 864*
392 if (scn->scn_phys.scn_state != DSS_SCANNING)	865 if (!dsl_scan_is_running(scn))
393 return (SET_ERROR(ENOENT)); 394 return (0); 395} 396 397/* ARGSUSED / 398static void 399dsl_scan_cancel_sync(void arg, dmu_tx_t tx) 400{ 401* dsl_scan_t scn = dmu_tx_pool(tx)->dp_scan; 402* 403 dsl_scan_done(scn, B_FALSE, tx);	866 return (SET_ERROR(ENOENT)); 867 return (0); 868} 869 870/* ARGSUSED / 871static void 872dsl_scan_cancel_sync(void arg, dmu_tx_t tx) 873{ 874* dsl_scan_t scn = dmu_tx_pool(tx)->dp_scan; 875* 876 dsl_scan_done(scn, B_FALSE, tx);
404 dsl_scan_sync_state(scn, tx);	877 dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
405 spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); 406} 407 408int 409dsl_scan_cancel(dsl_pool_t dp) 410{ 411* return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, 412 dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); 413} 414	878 spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); 879} 880 881int 882dsl_scan_cancel(dsl_pool_t dp) 883{ 884* return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, 885 dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); 886} 887
415boolean_t 416dsl_scan_is_paused_scrub(const dsl_scan_t scn) 417{ 418* if (dsl_scan_scrubbing(scn->scn_dp) && 419 scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED) 420 return (B_TRUE); 421 422 return (B_FALSE); 423} 424
425static int 426dsl_scrub_pause_resume_check(void arg, dmu_tx_t tx) 427{ 428 pool_scrub_cmd_t cmd = arg; 429* dsl_pool_t dp = dmu_tx_pool(tx); 430* dsl_scan_t scn = dp->dp_scan; 431* 432 if (cmd == POOL_SCRUB_PAUSE) { 433* /* can't pause a scrub when there is no in-progress scrub / 434* if (!dsl_scan_scrubbing(dp)) 435 return (SET_ERROR(ENOENT)); 436 437 /* can't pause a paused scrub / 438* if (dsl_scan_is_paused_scrub(scn)) 439 return (SET_ERROR(EBUSY)); 440 } else if (cmd != POOL_SCRUB_NORMAL) { 441* return (SET_ERROR(ENOTSUP)); 442 } 443 444 return (0); 445} 446 447static void 448dsl_scrub_pause_resume_sync(void arg, dmu_tx_t tx) 449{ 450 pool_scrub_cmd_t cmd = arg; 451* dsl_pool_t dp = dmu_tx_pool(tx); 452* spa_t spa = dp->dp_spa; 453* dsl_scan_t scn = dp->dp_scan; 454* 455 if (cmd == POOL_SCRUB_PAUSE) { 456* /* can't pause a scrub when there is no in-progress scrub / 457* spa->spa_scan_pass_scrub_pause = gethrestime_sec(); 458 scn->scn_phys.scn_flags \|= DSF_SCRUB_PAUSED;	888static int 889dsl_scrub_pause_resume_check(void arg, dmu_tx_t tx) 890{ 891 pool_scrub_cmd_t cmd = arg; 892* dsl_pool_t dp = dmu_tx_pool(tx); 893* dsl_scan_t scn = dp->dp_scan; 894* 895 if (cmd == POOL_SCRUB_PAUSE) { 896* /* can't pause a scrub when there is no in-progress scrub / 897* if (!dsl_scan_scrubbing(dp)) 898 return (SET_ERROR(ENOENT)); 899 900 /* can't pause a paused scrub / 901* if (dsl_scan_is_paused_scrub(scn)) 902 return (SET_ERROR(EBUSY)); 903 } else if (cmd != POOL_SCRUB_NORMAL) { 904* return (SET_ERROR(ENOTSUP)); 905 } 906 907 return (0); 908} 909 910static void 911dsl_scrub_pause_resume_sync(void arg, dmu_tx_t tx) 912{ 913 pool_scrub_cmd_t cmd = arg; 914* dsl_pool_t dp = dmu_tx_pool(tx); 915* spa_t spa = dp->dp_spa; 916* dsl_scan_t scn = dp->dp_scan; 917* 918 if (cmd == POOL_SCRUB_PAUSE) { 919* /* can't pause a scrub when there is no in-progress scrub / 920* spa->spa_scan_pass_scrub_pause = gethrestime_sec(); 921 scn->scn_phys.scn_flags \|= DSF_SCRUB_PAUSED;
459 dsl_scan_sync_state(scn, tx);	922 dsl_scan_sync_state(scn, tx, SYNC_CACHED);
460 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); 461 } else { 462 ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL); 463* if (dsl_scan_is_paused_scrub(scn)) { 464 /* 465 * We need to keep track of how much time we spend 466 * paused per pass so that we can adjust the scrub rate 467 * shown in the output of 'zpool status' 468 / 469* spa->spa_scan_pass_scrub_spent_paused += 470 gethrestime_sec() - spa->spa_scan_pass_scrub_pause; 471 spa->spa_scan_pass_scrub_pause = 0; 472 scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;	923 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); 924 } else { 925 ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL); 926* if (dsl_scan_is_paused_scrub(scn)) { 927 /* 928 * We need to keep track of how much time we spend 929 * paused per pass so that we can adjust the scrub rate 930 * shown in the output of 'zpool status' 931 / 932* spa->spa_scan_pass_scrub_spent_paused += 933 gethrestime_sec() - spa->spa_scan_pass_scrub_pause; 934 spa->spa_scan_pass_scrub_pause = 0; 935 scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
473 dsl_scan_sync_state(scn, tx);	936 dsl_scan_sync_state(scn, tx, SYNC_CACHED);
474 } 475 } 476} 477 478/* 479 * Set scrub pause/resume state if it makes sense to do so 480 / 481int 482dsl_scrub_set_pause_resume(const dsl_pool_t dp, pool_scrub_cmd_t cmd) 483{ 484 return (dsl_sync_task(spa_name(dp->dp_spa), 485 dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, 486 ZFS_SPACE_CHECK_RESERVED)); 487} 488	937 } 938 } 939} 940 941/* 942 * Set scrub pause/resume state if it makes sense to do so 943 / 944int 945dsl_scrub_set_pause_resume(const dsl_pool_t dp, pool_scrub_cmd_t cmd) 946{ 947 return (dsl_sync_task(spa_name(dp->dp_spa), 948 dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, 949 ZFS_SPACE_CHECK_RESERVED)); 950} 951
489boolean_t 490dsl_scan_scrubbing(const dsl_pool_t *dp)	952 953/* start a new scan, or restart an existing one. / 954void 955dsl_resilver_restart(dsl_pool_t dp, uint64_t txg)
491{	956{
492 dsl_scan_t *scn = dp->dp_scan;	957 if (txg == 0) { 958 dmu_tx_t tx; 959* tx = dmu_tx_create_dd(dp->dp_mos_dir); 960 VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
493	961
494 if (scn->scn_phys.scn_state == DSS_SCANNING && 495 scn->scn_phys.scn_func == POOL_SCAN_SCRUB) 496 return (B_TRUE); 497 498 return (B_FALSE);	962 txg = dmu_tx_get_txg(tx); 963 dp->dp_scan->scn_restart_txg = txg; 964 dmu_tx_commit(tx); 965 } else { 966 dp->dp_scan->scn_restart_txg = txg; 967 } 968 zfs_dbgmsg("restarting resilver txg=%llu", txg);
499} 500	969} 970
501static void dsl_scan_visitbp(blkptr_t bp, const zbookmark_phys_t zb, 502 dnode_phys_t dnp, dsl_dataset_t ds, dsl_scan_t scn, 503* dmu_objset_type_t ostype, dmu_tx_t tx); 504static void dsl_scan_visitdnode(dsl_scan_t , dsl_dataset_t ds, 505* dmu_objset_type_t ostype, 506 dnode_phys_t dnp, uint64_t object, dmu_tx_t tx); 507
508void 509dsl_free(dsl_pool_t dp, uint64_t txg, const blkptr_t bp) 510{ 511 zio_free(dp->dp_spa, txg, bp); 512} 513 514void 515dsl_free_sync(zio_t pio, dsl_pool_t dp, uint64_t txg, const blkptr_t bpp) 516{ 517* ASSERT(dsl_pool_sync_context(dp)); 518 zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), 519 pio->io_flags)); 520} 521	971void 972dsl_free(dsl_pool_t dp, uint64_t txg, const blkptr_t bp) 973{ 974 zio_free(dp->dp_spa, txg, bp); 975} 976 977void 978dsl_free_sync(zio_t pio, dsl_pool_t dp, uint64_t txg, const blkptr_t bpp) 979{ 980* ASSERT(dsl_pool_sync_context(dp)); 981 zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), 982 pio->io_flags)); 983} 984
522static uint64_t 523dsl_scan_ds_maxtxg(dsl_dataset_t *ds)	985static int 986scan_ds_queue_compare(const void a, const void b)
524{	987{
525 uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; 526 if (ds->ds_is_snapshot) 527 return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); 528 return (smt);	988 const scan_ds_t sds_a = a, sds_b = b; 989 990 if (sds_a->sds_dsobj < sds_b->sds_dsobj) 991 return (-1); 992 if (sds_a->sds_dsobj == sds_b->sds_dsobj) 993 return (0); 994 return (1);
529} 530 531static void	995} 996 997static void
532dsl_scan_sync_state(dsl_scan_t scn, dmu_tx_t tx)	998scan_ds_queue_clear(dsl_scan_t *scn)
533{	999{
534 VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, 535 DMU_POOL_DIRECTORY_OBJECT, 536 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 537 &scn->scn_phys, tx));	1000 void cookie = NULL; 1001* scan_ds_t sds; 1002* while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { 1003 kmem_free(sds, sizeof (sds)); 1004* }
538} 539	1005} 1006
540extern int zfs_vdev_async_write_active_min_dirty_percent;	1007static boolean_t 1008scan_ds_queue_contains(dsl_scan_t scn, uint64_t dsobj, uint64_t txg) 1009{ 1010 scan_ds_t srch, *sds;
541	1011
	1012 srch.sds_dsobj = dsobj; 1013 sds = avl_find(&scn->scn_queue, &srch, NULL); 1014 if (sds != NULL && txg != NULL) 1015 txg = sds->sds_txg; 1016* return (sds != NULL); 1017} 1018 1019static void 1020scan_ds_queue_insert(dsl_scan_t scn, uint64_t dsobj, uint64_t txg) 1021{ 1022* scan_ds_t sds; 1023* avl_index_t where; 1024 1025 sds = kmem_zalloc(sizeof (sds), KM_SLEEP); 1026* sds->sds_dsobj = dsobj; 1027 sds->sds_txg = txg; 1028 1029 VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); 1030 avl_insert(&scn->scn_queue, sds, where); 1031} 1032 1033static void 1034scan_ds_queue_remove(dsl_scan_t scn, uint64_t dsobj) 1035{ 1036* scan_ds_t srch, sds; 1037* 1038 srch.sds_dsobj = dsobj; 1039 1040 sds = avl_find(&scn->scn_queue, &srch, NULL); 1041 VERIFY(sds != NULL); 1042 avl_remove(&scn->scn_queue, sds); 1043 kmem_free(sds, sizeof (sds)); 1044} 1045* 1046static void 1047scan_ds_queue_sync(dsl_scan_t scn, dmu_tx_t tx) 1048{ 1049 dsl_pool_t dp = scn->scn_dp; 1050* spa_t spa = dp->dp_spa; 1051* dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? 1052 DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; 1053 1054 ASSERT0(scn->scn_bytes_pending); 1055 ASSERT(scn->scn_phys.scn_queue_obj != 0); 1056 1057 VERIFY0(dmu_object_free(dp->dp_meta_objset, 1058 scn->scn_phys.scn_queue_obj, tx)); 1059 scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, 1060 DMU_OT_NONE, 0, tx); 1061 for (scan_ds_t sds = avl_first(&scn->scn_queue); 1062* sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { 1063 VERIFY0(zap_add_int_key(dp->dp_meta_objset, 1064 scn->scn_phys.scn_queue_obj, sds->sds_dsobj, 1065 sds->sds_txg, tx)); 1066 } 1067} 1068 1069/* 1070 * Computes the memory limit state that we're currently in. A sorted scan 1071 * needs quite a bit of memory to hold the sorting queue, so we need to 1072 * reasonably constrain the size so it doesn't impact overall system 1073 * performance. We compute two limits: 1074 * 1) Hard memory limit: if the amount of memory used by the sorting 1075 * queues on a pool gets above this value, we stop the metadata 1076 * scanning portion and start issuing the queued up and sorted 1077 * I/Os to reduce memory usage. 1078 * This limit is calculated as a fraction of physmem (by default 5%). 1079 * We constrain the lower bound of the hard limit to an absolute 1080 * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain 1081 * the upper bound to 5% of the total pool size - no chance we'll 1082 * ever need that much memory, but just to keep the value in check. 1083 * 2) Soft memory limit: once we hit the hard memory limit, we start 1084 * issuing I/O to reduce queue memory usage, but we don't want to 1085 * completely empty out the queues, since we might be able to find I/Os 1086 * that will fill in the gaps of our non-sequential IOs at some point 1087 * in the future. So we stop the issuing of I/Os once the amount of 1088 * memory used drops below the soft limit (at which point we stop issuing 1089 * I/O and start scanning metadata again). 1090 * 1091 * This limit is calculated by subtracting a fraction of the hard 1092 * limit from the hard limit. By default this fraction is 5%, so 1093 * the soft limit is 95% of the hard limit. We cap the size of the 1094 * difference between the hard and soft limits at an absolute 1095 * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is 1096 * sufficient to not cause too frequent switching between the 1097 * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's 1098 * worth of queues is about 1.2 GiB of on-pool data, so scanning 1099 * that should take at least a decent fraction of a second). 1100 */
542static boolean_t	1101static boolean_t
	1102dsl_scan_should_clear(dsl_scan_t scn) 1103{ 1104* vdev_t rvd = scn->scn_dp->dp_spa->spa_root_vdev; 1105* uint64_t mlim_hard, mlim_soft, mused; 1106 uint64_t alloc = metaslab_class_get_alloc(spa_normal_class( 1107 scn->scn_dp->dp_spa)); 1108 1109 mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, 1110 zfs_scan_mem_lim_min); 1111 mlim_hard = MIN(mlim_hard, alloc / 20); 1112 mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, 1113 zfs_scan_mem_lim_soft_max); 1114 mused = 0; 1115 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 1116 vdev_t tvd = rvd->vdev_child[i]; 1117* dsl_scan_io_queue_t queue; 1118* 1119 mutex_enter(&tvd->vdev_scan_io_queue_lock); 1120 queue = tvd->vdev_scan_io_queue; 1121 if (queue != NULL) { 1122 /* #extents in exts_by_size = # in exts_by_addr / 1123* mused += avl_numnodes(&queue->q_exts_by_size) * 1124 sizeof (range_seg_t) + 1125 avl_numnodes(&queue->q_sios_by_addr) * 1126 sizeof (scan_io_t); 1127 } 1128 mutex_exit(&tvd->vdev_scan_io_queue_lock); 1129 } 1130 1131 dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); 1132 1133 if (mused == 0) 1134 ASSERT0(scn->scn_bytes_pending); 1135 1136 /* 1137 * If we are above our hard limit, we need to clear out memory. 1138 * If we are below our soft limit, we need to accumulate sequential IOs. 1139 * Otherwise, we should keep doing whatever we are currently doing. 1140 / 1141* if (mused >= mlim_hard) 1142 return (B_TRUE); 1143 else if (mused < mlim_soft) 1144 return (B_FALSE); 1145 else 1146 return (scn->scn_clearing); 1147} 1148 1149static boolean_t
543dsl_scan_check_suspend(dsl_scan_t scn, const zbookmark_phys_t zb) 544{ 545 /* we never skip user/group accounting objects / 546* if (zb && (int64_t)zb->zb_object < 0) 547 return (B_FALSE); 548 549 if (scn->scn_suspending) 550 return (B_TRUE); /* we're already suspending / 551* 552 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) 553 return (B_FALSE); /* we're resuming / 554* 555 /* We only know how to resume from level-0 blocks. / 556* if (zb && zb->zb_level != 0) 557 return (B_FALSE); 558 559 /* 560 * We suspend if:	1150dsl_scan_check_suspend(dsl_scan_t scn, const zbookmark_phys_t zb) 1151{ 1152 /* we never skip user/group accounting objects / 1153* if (zb && (int64_t)zb->zb_object < 0) 1154 return (B_FALSE); 1155 1156 if (scn->scn_suspending) 1157 return (B_TRUE); /* we're already suspending / 1158* 1159 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) 1160 return (B_FALSE); /* we're resuming / 1161* 1162 /* We only know how to resume from level-0 blocks. / 1163* if (zb && zb->zb_level != 0) 1164 return (B_FALSE); 1165 1166 /* 1167 * We suspend if:
561 * - we have scanned for the maximum time: an entire txg 562 * timeout (default 5 sec) 563 * or
564 * - we have scanned for at least the minimum time (default 1 sec 565 * for scrub, 3 sec for resilver), and either we have sufficient 566 * dirty data that we are starting to write more quickly 567 * (default 30%), or someone is explicitly waiting for this txg 568 * to complete. 569 * or 570 * - the spa is shutting down because this pool is being exported 571 * or the machine is rebooting.	1168 * - we have scanned for at least the minimum time (default 1 sec 1169 * for scrub, 3 sec for resilver), and either we have sufficient 1170 * dirty data that we are starting to write more quickly 1171 * (default 30%), or someone is explicitly waiting for this txg 1172 * to complete. 1173 * or 1174 * - the spa is shutting down because this pool is being exported 1175 * or the machine is rebooting.
	1176 * or 1177 * - the scan queue has reached its memory use limit
572 */	1178 */
573 int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? 574 zfs_resilver_min_time_ms : zfs_scan_min_time_ms; 575 uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;	1179 uint64_t elapsed_nanosecs = gethrtime(); 1180 uint64_t curr_time_ns = gethrtime(); 1181 uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; 1182 uint64_t sync_time_ns = curr_time_ns - 1183 scn->scn_dp->dp_spa->spa_sync_starttime; 1184
576 int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;	1185 int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
577 if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout \|\| 578 (NSEC2MSEC(elapsed_nanosecs) > mintime && 579 (txg_sync_waiting(scn->scn_dp) \|\| 580 dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) \|\| 581 spa_shutting_down(scn->scn_dp->dp_spa)) {	1186 int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? 1187 zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; 1188 1189 if ((NSEC2MSEC(scan_time_ns) > mintime && 1190 (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent \|\| 1191 txg_sync_waiting(scn->scn_dp) \|\| 1192 NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) \|\| 1193 spa_shutting_down(scn->scn_dp->dp_spa) \|\| 1194 (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
582 if (zb) { 583 dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", 584 (longlong_t)zb->zb_objset, 585 (longlong_t)zb->zb_object, 586 (longlong_t)zb->zb_level, 587 (longlong_t)zb->zb_blkid); 588 scn->scn_phys.scn_bookmark = *zb;	1195 if (zb) { 1196 dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", 1197 (longlong_t)zb->zb_objset, 1198 (longlong_t)zb->zb_object, 1199 (longlong_t)zb->zb_level, 1200 (longlong_t)zb->zb_blkid); 1201 scn->scn_phys.scn_bookmark = *zb;
	1202 } else { 1203 dsl_scan_phys_t scnp = &scn->scn_phys; 1204* 1205 dprintf("suspending at at DDT bookmark " 1206 "%llx/%llx/%llx/%llx\n", 1207 (longlong_t)scnp->scn_ddt_bookmark.ddb_class, 1208 (longlong_t)scnp->scn_ddt_bookmark.ddb_type, 1209 (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, 1210 (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
589 }	1211 }
590 dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n", 591 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, 592 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, 593 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, 594 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
595 scn->scn_suspending = B_TRUE; 596 return (B_TRUE); 597 } 598 return (B_FALSE); 599} 600 601typedef struct zil_scan_arg { 602 dsl_pool_t zsa_dp; 603* zil_header_t zsa_zh; 604} zil_scan_arg_t; 605* 606/* ARGSUSED / 607static int 608dsl_scan_zil_block(zilog_t zilog, blkptr_t bp, void arg, uint64_t claim_txg) 609{ 610 zil_scan_arg_t zsa = arg; 611* dsl_pool_t dp = zsa->zsa_dp; 612* dsl_scan_t scn = dp->dp_scan; 613* zil_header_t zh = zsa->zsa_zh; 614* zbookmark_phys_t zb; 615 616 if (BP_IS_HOLE(bp) \|\| bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 617 return (0); 618 619 /* 620 * One block ("stubby") can be allocated a long time ago; we 621 * want to visit that one because it has been allocated 622 * (on-disk) even if it hasn't been claimed (even though for 623 * scrub there's nothing to do to it). 624 / 625* if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa)) 626 return (0); 627 628 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 629 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 630 631 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 632 return (0); 633} 634 635/* ARGSUSED / 636static int 637dsl_scan_zil_record(zilog_t zilog, lr_t lrc, void arg, uint64_t claim_txg) 638{ 639 if (lrc->lrc_txtype == TX_WRITE) { 640 zil_scan_arg_t zsa = arg; 641* dsl_pool_t dp = zsa->zsa_dp; 642* dsl_scan_t scn = dp->dp_scan; 643* zil_header_t zh = zsa->zsa_zh; 644* lr_write_t lr = (lr_write_t )lrc; 645 blkptr_t bp = &lr->lr_blkptr; 646* zbookmark_phys_t zb; 647 648 if (BP_IS_HOLE(bp) \|\| 649 bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 650 return (0); 651 652 /* 653 * birth can be < claim_txg if this record's txg is 654 * already txg sync'ed (but this log block contains 655 * other records that are not synced) 656 / 657* if (claim_txg == 0 \|\| bp->blk_birth < claim_txg) 658 return (0); 659 660 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 661 lr->lr_foid, ZB_ZIL_LEVEL, 662 lr->lr_offset / BP_GET_LSIZE(bp)); 663 664 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 665 } 666 return (0); 667} 668 669static void 670dsl_scan_zil(dsl_pool_t dp, zil_header_t zh) 671{ 672 uint64_t claim_txg = zh->zh_claim_txg; 673 zil_scan_arg_t zsa = { dp, zh }; 674 zilog_t zilog; 675* 676 ASSERT(spa_writeable(dp->dp_spa)); 677 678 /* 679 * We only want to visit blocks that have been claimed 680 * but not yet replayed. 681 / 682* if (claim_txg == 0) 683 return; 684 685 zilog = zil_alloc(dp->dp_meta_objset, zh); 686 687 (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, 688 claim_txg); 689 690 zil_free(zilog); 691} 692	1212 scn->scn_suspending = B_TRUE; 1213 return (B_TRUE); 1214 } 1215 return (B_FALSE); 1216} 1217 1218typedef struct zil_scan_arg { 1219 dsl_pool_t zsa_dp; 1220* zil_header_t zsa_zh; 1221} zil_scan_arg_t; 1222* 1223/* ARGSUSED / 1224static int 1225dsl_scan_zil_block(zilog_t zilog, blkptr_t bp, void arg, uint64_t claim_txg) 1226{ 1227 zil_scan_arg_t zsa = arg; 1228* dsl_pool_t dp = zsa->zsa_dp; 1229* dsl_scan_t scn = dp->dp_scan; 1230* zil_header_t zh = zsa->zsa_zh; 1231* zbookmark_phys_t zb; 1232 1233 if (BP_IS_HOLE(bp) \|\| bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 1234 return (0); 1235 1236 /* 1237 * One block ("stubby") can be allocated a long time ago; we 1238 * want to visit that one because it has been allocated 1239 * (on-disk) even if it hasn't been claimed (even though for 1240 * scrub there's nothing to do to it). 1241 / 1242* if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa)) 1243 return (0); 1244 1245 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 1246 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 1247 1248 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 1249 return (0); 1250} 1251 1252/* ARGSUSED / 1253static int 1254dsl_scan_zil_record(zilog_t zilog, lr_t lrc, void arg, uint64_t claim_txg) 1255{ 1256 if (lrc->lrc_txtype == TX_WRITE) { 1257 zil_scan_arg_t zsa = arg; 1258* dsl_pool_t dp = zsa->zsa_dp; 1259* dsl_scan_t scn = dp->dp_scan; 1260* zil_header_t zh = zsa->zsa_zh; 1261* lr_write_t lr = (lr_write_t )lrc; 1262 blkptr_t bp = &lr->lr_blkptr; 1263* zbookmark_phys_t zb; 1264 1265 if (BP_IS_HOLE(bp) \|\| 1266 bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 1267 return (0); 1268 1269 /* 1270 * birth can be < claim_txg if this record's txg is 1271 * already txg sync'ed (but this log block contains 1272 * other records that are not synced) 1273 / 1274* if (claim_txg == 0 \|\| bp->blk_birth < claim_txg) 1275 return (0); 1276 1277 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 1278 lr->lr_foid, ZB_ZIL_LEVEL, 1279 lr->lr_offset / BP_GET_LSIZE(bp)); 1280 1281 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 1282 } 1283 return (0); 1284} 1285 1286static void 1287dsl_scan_zil(dsl_pool_t dp, zil_header_t zh) 1288{ 1289 uint64_t claim_txg = zh->zh_claim_txg; 1290 zil_scan_arg_t zsa = { dp, zh }; 1291 zilog_t zilog; 1292* 1293 ASSERT(spa_writeable(dp->dp_spa)); 1294 1295 /* 1296 * We only want to visit blocks that have been claimed 1297 * but not yet replayed. 1298 / 1299* if (claim_txg == 0) 1300 return; 1301 1302 zilog = zil_alloc(dp->dp_meta_objset, zh); 1303 1304 (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, 1305 claim_txg); 1306 1307 zil_free(zilog); 1308} 1309
693/* ARGSUSED */	1310/* 1311 * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea 1312 * here is to sort the AVL tree by the order each block will be needed. 1313 / 1314static int 1315scan_prefetch_queue_compare(const void a, const void b) 1316{ 1317* const scan_prefetch_issue_ctx_t spic_a = a, spic_b = b; 1318 const scan_prefetch_ctx_t spc_a = spic_a->spic_spc; 1319* const scan_prefetch_ctx_t spc_b = spic_b->spic_spc; 1320* 1321 return (zbookmark_compare(spc_a->spc_datablkszsec, 1322 spc_a->spc_indblkshift, spc_b->spc_datablkszsec, 1323 spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); 1324} 1325
694static void	1326static void
695dsl_scan_prefetch(dsl_scan_t scn, arc_buf_t buf, blkptr_t bp, 696* uint64_t objset, uint64_t object, uint64_t blkid)	1327scan_prefetch_ctx_rele(scan_prefetch_ctx_t spc, void tag)
697{	1328{
698 zbookmark_phys_t czb; 699 arc_flags_t flags = ARC_FLAG_NOWAIT \| ARC_FLAG_PREFETCH;	1329 if (refcount_remove(&spc->spc_refcnt, tag) == 0) { 1330 refcount_destroy(&spc->spc_refcnt); 1331 kmem_free(spc, sizeof (scan_prefetch_ctx_t)); 1332 } 1333}
700	1334
	1335static scan_prefetch_ctx_t * 1336scan_prefetch_ctx_create(dsl_scan_t scn, dnode_phys_t dnp, void tag) 1337{ 1338* scan_prefetch_ctx_t spc; 1339* 1340 spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); 1341 refcount_create(&spc->spc_refcnt); 1342 refcount_add(&spc->spc_refcnt, tag); 1343 spc->spc_scn = scn; 1344 if (dnp != NULL) { 1345 spc->spc_datablkszsec = dnp->dn_datablkszsec; 1346 spc->spc_indblkshift = dnp->dn_indblkshift; 1347 spc->spc_root = B_FALSE; 1348 } else { 1349 spc->spc_datablkszsec = 0; 1350 spc->spc_indblkshift = 0; 1351 spc->spc_root = B_TRUE; 1352 } 1353 1354 return (spc); 1355} 1356 1357static void 1358scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t spc, void tag) 1359{ 1360 refcount_add(&spc->spc_refcnt, tag); 1361} 1362 1363static boolean_t 1364dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t spc, 1365* const zbookmark_phys_t zb) 1366{ 1367* zbookmark_phys_t last_zb = &spc->spc_scn->scn_prefetch_bookmark; 1368* dnode_phys_t tmp_dnp; 1369 dnode_phys_t dnp = (spc->spc_root) ? NULL : &tmp_dnp; 1370* 1371 if (zb->zb_objset != last_zb->zb_objset) 1372 return (B_TRUE); 1373 if ((int64_t)zb->zb_object < 0) 1374 return (B_FALSE); 1375 1376 tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; 1377 tmp_dnp.dn_indblkshift = spc->spc_indblkshift; 1378 1379 if (zbookmark_subtree_completed(dnp, zb, last_zb)) 1380 return (B_TRUE); 1381 1382 return (B_FALSE); 1383} 1384 1385static void 1386dsl_scan_prefetch(scan_prefetch_ctx_t spc, blkptr_t bp, zbookmark_phys_t zb) 1387{ 1388* avl_index_t idx; 1389 dsl_scan_t scn = spc->spc_scn; 1390* spa_t spa = scn->scn_dp->dp_spa; 1391* scan_prefetch_issue_ctx_t spic; 1392*
701 if (zfs_no_scrub_prefetch) 702 return; 703	1393 if (zfs_no_scrub_prefetch) 1394 return; 1395
704 if (BP_IS_HOLE(bp) \|\| bp->blk_birth <= scn->scn_phys.scn_min_txg \|\| 705 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))	1396 if (BP_IS_HOLE(bp) \|\| bp->blk_birth <= scn->scn_phys.scn_cur_min_txg \|\| 1397 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && 1398 BP_GET_TYPE(bp) != DMU_OT_OBJSET))
706 return; 707	1399 return; 1400
708 SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);	1401 if (dsl_scan_check_prefetch_resume(spc, zb)) 1402 return;
709	1403
710 (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, 711 NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 712 ZIO_FLAG_CANFAIL \| ZIO_FLAG_SCAN_THREAD, &flags, &czb);	1404 scan_prefetch_ctx_add_ref(spc, scn); 1405 spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); 1406 spic->spic_spc = spc; 1407 spic->spic_bp = bp; 1408* spic->spic_zb = zb; 1409* 1410 /* 1411 * Add the IO to the queue of blocks to prefetch. This allows us to 1412 * prioritize blocks that we will need first for the main traversal 1413 * thread. 1414 / 1415* mutex_enter(&spa->spa_scrub_lock); 1416 if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { 1417 /* this block is already queued for prefetch / 1418* kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); 1419 scan_prefetch_ctx_rele(spc, scn); 1420 mutex_exit(&spa->spa_scrub_lock); 1421 return; 1422 } 1423 1424 avl_insert(&scn->scn_prefetch_queue, spic, idx); 1425 cv_broadcast(&spa->spa_scrub_io_cv); 1426 mutex_exit(&spa->spa_scrub_lock);
713} 714	1427} 1428
	1429static void 1430dsl_scan_prefetch_dnode(dsl_scan_t scn, dnode_phys_t dnp, 1431 uint64_t objset, uint64_t object) 1432{ 1433 int i; 1434 zbookmark_phys_t zb; 1435 scan_prefetch_ctx_t spc; 1436* 1437 if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1438 return; 1439 1440 SET_BOOKMARK(&zb, objset, object, 0, 0); 1441 1442 spc = scan_prefetch_ctx_create(scn, dnp, FTAG); 1443 1444 for (i = 0; i < dnp->dn_nblkptr; i++) { 1445 zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); 1446 zb.zb_blkid = i; 1447 dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); 1448 } 1449 1450 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 1451 zb.zb_level = 0; 1452 zb.zb_blkid = DMU_SPILL_BLKID; 1453 dsl_scan_prefetch(spc, &dnp->dn_spill, &zb); 1454 } 1455 1456 scan_prefetch_ctx_rele(spc, FTAG); 1457} 1458 1459void 1460dsl_scan_prefetch_cb(zio_t zio, const zbookmark_phys_t zb, const blkptr_t bp, 1461* arc_buf_t buf, void private) 1462{ 1463 scan_prefetch_ctx_t spc = private; 1464* dsl_scan_t scn = spc->spc_scn; 1465* spa_t spa = scn->scn_dp->dp_spa; 1466* 1467 /* broadcast that the IO has completed for rate limitting purposes / 1468* mutex_enter(&spa->spa_scrub_lock); 1469 ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); 1470 spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); 1471 cv_broadcast(&spa->spa_scrub_io_cv); 1472 mutex_exit(&spa->spa_scrub_lock); 1473 1474 /* if there was an error or we are done prefetching, just cleanup / 1475* if (buf == NULL \|\| scn->scn_suspending) 1476 goto out; 1477 1478 if (BP_GET_LEVEL(bp) > 0) { 1479 int i; 1480 blkptr_t cbp; 1481* int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 1482 zbookmark_phys_t czb; 1483 1484 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { 1485 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 1486 zb->zb_level - 1, zb->zb_blkid * epb + i); 1487 dsl_scan_prefetch(spc, cbp, &czb); 1488 } 1489 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 1490 dnode_phys_t cdnp = buf->b_data; 1491* int i; 1492 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 1493 1494 for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { 1495 dsl_scan_prefetch_dnode(scn, cdnp, 1496 zb->zb_objset, zb->zb_blkid * epb + i); 1497 } 1498 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 1499 objset_phys_t osp = buf->b_data; 1500* 1501 dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, 1502 zb->zb_objset, DMU_META_DNODE_OBJECT); 1503 1504 if (OBJSET_BUF_HAS_USERUSED(buf)) { 1505 dsl_scan_prefetch_dnode(scn, 1506 &osp->os_groupused_dnode, zb->zb_objset, 1507 DMU_GROUPUSED_OBJECT); 1508 dsl_scan_prefetch_dnode(scn, 1509 &osp->os_userused_dnode, zb->zb_objset, 1510 DMU_USERUSED_OBJECT); 1511 } 1512 } 1513 1514out: 1515 if (buf != NULL) 1516 arc_buf_destroy(buf, private); 1517 scan_prefetch_ctx_rele(spc, scn); 1518} 1519 1520/* ARGSUSED / 1521static void 1522dsl_scan_prefetch_thread(void arg) 1523{ 1524 dsl_scan_t scn = arg; 1525* spa_t spa = scn->scn_dp->dp_spa; 1526* vdev_t rvd = spa->spa_root_vdev; 1527* uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; 1528 scan_prefetch_issue_ctx_t spic; 1529* 1530 /* loop until we are told to stop / 1531* while (!scn->scn_prefetch_stop) { 1532 arc_flags_t flags = ARC_FLAG_NOWAIT \| 1533 ARC_FLAG_PRESCIENT_PREFETCH \| ARC_FLAG_PREFETCH; 1534 int zio_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SCAN_THREAD; 1535 1536 mutex_enter(&spa->spa_scrub_lock); 1537 1538 /* 1539 * Wait until we have an IO to issue and are not above our 1540 * maximum in flight limit. 1541 / 1542* while (!scn->scn_prefetch_stop && 1543 (avl_numnodes(&scn->scn_prefetch_queue) == 0 \|\| 1544 spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { 1545 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1546 } 1547 1548 /* recheck if we should stop since we waited for the cv / 1549* if (scn->scn_prefetch_stop) { 1550 mutex_exit(&spa->spa_scrub_lock); 1551 break; 1552 } 1553 1554 /* remove the prefetch IO from the tree / 1555* spic = avl_first(&scn->scn_prefetch_queue); 1556 spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); 1557 avl_remove(&scn->scn_prefetch_queue, spic); 1558 1559 mutex_exit(&spa->spa_scrub_lock); 1560 1561 /* issue the prefetch asynchronously / 1562* (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, 1563 &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, 1564 ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); 1565 1566 kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); 1567 } 1568 1569 ASSERT(scn->scn_prefetch_stop); 1570 1571 /* free any prefetches we didn't get to complete / 1572* mutex_enter(&spa->spa_scrub_lock); 1573 while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { 1574 avl_remove(&scn->scn_prefetch_queue, spic); 1575 scan_prefetch_ctx_rele(spic->spic_spc, scn); 1576 kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); 1577 } 1578 ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); 1579 mutex_exit(&spa->spa_scrub_lock); 1580} 1581
715static boolean_t 716dsl_scan_check_resume(dsl_scan_t scn, const dnode_phys_t dnp, 717 const zbookmark_phys_t zb) 718{ 719* /* 720 * We never skip over user/group accounting objects (obj<0) 721 / 722* if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && 723 (int64_t)zb->zb_object >= 0) { 724 /* 725 * If we already visited this bp & everything below (in 726 * a prior txg sync), don't bother doing it again. 727 / 728* if (zbookmark_subtree_completed(dnp, zb, 729 &scn->scn_phys.scn_bookmark)) 730 return (B_TRUE); 731 732 /* 733 * If we found the block we're trying to resume from, or 734 * we went past it to a different object, zero it out to 735 * indicate that it's OK to start checking for suspending 736 * again. 737 / 738* if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (zb)) == 0 \|\| 739* zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { 740 dprintf("resuming at %llx/%llx/%llx/%llx\n", 741 (longlong_t)zb->zb_objset, 742 (longlong_t)zb->zb_object, 743 (longlong_t)zb->zb_level, 744 (longlong_t)zb->zb_blkid); 745 bzero(&scn->scn_phys.scn_bookmark, sizeof (zb)); 746* } 747 } 748 return (B_FALSE); 749} 750	1582static boolean_t 1583dsl_scan_check_resume(dsl_scan_t scn, const dnode_phys_t dnp, 1584 const zbookmark_phys_t zb) 1585{ 1586* /* 1587 * We never skip over user/group accounting objects (obj<0) 1588 / 1589* if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && 1590 (int64_t)zb->zb_object >= 0) { 1591 /* 1592 * If we already visited this bp & everything below (in 1593 * a prior txg sync), don't bother doing it again. 1594 / 1595* if (zbookmark_subtree_completed(dnp, zb, 1596 &scn->scn_phys.scn_bookmark)) 1597 return (B_TRUE); 1598 1599 /* 1600 * If we found the block we're trying to resume from, or 1601 * we went past it to a different object, zero it out to 1602 * indicate that it's OK to start checking for suspending 1603 * again. 1604 / 1605* if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (zb)) == 0 \|\| 1606* zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { 1607 dprintf("resuming at %llx/%llx/%llx/%llx\n", 1608 (longlong_t)zb->zb_objset, 1609 (longlong_t)zb->zb_object, 1610 (longlong_t)zb->zb_level, 1611 (longlong_t)zb->zb_blkid); 1612 bzero(&scn->scn_phys.scn_bookmark, sizeof (zb)); 1613* } 1614 } 1615 return (B_FALSE); 1616} 1617
	1618static void dsl_scan_visitbp(blkptr_t bp, const zbookmark_phys_t zb, 1619 dnode_phys_t dnp, dsl_dataset_t ds, dsl_scan_t scn, 1620* dmu_objset_type_t ostype, dmu_tx_t tx); 1621static void dsl_scan_visitdnode( 1622* dsl_scan_t , dsl_dataset_t ds, dmu_objset_type_t ostype, 1623 dnode_phys_t dnp, uint64_t object, dmu_tx_t tx); 1624
751/* 752 * Return nonzero on i/o error. 753 * Return new buf to write out in bufp. 754* / 755static int 756dsl_scan_recurse(dsl_scan_t scn, dsl_dataset_t ds, dmu_objset_type_t ostype, 757* dnode_phys_t dnp, const blkptr_t bp, 758 const zbookmark_phys_t zb, dmu_tx_t tx) 759{ 760 dsl_pool_t dp = scn->scn_dp; 761* int zio_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SCAN_THREAD; 762 int err; 763 764 if (BP_GET_LEVEL(bp) > 0) { 765 arc_flags_t flags = ARC_FLAG_WAIT; 766 int i; 767 blkptr_t cbp; 768* int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 769 arc_buf_t buf; 770* 771 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,	1625/* 1626 * Return nonzero on i/o error. 1627 * Return new buf to write out in bufp. 1628* / 1629static int 1630dsl_scan_recurse(dsl_scan_t scn, dsl_dataset_t ds, dmu_objset_type_t ostype, 1631* dnode_phys_t dnp, const blkptr_t bp, 1632 const zbookmark_phys_t zb, dmu_tx_t tx) 1633{ 1634 dsl_pool_t dp = scn->scn_dp; 1635* int zio_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_SCAN_THREAD; 1636 int err; 1637 1638 if (BP_GET_LEVEL(bp) > 0) { 1639 arc_flags_t flags = ARC_FLAG_WAIT; 1640 int i; 1641 blkptr_t cbp; 1642* int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 1643 arc_buf_t buf; 1644* 1645 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
772 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);	1646 ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
773 if (err) { 774 scn->scn_phys.scn_errors++; 775 return (err); 776 } 777 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {	1647 if (err) { 1648 scn->scn_phys.scn_errors++; 1649 return (err); 1650 } 1651 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
778 dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, 779 zb->zb_object, zb->zb_blkid * epb + i); 780 } 781 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
782 zbookmark_phys_t czb; 783 784 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 785 zb->zb_level - 1, 786 zb->zb_blkid * epb + i); 787 dsl_scan_visitbp(cbp, &czb, dnp, 788 ds, scn, ostype, tx); 789 } 790 arc_buf_destroy(buf, &buf); 791 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 792 arc_flags_t flags = ARC_FLAG_WAIT; 793 dnode_phys_t *cdnp;	1652 zbookmark_phys_t czb; 1653 1654 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 1655 zb->zb_level - 1, 1656 zb->zb_blkid * epb + i); 1657 dsl_scan_visitbp(cbp, &czb, dnp, 1658 ds, scn, ostype, tx); 1659 } 1660 arc_buf_destroy(buf, &buf); 1661 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 1662 arc_flags_t flags = ARC_FLAG_WAIT; 1663 dnode_phys_t *cdnp;
794 int i, j;	1664 int i;
795 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 796 arc_buf_t buf; 797* 798 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,	1665 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 1666 arc_buf_t buf; 1667* 1668 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
799 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);	1669 ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
800 if (err) { 801 scn->scn_phys.scn_errors++; 802 return (err); 803 } 804 for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {	1670 if (err) { 1671 scn->scn_phys.scn_errors++; 1672 return (err); 1673 } 1674 for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
805 for (j = 0; j < cdnp->dn_nblkptr; j++) { 806 blkptr_t cbp = &cdnp->dn_blkptr[j]; 807* dsl_scan_prefetch(scn, buf, cbp, 808 zb->zb_objset, zb->zb_blkid * epb + i, j); 809 } 810 } 811 for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
812 dsl_scan_visitdnode(scn, ds, ostype, 813 cdnp, zb->zb_blkid * epb + i, tx); 814 } 815 816 arc_buf_destroy(buf, &buf); 817 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 818 arc_flags_t flags = ARC_FLAG_WAIT; 819 objset_phys_t osp; 820* arc_buf_t buf; 821* 822 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,	1675 dsl_scan_visitdnode(scn, ds, ostype, 1676 cdnp, zb->zb_blkid * epb + i, tx); 1677 } 1678 1679 arc_buf_destroy(buf, &buf); 1680 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 1681 arc_flags_t flags = ARC_FLAG_WAIT; 1682 objset_phys_t osp; 1683* arc_buf_t buf; 1684* 1685 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
823 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);	1686 ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
824 if (err) { 825 scn->scn_phys.scn_errors++; 826 return (err); 827 } 828 829 osp = buf->b_data; 830 831 dsl_scan_visitdnode(scn, ds, osp->os_type, 832 &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); 833 834 if (OBJSET_BUF_HAS_USERUSED(buf)) { 835 /* 836 * We also always visit user/group accounting 837 * objects, and never skip them, even if we are 838 * suspending. This is necessary so that the space 839 * deltas from this txg get integrated. 840 / 841* dsl_scan_visitdnode(scn, ds, osp->os_type, 842 &osp->os_groupused_dnode, 843 DMU_GROUPUSED_OBJECT, tx); 844 dsl_scan_visitdnode(scn, ds, osp->os_type, 845 &osp->os_userused_dnode, 846 DMU_USERUSED_OBJECT, tx); 847 } 848 arc_buf_destroy(buf, &buf); 849 } 850 851 return (0); 852} 853 854static void 855dsl_scan_visitdnode(dsl_scan_t scn, dsl_dataset_t ds, 856 dmu_objset_type_t ostype, dnode_phys_t dnp, 857* uint64_t object, dmu_tx_t tx) 858{ 859* int j; 860 861 for (j = 0; j < dnp->dn_nblkptr; j++) { 862 zbookmark_phys_t czb; 863 864 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 865 dnp->dn_nlevels - 1, j); 866 dsl_scan_visitbp(&dnp->dn_blkptr[j], 867 &czb, dnp, ds, scn, ostype, tx); 868 } 869 870 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 871 zbookmark_phys_t czb; 872 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 873 0, DMU_SPILL_BLKID); 874 dsl_scan_visitbp(&dnp->dn_spill, 875 &czb, dnp, ds, scn, ostype, tx); 876 } 877} 878 879/* 880 * The arguments are in this order because mdb can only print the 881 * first 5; we want them to be useful. 882 / 883static void 884dsl_scan_visitbp(blkptr_t bp, const zbookmark_phys_t zb, 885* dnode_phys_t dnp, dsl_dataset_t ds, dsl_scan_t scn, 886* dmu_objset_type_t ostype, dmu_tx_t tx) 887{ 888* dsl_pool_t *dp = scn->scn_dp;	1687 if (err) { 1688 scn->scn_phys.scn_errors++; 1689 return (err); 1690 } 1691 1692 osp = buf->b_data; 1693 1694 dsl_scan_visitdnode(scn, ds, osp->os_type, 1695 &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); 1696 1697 if (OBJSET_BUF_HAS_USERUSED(buf)) { 1698 /* 1699 * We also always visit user/group accounting 1700 * objects, and never skip them, even if we are 1701 * suspending. This is necessary so that the space 1702 * deltas from this txg get integrated. 1703 / 1704* dsl_scan_visitdnode(scn, ds, osp->os_type, 1705 &osp->os_groupused_dnode, 1706 DMU_GROUPUSED_OBJECT, tx); 1707 dsl_scan_visitdnode(scn, ds, osp->os_type, 1708 &osp->os_userused_dnode, 1709 DMU_USERUSED_OBJECT, tx); 1710 } 1711 arc_buf_destroy(buf, &buf); 1712 } 1713 1714 return (0); 1715} 1716 1717static void 1718dsl_scan_visitdnode(dsl_scan_t scn, dsl_dataset_t ds, 1719 dmu_objset_type_t ostype, dnode_phys_t dnp, 1720* uint64_t object, dmu_tx_t tx) 1721{ 1722* int j; 1723 1724 for (j = 0; j < dnp->dn_nblkptr; j++) { 1725 zbookmark_phys_t czb; 1726 1727 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 1728 dnp->dn_nlevels - 1, j); 1729 dsl_scan_visitbp(&dnp->dn_blkptr[j], 1730 &czb, dnp, ds, scn, ostype, tx); 1731 } 1732 1733 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 1734 zbookmark_phys_t czb; 1735 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 1736 0, DMU_SPILL_BLKID); 1737 dsl_scan_visitbp(&dnp->dn_spill, 1738 &czb, dnp, ds, scn, ostype, tx); 1739 } 1740} 1741 1742/* 1743 * The arguments are in this order because mdb can only print the 1744 * first 5; we want them to be useful. 1745 / 1746static void 1747dsl_scan_visitbp(blkptr_t bp, const zbookmark_phys_t zb, 1748* dnode_phys_t dnp, dsl_dataset_t ds, dsl_scan_t scn, 1749* dmu_objset_type_t ostype, dmu_tx_t tx) 1750{ 1751* dsl_pool_t *dp = scn->scn_dp;
889 arc_buf_t buf = NULL; 890* blkptr_t bp_toread = *bp;	1752 blkptr_t *bp_toread = NULL;
891	1753
892 /* ASSERT(pbuf == NULL \|\| arc_released(pbuf)); / 893*
894 if (dsl_scan_check_suspend(scn, zb)) 895 return; 896 897 if (dsl_scan_check_resume(scn, dnp, zb)) 898 return; 899	1754 if (dsl_scan_check_suspend(scn, zb)) 1755 return; 1756 1757 if (dsl_scan_check_resume(scn, dnp, zb)) 1758 return; 1759
900 if (BP_IS_HOLE(bp)) 901 return; 902
903 scn->scn_visited_this_txg++; 904 905 dprintf_bp(bp, 906 "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", 907 ds, ds ? ds->ds_object : 0, 908 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, 909 bp); 910	1760 scn->scn_visited_this_txg++; 1761 1762 dprintf_bp(bp, 1763 "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", 1764 ds, ds ? ds->ds_object : 0, 1765 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, 1766 bp); 1767
911 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)	1768 if (BP_IS_HOLE(bp)) { 1769 scn->scn_holes_this_txg++;
912 return;	1770 return;
	1771 }
913	1772
914 if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0)	1773 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { 1774 scn->scn_lt_min_this_txg++;
915 return;	1775 return;
	1776 }
916	1777
	1778 bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); 1779 bp_toread = bp; 1780 1781 if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) 1782 return; 1783
917 /* 918 * If dsl_scan_ddt() has already visited this block, it will have 919 * already done any translations or scrubbing, so don't call the 920 * callback again. 921 / 922* if (ddt_class_contains(dp->dp_spa, 923 scn->scn_phys.scn_ddt_class_max, bp)) {	1784 /* 1785 * If dsl_scan_ddt() has already visited this block, it will have 1786 * already done any translations or scrubbing, so don't call the 1787 * callback again. 1788 / 1789* if (ddt_class_contains(dp->dp_spa, 1790 scn->scn_phys.scn_ddt_class_max, bp)) {
924 ASSERT(buf == NULL); 925 return;	1791 scn->scn_ddt_contained_this_txg++; 1792 goto out;
926 } 927 928 /* 929 * If this block is from the future (after cur_max_txg), then we 930 * are doing this on behalf of a deleted snapshot, and we will 931 * revisit the future block on the next pass of this dataset. 932 * Don't scan it now unless we need to because something 933 * under it was modified. 934 */	1793 } 1794 1795 /* 1796 * If this block is from the future (after cur_max_txg), then we 1797 * are doing this on behalf of a deleted snapshot, and we will 1798 * revisit the future block on the next pass of this dataset. 1799 * Don't scan it now unless we need to because something 1800 * under it was modified. 1801 */
935 if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { 936 scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);	1802 if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { 1803 scn->scn_gt_max_this_txg++; 1804 goto out;
937 }	1805 }
	1806 1807 scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); 1808out: 1809 kmem_free(bp_toread, sizeof (blkptr_t));
938} 939 940static void 941dsl_scan_visit_rootbp(dsl_scan_t scn, dsl_dataset_t ds, blkptr_t bp, 942* dmu_tx_t tx) 943{ 944* zbookmark_phys_t zb;	1810} 1811 1812static void 1813dsl_scan_visit_rootbp(dsl_scan_t scn, dsl_dataset_t ds, blkptr_t bp, 1814* dmu_tx_t tx) 1815{ 1816* zbookmark_phys_t zb;
	1817 scan_prefetch_ctx_t *spc;
945 946 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 947 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);	1818 1819 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1820 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
948 dsl_scan_visitbp(bp, &zb, NULL, 949 ds, scn, DMU_OST_NONE, tx);
950	1821
	1822 if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { 1823 SET_BOOKMARK(&scn->scn_prefetch_bookmark, 1824 zb.zb_objset, 0, 0, 0); 1825 } else { 1826 scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; 1827 } 1828 1829 scn->scn_objsets_visited_this_txg++; 1830 1831 spc = scan_prefetch_ctx_create(scn, NULL, FTAG); 1832 dsl_scan_prefetch(spc, bp, &zb); 1833 scan_prefetch_ctx_rele(spc, FTAG); 1834 1835 dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); 1836
951 dprintf_ds(ds, "finished scan%s", ""); 952} 953	1837 dprintf_ds(ds, "finished scan%s", ""); 1838} 1839
954void 955dsl_scan_ds_destroyed(dsl_dataset_t ds, dmu_tx_t tx)	1840static void 1841ds_destroyed_scn_phys(dsl_dataset_t ds, dsl_scan_phys_t scn_phys)
956{	1842{
957 dsl_pool_t dp = ds->ds_dir->dd_pool; 958* dsl_scan_t scn = dp->dp_scan; 959* uint64_t mintxg; 960 961 if (scn->scn_phys.scn_state != DSS_SCANNING) 962 return; 963 964 if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {	1843 if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
965 if (ds->ds_is_snapshot) { 966 /* 967 * Note: 968 * - scn_cur_{min,max}_txg stays the same. 969 * - Setting the flag is not really necessary if 970 * scn_cur_max_txg == scn_max_txg, because there 971 * is nothing after this snapshot that we care 972 * about. However, we set it anyway and then 973 * ignore it when we retraverse it in 974 * dsl_scan_visitds(). 975 */	1844 if (ds->ds_is_snapshot) { 1845 /* 1846 * Note: 1847 * - scn_cur_{min,max}_txg stays the same. 1848 * - Setting the flag is not really necessary if 1849 * scn_cur_max_txg == scn_max_txg, because there 1850 * is nothing after this snapshot that we care 1851 * about. However, we set it anyway and then 1852 * ignore it when we retraverse it in 1853 * dsl_scan_visitds(). 1854 */
976 scn->scn_phys.scn_bookmark.zb_objset =	1855 scn_phys->scn_bookmark.zb_objset =
977 dsl_dataset_phys(ds)->ds_next_snap_obj; 978 zfs_dbgmsg("destroying ds %llu; currently traversing; " 979 "reset zb_objset to %llu", 980 (u_longlong_t)ds->ds_object, 981 (u_longlong_t)dsl_dataset_phys(ds)-> 982 ds_next_snap_obj);	1856 dsl_dataset_phys(ds)->ds_next_snap_obj; 1857 zfs_dbgmsg("destroying ds %llu; currently traversing; " 1858 "reset zb_objset to %llu", 1859 (u_longlong_t)ds->ds_object, 1860 (u_longlong_t)dsl_dataset_phys(ds)-> 1861 ds_next_snap_obj);
983 scn->scn_phys.scn_flags \|= DSF_VISIT_DS_AGAIN;	1862 scn_phys->scn_flags \|= DSF_VISIT_DS_AGAIN;
984 } else {	1863 } else {
985 SET_BOOKMARK(&scn->scn_phys.scn_bookmark,	1864 SET_BOOKMARK(&scn_phys->scn_bookmark,
986 ZB_DESTROYED_OBJSET, 0, 0, 0); 987 zfs_dbgmsg("destroying ds %llu; currently traversing; " 988 "reset bookmark to -1,0,0,0", 989 (u_longlong_t)ds->ds_object); 990 }	1865 ZB_DESTROYED_OBJSET, 0, 0, 0); 1866 zfs_dbgmsg("destroying ds %llu; currently traversing; " 1867 "reset bookmark to -1,0,0,0", 1868 (u_longlong_t)ds->ds_object); 1869 }
991 } else if (zap_lookup_int_key(dp->dp_meta_objset, 992 scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {	1870 } 1871} 1872 1873/* 1874 * Invoked when a dataset is destroyed. We need to make sure that: 1875 * 1876 * 1) If it is the dataset that was currently being scanned, we write 1877 * a new dsl_scan_phys_t and marking the objset reference in it 1878 * as destroyed. 1879 * 2) Remove it from the work queue, if it was present. 1880 * 1881 * If the dataset was actually a snapshot, instead of marking the dataset 1882 * as destroyed, we instead substitute the next snapshot in line. 1883 / 1884void 1885dsl_scan_ds_destroyed(dsl_dataset_t ds, dmu_tx_t tx) 1886{ 1887* dsl_pool_t dp = ds->ds_dir->dd_pool; 1888* dsl_scan_t scn = dp->dp_scan; 1889* uint64_t mintxg; 1890 1891 if (!dsl_scan_is_running(scn)) 1892 return; 1893 1894 ds_destroyed_scn_phys(ds, &scn->scn_phys); 1895 ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); 1896 1897 if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { 1898 scan_ds_queue_remove(scn, ds->ds_object); 1899 if (ds->ds_is_snapshot) 1900 scan_ds_queue_insert(scn, 1901 dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); 1902 } 1903 1904 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 1905 ds->ds_object, &mintxg) == 0) {
993 ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); 994 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 995 scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 996 if (ds->ds_is_snapshot) { 997 /* 998 * We keep the same mintxg; it could be > 999 * ds_creation_txg if the previous snapshot was 1000 * deleted too. 1001 / 1002* VERIFY(zap_add_int_key(dp->dp_meta_objset, 1003 scn->scn_phys.scn_queue_obj, 1004 dsl_dataset_phys(ds)->ds_next_snap_obj, 1005 mintxg, tx) == 0); 1006 zfs_dbgmsg("destroying ds %llu; in queue; " 1007 "replacing with %llu", 1008 (u_longlong_t)ds->ds_object, 1009 (u_longlong_t)dsl_dataset_phys(ds)-> 1010 ds_next_snap_obj); 1011 } else { 1012 zfs_dbgmsg("destroying ds %llu; in queue; removing", 1013 (u_longlong_t)ds->ds_object); 1014 } 1015 } 1016 1017 /* 1018 * dsl_scan_sync() should be called after this, and should sync 1019 * out our changed state, but just to be safe, do it here. 1020 */	1906 ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); 1907 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 1908 scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 1909 if (ds->ds_is_snapshot) { 1910 /* 1911 * We keep the same mintxg; it could be > 1912 * ds_creation_txg if the previous snapshot was 1913 * deleted too. 1914 / 1915* VERIFY(zap_add_int_key(dp->dp_meta_objset, 1916 scn->scn_phys.scn_queue_obj, 1917 dsl_dataset_phys(ds)->ds_next_snap_obj, 1918 mintxg, tx) == 0); 1919 zfs_dbgmsg("destroying ds %llu; in queue; " 1920 "replacing with %llu", 1921 (u_longlong_t)ds->ds_object, 1922 (u_longlong_t)dsl_dataset_phys(ds)-> 1923 ds_next_snap_obj); 1924 } else { 1925 zfs_dbgmsg("destroying ds %llu; in queue; removing", 1926 (u_longlong_t)ds->ds_object); 1927 } 1928 } 1929 1930 /* 1931 * dsl_scan_sync() should be called after this, and should sync 1932 * out our changed state, but just to be safe, do it here. 1933 */
1021 dsl_scan_sync_state(scn, tx);	1934 dsl_scan_sync_state(scn, tx, SYNC_CACHED);
1022} 1023	1935} 1936
	1937static void 1938ds_snapshotted_bookmark(dsl_dataset_t ds, zbookmark_phys_t scn_bookmark) 1939{ 1940 if (scn_bookmark->zb_objset == ds->ds_object) { 1941 scn_bookmark->zb_objset = 1942 dsl_dataset_phys(ds)->ds_prev_snap_obj; 1943 zfs_dbgmsg("snapshotting ds %llu; currently traversing; " 1944 "reset zb_objset to %llu", 1945 (u_longlong_t)ds->ds_object, 1946 (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); 1947 } 1948} 1949 1950/* 1951 * Called when a dataset is snapshotted. If we were currently traversing 1952 * this snapshot, we reset our bookmark to point at the newly created 1953 * snapshot. We also modify our work queue to remove the old snapshot and 1954 * replace with the new one. 1955 */
1024void 1025dsl_scan_ds_snapshotted(dsl_dataset_t ds, dmu_tx_t tx) 1026{ 1027 dsl_pool_t dp = ds->ds_dir->dd_pool; 1028* dsl_scan_t scn = dp->dp_scan; 1029* uint64_t mintxg; 1030	1956void 1957dsl_scan_ds_snapshotted(dsl_dataset_t ds, dmu_tx_t tx) 1958{ 1959 dsl_pool_t dp = ds->ds_dir->dd_pool; 1960* dsl_scan_t scn = dp->dp_scan; 1961* uint64_t mintxg; 1962
1031 if (scn->scn_phys.scn_state != DSS_SCANNING)	1963 if (!dsl_scan_is_running(scn))
1032 return; 1033 1034 ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); 1035	1964 return; 1965 1966 ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); 1967
1036 if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { 1037 scn->scn_phys.scn_bookmark.zb_objset = 1038 dsl_dataset_phys(ds)->ds_prev_snap_obj; 1039 zfs_dbgmsg("snapshotting ds %llu; currently traversing; " 1040 "reset zb_objset to %llu", 1041 (u_longlong_t)ds->ds_object, 1042 (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); 1043 } else if (zap_lookup_int_key(dp->dp_meta_objset, 1044 scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {	1968 ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); 1969 ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); 1970 1971 if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { 1972 scan_ds_queue_remove(scn, ds->ds_object); 1973 scan_ds_queue_insert(scn, 1974 dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); 1975 } 1976 1977 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 1978 ds->ds_object, &mintxg) == 0) {
1045 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 1046 scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 1047 VERIFY(zap_add_int_key(dp->dp_meta_objset, 1048 scn->scn_phys.scn_queue_obj, 1049 dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); 1050 zfs_dbgmsg("snapshotting ds %llu; in queue; " 1051 "replacing with %llu", 1052 (u_longlong_t)ds->ds_object, 1053 (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); 1054 }	1979 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 1980 scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 1981 VERIFY(zap_add_int_key(dp->dp_meta_objset, 1982 scn->scn_phys.scn_queue_obj, 1983 dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); 1984 zfs_dbgmsg("snapshotting ds %llu; in queue; " 1985 "replacing with %llu", 1986 (u_longlong_t)ds->ds_object, 1987 (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); 1988 }
1055 dsl_scan_sync_state(scn, tx);	1989 1990 dsl_scan_sync_state(scn, tx, SYNC_CACHED);
1056} 1057	1991} 1992
1058void 1059dsl_scan_ds_clone_swapped(dsl_dataset_t ds1, dsl_dataset_t ds2, dmu_tx_t *tx)	1993static void 1994ds_clone_swapped_bookmark(dsl_dataset_t ds1, dsl_dataset_t ds2, 1995 zbookmark_phys_t *scn_bookmark)
1060{	1996{
1061 dsl_pool_t dp = ds1->ds_dir->dd_pool; 1062* dsl_scan_t scn = dp->dp_scan; 1063* uint64_t mintxg; 1064 1065 if (scn->scn_phys.scn_state != DSS_SCANNING) 1066 return; 1067 1068 if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { 1069 scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;	1997 if (scn_bookmark->zb_objset == ds1->ds_object) { 1998 scn_bookmark->zb_objset = ds2->ds_object;
1070 zfs_dbgmsg("clone_swap ds %llu; currently traversing; " 1071 "reset zb_objset to %llu", 1072 (u_longlong_t)ds1->ds_object, 1073 (u_longlong_t)ds2->ds_object);	1999 zfs_dbgmsg("clone_swap ds %llu; currently traversing; " 2000 "reset zb_objset to %llu", 2001 (u_longlong_t)ds1->ds_object, 2002 (u_longlong_t)ds2->ds_object);
1074 } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { 1075 scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;	2003 } else if (scn_bookmark->zb_objset == ds2->ds_object) { 2004 scn_bookmark->zb_objset = ds1->ds_object;
1076 zfs_dbgmsg("clone_swap ds %llu; currently traversing; " 1077 "reset zb_objset to %llu", 1078 (u_longlong_t)ds2->ds_object, 1079 (u_longlong_t)ds1->ds_object); 1080 }	2005 zfs_dbgmsg("clone_swap ds %llu; currently traversing; " 2006 "reset zb_objset to %llu", 2007 (u_longlong_t)ds2->ds_object, 2008 (u_longlong_t)ds1->ds_object); 2009 }
	2010}
1081	2011
	2012/* 2013 * Called when a parent dataset and its clone are swapped. If we were 2014 * currently traversing the dataset, we need to switch to traversing the 2015 * newly promoted parent. 2016 / 2017void 2018dsl_scan_ds_clone_swapped(dsl_dataset_t ds1, dsl_dataset_t ds2, dmu_tx_t tx) 2019{ 2020 dsl_pool_t dp = ds1->ds_dir->dd_pool; 2021* dsl_scan_t scn = dp->dp_scan; 2022* uint64_t mintxg; 2023 2024 if (!dsl_scan_is_running(scn)) 2025 return; 2026 2027 ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); 2028 ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); 2029 2030 if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) { 2031 scan_ds_queue_remove(scn, ds1->ds_object); 2032 scan_ds_queue_insert(scn, ds2->ds_object, mintxg); 2033 } 2034 if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) { 2035 scan_ds_queue_remove(scn, ds2->ds_object); 2036 scan_ds_queue_insert(scn, ds1->ds_object, mintxg); 2037 } 2038
1082 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 1083 ds1->ds_object, &mintxg) == 0) { 1084 int err;	2039 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 2040 ds1->ds_object, &mintxg) == 0) { 2041 int err;
1085
1086 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 1087 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 1088 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 1089 scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); 1090 err = zap_add_int_key(dp->dp_meta_objset, 1091 scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); 1092 VERIFY(err == 0 \|\| err == EEXIST); 1093 if (err == EEXIST) { 1094 /* Both were there to begin with / 1095* VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, 1096 scn->scn_phys.scn_queue_obj, 1097 ds1->ds_object, mintxg, tx)); 1098 } 1099 zfs_dbgmsg("clone_swap ds %llu; in queue; " 1100 "replacing with %llu", 1101 (u_longlong_t)ds1->ds_object, 1102 (u_longlong_t)ds2->ds_object);	2042 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 2043 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 2044 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2045 scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); 2046 err = zap_add_int_key(dp->dp_meta_objset, 2047 scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); 2048 VERIFY(err == 0 \|\| err == EEXIST); 2049 if (err == EEXIST) { 2050 /* Both were there to begin with / 2051* VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, 2052 scn->scn_phys.scn_queue_obj, 2053 ds1->ds_object, mintxg, tx)); 2054 } 2055 zfs_dbgmsg("clone_swap ds %llu; in queue; " 2056 "replacing with %llu", 2057 (u_longlong_t)ds1->ds_object, 2058 (u_longlong_t)ds2->ds_object);
1103 } else if (zap_lookup_int_key(dp->dp_meta_objset, 1104 scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {	2059 } 2060 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 2061 ds2->ds_object, &mintxg) == 0) {
1105 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 1106 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 1107 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 1108 scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); 1109 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, 1110 scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); 1111 zfs_dbgmsg("clone_swap ds %llu; in queue; " 1112 "replacing with %llu", 1113 (u_longlong_t)ds2->ds_object, 1114 (u_longlong_t)ds1->ds_object); 1115 } 1116	2062 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 2063 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 2064 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2065 scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); 2066 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, 2067 scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); 2068 zfs_dbgmsg("clone_swap ds %llu; in queue; " 2069 "replacing with %llu", 2070 (u_longlong_t)ds2->ds_object, 2071 (u_longlong_t)ds1->ds_object); 2072 } 2073
1117 dsl_scan_sync_state(scn, tx);	2074 dsl_scan_sync_state(scn, tx, SYNC_CACHED);
1118} 1119	2075} 2076
1120struct enqueue_clones_arg { 1121 dmu_tx_t tx; 1122* uint64_t originobj; 1123}; 1124
1125/* ARGSUSED / 1126static int 1127enqueue_clones_cb(dsl_pool_t dp, dsl_dataset_t hds, void arg) 1128{	2077/* ARGSUSED / 2078static int 2079enqueue_clones_cb(dsl_pool_t dp, dsl_dataset_t hds, void arg) 2080{
1129 struct enqueue_clones_arg *eca = arg;	2081 uint64_t originobj = (uint64_t )arg;
1130 dsl_dataset_t ds; 1131* int err; 1132 dsl_scan_t scn = dp->dp_scan; 1133*	2082 dsl_dataset_t ds; 2083* int err; 2084 dsl_scan_t scn = dp->dp_scan; 2085*
1134 if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)	2086 if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
1135 return (0); 1136 1137 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 1138 if (err) 1139 return (err); 1140	2087 return (0); 2088 2089 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 2090 if (err) 2091 return (err); 2092
1141 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {	2093 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
1142 dsl_dataset_t prev; 1143* err = dsl_dataset_hold_obj(dp, 1144 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 1145 1146 dsl_dataset_rele(ds, FTAG); 1147 if (err) 1148 return (err); 1149 ds = prev; 1150 }	2094 dsl_dataset_t prev; 2095* err = dsl_dataset_hold_obj(dp, 2096 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 2097 2098 dsl_dataset_rele(ds, FTAG); 2099 if (err) 2100 return (err); 2101 ds = prev; 2102 }
1151 VERIFY(zap_add_int_key(dp->dp_meta_objset, 1152 scn->scn_phys.scn_queue_obj, ds->ds_object, 1153 dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);	2103 scan_ds_queue_insert(scn, ds->ds_object, 2104 dsl_dataset_phys(ds)->ds_prev_snap_txg);
1154 dsl_dataset_rele(ds, FTAG); 1155 return (0); 1156} 1157 1158static void 1159dsl_scan_visitds(dsl_scan_t scn, uint64_t dsobj, dmu_tx_t tx) 1160{ 1161 dsl_pool_t dp = scn->scn_dp; 1162* dsl_dataset_t ds; 1163* 1164 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 1165 1166 if (scn->scn_phys.scn_cur_min_txg >= 1167 scn->scn_phys.scn_max_txg) { 1168 /* 1169 * This can happen if this snapshot was created after the 1170 * scan started, and we already completed a previous snapshot 1171 * that was created after the scan started. This snapshot 1172 * only references blocks with: 1173 * 1174 * birth < our ds_creation_txg 1175 * cur_min_txg is no less than ds_creation_txg. 1176 * We have already visited these blocks. 1177 * or 1178 * birth > scn_max_txg 1179 * The scan requested not to visit these blocks. 1180 * 1181 * Subsequent snapshots (and clones) can reference our 1182 * blocks, or blocks with even higher birth times. 1183 * Therefore we do not need to visit them either, 1184 * so we do not add them to the work queue. 1185 * 1186 * Note that checking for cur_min_txg >= cur_max_txg 1187 * is not sufficient, because in that case we may need to 1188 * visit subsequent snapshots. This happens when min_txg > 0, 1189 * which raises cur_min_txg. In this case we will visit 1190 * this dataset but skip all of its blocks, because the 1191 * rootbp's birth time is < cur_min_txg. Then we will 1192 * add the next snapshots/clones to the work queue. 1193 / 1194* char dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); 1195* dsl_dataset_name(ds, dsname); 1196 zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " 1197 "cur_min_txg (%llu) >= max_txg (%llu)",	2105 dsl_dataset_rele(ds, FTAG); 2106 return (0); 2107} 2108 2109static void 2110dsl_scan_visitds(dsl_scan_t scn, uint64_t dsobj, dmu_tx_t tx) 2111{ 2112 dsl_pool_t dp = scn->scn_dp; 2113* dsl_dataset_t ds; 2114* 2115 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 2116 2117 if (scn->scn_phys.scn_cur_min_txg >= 2118 scn->scn_phys.scn_max_txg) { 2119 /* 2120 * This can happen if this snapshot was created after the 2121 * scan started, and we already completed a previous snapshot 2122 * that was created after the scan started. This snapshot 2123 * only references blocks with: 2124 * 2125 * birth < our ds_creation_txg 2126 * cur_min_txg is no less than ds_creation_txg. 2127 * We have already visited these blocks. 2128 * or 2129 * birth > scn_max_txg 2130 * The scan requested not to visit these blocks. 2131 * 2132 * Subsequent snapshots (and clones) can reference our 2133 * blocks, or blocks with even higher birth times. 2134 * Therefore we do not need to visit them either, 2135 * so we do not add them to the work queue. 2136 * 2137 * Note that checking for cur_min_txg >= cur_max_txg 2138 * is not sufficient, because in that case we may need to 2139 * visit subsequent snapshots. This happens when min_txg > 0, 2140 * which raises cur_min_txg. In this case we will visit 2141 * this dataset but skip all of its blocks, because the 2142 * rootbp's birth time is < cur_min_txg. Then we will 2143 * add the next snapshots/clones to the work queue. 2144 / 2145* char dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); 2146* dsl_dataset_name(ds, dsname); 2147 zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " 2148 "cur_min_txg (%llu) >= max_txg (%llu)",
1198 dsobj, dsname, 1199 scn->scn_phys.scn_cur_min_txg, 1200 scn->scn_phys.scn_max_txg);	2149 (longlong_t)dsobj, dsname, 2150 (longlong_t)scn->scn_phys.scn_cur_min_txg, 2151 (longlong_t)scn->scn_phys.scn_max_txg);
1201 kmem_free(dsname, MAXNAMELEN); 1202 1203 goto out; 1204 } 1205 1206 /* 1207 * Only the ZIL in the head (non-snapshot) is valid. Even though 1208 * snapshots can have ZIL block pointers (which may be the same 1209 * BP as in the head), they must be ignored. In addition, $ORIGIN 1210 * doesn't have a objset (i.e. its ds_bp is a hole) so we don't 1211 * need to look for a ZIL in it either. So we traverse the ZIL here, 1212 * rather than in scan_recurse(), because the regular snapshot 1213 * block-sharing rules don't apply to it. 1214 / 1215* if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) && 1216 ds->ds_dir != dp->dp_origin_snap->ds_dir) { 1217 objset_t os; 1218* if (dmu_objset_from_ds(ds, &os) != 0) { 1219 goto out; 1220 } 1221 dsl_scan_zil(dp, &os->os_zil_header); 1222 } 1223 1224 /* 1225 * Iterate over the bps in this ds. 1226 / 1227* dmu_buf_will_dirty(ds->ds_dbuf, tx); 1228 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 1229 dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); 1230 rrw_exit(&ds->ds_bp_rwlock, FTAG); 1231 1232 char dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 1233* dsl_dataset_name(ds, dsname); 1234 zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " 1235 "suspending=%u", 1236 (longlong_t)dsobj, dsname, 1237 (longlong_t)scn->scn_phys.scn_cur_min_txg, 1238 (longlong_t)scn->scn_phys.scn_cur_max_txg, 1239 (int)scn->scn_suspending); 1240 kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); 1241 1242 if (scn->scn_suspending) 1243 goto out; 1244 1245 /* 1246 * We've finished this pass over this dataset. 1247 / 1248* 1249 /* 1250 * If we did not completely visit this dataset, do another pass. 1251 / 1252* if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { 1253 zfs_dbgmsg("incomplete pass; visiting again"); 1254 scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;	2152 kmem_free(dsname, MAXNAMELEN); 2153 2154 goto out; 2155 } 2156 2157 /* 2158 * Only the ZIL in the head (non-snapshot) is valid. Even though 2159 * snapshots can have ZIL block pointers (which may be the same 2160 * BP as in the head), they must be ignored. In addition, $ORIGIN 2161 * doesn't have a objset (i.e. its ds_bp is a hole) so we don't 2162 * need to look for a ZIL in it either. So we traverse the ZIL here, 2163 * rather than in scan_recurse(), because the regular snapshot 2164 * block-sharing rules don't apply to it. 2165 / 2166* if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) && 2167 ds->ds_dir != dp->dp_origin_snap->ds_dir) { 2168 objset_t os; 2169* if (dmu_objset_from_ds(ds, &os) != 0) { 2170 goto out; 2171 } 2172 dsl_scan_zil(dp, &os->os_zil_header); 2173 } 2174 2175 /* 2176 * Iterate over the bps in this ds. 2177 / 2178* dmu_buf_will_dirty(ds->ds_dbuf, tx); 2179 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 2180 dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); 2181 rrw_exit(&ds->ds_bp_rwlock, FTAG); 2182 2183 char dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 2184* dsl_dataset_name(ds, dsname); 2185 zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " 2186 "suspending=%u", 2187 (longlong_t)dsobj, dsname, 2188 (longlong_t)scn->scn_phys.scn_cur_min_txg, 2189 (longlong_t)scn->scn_phys.scn_cur_max_txg, 2190 (int)scn->scn_suspending); 2191 kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); 2192 2193 if (scn->scn_suspending) 2194 goto out; 2195 2196 /* 2197 * We've finished this pass over this dataset. 2198 / 2199* 2200 /* 2201 * If we did not completely visit this dataset, do another pass. 2202 / 2203* if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { 2204 zfs_dbgmsg("incomplete pass; visiting again"); 2205 scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
1255 VERIFY(zap_add_int_key(dp->dp_meta_objset, 1256 scn->scn_phys.scn_queue_obj, ds->ds_object, 1257 scn->scn_phys.scn_cur_max_txg, tx) == 0);	2206 scan_ds_queue_insert(scn, ds->ds_object, 2207 scn->scn_phys.scn_cur_max_txg);
1258 goto out; 1259 } 1260 1261 /* 1262 * Add descendent datasets to work queue. 1263 / 1264* if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {	2208 goto out; 2209 } 2210 2211 /* 2212 * Add descendent datasets to work queue. 2213 / 2214* if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
1265 VERIFY(zap_add_int_key(dp->dp_meta_objset, 1266 scn->scn_phys.scn_queue_obj,	2215 scan_ds_queue_insert(scn,
1267 dsl_dataset_phys(ds)->ds_next_snap_obj,	2216 dsl_dataset_phys(ds)->ds_next_snap_obj,
1268 dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);	2217 dsl_dataset_phys(ds)->ds_creation_txg);
1269 } 1270 if (dsl_dataset_phys(ds)->ds_num_children > 1) { 1271 boolean_t usenext = B_FALSE; 1272 if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { 1273 uint64_t count; 1274 /* 1275 * A bug in a previous version of the code could 1276 * cause upgrade_clones_cb() to not set 1277 * ds_next_snap_obj when it should, leading to a 1278 * missing entry. Therefore we can only use the 1279 * next_clones_obj when its count is correct. 1280 / 1281* int err = zap_count(dp->dp_meta_objset, 1282 dsl_dataset_phys(ds)->ds_next_clones_obj, &count); 1283 if (err == 0 && 1284 count == dsl_dataset_phys(ds)->ds_num_children - 1) 1285 usenext = B_TRUE; 1286 } 1287 1288 if (usenext) {	2218 } 2219 if (dsl_dataset_phys(ds)->ds_num_children > 1) { 2220 boolean_t usenext = B_FALSE; 2221 if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { 2222 uint64_t count; 2223 /* 2224 * A bug in a previous version of the code could 2225 * cause upgrade_clones_cb() to not set 2226 * ds_next_snap_obj when it should, leading to a 2227 * missing entry. Therefore we can only use the 2228 * next_clones_obj when its count is correct. 2229 / 2230* int err = zap_count(dp->dp_meta_objset, 2231 dsl_dataset_phys(ds)->ds_next_clones_obj, &count); 2232 if (err == 0 && 2233 count == dsl_dataset_phys(ds)->ds_num_children - 1) 2234 usenext = B_TRUE; 2235 } 2236 2237 if (usenext) {
1289 VERIFY0(zap_join_key(dp->dp_meta_objset, 1290 dsl_dataset_phys(ds)->ds_next_clones_obj, 1291 scn->scn_phys.scn_queue_obj, 1292 dsl_dataset_phys(ds)->ds_creation_txg, tx));	2238 zap_cursor_t zc; 2239 zap_attribute_t za; 2240 for (zap_cursor_init(&zc, dp->dp_meta_objset, 2241 dsl_dataset_phys(ds)->ds_next_clones_obj); 2242 zap_cursor_retrieve(&zc, &za) == 0; 2243 (void) zap_cursor_advance(&zc)) { 2244 scan_ds_queue_insert(scn, 2245 zfs_strtonum(za.za_name, NULL), 2246 dsl_dataset_phys(ds)->ds_creation_txg); 2247 } 2248 zap_cursor_fini(&zc);
1293 } else {	2249 } else {
1294 struct enqueue_clones_arg eca; 1295 eca.tx = tx; 1296 eca.originobj = ds->ds_object; 1297
1298 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,	2250 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1299 enqueue_clones_cb, &eca, DS_FIND_CHILDREN));	2251 enqueue_clones_cb, &ds->ds_object, 2252 DS_FIND_CHILDREN));
1300 } 1301 } 1302 1303out: 1304 dsl_dataset_rele(ds, FTAG); 1305} 1306 1307/* ARGSUSED / 1308static int 1309enqueue_cb(dsl_pool_t dp, dsl_dataset_t hds, void arg) 1310{	2253 } 2254 } 2255 2256out: 2257 dsl_dataset_rele(ds, FTAG); 2258} 2259 2260/* ARGSUSED / 2261static int 2262enqueue_cb(dsl_pool_t dp, dsl_dataset_t hds, void arg) 2263{
1311 dmu_tx_t *tx = arg;
1312 dsl_dataset_t ds; 1313* int err; 1314 dsl_scan_t scn = dp->dp_scan; 1315* 1316 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 1317 if (err) 1318 return (err); 1319 1320 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { 1321 dsl_dataset_t prev; 1322* err = dsl_dataset_hold_obj(dp, 1323 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 1324 if (err) { 1325 dsl_dataset_rele(ds, FTAG); 1326 return (err); 1327 } 1328 1329 /* 1330 * If this is a clone, we don't need to worry about it for now. 1331 / 1332* if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { 1333 dsl_dataset_rele(ds, FTAG); 1334 dsl_dataset_rele(prev, FTAG); 1335 return (0); 1336 } 1337 dsl_dataset_rele(ds, FTAG); 1338 ds = prev; 1339 } 1340	2264 dsl_dataset_t ds; 2265* int err; 2266 dsl_scan_t scn = dp->dp_scan; 2267* 2268 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 2269 if (err) 2270 return (err); 2271 2272 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { 2273 dsl_dataset_t prev; 2274* err = dsl_dataset_hold_obj(dp, 2275 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 2276 if (err) { 2277 dsl_dataset_rele(ds, FTAG); 2278 return (err); 2279 } 2280 2281 /* 2282 * If this is a clone, we don't need to worry about it for now. 2283 / 2284* if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { 2285 dsl_dataset_rele(ds, FTAG); 2286 dsl_dataset_rele(prev, FTAG); 2287 return (0); 2288 } 2289 dsl_dataset_rele(ds, FTAG); 2290 ds = prev; 2291 } 2292
1341 VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 1342 ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);	2293 scan_ds_queue_insert(scn, ds->ds_object, 2294 dsl_dataset_phys(ds)->ds_prev_snap_txg);
1343 dsl_dataset_rele(ds, FTAG); 1344 return (0); 1345} 1346	2295 dsl_dataset_rele(ds, FTAG); 2296 return (0); 2297} 2298
	2299/* ARGSUSED / 2300void 2301dsl_scan_ddt_entry(dsl_scan_t scn, enum zio_checksum checksum, 2302 ddt_entry_t dde, dmu_tx_t tx) 2303{ 2304 const ddt_key_t ddk = &dde->dde_key; 2305* ddt_phys_t ddp = dde->dde_phys; 2306* blkptr_t bp; 2307 zbookmark_phys_t zb = { 0 }; 2308 int p; 2309 2310 if (scn->scn_phys.scn_state != DSS_SCANNING) 2311 return; 2312 2313 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2314 if (ddp->ddp_phys_birth == 0 \|\| 2315 ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) 2316 continue; 2317 ddt_bp_create(checksum, ddk, ddp, &bp); 2318 2319 scn->scn_visited_this_txg++; 2320 scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); 2321 } 2322} 2323
1347/* 1348 * Scrub/dedup interaction. 1349 * 1350 * If there are N references to a deduped block, we don't want to scrub it 1351 * N times -- ideally, we should scrub it exactly once. 1352 * 1353 * We leverage the fact that the dde's replication class (enum ddt_class) 1354 * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest 1355 * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. 1356 * 1357 * To prevent excess scrubbing, the scrub begins by walking the DDT 1358 * to find all blocks with refcnt > 1, and scrubs each of these once. 1359 * Since there are two replication classes which contain blocks with 1360 * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. 1361 * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. 1362 * 1363 * There would be nothing more to say if a block's refcnt couldn't change 1364 * during a scrub, but of course it can so we must account for changes 1365 * in a block's replication class. 1366 * 1367 * Here's an example of what can occur: 1368 * 1369 * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 1370 * when visited during the top-down scrub phase, it will be scrubbed twice. 1371 * This negates our scrub optimization, but is otherwise harmless. 1372 * 1373 * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 1374 * on each visit during the top-down scrub phase, it will never be scrubbed. 1375 * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's 1376 * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to 1377 * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 1378 * while a scrub is in progress, it scrubs the block right then. 1379 / 1380static void 1381dsl_scan_ddt(dsl_scan_t scn, dmu_tx_t tx) 1382{ 1383* ddt_bookmark_t ddb = &scn->scn_phys.scn_ddt_bookmark; 1384* ddt_entry_t dde = { 0 }; 1385 int error; 1386 uint64_t n = 0; 1387 1388 while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { 1389 ddt_t ddt; 1390* 1391 if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) 1392 break; 1393 dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", 1394 (longlong_t)ddb->ddb_class, 1395 (longlong_t)ddb->ddb_type, 1396 (longlong_t)ddb->ddb_checksum, 1397 (longlong_t)ddb->ddb_cursor); 1398 1399 /* There should be no pending changes to the dedup table / 1400* ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; 1401 ASSERT(avl_first(&ddt->ddt_tree) == NULL); 1402 1403 dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); 1404 n++; 1405 1406 if (dsl_scan_check_suspend(scn, NULL)) 1407 break; 1408 } 1409 1410 zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; " 1411 "suspending=%u", (longlong_t)n, 1412 (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); 1413 1414 ASSERT(error == 0 \|\| error == ENOENT); 1415 ASSERT(error != ENOENT \|\| 1416 ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); 1417} 1418	2324/* 2325 * Scrub/dedup interaction. 2326 * 2327 * If there are N references to a deduped block, we don't want to scrub it 2328 * N times -- ideally, we should scrub it exactly once. 2329 * 2330 * We leverage the fact that the dde's replication class (enum ddt_class) 2331 * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest 2332 * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. 2333 * 2334 * To prevent excess scrubbing, the scrub begins by walking the DDT 2335 * to find all blocks with refcnt > 1, and scrubs each of these once. 2336 * Since there are two replication classes which contain blocks with 2337 * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. 2338 * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. 2339 * 2340 * There would be nothing more to say if a block's refcnt couldn't change 2341 * during a scrub, but of course it can so we must account for changes 2342 * in a block's replication class. 2343 * 2344 * Here's an example of what can occur: 2345 * 2346 * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 2347 * when visited during the top-down scrub phase, it will be scrubbed twice. 2348 * This negates our scrub optimization, but is otherwise harmless. 2349 * 2350 * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 2351 * on each visit during the top-down scrub phase, it will never be scrubbed. 2352 * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's 2353 * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to 2354 * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 2355 * while a scrub is in progress, it scrubs the block right then. 2356 / 2357static void 2358dsl_scan_ddt(dsl_scan_t scn, dmu_tx_t tx) 2359{ 2360* ddt_bookmark_t ddb = &scn->scn_phys.scn_ddt_bookmark; 2361* ddt_entry_t dde = { 0 }; 2362 int error; 2363 uint64_t n = 0; 2364 2365 while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { 2366 ddt_t ddt; 2367* 2368 if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) 2369 break; 2370 dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", 2371 (longlong_t)ddb->ddb_class, 2372 (longlong_t)ddb->ddb_type, 2373 (longlong_t)ddb->ddb_checksum, 2374 (longlong_t)ddb->ddb_cursor); 2375 2376 /* There should be no pending changes to the dedup table / 2377* ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; 2378 ASSERT(avl_first(&ddt->ddt_tree) == NULL); 2379 2380 dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); 2381 n++; 2382 2383 if (dsl_scan_check_suspend(scn, NULL)) 2384 break; 2385 } 2386 2387 zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; " 2388 "suspending=%u", (longlong_t)n, 2389 (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); 2390 2391 ASSERT(error == 0 \|\| error == ENOENT); 2392 ASSERT(error != ENOENT \|\| 2393 ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); 2394} 2395
1419/* ARGSUSED / 1420void 1421dsl_scan_ddt_entry(dsl_scan_t scn, enum zio_checksum checksum, 1422 ddt_entry_t dde, dmu_tx_t tx)	2396static uint64_t 2397dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
1423{	2398{
1424 const ddt_key_t ddk = &dde->dde_key; 1425* ddt_phys_t ddp = dde->dde_phys; 1426* blkptr_t bp; 1427 zbookmark_phys_t zb = { 0 }; 1428 1429 if (scn->scn_phys.scn_state != DSS_SCANNING) 1430 return; 1431 1432 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1433 if (ddp->ddp_phys_birth == 0 \|\| 1434 ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) 1435 continue; 1436 ddt_bp_create(checksum, ddk, ddp, &bp); 1437 1438 scn->scn_visited_this_txg++; 1439 scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); 1440 }	2399 uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; 2400 if (ds->ds_is_snapshot) 2401 return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); 2402 return (smt);
1441} 1442 1443static void 1444dsl_scan_visit(dsl_scan_t scn, dmu_tx_t tx) 1445{	2403} 2404 2405static void 2406dsl_scan_visit(dsl_scan_t scn, dmu_tx_t tx) 2407{
	2408 scan_ds_t *sds;
1446 dsl_pool_t *dp = scn->scn_dp;	2409 dsl_pool_t *dp = scn->scn_dp;
1447 zap_cursor_t zc; 1448 zap_attribute_t za;
1449 1450 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= 1451 scn->scn_phys.scn_ddt_class_max) { 1452 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 1453 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 1454 dsl_scan_ddt(scn, tx); 1455 if (scn->scn_suspending) 1456 return; 1457 } 1458 1459 if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { 1460 /* First do the MOS & ORIGIN / 1461* 1462 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 1463 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 1464 dsl_scan_visit_rootbp(scn, NULL, 1465 &dp->dp_meta_rootbp, tx); 1466 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 1467 if (scn->scn_suspending) 1468 return; 1469 1470 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { 1471 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,	2410 2411 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= 2412 scn->scn_phys.scn_ddt_class_max) { 2413 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 2414 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 2415 dsl_scan_ddt(scn, tx); 2416 if (scn->scn_suspending) 2417 return; 2418 } 2419 2420 if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { 2421 /* First do the MOS & ORIGIN / 2422* 2423 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 2424 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 2425 dsl_scan_visit_rootbp(scn, NULL, 2426 &dp->dp_meta_rootbp, tx); 2427 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 2428 if (scn->scn_suspending) 2429 return; 2430 2431 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { 2432 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1472 enqueue_cb, tx, DS_FIND_CHILDREN));	2433 enqueue_cb, NULL, DS_FIND_CHILDREN));
1473 } else { 1474 dsl_scan_visitds(scn, 1475 dp->dp_origin_snap->ds_object, tx); 1476 } 1477 ASSERT(!scn->scn_suspending); 1478 } else if (scn->scn_phys.scn_bookmark.zb_objset != 1479 ZB_DESTROYED_OBJSET) {	2434 } else { 2435 dsl_scan_visitds(scn, 2436 dp->dp_origin_snap->ds_object, tx); 2437 } 2438 ASSERT(!scn->scn_suspending); 2439 } else if (scn->scn_phys.scn_bookmark.zb_objset != 2440 ZB_DESTROYED_OBJSET) {
	2441 uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
1480 /*	2442 /*
1481 * If we were suspended, continue from here. Note if the	2443 * If we were suspended, continue from here. Note if the
1482 * ds we were suspended on was deleted, the zb_objset may 1483 * be -1, so we will skip this and find a new objset 1484 * below. 1485 */	2444 * ds we were suspended on was deleted, the zb_objset may 2445 * be -1, so we will skip this and find a new objset 2446 * below. 2447 */
1486 dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);	2448 dsl_scan_visitds(scn, dsobj, tx);
1487 if (scn->scn_suspending) 1488 return; 1489 } 1490 1491 /*	2449 if (scn->scn_suspending) 2450 return; 2451 } 2452 2453 /*
1492 * In case we were suspended right at the end of the ds, zero the	2454 * In case we suspended right at the end of the ds, zero the
1493 * bookmark so we don't think that we're still trying to resume. 1494 / 1495* bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); 1496	2455 * bookmark so we don't think that we're still trying to resume. 2456 / 2457* bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); 2458
1497 /* keep pulling things out of the zap-object-as-queue / 1498* while (zap_cursor_init(&zc, dp->dp_meta_objset, 1499 scn->scn_phys.scn_queue_obj), 1500 zap_cursor_retrieve(&zc, &za) == 0) {	2459 /* 2460 * Keep pulling things out of the dataset avl queue. Updates to the 2461 * persistent zap-object-as-queue happen only at checkpoints. 2462 / 2463* while ((sds = avl_first(&scn->scn_queue)) != NULL) {
1501 dsl_dataset_t *ds;	2464 dsl_dataset_t *ds;
1502 uint64_t dsobj;	2465 uint64_t dsobj = sds->sds_dsobj; 2466 uint64_t txg = sds->sds_txg;
1503	2467
1504 dsobj = zfs_strtonum(za.za_name, NULL); 1505 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 1506 scn->scn_phys.scn_queue_obj, dsobj, tx));	2468 /* dequeue and free the ds from the queue / 2469* scan_ds_queue_remove(scn, dsobj); 2470 sds = NULL; /* must not be touched after removal */
1507	2471
1508 /* Set up min/max txg */	2472 /* Set up min / max txg */
1509 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));	2473 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
1510 if (za.za_first_integer != 0) {	2474 if (txg != 0) {
1511 scn->scn_phys.scn_cur_min_txg =	2475 scn->scn_phys.scn_cur_min_txg =
1512 MAX(scn->scn_phys.scn_min_txg, 1513 za.za_first_integer);	2476 MAX(scn->scn_phys.scn_min_txg, txg);
1514 } else { 1515 scn->scn_phys.scn_cur_min_txg = 1516 MAX(scn->scn_phys.scn_min_txg, 1517 dsl_dataset_phys(ds)->ds_prev_snap_txg); 1518 } 1519 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); 1520 dsl_dataset_rele(ds, FTAG); 1521 1522 dsl_scan_visitds(scn, dsobj, tx);	2477 } else { 2478 scn->scn_phys.scn_cur_min_txg = 2479 MAX(scn->scn_phys.scn_min_txg, 2480 dsl_dataset_phys(ds)->ds_prev_snap_txg); 2481 } 2482 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); 2483 dsl_dataset_rele(ds, FTAG); 2484 2485 dsl_scan_visitds(scn, dsobj, tx);
1523 zap_cursor_fini(&zc);
1524 if (scn->scn_suspending) 1525 return; 1526 }	2486 if (scn->scn_suspending) 2487 return; 2488 }
1527 zap_cursor_fini(&zc);	2489 /* No more objsets to fetch, we're done / 2490* scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; 2491 ASSERT0(scn->scn_suspending);
1528} 1529	2492} 2493
	2494static uint64_t 2495dsl_scan_count_leaves(vdev_t vd) 2496{ 2497* uint64_t i, leaves = 0; 2498 2499 /* we only count leaves that belong to the main pool and are readable / 2500* if (vd->vdev_islog \|\| vd->vdev_isspare \|\| 2501 vd->vdev_isl2cache \|\| !vdev_readable(vd)) 2502 return (0); 2503 2504 if (vd->vdev_ops->vdev_op_leaf) 2505 return (1); 2506 2507 for (i = 0; i < vd->vdev_children; i++) { 2508 leaves += dsl_scan_count_leaves(vd->vdev_child[i]); 2509 } 2510 2511 return (leaves); 2512} 2513 2514 2515static void 2516scan_io_queues_update_zio_stats(dsl_scan_io_queue_t q, const blkptr_t bp) 2517{ 2518 int i; 2519 uint64_t cur_size = 0; 2520 2521 for (i = 0; i < BP_GET_NDVAS(bp); i++) { 2522 cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); 2523 } 2524 2525 q->q_total_zio_size_this_txg += cur_size; 2526 q->q_zios_this_txg++; 2527} 2528 2529static void 2530scan_io_queues_update_seg_stats(dsl_scan_io_queue_t q, uint64_t start, 2531* uint64_t end) 2532{ 2533 q->q_total_seg_size_this_txg += end - start; 2534 q->q_segs_this_txg++; 2535} 2536
1530static boolean_t	2537static boolean_t
	2538scan_io_queue_check_suspend(dsl_scan_t scn) 2539{ 2540* /* See comment in dsl_scan_check_suspend() / 2541* uint64_t curr_time_ns = gethrtime(); 2542 uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; 2543 uint64_t sync_time_ns = curr_time_ns - 2544 scn->scn_dp->dp_spa->spa_sync_starttime; 2545 int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; 2546 int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? 2547 zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; 2548 2549 return ((NSEC2MSEC(scan_time_ns) > mintime && 2550 (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent \|\| 2551 txg_sync_waiting(scn->scn_dp) \|\| 2552 NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) \|\| 2553 spa_shutting_down(scn->scn_dp->dp_spa)); 2554} 2555 2556/* 2557 * Given a list of scan_io_t's in io_list, this issues the io's out to 2558 * disk. This consumes the io_list and frees the scan_io_t's. This is 2559 * called when emptying queues, either when we're up against the memory 2560 * limit or when we have finished scanning. Returns B_TRUE if we stopped 2561 * processing the list before we finished. Any zios that were not issued 2562 * will remain in the io_list. 2563 / 2564static boolean_t 2565scan_io_queue_issue(dsl_scan_io_queue_t queue, list_t io_list) 2566{ 2567* dsl_scan_t scn = queue->q_scn; 2568* scan_io_t sio; 2569* int64_t bytes_issued = 0; 2570 boolean_t suspended = B_FALSE; 2571 2572 while ((sio = list_head(io_list)) != NULL) { 2573 blkptr_t bp; 2574 2575 if (scan_io_queue_check_suspend(scn)) { 2576 suspended = B_TRUE; 2577 break; 2578 } 2579 2580 sio2bp(sio, &bp, queue->q_vd->vdev_id); 2581 bytes_issued += sio->sio_asize; 2582 scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, 2583 &sio->sio_zb, queue); 2584 (void) list_remove_head(io_list); 2585 scan_io_queues_update_zio_stats(queue, &bp); 2586 kmem_free(sio, sizeof (sio)); 2587* } 2588 2589 atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); 2590 2591 return (suspended); 2592} 2593 2594/* 2595 * Given a range_seg_t (extent) and a list, this function passes over a 2596 * scan queue and gathers up the appropriate ios which fit into that 2597 * scan seg (starting from lowest LBA). At the end, we remove the segment 2598 * from the q_exts_by_addr range tree. 2599 / 2600static boolean_t 2601scan_io_queue_gather(dsl_scan_io_queue_t queue, range_seg_t rs, list_t list) 2602{ 2603 scan_io_t srch_sio, sio, next_sio; 2604 avl_index_t idx; 2605 uint_t num_sios = 0; 2606 int64_t bytes_issued = 0; 2607 2608 ASSERT(rs != NULL); 2609 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); 2610 2611 srch_sio.sio_offset = rs->rs_start; 2612 2613 /* 2614 * The exact start of the extent might not contain any matching zios, 2615 * so if that's the case, examine the next one in the tree. 2616 / 2617* sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx); 2618 if (sio == NULL) 2619 sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); 2620 2621 while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) { 2622 ASSERT3U(sio->sio_offset, >=, rs->rs_start); 2623 ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end); 2624 2625 next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); 2626 avl_remove(&queue->q_sios_by_addr, sio); 2627 2628 bytes_issued += sio->sio_asize; 2629 num_sios++; 2630 list_insert_tail(list, sio); 2631 sio = next_sio; 2632 } 2633 2634 /* 2635 * We limit the number of sios we process at once to 32 to avoid 2636 * biting off more than we can chew. If we didn't take everything 2637 * in the segment we update it to reflect the work we were able to 2638 * complete. Otherwise, we remove it from the range tree entirely. 2639 / 2640* if (sio != NULL && sio->sio_offset < rs->rs_end) { 2641 range_tree_adjust_fill(queue->q_exts_by_addr, rs, 2642 -bytes_issued); 2643 range_tree_resize_segment(queue->q_exts_by_addr, rs, 2644 sio->sio_offset, rs->rs_end - sio->sio_offset); 2645 2646 return (B_TRUE); 2647 } else { 2648 range_tree_remove(queue->q_exts_by_addr, rs->rs_start, 2649 rs->rs_end - rs->rs_start); 2650 return (B_FALSE); 2651 } 2652} 2653 2654 2655/* 2656 * This is called from the queue emptying thread and selects the next 2657 * extent from which we are to issue io's. The behavior of this function 2658 * depends on the state of the scan, the current memory consumption and 2659 * whether or not we are performing a scan shutdown. 2660 * 1) We select extents in an elevator algorithm (LBA-order) if the scan 2661 * needs to perform a checkpoint 2662 * 2) We select the largest available extent if we are up against the 2663 * memory limit. 2664 * 3) Otherwise we don't select any extents. 2665 / 2666static const range_seg_t 2667scan_io_queue_fetch_ext(dsl_scan_io_queue_t queue) 2668{ 2669* dsl_scan_t scn = queue->q_scn; 2670* 2671 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); 2672 ASSERT(scn->scn_is_sorted); 2673 2674 /* handle tunable overrides / 2675* if (scn->scn_checkpointing \|\| scn->scn_clearing) { 2676 if (zfs_scan_issue_strategy == 1) { 2677 return (range_tree_first(queue->q_exts_by_addr)); 2678 } else if (zfs_scan_issue_strategy == 2) { 2679 return (avl_first(&queue->q_exts_by_size)); 2680 } 2681 } 2682 2683 /* 2684 * During normal clearing, we want to issue our largest segments 2685 * first, keeping IO as sequential as possible, and leaving the 2686 * smaller extents for later with the hope that they might eventually 2687 * grow to larger sequential segments. However, when the scan is 2688 * checkpointing, no new extents will be added to the sorting queue, 2689 * so the way we are sorted now is as good as it will ever get. 2690 * In this case, we instead switch to issuing extents in LBA order. 2691 / 2692* if (scn->scn_checkpointing) { 2693 return (range_tree_first(queue->q_exts_by_addr)); 2694 } else if (scn->scn_clearing) { 2695 return (avl_first(&queue->q_exts_by_size)); 2696 } else { 2697 return (NULL); 2698 } 2699} 2700 2701static void 2702scan_io_queues_run_one(void arg) 2703{ 2704* dsl_scan_io_queue_t queue = arg; 2705* kmutex_t q_lock = &queue->q_vd->vdev_scan_io_queue_lock; 2706* boolean_t suspended = B_FALSE; 2707 range_seg_t rs = NULL; 2708* scan_io_t sio = NULL; 2709* list_t sio_list; 2710 uint64_t bytes_per_leaf = zfs_scan_vdev_limit; 2711 uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd); 2712 2713 ASSERT(queue->q_scn->scn_is_sorted); 2714 2715 list_create(&sio_list, sizeof (scan_io_t), 2716 offsetof(scan_io_t, sio_nodes.sio_list_node)); 2717 mutex_enter(q_lock); 2718 2719 /* calculate maximum in-flight bytes for this txg (min 1MB) / 2720* queue->q_maxinflight_bytes = 2721 MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); 2722 2723 /* reset per-queue scan statistics for this txg / 2724* queue->q_total_seg_size_this_txg = 0; 2725 queue->q_segs_this_txg = 0; 2726 queue->q_total_zio_size_this_txg = 0; 2727 queue->q_zios_this_txg = 0; 2728 2729 /* loop until we have run out of time or sios / 2730* while ((rs = (range_seg_t)scan_io_queue_fetch_ext(queue)) != NULL) { 2731* uint64_t seg_start = 0, seg_end = 0; 2732 boolean_t more_left = B_TRUE; 2733 2734 ASSERT(list_is_empty(&sio_list)); 2735 2736 /* loop while we still have sios left to process in this rs / 2737* while (more_left) { 2738 scan_io_t first_sio, last_sio; 2739 2740 /* 2741 * We have selected which extent needs to be 2742 * processed next. Gather up the corresponding sios. 2743 / 2744* more_left = scan_io_queue_gather(queue, rs, &sio_list); 2745 ASSERT(!list_is_empty(&sio_list)); 2746 first_sio = list_head(&sio_list); 2747 last_sio = list_tail(&sio_list); 2748 2749 seg_end = last_sio->sio_offset + last_sio->sio_asize; 2750 if (seg_start == 0) 2751 seg_start = first_sio->sio_offset; 2752 2753 /* 2754 * Issuing sios can take a long time so drop the 2755 * queue lock. The sio queue won't be updated by 2756 * other threads since we're in syncing context so 2757 * we can be sure that our trees will remain exactly 2758 * as we left them. 2759 / 2760* mutex_exit(q_lock); 2761 suspended = scan_io_queue_issue(queue, &sio_list); 2762 mutex_enter(q_lock); 2763 2764 if (suspended) 2765 break; 2766 } 2767 /* update statistics for debugging purposes / 2768* scan_io_queues_update_seg_stats(queue, seg_start, seg_end); 2769 2770 if (suspended) 2771 break; 2772 } 2773 2774 2775 /* If we were suspended in the middle of processing, 2776 * requeue any unfinished sios and exit. 2777 / 2778* while ((sio = list_head(&sio_list)) != NULL) { 2779 list_remove(&sio_list, sio); 2780 scan_io_queue_insert_impl(queue, sio); 2781 } 2782 2783 mutex_exit(q_lock); 2784 list_destroy(&sio_list); 2785} 2786 2787/* 2788 * Performs an emptying run on all scan queues in the pool. This just 2789 * punches out one thread per top-level vdev, each of which processes 2790 * only that vdev's scan queue. We can parallelize the I/O here because 2791 * we know that each queue's io's only affect its own top-level vdev. 2792 * 2793 * This function waits for the queue runs to complete, and must be 2794 * called from dsl_scan_sync (or in general, syncing context). 2795 / 2796static void 2797scan_io_queues_run(dsl_scan_t scn) 2798{ 2799 spa_t spa = scn->scn_dp->dp_spa; 2800* 2801 ASSERT(scn->scn_is_sorted); 2802 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2803 2804 if (scn->scn_bytes_pending == 0) 2805 return; 2806 2807 if (scn->scn_taskq == NULL) { 2808 char tq_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN + 16, 2809* KM_SLEEP); 2810 int nthreads = spa->spa_root_vdev->vdev_children; 2811 2812 /* 2813 * We need to make this taskq always execute as many 2814 * threads in parallel as we have top-level vdevs and no 2815 * less, otherwise strange serialization of the calls to 2816 * scan_io_queues_run_one can occur during spa_sync runs 2817 * and that significantly impacts performance. 2818 / 2819* (void) snprintf(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16, 2820 "dsl_scan_tq_%s", spa->spa_name); 2821 scn->scn_taskq = taskq_create(tq_name, nthreads, minclsyspri, 2822 nthreads, nthreads, TASKQ_PREPOPULATE); 2823 kmem_free(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16); 2824 } 2825 2826 for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { 2827 vdev_t vd = spa->spa_root_vdev->vdev_child[i]; 2828* 2829 mutex_enter(&vd->vdev_scan_io_queue_lock); 2830 if (vd->vdev_scan_io_queue != NULL) { 2831 VERIFY(taskq_dispatch(scn->scn_taskq, 2832 scan_io_queues_run_one, vd->vdev_scan_io_queue, 2833 TQ_SLEEP) != TASKQID_INVALID); 2834 } 2835 mutex_exit(&vd->vdev_scan_io_queue_lock); 2836 } 2837 2838 /* 2839 * Wait for the queues to finish issuing thir IOs for this run 2840 * before we return. There may still be IOs in flight at this 2841 * point. 2842 / 2843* taskq_wait(scn->scn_taskq); 2844} 2845 2846static boolean_t
1531dsl_scan_async_block_should_pause(dsl_scan_t scn) 1532{ 1533* uint64_t elapsed_nanosecs; 1534 1535 if (zfs_recover) 1536 return (B_FALSE); 1537 1538 if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks) 1539 return (B_TRUE); 1540 1541 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; 1542 return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout \|\| 1543 (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && 1544 txg_sync_waiting(scn->scn_dp)) \|\| 1545 spa_shutting_down(scn->scn_dp->dp_spa)); 1546} 1547 1548static int 1549dsl_scan_free_block_cb(void arg, const blkptr_t bp, dmu_tx_t tx) 1550{ 1551* dsl_scan_t scn = arg; 1552* 1553 if (!scn->scn_is_bptree \|\| 1554 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { 1555 if (dsl_scan_async_block_should_pause(scn)) 1556 return (SET_ERROR(ERESTART)); 1557 } 1558 1559 zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, 1560 dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); 1561 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 1562 -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), 1563 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 1564 scn->scn_visited_this_txg++; 1565 return (0); 1566} 1567	2847dsl_scan_async_block_should_pause(dsl_scan_t scn) 2848{ 2849* uint64_t elapsed_nanosecs; 2850 2851 if (zfs_recover) 2852 return (B_FALSE); 2853 2854 if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks) 2855 return (B_TRUE); 2856 2857 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; 2858 return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout \|\| 2859 (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && 2860 txg_sync_waiting(scn->scn_dp)) \|\| 2861 spa_shutting_down(scn->scn_dp->dp_spa)); 2862} 2863 2864static int 2865dsl_scan_free_block_cb(void arg, const blkptr_t bp, dmu_tx_t tx) 2866{ 2867* dsl_scan_t scn = arg; 2868* 2869 if (!scn->scn_is_bptree \|\| 2870 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { 2871 if (dsl_scan_async_block_should_pause(scn)) 2872 return (SET_ERROR(ERESTART)); 2873 } 2874 2875 zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, 2876 dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); 2877 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2878 -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), 2879 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2880 scn->scn_visited_this_txg++; 2881 return (0); 2882} 2883
	2884static void 2885dsl_scan_update_stats(dsl_scan_t scn) 2886{ 2887* spa_t spa = scn->scn_dp->dp_spa; 2888* uint64_t i; 2889 uint64_t seg_size_total = 0, zio_size_total = 0; 2890 uint64_t seg_count_total = 0, zio_count_total = 0; 2891 2892 for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { 2893 vdev_t vd = spa->spa_root_vdev->vdev_child[i]; 2894* dsl_scan_io_queue_t queue = vd->vdev_scan_io_queue; 2895* 2896 if (queue == NULL) 2897 continue; 2898 2899 seg_size_total += queue->q_total_seg_size_this_txg; 2900 zio_size_total += queue->q_total_zio_size_this_txg; 2901 seg_count_total += queue->q_segs_this_txg; 2902 zio_count_total += queue->q_zios_this_txg; 2903 } 2904 2905 if (seg_count_total == 0 \|\| zio_count_total == 0) { 2906 scn->scn_avg_seg_size_this_txg = 0; 2907 scn->scn_avg_zio_size_this_txg = 0; 2908 scn->scn_segs_this_txg = 0; 2909 scn->scn_zios_this_txg = 0; 2910 return; 2911 } 2912 2913 scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; 2914 scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; 2915 scn->scn_segs_this_txg = seg_count_total; 2916 scn->scn_zios_this_txg = zio_count_total; 2917} 2918
1568static int 1569dsl_scan_obsolete_block_cb(void arg, const blkptr_t bp, dmu_tx_t tx) 1570{ 1571* dsl_scan_t scn = arg; 1572* const dva_t dva = &bp->blk_dva[0]; 1573* 1574 if (dsl_scan_async_block_should_pause(scn)) 1575 return (SET_ERROR(ERESTART)); 1576 1577 spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, 1578 DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), 1579 DVA_GET_ASIZE(dva), tx); 1580 scn->scn_visited_this_txg++; 1581 return (0); 1582} 1583 1584boolean_t 1585dsl_scan_active(dsl_scan_t scn) 1586{ 1587* spa_t spa = scn->scn_dp->dp_spa; 1588* uint64_t used = 0, comp, uncomp; 1589 1590 if (spa->spa_load_state != SPA_LOAD_NONE) 1591 return (B_FALSE); 1592 if (spa_shutting_down(spa)) 1593 return (B_FALSE);	2919static int 2920dsl_scan_obsolete_block_cb(void arg, const blkptr_t bp, dmu_tx_t tx) 2921{ 2922* dsl_scan_t scn = arg; 2923* const dva_t dva = &bp->blk_dva[0]; 2924* 2925 if (dsl_scan_async_block_should_pause(scn)) 2926 return (SET_ERROR(ERESTART)); 2927 2928 spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, 2929 DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), 2930 DVA_GET_ASIZE(dva), tx); 2931 scn->scn_visited_this_txg++; 2932 return (0); 2933} 2934 2935boolean_t 2936dsl_scan_active(dsl_scan_t scn) 2937{ 2938* spa_t spa = scn->scn_dp->dp_spa; 2939* uint64_t used = 0, comp, uncomp; 2940 2941 if (spa->spa_load_state != SPA_LOAD_NONE) 2942 return (B_FALSE); 2943 if (spa_shutting_down(spa)) 2944 return (B_FALSE);
1594 if ((scn->scn_phys.scn_state == DSS_SCANNING && 1595 !dsl_scan_is_paused_scrub(scn)) \|\|	2945 if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) \|\|
1596 (scn->scn_async_destroying && !scn->scn_async_stalled)) 1597 return (B_TRUE); 1598 1599 if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 1600 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, 1601 &used, &comp, &uncomp); 1602 } 1603 return (used != 0); 1604} 1605	2946 (scn->scn_async_destroying && !scn->scn_async_stalled)) 2947 return (B_TRUE); 2948 2949 if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 2950 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, 2951 &used, &comp, &uncomp); 2952 } 2953 return (used != 0); 2954} 2955
	2956static boolean_t 2957dsl_scan_need_resilver(spa_t spa, const dva_t dva, size_t psize, 2958 uint64_t phys_birth) 2959{ 2960 vdev_t vd; 2961* 2962 if (DVA_GET_GANG(dva)) { 2963 /* 2964 * Gang members may be spread across multiple 2965 * vdevs, so the best estimate we have is the 2966 * scrub range, which has already been checked. 2967 * XXX -- it would be better to change our 2968 * allocation policy to ensure that all 2969 * gang members reside on the same vdev. 2970 / 2971* return (B_TRUE); 2972 } 2973 2974 vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 2975 2976 /* 2977 * Check if the txg falls within the range which must be 2978 * resilvered. DVAs outside this range can always be skipped. 2979 / 2980* if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) 2981 return (B_FALSE); 2982 2983 /* 2984 * Check if the top-level vdev must resilver this offset. 2985 * When the offset does not intersect with a dirty leaf DTL 2986 * then it may be possible to skip the resilver IO. The psize 2987 * is provided instead of asize to simplify the check for RAIDZ. 2988 / 2989* if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) 2990 return (B_FALSE); 2991 2992 return (B_TRUE); 2993} 2994
1606static int 1607dsl_process_async_destroys(dsl_pool_t dp, dmu_tx_t tx) 1608{	2995static int 2996dsl_process_async_destroys(dsl_pool_t dp, dmu_tx_t tx) 2997{
	2998 int err = 0;
1609 dsl_scan_t scn = dp->dp_scan; 1610* spa_t *spa = dp->dp_spa;	2999 dsl_scan_t scn = dp->dp_scan; 3000* spa_t *spa = dp->dp_spa;
1611 int err = 0;
1612 1613 if (spa_suspend_async_destroy(spa)) 1614 return (0); 1615 1616 if (zfs_free_bpobj_enabled &&	3001 3002 if (spa_suspend_async_destroy(spa)) 3003 return (0); 3004 3005 if (zfs_free_bpobj_enabled &&
1617 spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {	3006 spa_version(spa) >= SPA_VERSION_DEADLISTS) {
1618 scn->scn_is_bptree = B_FALSE; 1619 scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;	3007 scn->scn_is_bptree = B_FALSE; 3008 scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
1620 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,	3009 scn->scn_zio_root = zio_root(spa, NULL,
1621 NULL, ZIO_FLAG_MUSTSUCCEED); 1622 err = bpobj_iterate(&dp->dp_free_bpobj, 1623 dsl_scan_free_block_cb, scn, tx);	3010 NULL, ZIO_FLAG_MUSTSUCCEED); 3011 err = bpobj_iterate(&dp->dp_free_bpobj, 3012 dsl_scan_free_block_cb, scn, tx);
1624 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));	3013 VERIFY0(zio_wait(scn->scn_zio_root)); 3014 scn->scn_zio_root = NULL;
1625 1626 if (err != 0 && err != ERESTART) 1627 zfs_panic_recover("error %u from bpobj_iterate()", err); 1628 } 1629 1630 if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { 1631 ASSERT(scn->scn_async_destroying); 1632 scn->scn_is_bptree = B_TRUE;	3015 3016 if (err != 0 && err != ERESTART) 3017 zfs_panic_recover("error %u from bpobj_iterate()", err); 3018 } 3019 3020 if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { 3021 ASSERT(scn->scn_async_destroying); 3022 scn->scn_is_bptree = B_TRUE;
1633 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,	3023 scn->scn_zio_root = zio_root(spa, NULL,
1634 NULL, ZIO_FLAG_MUSTSUCCEED); 1635 err = bptree_iterate(dp->dp_meta_objset, 1636 dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); 1637 VERIFY0(zio_wait(scn->scn_zio_root));	3024 NULL, ZIO_FLAG_MUSTSUCCEED); 3025 err = bptree_iterate(dp->dp_meta_objset, 3026 dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); 3027 VERIFY0(zio_wait(scn->scn_zio_root));
	3028 scn->scn_zio_root = NULL;
1638 1639 if (err == EIO \|\| err == ECKSUM) { 1640 err = 0; 1641 } else if (err != 0 && err != ERESTART) { 1642 zfs_panic_recover("error %u from " 1643 "traverse_dataset_destroyed()", err); 1644 } 1645 1646 if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { 1647 /* finished; deactivate async destroy feature / 1648* spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); 1649 ASSERT(!spa_feature_is_active(spa, 1650 SPA_FEATURE_ASYNC_DESTROY)); 1651 VERIFY0(zap_remove(dp->dp_meta_objset, 1652 DMU_POOL_DIRECTORY_OBJECT, 1653 DMU_POOL_BPTREE_OBJ, tx)); 1654 VERIFY0(bptree_free(dp->dp_meta_objset, 1655 dp->dp_bptree_obj, tx)); 1656 dp->dp_bptree_obj = 0; 1657 scn->scn_async_destroying = B_FALSE; 1658 scn->scn_async_stalled = B_FALSE; 1659 } else { 1660 /* 1661 * If we didn't make progress, mark the async 1662 * destroy as stalled, so that we will not initiate 1663 * a spa_sync() on its behalf. Note that we only 1664 * check this if we are not finished, because if the 1665 * bptree had no blocks for us to visit, we can 1666 * finish without "making progress". 1667 / 1668* scn->scn_async_stalled = 1669 (scn->scn_visited_this_txg == 0); 1670 } 1671 } 1672 if (scn->scn_visited_this_txg) { 1673 zfs_dbgmsg("freed %llu blocks in %llums from " 1674 "free_bpobj/bptree txg %llu; err=%d", 1675 (longlong_t)scn->scn_visited_this_txg, 1676 (longlong_t) 1677 NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), 1678 (longlong_t)tx->tx_txg, err); 1679 scn->scn_visited_this_txg = 0; 1680 1681 /* 1682 * Write out changes to the DDT that may be required as a 1683 * result of the blocks freed. This ensures that the DDT 1684 * is clean when a scrub/resilver runs. 1685 / 1686* ddt_sync(spa, tx->tx_txg); 1687 } 1688 if (err != 0) 1689 return (err); 1690 if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && 1691 zfs_free_leak_on_eio && 1692 (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 \|\| 1693 dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 \|\| 1694 dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { 1695 /* 1696 * We have finished background destroying, but there is still 1697 * some space left in the dp_free_dir. Transfer this leaked 1698 * space to the dp_leak_dir. 1699 / 1700* if (dp->dp_leak_dir == NULL) { 1701 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 1702 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 1703 LEAK_DIR_NAME, tx); 1704 VERIFY0(dsl_pool_open_special_dir(dp, 1705 LEAK_DIR_NAME, &dp->dp_leak_dir)); 1706 rrw_exit(&dp->dp_config_rwlock, FTAG); 1707 } 1708 dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, 1709 dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, 1710 dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, 1711 dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); 1712 dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, 1713 -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, 1714 -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, 1715 -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); 1716 } 1717 1718 if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { 1719 /* finished; verify that space accounting went to zero / 1720* ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); 1721 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); 1722 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); 1723 } 1724 1725 EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), 1726 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1727 DMU_POOL_OBSOLETE_BPOBJ)); 1728 if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { 1729 ASSERT(spa_feature_is_active(dp->dp_spa, 1730 SPA_FEATURE_OBSOLETE_COUNTS)); 1731 1732 scn->scn_is_bptree = B_FALSE; 1733 scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; 1734 err = bpobj_iterate(&dp->dp_obsolete_bpobj, 1735 dsl_scan_obsolete_block_cb, scn, tx); 1736 if (err != 0 && err != ERESTART) 1737 zfs_panic_recover("error %u from bpobj_iterate()", err); 1738 1739 if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) 1740 dsl_pool_destroy_obsolete_bpobj(dp, tx); 1741 } 1742 1743 return (0); 1744} 1745	3029 3030 if (err == EIO \|\| err == ECKSUM) { 3031 err = 0; 3032 } else if (err != 0 && err != ERESTART) { 3033 zfs_panic_recover("error %u from " 3034 "traverse_dataset_destroyed()", err); 3035 } 3036 3037 if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { 3038 /* finished; deactivate async destroy feature / 3039* spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); 3040 ASSERT(!spa_feature_is_active(spa, 3041 SPA_FEATURE_ASYNC_DESTROY)); 3042 VERIFY0(zap_remove(dp->dp_meta_objset, 3043 DMU_POOL_DIRECTORY_OBJECT, 3044 DMU_POOL_BPTREE_OBJ, tx)); 3045 VERIFY0(bptree_free(dp->dp_meta_objset, 3046 dp->dp_bptree_obj, tx)); 3047 dp->dp_bptree_obj = 0; 3048 scn->scn_async_destroying = B_FALSE; 3049 scn->scn_async_stalled = B_FALSE; 3050 } else { 3051 /* 3052 * If we didn't make progress, mark the async 3053 * destroy as stalled, so that we will not initiate 3054 * a spa_sync() on its behalf. Note that we only 3055 * check this if we are not finished, because if the 3056 * bptree had no blocks for us to visit, we can 3057 * finish without "making progress". 3058 / 3059* scn->scn_async_stalled = 3060 (scn->scn_visited_this_txg == 0); 3061 } 3062 } 3063 if (scn->scn_visited_this_txg) { 3064 zfs_dbgmsg("freed %llu blocks in %llums from " 3065 "free_bpobj/bptree txg %llu; err=%d", 3066 (longlong_t)scn->scn_visited_this_txg, 3067 (longlong_t) 3068 NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), 3069 (longlong_t)tx->tx_txg, err); 3070 scn->scn_visited_this_txg = 0; 3071 3072 /* 3073 * Write out changes to the DDT that may be required as a 3074 * result of the blocks freed. This ensures that the DDT 3075 * is clean when a scrub/resilver runs. 3076 / 3077* ddt_sync(spa, tx->tx_txg); 3078 } 3079 if (err != 0) 3080 return (err); 3081 if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && 3082 zfs_free_leak_on_eio && 3083 (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 \|\| 3084 dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 \|\| 3085 dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { 3086 /* 3087 * We have finished background destroying, but there is still 3088 * some space left in the dp_free_dir. Transfer this leaked 3089 * space to the dp_leak_dir. 3090 / 3091* if (dp->dp_leak_dir == NULL) { 3092 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 3093 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 3094 LEAK_DIR_NAME, tx); 3095 VERIFY0(dsl_pool_open_special_dir(dp, 3096 LEAK_DIR_NAME, &dp->dp_leak_dir)); 3097 rrw_exit(&dp->dp_config_rwlock, FTAG); 3098 } 3099 dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, 3100 dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, 3101 dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, 3102 dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); 3103 dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, 3104 -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, 3105 -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, 3106 -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); 3107 } 3108 3109 if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { 3110 /* finished; verify that space accounting went to zero / 3111* ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); 3112 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); 3113 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); 3114 } 3115 3116 EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), 3117 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 3118 DMU_POOL_OBSOLETE_BPOBJ)); 3119 if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { 3120 ASSERT(spa_feature_is_active(dp->dp_spa, 3121 SPA_FEATURE_OBSOLETE_COUNTS)); 3122 3123 scn->scn_is_bptree = B_FALSE; 3124 scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; 3125 err = bpobj_iterate(&dp->dp_obsolete_bpobj, 3126 dsl_scan_obsolete_block_cb, scn, tx); 3127 if (err != 0 && err != ERESTART) 3128 zfs_panic_recover("error %u from bpobj_iterate()", err); 3129 3130 if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) 3131 dsl_pool_destroy_obsolete_bpobj(dp, tx); 3132 } 3133 3134 return (0); 3135} 3136
	3137/* 3138 * This is the primary entry point for scans that is called from syncing 3139 * context. Scans must happen entirely during syncing context so that we 3140 * cna guarantee that blocks we are currently scanning will not change out 3141 * from under us. While a scan is active, this funciton controls how quickly 3142 * transaction groups proceed, instead of the normal handling provided by 3143 * txg_sync_thread(). 3144 */
1746void 1747dsl_scan_sync(dsl_pool_t dp, dmu_tx_t tx) 1748{ 1749 dsl_scan_t scn = dp->dp_scan; 1750* spa_t spa = dp->dp_spa; 1751* int err = 0;	3145void 3146dsl_scan_sync(dsl_pool_t dp, dmu_tx_t tx) 3147{ 3148 dsl_scan_t scn = dp->dp_scan; 3149* spa_t spa = dp->dp_spa; 3150* int err = 0;
	3151 state_sync_type_t sync_type = SYNC_OPTIONAL;
1752 1753 /* 1754 * Check for scn_restart_txg before checking spa_load_state, so 1755 * that we can restart an old-style scan while the pool is being 1756 * imported (see dsl_scan_init). 1757 / 1758* if (dsl_scan_restarting(scn, tx)) { 1759 pool_scan_func_t func = POOL_SCAN_SCRUB; 1760 dsl_scan_done(scn, B_FALSE, tx); 1761 if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) 1762 func = POOL_SCAN_RESILVER; 1763 zfs_dbgmsg("restarting scan func=%u txg=%llu",	3152 3153 /* 3154 * Check for scn_restart_txg before checking spa_load_state, so 3155 * that we can restart an old-style scan while the pool is being 3156 * imported (see dsl_scan_init). 3157 / 3158* if (dsl_scan_restarting(scn, tx)) { 3159 pool_scan_func_t func = POOL_SCAN_SCRUB; 3160 dsl_scan_done(scn, B_FALSE, tx); 3161 if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) 3162 func = POOL_SCAN_RESILVER; 3163 zfs_dbgmsg("restarting scan func=%u txg=%llu",
1764 func, tx->tx_txg);	3164 func, (longlong_t)tx->tx_txg);
1765 dsl_scan_setup_sync(&func, tx); 1766 } 1767 1768 /* 1769 * Only process scans in sync pass 1. 1770 / 1771* if (spa_sync_pass(dp->dp_spa) > 1) 1772 return; 1773 1774 /* 1775 * If the spa is shutting down, then stop scanning. This will 1776 * ensure that the scan does not dirty any new data during the 1777 * shutdown phase. 1778 / 1779* if (spa_shutting_down(spa)) 1780 return; 1781 1782 /* 1783 * If the scan is inactive due to a stalled async destroy, try again. 1784 / 1785* if (!scn->scn_async_stalled && !dsl_scan_active(scn)) 1786 return; 1787	3165 dsl_scan_setup_sync(&func, tx); 3166 } 3167 3168 /* 3169 * Only process scans in sync pass 1. 3170 / 3171* if (spa_sync_pass(dp->dp_spa) > 1) 3172 return; 3173 3174 /* 3175 * If the spa is shutting down, then stop scanning. This will 3176 * ensure that the scan does not dirty any new data during the 3177 * shutdown phase. 3178 / 3179* if (spa_shutting_down(spa)) 3180 return; 3181 3182 /* 3183 * If the scan is inactive due to a stalled async destroy, try again. 3184 / 3185* if (!scn->scn_async_stalled && !dsl_scan_active(scn)) 3186 return; 3187
	3188 /* reset scan statistics */
1788 scn->scn_visited_this_txg = 0;	3189 scn->scn_visited_this_txg = 0;
	3190 scn->scn_holes_this_txg = 0; 3191 scn->scn_lt_min_this_txg = 0; 3192 scn->scn_gt_max_this_txg = 0; 3193 scn->scn_ddt_contained_this_txg = 0; 3194 scn->scn_objsets_visited_this_txg = 0; 3195 scn->scn_avg_seg_size_this_txg = 0; 3196 scn->scn_segs_this_txg = 0; 3197 scn->scn_avg_zio_size_this_txg = 0; 3198 scn->scn_zios_this_txg = 0;
1789 scn->scn_suspending = B_FALSE; 1790 scn->scn_sync_start_time = gethrtime(); 1791 spa->spa_scrub_active = B_TRUE; 1792 1793 /* 1794 * First process the async destroys. If we pause, don't do 1795 * any scrubbing or resilvering. This ensures that there are no 1796 * async destroys while we are scanning, so the scan code doesn't 1797 * have to worry about traversing it. It is also faster to free the 1798 * blocks than to scrub them. 1799 / 1800* err = dsl_process_async_destroys(dp, tx); 1801 if (err != 0) 1802 return; 1803	3199 scn->scn_suspending = B_FALSE; 3200 scn->scn_sync_start_time = gethrtime(); 3201 spa->spa_scrub_active = B_TRUE; 3202 3203 /* 3204 * First process the async destroys. If we pause, don't do 3205 * any scrubbing or resilvering. This ensures that there are no 3206 * async destroys while we are scanning, so the scan code doesn't 3207 * have to worry about traversing it. It is also faster to free the 3208 * blocks than to scrub them. 3209 / 3210* err = dsl_process_async_destroys(dp, tx); 3211 if (err != 0) 3212 return; 3213
1804 if (scn->scn_phys.scn_state != DSS_SCANNING)	3214 if (!dsl_scan_is_running(scn) \|\| dsl_scan_is_paused_scrub(scn))
1805 return; 1806	3215 return; 3216
1807 if (scn->scn_done_txg == tx->tx_txg) { 1808 ASSERT(!scn->scn_suspending); 1809 /* finished with scan. / 1810* zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); 1811 dsl_scan_done(scn, B_TRUE, tx); 1812 ASSERT3U(spa->spa_scrub_inflight, ==, 0); 1813 dsl_scan_sync_state(scn, tx);	3217 /* 3218 * Wait a few txgs after importing to begin scanning so that 3219 * we can get the pool imported quickly. 3220 / 3221* if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
1814 return;	3222 return;
	3223 3224 /* 3225 * It is possible to switch from unsorted to sorted at any time, 3226 * but afterwards the scan will remain sorted unless reloaded from 3227 * a checkpoint after a reboot. 3228 / 3229* if (!zfs_scan_legacy) { 3230 scn->scn_is_sorted = B_TRUE; 3231 if (scn->scn_last_checkpoint == 0) 3232 scn->scn_last_checkpoint = ddi_get_lbolt();
1815 } 1816	3233 } 3234
1817 if (dsl_scan_is_paused_scrub(scn)) 1818 return;	3235 /* 3236 * For sorted scans, determine what kind of work we will be doing 3237 * this txg based on our memory limitations and whether or not we 3238 * need to perform a checkpoint. 3239 / 3240* if (scn->scn_is_sorted) { 3241 /* 3242 * If we are over our checkpoint interval, set scn_clearing 3243 * so that we can begin checkpointing immediately. The 3244 * checkpoint allows us to save a consisent bookmark 3245 * representing how much data we have scrubbed so far. 3246 * Otherwise, use the memory limit to determine if we should 3247 * scan for metadata or start issue scrub IOs. We accumulate 3248 * metadata until we hit our hard memory limit at which point 3249 * we issue scrub IOs until we are at our soft memory limit. 3250 / 3251* if (scn->scn_checkpointing \|\| 3252 ddi_get_lbolt() - scn->scn_last_checkpoint > 3253 SEC_TO_TICK(zfs_scan_checkpoint_intval)) { 3254 if (!scn->scn_checkpointing) 3255 zfs_dbgmsg("begin scan checkpoint");
1819	3256
1820 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= 1821 scn->scn_phys.scn_ddt_class_max) { 1822 zfs_dbgmsg("doing scan sync txg %llu; " 1823 "ddt bm=%llu/%llu/%llu/%llx", 1824 (longlong_t)tx->tx_txg, 1825 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, 1826 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, 1827 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, 1828 (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); 1829 ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); 1830 ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); 1831 ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); 1832 ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);	3257 scn->scn_checkpointing = B_TRUE; 3258 scn->scn_clearing = B_TRUE; 3259 } else { 3260 boolean_t should_clear = dsl_scan_should_clear(scn); 3261 if (should_clear && !scn->scn_clearing) { 3262 zfs_dbgmsg("begin scan clearing"); 3263 scn->scn_clearing = B_TRUE; 3264 } else if (!should_clear && scn->scn_clearing) { 3265 zfs_dbgmsg("finish scan clearing"); 3266 scn->scn_clearing = B_FALSE; 3267 } 3268 }
1833 } else {	3269 } else {
1834 zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", 1835 (longlong_t)tx->tx_txg, 1836 (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, 1837 (longlong_t)scn->scn_phys.scn_bookmark.zb_object, 1838 (longlong_t)scn->scn_phys.scn_bookmark.zb_level, 1839 (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);	3270 ASSERT0(scn->scn_checkpointing); 3271 ASSERT0(scn->scn_clearing);
1840 } 1841	3272 } 3273
1842 scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 1843 NULL, ZIO_FLAG_CANFAIL); 1844 dsl_pool_config_enter(dp, FTAG); 1845 dsl_scan_visit(scn, tx); 1846 dsl_pool_config_exit(dp, FTAG); 1847 (void) zio_wait(scn->scn_zio_root); 1848 scn->scn_zio_root = NULL;	3274 if (!scn->scn_clearing && scn->scn_done_txg == 0) { 3275 /* Need to scan metadata for more blocks to scrub / 3276* dsl_scan_phys_t scnp = &scn->scn_phys; 3277* taskqid_t prefetch_tqid; 3278 uint64_t bytes_per_leaf = zfs_scan_vdev_limit; 3279 uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
1849	3280
1850 zfs_dbgmsg("visited %llu blocks in %llums", 1851 (longlong_t)scn->scn_visited_this_txg, 1852 (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));	3281 /* 3282 * Calculate the max number of in-flight bytes for pool-wide 3283 * scanning operations (minimum 1MB). Limits for the issuing 3284 * phase are done per top-level vdev and are handled separately. 3285 / 3286* scn->scn_maxinflight_bytes = 3287 MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
1853	3288
1854 if (!scn->scn_suspending) { 1855 scn->scn_done_txg = tx->tx_txg + 1; 1856 zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", 1857 tx->tx_txg, scn->scn_done_txg); 1858 } 1859 1860 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 1861 mutex_enter(&spa->spa_scrub_lock); 1862 while (spa->spa_scrub_inflight > 0) { 1863 cv_wait(&spa->spa_scrub_io_cv, 1864 &spa->spa_scrub_lock);	3289 if (scnp->scn_ddt_bookmark.ddb_class <= 3290 scnp->scn_ddt_class_max) { 3291 ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); 3292 zfs_dbgmsg("doing scan sync txg %llu; " 3293 "ddt bm=%llu/%llu/%llu/%llx", 3294 (longlong_t)tx->tx_txg, 3295 (longlong_t)scnp->scn_ddt_bookmark.ddb_class, 3296 (longlong_t)scnp->scn_ddt_bookmark.ddb_type, 3297 (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, 3298 (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); 3299 } else { 3300 zfs_dbgmsg("doing scan sync txg %llu; " 3301 "bm=%llu/%llu/%llu/%llu", 3302 (longlong_t)tx->tx_txg, 3303 (longlong_t)scnp->scn_bookmark.zb_objset, 3304 (longlong_t)scnp->scn_bookmark.zb_object, 3305 (longlong_t)scnp->scn_bookmark.zb_level, 3306 (longlong_t)scnp->scn_bookmark.zb_blkid);
1865 }	3307 }
1866 mutex_exit(&spa->spa_scrub_lock); 1867 }
1868	3308
1869 dsl_scan_sync_state(scn, tx); 1870}	3309 scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 3310 NULL, ZIO_FLAG_CANFAIL);
1871	3311
1872/* 1873 * This will start a new scan, or restart an existing one. 1874 / 1875void 1876dsl_resilver_restart(dsl_pool_t dp, uint64_t txg) 1877{ 1878 if (txg == 0) { 1879 dmu_tx_t tx; 1880* tx = dmu_tx_create_dd(dp->dp_mos_dir); 1881 VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));	3312 scn->scn_prefetch_stop = B_FALSE; 3313 prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, 3314 dsl_scan_prefetch_thread, scn, TQ_SLEEP); 3315 ASSERT(prefetch_tqid != TASKQID_INVALID);
1882	3316
1883 txg = dmu_tx_get_txg(tx); 1884 dp->dp_scan->scn_restart_txg = txg; 1885 dmu_tx_commit(tx); 1886 } else { 1887 dp->dp_scan->scn_restart_txg = txg;	3317 dsl_pool_config_enter(dp, FTAG); 3318 dsl_scan_visit(scn, tx); 3319 dsl_pool_config_exit(dp, FTAG); 3320 3321 mutex_enter(&dp->dp_spa->spa_scrub_lock); 3322 scn->scn_prefetch_stop = B_TRUE; 3323 cv_broadcast(&spa->spa_scrub_io_cv); 3324 mutex_exit(&dp->dp_spa->spa_scrub_lock); 3325 3326 taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); 3327 (void) zio_wait(scn->scn_zio_root); 3328 scn->scn_zio_root = NULL; 3329 3330 zfs_dbgmsg("scan visited %llu blocks in %llums " 3331 "(%llu os's, %llu holes, %llu < mintxg, " 3332 "%llu in ddt, %llu > maxtxg)", 3333 (longlong_t)scn->scn_visited_this_txg, 3334 (longlong_t)NSEC2MSEC(gethrtime() - 3335 scn->scn_sync_start_time), 3336 (longlong_t)scn->scn_objsets_visited_this_txg, 3337 (longlong_t)scn->scn_holes_this_txg, 3338 (longlong_t)scn->scn_lt_min_this_txg, 3339 (longlong_t)scn->scn_ddt_contained_this_txg, 3340 (longlong_t)scn->scn_gt_max_this_txg); 3341 3342 if (!scn->scn_suspending) { 3343 ASSERT0(avl_numnodes(&scn->scn_queue)); 3344 scn->scn_done_txg = tx->tx_txg + 1; 3345 if (scn->scn_is_sorted) { 3346 scn->scn_checkpointing = B_TRUE; 3347 scn->scn_clearing = B_TRUE; 3348 } 3349 zfs_dbgmsg("scan complete txg %llu", 3350 (longlong_t)tx->tx_txg); 3351 } 3352 } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { 3353 /* need to issue scrubbing IOs from per-vdev queues / 3354* scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 3355 NULL, ZIO_FLAG_CANFAIL); 3356 scan_io_queues_run(scn); 3357 (void) zio_wait(scn->scn_zio_root); 3358 scn->scn_zio_root = NULL; 3359 3360 /* calculate and dprintf the current memory usage / 3361* (void) dsl_scan_should_clear(scn); 3362 dsl_scan_update_stats(scn); 3363 3364 zfs_dbgmsg("scrubbed %llu blocks (%llu segs) in %llums " 3365 "(avg_block_size = %llu, avg_seg_size = %llu)", 3366 (longlong_t)scn->scn_zios_this_txg, 3367 (longlong_t)scn->scn_segs_this_txg, 3368 (longlong_t)NSEC2MSEC(gethrtime() - 3369 scn->scn_sync_start_time), 3370 (longlong_t)scn->scn_avg_zio_size_this_txg, 3371 (longlong_t)scn->scn_avg_seg_size_this_txg); 3372 } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { 3373 /* Finished with everything. Mark the scrub as complete / 3374* zfs_dbgmsg("scan issuing complete txg %llu", 3375 (longlong_t)tx->tx_txg); 3376 ASSERT3U(scn->scn_done_txg, !=, 0); 3377 ASSERT0(spa->spa_scrub_inflight); 3378 ASSERT0(scn->scn_bytes_pending); 3379 dsl_scan_done(scn, B_TRUE, tx); 3380 sync_type = SYNC_MANDATORY;
1888 }	3381 }
1889 zfs_dbgmsg("restarting resilver txg=%llu", txg); 1890}
1891	3382
1892boolean_t 1893dsl_scan_resilvering(dsl_pool_t dp) 1894{ 1895* return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && 1896 dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);	3383 dsl_scan_sync_state(scn, tx, sync_type);
1897} 1898	3384} 3385
1899/* 1900 * scrub consumers 1901 / 1902*
1903static void	3386static void
1904count_block(zfs_all_blkstats_t zab, const blkptr_t bp)	3387count_block(dsl_scan_t scn, zfs_all_blkstats_t zab, const blkptr_t *bp)
1905{ 1906 int i; 1907	3388{ 3389 int i; 3390
	3391 /* update the spa's stats on how many bytes we have issued / 3392* for (i = 0; i < BP_GET_NDVAS(bp); i++) { 3393 atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, 3394 DVA_GET_ASIZE(&bp->blk_dva[i])); 3395 } 3396
1908 /* 1909 * If we resume after a reboot, zab will be NULL; don't record 1910 * incomplete stats in that case. 1911 / 1912* if (zab == NULL) 1913 return; 1914	3397 /* 3398 * If we resume after a reboot, zab will be NULL; don't record 3399 * incomplete stats in that case. 3400 / 3401* if (zab == NULL) 3402 return; 3403
	3404 mutex_enter(&zab->zab_lock); 3405
1915 for (i = 0; i < 4; i++) { 1916 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; 1917 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; 1918 if (t & DMU_OT_NEWTYPE) 1919 t = DMU_OT_OTHER; 1920 zfs_blkstat_t zb = &zab->zab_type[l][t]; 1921* int equal; 1922 1923 zb->zb_count++; 1924 zb->zb_asize += BP_GET_ASIZE(bp); 1925 zb->zb_lsize += BP_GET_LSIZE(bp); 1926 zb->zb_psize += BP_GET_PSIZE(bp); 1927 zb->zb_gangs += BP_COUNT_GANG(bp); 1928 1929 switch (BP_GET_NDVAS(bp)) { 1930 case 2: 1931 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 1932 DVA_GET_VDEV(&bp->blk_dva[1])) 1933 zb->zb_ditto_2_of_2_samevdev++; 1934 break; 1935 case 3: 1936 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 1937 DVA_GET_VDEV(&bp->blk_dva[1])) + 1938 (DVA_GET_VDEV(&bp->blk_dva[0]) == 1939 DVA_GET_VDEV(&bp->blk_dva[2])) + 1940 (DVA_GET_VDEV(&bp->blk_dva[1]) == 1941 DVA_GET_VDEV(&bp->blk_dva[2])); 1942 if (equal == 1) 1943 zb->zb_ditto_2_of_3_samevdev++; 1944 else if (equal == 3) 1945 zb->zb_ditto_3_of_3_samevdev++; 1946 break; 1947 } 1948 }	3406 for (i = 0; i < 4; i++) { 3407 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; 3408 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; 3409 if (t & DMU_OT_NEWTYPE) 3410 t = DMU_OT_OTHER; 3411 zfs_blkstat_t zb = &zab->zab_type[l][t]; 3412* int equal; 3413 3414 zb->zb_count++; 3415 zb->zb_asize += BP_GET_ASIZE(bp); 3416 zb->zb_lsize += BP_GET_LSIZE(bp); 3417 zb->zb_psize += BP_GET_PSIZE(bp); 3418 zb->zb_gangs += BP_COUNT_GANG(bp); 3419 3420 switch (BP_GET_NDVAS(bp)) { 3421 case 2: 3422 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 3423 DVA_GET_VDEV(&bp->blk_dva[1])) 3424 zb->zb_ditto_2_of_2_samevdev++; 3425 break; 3426 case 3: 3427 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 3428 DVA_GET_VDEV(&bp->blk_dva[1])) + 3429 (DVA_GET_VDEV(&bp->blk_dva[0]) == 3430 DVA_GET_VDEV(&bp->blk_dva[2])) + 3431 (DVA_GET_VDEV(&bp->blk_dva[1]) == 3432 DVA_GET_VDEV(&bp->blk_dva[2])); 3433 if (equal == 1) 3434 zb->zb_ditto_2_of_3_samevdev++; 3435 else if (equal == 3) 3436 zb->zb_ditto_3_of_3_samevdev++; 3437 break; 3438 } 3439 }
	3440 3441 mutex_exit(&zab->zab_lock);
1949} 1950 1951static void	3442} 3443 3444static void
1952dsl_scan_scrub_done(zio_t *zio)	3445scan_io_queue_insert_impl(dsl_scan_io_queue_t queue, scan_io_t sio)
1953{	3446{
1954 spa_t *spa = zio->io_spa;	3447 avl_index_t idx; 3448 int64_t asize = sio->sio_asize; 3449 dsl_scan_t *scn = queue->q_scn;
1955	3450
1956 abd_free(zio->io_abd);	3451 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
1957	3452
1958 mutex_enter(&spa->spa_scrub_lock); 1959 spa->spa_scrub_inflight--; 1960 cv_broadcast(&spa->spa_scrub_io_cv);	3453 if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { 3454 /* block is already scheduled for reading / 3455* atomic_add_64(&scn->scn_bytes_pending, -asize); 3456 kmem_free(sio, sizeof (sio)); 3457* return; 3458 } 3459 avl_insert(&queue->q_sios_by_addr, sio, idx); 3460 range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize); 3461}
1961	3462
1962 if (zio->io_error && (zio->io_error != ECKSUM \|\| 1963 !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { 1964 spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;	3463/* 3464 * Given all the info we got from our metadata scanning process, we 3465 * construct a scan_io_t and insert it into the scan sorting queue. The 3466 * I/O must already be suitable for us to process. This is controlled 3467 * by dsl_scan_enqueue(). 3468 / 3469static void 3470scan_io_queue_insert(dsl_scan_io_queue_t queue, const blkptr_t bp, int dva_i, 3471* int zio_flags, const zbookmark_phys_t zb) 3472{ 3473* dsl_scan_t scn = queue->q_scn; 3474* scan_io_t sio = kmem_zalloc(sizeof (sio), KM_SLEEP); 3475 3476 ASSERT0(BP_IS_GANG(bp)); 3477 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); 3478 3479 bp2sio(bp, sio, dva_i); 3480 sio->sio_flags = zio_flags; 3481 sio->sio_zb = zb; 3482* 3483 /* 3484 * Increment the bytes pending counter now so that we can't 3485 * get an integer underflow in case the worker processes the 3486 * zio before we get to incrementing this counter. 3487 / 3488* atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize); 3489 3490 scan_io_queue_insert_impl(queue, sio); 3491} 3492 3493/* 3494 * Given a set of I/O parameters as discovered by the metadata traversal 3495 * process, attempts to place the I/O into the sorted queues (if allowed), 3496 * or immediately executes the I/O. 3497 / 3498static void 3499dsl_scan_enqueue(dsl_pool_t dp, const blkptr_t bp, int zio_flags, 3500* const zbookmark_phys_t zb) 3501{ 3502* spa_t spa = dp->dp_spa; 3503* 3504 ASSERT(!BP_IS_EMBEDDED(bp)); 3505 3506 /* 3507 * Gang blocks are hard to issue sequentially, so we just issue them 3508 * here immediately instead of queuing them. 3509 / 3510* if (!dp->dp_scan->scn_is_sorted \|\| BP_IS_GANG(bp)) { 3511 scan_exec_io(dp, bp, zio_flags, zb, NULL); 3512 return;
1965 }	3513 }
1966 mutex_exit(&spa->spa_scrub_lock);	3514 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 3515 dva_t dva; 3516 vdev_t vdev; 3517* 3518 dva = bp->blk_dva[i]; 3519 vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); 3520 ASSERT(vdev != NULL); 3521 3522 mutex_enter(&vdev->vdev_scan_io_queue_lock); 3523 if (vdev->vdev_scan_io_queue == NULL) 3524 vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); 3525 ASSERT(dp->dp_scan != NULL); 3526 scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, 3527 i, zio_flags, zb); 3528 mutex_exit(&vdev->vdev_scan_io_queue_lock); 3529 }
1967} 1968 1969static int 1970dsl_scan_scrub_cb(dsl_pool_t dp, 1971* const blkptr_t bp, const zbookmark_phys_t zb) 1972{ 1973 dsl_scan_t *scn = dp->dp_scan;	3530} 3531 3532static int 3533dsl_scan_scrub_cb(dsl_pool_t dp, 3534* const blkptr_t bp, const zbookmark_phys_t zb) 3535{ 3536 dsl_scan_t *scn = dp->dp_scan;
1974 size_t size = BP_GET_PSIZE(bp);
1975 spa_t spa = dp->dp_spa; 1976* uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);	3537 spa_t spa = dp->dp_spa; 3538* uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
	3539 size_t psize = BP_GET_PSIZE(bp);
1977 boolean_t needs_io; 1978 int zio_flags = ZIO_FLAG_SCAN_THREAD \| ZIO_FLAG_RAW \| ZIO_FLAG_CANFAIL;	3540 boolean_t needs_io; 3541 int zio_flags = ZIO_FLAG_SCAN_THREAD \| ZIO_FLAG_RAW \| ZIO_FLAG_CANFAIL;
1979 unsigned int scan_delay = 0; 1980	3542 int d; 3543
1981 if (phys_birth <= scn->scn_phys.scn_min_txg \|\| 1982 phys_birth >= scn->scn_phys.scn_max_txg) 1983 return (0); 1984	3544 if (phys_birth <= scn->scn_phys.scn_min_txg \|\| 3545 phys_birth >= scn->scn_phys.scn_max_txg) 3546 return (0); 3547
1985 count_block(dp->dp_blkstats, bp); 1986 1987 if (BP_IS_EMBEDDED(bp))	3548 if (BP_IS_EMBEDDED(bp)) { 3549 count_block(scn, dp->dp_blkstats, bp);
1988 return (0);	3550 return (0);
	3551 }
1989 1990 ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); 1991 if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { 1992 zio_flags \|= ZIO_FLAG_SCRUB; 1993 needs_io = B_TRUE;	3552 3553 ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); 3554 if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { 3555 zio_flags \|= ZIO_FLAG_SCRUB; 3556 needs_io = B_TRUE;
1994 scan_delay = zfs_scrub_delay;
1995 } else { 1996 ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); 1997 zio_flags \|= ZIO_FLAG_RESILVER; 1998 needs_io = B_FALSE;	3557 } else { 3558 ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); 3559 zio_flags \|= ZIO_FLAG_RESILVER; 3560 needs_io = B_FALSE;
1999 scan_delay = zfs_resilver_delay;
2000 } 2001 2002 /* If it's an intent log block, failure is expected. / 2003* if (zb->zb_level == ZB_ZIL_LEVEL) 2004 zio_flags \|= ZIO_FLAG_SPECULATIVE; 2005	3561 } 3562 3563 /* If it's an intent log block, failure is expected. / 3564* if (zb->zb_level == ZB_ZIL_LEVEL) 3565 zio_flags \|= ZIO_FLAG_SPECULATIVE; 3566
2006 for (int d = 0; d < BP_GET_NDVAS(bp); d++) { 2007 vdev_t vd = vdev_lookup_top(spa, 2008* DVA_GET_VDEV(&bp->blk_dva[d]));	3567 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 3568 const dva_t *dva = &bp->blk_dva[d];
2009 2010 /* 2011 * Keep track of how much data we've examined so that 2012 * zpool(1M) status can make useful progress reports. 2013 */	3569 3570 /* 3571 * Keep track of how much data we've examined so that 3572 * zpool(1M) status can make useful progress reports. 3573 */
2014 scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); 2015 spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);	3574 scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); 3575 spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
2016 2017 /* if it's a resilver, this may not be in the target range */	3576 3577 /* if it's a resilver, this may not be in the target range */
2018 if (!needs_io) { 2019 if (DVA_GET_GANG(&bp->blk_dva[d])) { 2020 /* 2021 * Gang members may be spread across multiple 2022 * vdevs, so the best estimate we have is the 2023 * scrub range, which has already been checked. 2024 * XXX -- it would be better to change our 2025 * allocation policy to ensure that all 2026 * gang members reside on the same vdev. 2027 / 2028* needs_io = B_TRUE; 2029 } else { 2030 needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, 2031 phys_birth, 1); 2032 } 2033 }	3578 if (!needs_io) 3579 needs_io = dsl_scan_need_resilver(spa, dva, psize, 3580 phys_birth);
2034 } 2035 2036 if (needs_io && !zfs_no_scrub_io) {	3581 } 3582 3583 if (needs_io && !zfs_no_scrub_io) {
2037 vdev_t rvd = spa->spa_root_vdev; 2038* uint64_t maxinflight = rvd->vdev_children * 2039 MAX(zfs_top_maxinflight, 1);	3584 dsl_scan_enqueue(dp, bp, zio_flags, zb); 3585 } else { 3586 count_block(scn, dp->dp_blkstats, bp); 3587 }
2040	3588
	3589 /* do not relocate this block / 3590* return (0); 3591} 3592 3593static void 3594dsl_scan_scrub_done(zio_t zio) 3595{ 3596* spa_t spa = zio->io_spa; 3597* blkptr_t bp = zio->io_bp; 3598* dsl_scan_io_queue_t queue = zio->io_private; 3599* 3600 abd_free(zio->io_abd); 3601 3602 if (queue == NULL) {
2041 mutex_enter(&spa->spa_scrub_lock);	3603 mutex_enter(&spa->spa_scrub_lock);
2042 while (spa->spa_scrub_inflight >= maxinflight)	3604 ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); 3605 spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); 3606 cv_broadcast(&spa->spa_scrub_io_cv); 3607 mutex_exit(&spa->spa_scrub_lock); 3608 } else { 3609 mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); 3610 ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); 3611 queue->q_inflight_bytes -= BP_GET_PSIZE(bp); 3612 cv_broadcast(&queue->q_zio_cv); 3613 mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); 3614 } 3615 3616 if (zio->io_error && (zio->io_error != ECKSUM \|\| 3617 !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { 3618 atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); 3619 } 3620} 3621 3622/* 3623 * Given a scanning zio's information, executes the zio. The zio need 3624 * not necessarily be only sortable, this function simply executes the 3625 * zio, no matter what it is. The optional queue argument allows the 3626 * caller to specify that they want per top level vdev IO rate limiting 3627 * instead of the legacy global limiting. 3628 / 3629static void 3630scan_exec_io(dsl_pool_t dp, const blkptr_t bp, int zio_flags, 3631* const zbookmark_phys_t zb, dsl_scan_io_queue_t queue) 3632{ 3633 spa_t spa = dp->dp_spa; 3634* dsl_scan_t scn = dp->dp_scan; 3635* size_t size = BP_GET_PSIZE(bp); 3636 abd_t data = abd_alloc_for_io(size, B_FALSE); 3637* unsigned int scan_delay = 0; 3638 3639 if (queue == NULL) { 3640 mutex_enter(&spa->spa_scrub_lock); 3641 while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
2043 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);	3642 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2044 spa->spa_scrub_inflight++;	3643 spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
2045 mutex_exit(&spa->spa_scrub_lock);	3644 mutex_exit(&spa->spa_scrub_lock);
	3645 } else { 3646 kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
2046	3647
2047 /* 2048 * If we're seeing recent (zfs_scan_idle) "important" I/Os 2049 * then throttle our workload to limit the impact of a scan. 2050 / 2051* if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) 2052 delay(MAX((int)scan_delay, 0));	3648 mutex_enter(q_lock); 3649 while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) 3650 cv_wait(&queue->q_zio_cv, q_lock); 3651 queue->q_inflight_bytes += BP_GET_PSIZE(bp); 3652 mutex_exit(q_lock); 3653 }
2053	3654
2054 zio_nowait(zio_read(NULL, spa, bp, 2055 abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done, 2056 NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));	3655 if (zio_flags & ZIO_FLAG_RESILVER) 3656 scan_delay = zfs_resilver_delay; 3657 else { 3658 ASSERT(zio_flags & ZIO_FLAG_SCRUB); 3659 scan_delay = zfs_scrub_delay;
2057 } 2058	3660 } 3661
2059 /* do not relocate this block / 2060* return (0);	3662 if (scan_delay && (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)) 3663 delay(MAX((int)scan_delay, 0)); 3664 3665 count_block(dp->dp_scan, dp->dp_blkstats, bp); 3666 zio_nowait(zio_read(dp->dp_scan->scn_zio_root, spa, bp, data, size, 3667 dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
2061} 2062 2063/*	3668} 3669 3670/*
2064 * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. 2065 * Can also be called to resume a paused scrub.	3671 * This is the primary extent sorting algorithm. We balance two parameters: 3672 * 1) how many bytes of I/O are in an extent 3673 * 2) how well the extent is filled with I/O (as a fraction of its total size) 3674 * Since we allow extents to have gaps between their constituent I/Os, it's 3675 * possible to have a fairly large extent that contains the same amount of 3676 * I/O bytes than a much smaller extent, which just packs the I/O more tightly. 3677 * The algorithm sorts based on a score calculated from the extent's size, 3678 * the relative fill volume (in %) and a "fill weight" parameter that controls 3679 * the split between whether we prefer larger extents or more well populated 3680 * extents: 3681 * 3682 * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) 3683 * 3684 * Example: 3685 * 1) assume extsz = 64 MiB 3686 * 2) assume fill = 32 MiB (extent is half full) 3687 * 3) assume fill_weight = 3 3688 * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 3689 * SCORE = 32M + (50 * 3 * 32M) / 100 3690 * SCORE = 32M + (4800M / 100) 3691 * SCORE = 32M + 48M 3692 * ^ ^ 3693 * \| +--- final total relative fill-based score 3694 * +--------- final total fill-based score 3695 * SCORE = 80M 3696 * 3697 * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards 3698 * extents that are more completely filled (in a 3:2 ratio) vs just larger. 3699 * Note that as an optimization, we replace multiplication and division by 3700 * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128).
2066 */	3701 */
2067int 2068dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)	3702static int 3703ext_size_compare(const void x, const void y)
2069{	3704{
2070 spa_t *spa = dp->dp_spa;	3705 const range_seg_t rsa = x, rsb = y; 3706 uint64_t sa = rsa->rs_end - rsa->rs_start, 3707 sb = rsb->rs_end - rsb->rs_start; 3708 uint64_t score_a, score_b; 3709 3710 score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * 3711 fill_weight * rsa->rs_fill) >> 7); 3712 score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) * 3713 fill_weight * rsb->rs_fill) >> 7); 3714 3715 if (score_a > score_b) 3716 return (-1); 3717 if (score_a == score_b) { 3718 if (rsa->rs_start < rsb->rs_start) 3719 return (-1); 3720 if (rsa->rs_start == rsb->rs_start) 3721 return (0); 3722 return (1); 3723 } 3724 return (1); 3725} 3726 3727/* 3728 * Comparator for the q_sios_by_addr tree. Sorting is simply performed 3729 * based on LBA-order (from lowest to highest). 3730 / 3731static int 3732io_addr_compare(const void x, const void y) 3733{ 3734* const scan_io_t a = x, b = y; 3735 3736 if (a->sio_offset < b->sio_offset) 3737 return (-1); 3738 if (a->sio_offset == b->sio_offset) 3739 return (0); 3740 return (1); 3741} 3742 3743/* IO queues are created on demand when they are needed. / 3744static dsl_scan_io_queue_t 3745scan_io_queue_create(vdev_t vd) 3746{ 3747* dsl_scan_t scn = vd->vdev_spa->spa_dsl_pool->dp_scan; 3748* dsl_scan_io_queue_t q = kmem_zalloc(sizeof (q), KM_SLEEP); 3749 3750 q->q_scn = scn; 3751 q->q_vd = vd; 3752 cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); 3753 q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, 3754 &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); 3755 avl_create(&q->q_sios_by_addr, io_addr_compare, 3756 sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); 3757 3758 return (q); 3759} 3760 3761/* 3762 * Destroys a scan queue and all segments and scan_io_t's contained in it. 3763 * No further execution of I/O occurs, anything pending in the queue is 3764 * simply freed without being executed. 3765 / 3766void 3767dsl_scan_io_queue_destroy(dsl_scan_io_queue_t queue) 3768{ 3769 dsl_scan_t scn = queue->q_scn; 3770* scan_io_t sio; 3771* void cookie = NULL; 3772* int64_t bytes_dequeued = 0; 3773 3774 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); 3775 3776 while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != 3777 NULL) { 3778 ASSERT(range_tree_contains(queue->q_exts_by_addr, 3779 sio->sio_offset, sio->sio_asize)); 3780 bytes_dequeued += sio->sio_asize; 3781 kmem_free(sio, sizeof (sio)); 3782* } 3783 3784 atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); 3785 range_tree_vacate(queue->q_exts_by_addr, NULL, queue); 3786 range_tree_destroy(queue->q_exts_by_addr); 3787 avl_destroy(&queue->q_sios_by_addr); 3788 cv_destroy(&queue->q_zio_cv); 3789 3790 kmem_free(queue, sizeof (queue)); 3791} 3792* 3793/* 3794 * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is 3795 * called on behalf of vdev_top_transfer when creating or destroying 3796 * a mirror vdev due to zpool attach/detach. 3797 / 3798void 3799dsl_scan_io_queue_vdev_xfer(vdev_t svd, vdev_t tvd) 3800{ 3801* mutex_enter(&svd->vdev_scan_io_queue_lock); 3802 mutex_enter(&tvd->vdev_scan_io_queue_lock); 3803 3804 VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); 3805 tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; 3806 svd->vdev_scan_io_queue = NULL; 3807 if (tvd->vdev_scan_io_queue != NULL) 3808 tvd->vdev_scan_io_queue->q_vd = tvd; 3809 3810 mutex_exit(&tvd->vdev_scan_io_queue_lock); 3811 mutex_exit(&svd->vdev_scan_io_queue_lock); 3812} 3813 3814static void 3815scan_io_queues_destroy(dsl_scan_t scn) 3816{ 3817* vdev_t rvd = scn->scn_dp->dp_spa->spa_root_vdev; 3818* 3819 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 3820 vdev_t tvd = rvd->vdev_child[i]; 3821* 3822 mutex_enter(&tvd->vdev_scan_io_queue_lock); 3823 if (tvd->vdev_scan_io_queue != NULL) 3824 dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); 3825 tvd->vdev_scan_io_queue = NULL; 3826 mutex_exit(&tvd->vdev_scan_io_queue_lock); 3827 } 3828} 3829 3830static void 3831dsl_scan_freed_dva(spa_t spa, const blkptr_t bp, int dva_i) 3832{ 3833 dsl_pool_t *dp = spa->spa_dsl_pool;
2071 dsl_scan_t *scn = dp->dp_scan;	3834 dsl_scan_t *scn = dp->dp_scan;
	3835 vdev_t vdev; 3836* kmutex_t q_lock; 3837* dsl_scan_io_queue_t queue; 3838* scan_io_t srch, sio; 3839* avl_index_t idx; 3840 uint64_t start, size;
2072	3841
	3842 vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); 3843 ASSERT(vdev != NULL); 3844 q_lock = &vdev->vdev_scan_io_queue_lock; 3845 queue = vdev->vdev_scan_io_queue; 3846 3847 mutex_enter(q_lock); 3848 if (queue == NULL) { 3849 mutex_exit(q_lock); 3850 return; 3851 } 3852 3853 bp2sio(bp, &srch, dva_i); 3854 start = srch.sio_offset; 3855 size = srch.sio_asize; 3856
2073 /*	3857 /*
2074 * Purge all vdev caches and probe all devices. We do this here 2075 * rather than in sync context because this requires a writer lock 2076 * on the spa_config lock, which we can't do from sync context. The 2077 * spa_scrub_reopen flag indicates that vdev_open() should not 2078 * attempt to start another scrub.	3858 * We can find the zio in two states: 3859 * 1) Cold, just sitting in the queue of zio's to be issued at 3860 * some point in the future. In this case, all we do is 3861 * remove the zio from the q_sios_by_addr tree, decrement 3862 * its data volume from the containing range_seg_t and 3863 * resort the q_exts_by_size tree to reflect that the 3864 * range_seg_t has lost some of its 'fill'. We don't shorten 3865 * the range_seg_t - this is usually rare enough not to be 3866 * worth the extra hassle of trying keep track of precise 3867 * extent boundaries. 3868 * 2) Hot, where the zio is currently in-flight in 3869 * dsl_scan_issue_ios. In this case, we can't simply 3870 * reach in and stop the in-flight zio's, so we instead 3871 * block the caller. Eventually, dsl_scan_issue_ios will 3872 * be done with issuing the zio's it gathered and will 3873 * signal us.
2079 */	3874 */
2080 spa_vdev_state_enter(spa, SCL_NONE); 2081 spa->spa_scrub_reopen = B_TRUE; 2082 vdev_reopen(spa->spa_root_vdev); 2083 spa->spa_scrub_reopen = B_FALSE; 2084 (void) spa_vdev_state_exit(spa, NULL, 0);	3875 sio = avl_find(&queue->q_sios_by_addr, &srch, &idx); 3876 if (sio != NULL) { 3877 int64_t asize = sio->sio_asize; 3878 blkptr_t tmpbp;
2085	3879
2086 if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { 2087 /* got scrub start cmd, resume paused scrub / 2088* int err = dsl_scrub_set_pause_resume(scn->scn_dp, 2089 POOL_SCRUB_NORMAL); 2090 if (err == 0) { 2091 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); 2092 return (ECANCELED); 2093 }	3880 /* Got it while it was cold in the queue / 3881* ASSERT3U(start, ==, sio->sio_offset); 3882 ASSERT3U(size, ==, asize); 3883 avl_remove(&queue->q_sios_by_addr, sio);
2094	3884
2095 return (SET_ERROR(err)); 2096 }	3885 ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); 3886 range_tree_remove_fill(queue->q_exts_by_addr, start, size);
2097	3887
2098 return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, 2099 dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));	3888 /* 3889 * We only update scn_bytes_pending in the cold path, 3890 * otherwise it will already have been accounted for as 3891 * part of the zio's execution. 3892 / 3893* atomic_add_64(&scn->scn_bytes_pending, -asize); 3894 3895 /* count the block as though we issued it / 3896* sio2bp(sio, &tmpbp, dva_i); 3897 count_block(scn, dp->dp_blkstats, &tmpbp); 3898 3899 kmem_free(sio, sizeof (sio)); 3900* } 3901 mutex_exit(q_lock);
2100} 2101	3902} 3903
2102static boolean_t 2103dsl_scan_restarting(dsl_scan_t scn, dmu_tx_t tx)	3904/* 3905 * Callback invoked when a zio_free() zio is executing. This needs to be 3906 * intercepted to prevent the zio from deallocating a particular portion 3907 * of disk space and it then getting reallocated and written to, while we 3908 * still have it queued up for processing. 3909 / 3910void 3911dsl_scan_freed(spa_t spa, const blkptr_t *bp)
2104{	3912{
2105 return (scn->scn_restart_txg != 0 && 2106 scn->scn_restart_txg <= tx->tx_txg);	3913 dsl_pool_t dp = spa->spa_dsl_pool; 3914* dsl_scan_t scn = dp->dp_scan; 3915* 3916 ASSERT(!BP_IS_EMBEDDED(bp)); 3917 ASSERT(scn != NULL); 3918 if (!dsl_scan_is_running(scn)) 3919 return; 3920 3921 for (int i = 0; i < BP_GET_NDVAS(bp); i++) 3922 dsl_scan_freed_dva(spa, bp, i);
2107}	3923}