dsl_pool.c revision 325911
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 24 * Copyright (c) 2013 Steven Hartland. All rights reserved. 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30#include <sys/dsl_pool.h> 31#include <sys/dsl_dataset.h> 32#include <sys/dsl_prop.h> 33#include <sys/dsl_dir.h> 34#include <sys/dsl_synctask.h> 35#include <sys/dsl_scan.h> 36#include <sys/dnode.h> 37#include <sys/dmu_tx.h> 38#include <sys/dmu_objset.h> 39#include <sys/arc.h> 40#include <sys/zap.h> 41#include <sys/zio.h> 42#include <sys/zfs_context.h> 43#include <sys/fs/zfs.h> 44#include <sys/zfs_znode.h> 45#include <sys/spa_impl.h> 46#include <sys/dsl_deadlist.h> 47#include <sys/bptree.h> 48#include <sys/zfeature.h> 49#include <sys/zil_impl.h> 50#include <sys/dsl_userhold.h> 51 52#if defined(__FreeBSD__) && defined(_KERNEL) 53#include <sys/types.h> 54#include <sys/sysctl.h> 55#endif 56 57/* 58 * ZFS Write Throttle 59 * ------------------ 60 * 61 * ZFS must limit the rate of incoming writes to the rate at which it is able 62 * to sync data modifications to the backend storage. Throttling by too much 63 * creates an artificial limit; throttling by too little can only be sustained 64 * for short periods and would lead to highly lumpy performance. On a per-pool 65 * basis, ZFS tracks the amount of modified (dirty) data. As operations change 66 * data, the amount of dirty data increases; as ZFS syncs out data, the amount 67 * of dirty data decreases. When the amount of dirty data exceeds a 68 * predetermined threshold further modifications are blocked until the amount 69 * of dirty data decreases (as data is synced out). 70 * 71 * The limit on dirty data is tunable, and should be adjusted according to 72 * both the IO capacity and available memory of the system. The larger the 73 * window, the more ZFS is able to aggregate and amortize metadata (and data) 74 * changes. However, memory is a limited resource, and allowing for more dirty 75 * data comes at the cost of keeping other useful data in memory (for example 76 * ZFS data cached by the ARC). 77 * 78 * Implementation 79 * 80 * As buffers are modified dsl_pool_willuse_space() increments both the per- 81 * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of 82 * dirty space used; dsl_pool_dirty_space() decrements those values as data 83 * is synced out from dsl_pool_sync(). While only the poolwide value is 84 * relevant, the per-txg value is useful for debugging. The tunable 85 * zfs_dirty_data_max determines the dirty space limit. Once that value is 86 * exceeded, new writes are halted until space frees up. 87 * 88 * The zfs_dirty_data_sync tunable dictates the threshold at which we 89 * ensure that there is a txg syncing (see the comment in txg.c for a full 90 * description of transaction group stages). 91 * 92 * The IO scheduler uses both the dirty space limit and current amount of 93 * dirty data as inputs. Those values affect the number of concurrent IOs ZFS 94 * issues. See the comment in vdev_queue.c for details of the IO scheduler. 95 * 96 * The delay is also calculated based on the amount of dirty data. See the 97 * comment above dmu_tx_delay() for details. 98 */ 99 100/* 101 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, 102 * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. 103 */ 104uint64_t zfs_dirty_data_max; 105uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; 106int zfs_dirty_data_max_percent = 10; 107 108/* 109 * If there is at least this much dirty data, push out a txg. 110 */ 111uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024; 112 113/* 114 * Once there is this amount of dirty data, the dmu_tx_delay() will kick in 115 * and delay each transaction. 116 * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. 117 */ 118int zfs_delay_min_dirty_percent = 60; 119 120/* 121 * This controls how quickly the delay approaches infinity. 122 * Larger values cause it to delay more for a given amount of dirty data. 123 * Therefore larger values will cause there to be less dirty data for a 124 * given throughput. 125 * 126 * For the smoothest delay, this value should be about 1 billion divided 127 * by the maximum number of operations per second. This will smoothly 128 * handle between 10x and 1/10th this number. 129 * 130 * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the 131 * multiply in dmu_tx_delay(). 132 */ 133uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; 134 135/* 136 * This determines the number of threads used by the dp_sync_taskq. 137 */ 138int zfs_sync_taskq_batch_pct = 75; 139 140/* 141 * These tunables determine the behavior of how zil_itxg_clean() is 142 * called via zil_clean() in the context of spa_sync(). When an itxg 143 * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching. 144 * If the dispatch fails, the call to zil_itxg_clean() will occur 145 * synchronously in the context of spa_sync(), which can negatively 146 * impact the performance of spa_sync() (e.g. in the case of the itxg 147 * list having a large number of itxs that needs to be cleaned). 148 * 149 * Thus, these tunables can be used to manipulate the behavior of the 150 * taskq used by zil_clean(); they determine the number of taskq entries 151 * that are pre-populated when the taskq is first created (via the 152 * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of 153 * taskq entries that are cached after an on-demand allocation (via the 154 * "zfs_zil_clean_taskq_maxalloc"). 155 * 156 * The idea being, we want to try reasonably hard to ensure there will 157 * already be a taskq entry pre-allocated by the time that it is needed 158 * by zil_clean(). This way, we can avoid the possibility of an 159 * on-demand allocation of a new taskq entry from failing, which would 160 * result in zil_itxg_clean() being called synchronously from zil_clean() 161 * (which can adversely affect performance of spa_sync()). 162 * 163 * Additionally, the number of threads used by the taskq can be 164 * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable. 165 */ 166int zfs_zil_clean_taskq_nthr_pct = 100; 167int zfs_zil_clean_taskq_minalloc = 1024; 168int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; 169 170#if defined(__FreeBSD__) && defined(_KERNEL) 171 172extern int zfs_vdev_async_write_active_max_dirty_percent; 173 174SYSCTL_DECL(_vfs_zfs); 175 176SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN, 177 &zfs_dirty_data_max, 0, 178 "The maximum amount of dirty data in bytes after which new writes are " 179 "halted until space becomes available"); 180 181SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN, 182 &zfs_dirty_data_max_max, 0, 183 "The absolute cap on dirty_data_max when auto calculating"); 184 185static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS); 186SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent, 187 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), 188 sysctl_zfs_dirty_data_max_percent, "I", 189 "The percent of physical memory used to auto calculate dirty_data_max"); 190 191SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN, 192 &zfs_dirty_data_sync, 0, 193 "Force a txg if the number of dirty buffer bytes exceed this value"); 194 195static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS); 196/* No zfs_delay_min_dirty_percent tunable due to limit requirements */ 197SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent, 198 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int), 199 sysctl_zfs_delay_min_dirty_percent, "I", 200 "The limit of outstanding dirty data before transactions are delayed"); 201 202static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS); 203/* No zfs_delay_scale tunable due to limit requirements */ 204SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale, 205 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 206 sysctl_zfs_delay_scale, "QU", 207 "Controls how quickly the delay approaches infinity"); 208 209static int 210sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS) 211{ 212 int val, err; 213 214 val = zfs_dirty_data_max_percent; 215 err = sysctl_handle_int(oidp, &val, 0, req); 216 if (err != 0 || req->newptr == NULL) 217 return (err); 218 219 if (val < 0 || val > 100) 220 return (EINVAL); 221 222 zfs_dirty_data_max_percent = val; 223 224 return (0); 225} 226 227static int 228sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS) 229{ 230 int val, err; 231 232 val = zfs_delay_min_dirty_percent; 233 err = sysctl_handle_int(oidp, &val, 0, req); 234 if (err != 0 || req->newptr == NULL) 235 return (err); 236 237 if (val < zfs_vdev_async_write_active_max_dirty_percent) 238 return (EINVAL); 239 240 zfs_delay_min_dirty_percent = val; 241 242 return (0); 243} 244 245static int 246sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS) 247{ 248 uint64_t val; 249 int err; 250 251 val = zfs_delay_scale; 252 err = sysctl_handle_64(oidp, &val, 0, req); 253 if (err != 0 || req->newptr == NULL) 254 return (err); 255 256 if (val > UINT64_MAX / zfs_dirty_data_max) 257 return (EINVAL); 258 259 zfs_delay_scale = val; 260 261 return (0); 262} 263#endif 264 265int 266dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 267{ 268 uint64_t obj; 269 int err; 270 271 err = zap_lookup(dp->dp_meta_objset, 272 dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, 273 name, sizeof (obj), 1, &obj); 274 if (err) 275 return (err); 276 277 return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); 278} 279 280static dsl_pool_t * 281dsl_pool_open_impl(spa_t *spa, uint64_t txg) 282{ 283 dsl_pool_t *dp; 284 blkptr_t *bp = spa_get_rootblkptr(spa); 285 286 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 287 dp->dp_spa = spa; 288 dp->dp_meta_rootbp = *bp; 289 rrw_init(&dp->dp_config_rwlock, B_TRUE); 290 txg_init(dp, txg); 291 292 txg_list_create(&dp->dp_dirty_datasets, spa, 293 offsetof(dsl_dataset_t, ds_dirty_link)); 294 txg_list_create(&dp->dp_dirty_zilogs, spa, 295 offsetof(zilog_t, zl_dirty_link)); 296 txg_list_create(&dp->dp_dirty_dirs, spa, 297 offsetof(dsl_dir_t, dd_dirty_link)); 298 txg_list_create(&dp->dp_sync_tasks, spa, 299 offsetof(dsl_sync_task_t, dst_node)); 300 301 dp->dp_sync_taskq = taskq_create("dp_sync_taskq", 302 zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, 303 TASKQ_THREADS_CPU_PCT); 304 305 dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", 306 zfs_zil_clean_taskq_nthr_pct, minclsyspri, 307 zfs_zil_clean_taskq_minalloc, 308 zfs_zil_clean_taskq_maxalloc, 309 TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); 310 311 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 312 cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); 313 314 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 315 1, 4, 0); 316 317 return (dp); 318} 319 320int 321dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 322{ 323 int err; 324 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 325 326 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 327 &dp->dp_meta_objset); 328 if (err != 0) 329 dsl_pool_close(dp); 330 else 331 *dpp = dp; 332 333 return (err); 334} 335 336int 337dsl_pool_open(dsl_pool_t *dp) 338{ 339 int err; 340 dsl_dir_t *dd; 341 dsl_dataset_t *ds; 342 uint64_t obj; 343 344 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 345 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 346 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 347 &dp->dp_root_dir_obj); 348 if (err) 349 goto out; 350 351 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 352 NULL, dp, &dp->dp_root_dir); 353 if (err) 354 goto out; 355 356 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 357 if (err) 358 goto out; 359 360 if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { 361 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 362 if (err) 363 goto out; 364 err = dsl_dataset_hold_obj(dp, 365 dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); 366 if (err == 0) { 367 err = dsl_dataset_hold_obj(dp, 368 dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, 369 &dp->dp_origin_snap); 370 dsl_dataset_rele(ds, FTAG); 371 } 372 dsl_dir_rele(dd, dp); 373 if (err) 374 goto out; 375 } 376 377 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 378 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, 379 &dp->dp_free_dir); 380 if (err) 381 goto out; 382 383 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 384 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); 385 if (err) 386 goto out; 387 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 388 dp->dp_meta_objset, obj)); 389 } 390 391 /* 392 * Note: errors ignored, because the leak dir will not exist if we 393 * have not encountered a leak yet. 394 */ 395 (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, 396 &dp->dp_leak_dir); 397 398 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { 399 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 400 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 401 &dp->dp_bptree_obj); 402 if (err != 0) 403 goto out; 404 } 405 406 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { 407 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 408 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 409 &dp->dp_empty_bpobj); 410 if (err != 0) 411 goto out; 412 } 413 414 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 415 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 416 &dp->dp_tmp_userrefs_obj); 417 if (err == ENOENT) 418 err = 0; 419 if (err) 420 goto out; 421 422 err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); 423 424out: 425 rrw_exit(&dp->dp_config_rwlock, FTAG); 426 return (err); 427} 428 429void 430dsl_pool_close(dsl_pool_t *dp) 431{ 432 /* 433 * Drop our references from dsl_pool_open(). 434 * 435 * Since we held the origin_snap from "syncing" context (which 436 * includes pool-opening context), it actually only got a "ref" 437 * and not a hold, so just drop that here. 438 */ 439 if (dp->dp_origin_snap) 440 dsl_dataset_rele(dp->dp_origin_snap, dp); 441 if (dp->dp_mos_dir) 442 dsl_dir_rele(dp->dp_mos_dir, dp); 443 if (dp->dp_free_dir) 444 dsl_dir_rele(dp->dp_free_dir, dp); 445 if (dp->dp_leak_dir) 446 dsl_dir_rele(dp->dp_leak_dir, dp); 447 if (dp->dp_root_dir) 448 dsl_dir_rele(dp->dp_root_dir, dp); 449 450 bpobj_close(&dp->dp_free_bpobj); 451 452 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 453 if (dp->dp_meta_objset) 454 dmu_objset_evict(dp->dp_meta_objset); 455 456 txg_list_destroy(&dp->dp_dirty_datasets); 457 txg_list_destroy(&dp->dp_dirty_zilogs); 458 txg_list_destroy(&dp->dp_sync_tasks); 459 txg_list_destroy(&dp->dp_dirty_dirs); 460 461 taskq_destroy(dp->dp_zil_clean_taskq); 462 taskq_destroy(dp->dp_sync_taskq); 463 464 /* 465 * We can't set retry to TRUE since we're explicitly specifying 466 * a spa to flush. This is good enough; any missed buffers for 467 * this spa won't cause trouble, and they'll eventually fall 468 * out of the ARC just like any other unused buffer. 469 */ 470 arc_flush(dp->dp_spa, FALSE); 471 472 txg_fini(dp); 473 dsl_scan_fini(dp); 474 dmu_buf_user_evict_wait(); 475 476 rrw_destroy(&dp->dp_config_rwlock); 477 mutex_destroy(&dp->dp_lock); 478 taskq_destroy(dp->dp_vnrele_taskq); 479 if (dp->dp_blkstats) 480 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 481 kmem_free(dp, sizeof (dsl_pool_t)); 482} 483 484dsl_pool_t * 485dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 486{ 487 int err; 488 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 489 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 490 dsl_dataset_t *ds; 491 uint64_t obj; 492 493 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 494 495 /* create and open the MOS (meta-objset) */ 496 dp->dp_meta_objset = dmu_objset_create_impl(spa, 497 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 498 499 /* create the pool directory */ 500 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 501 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 502 ASSERT0(err); 503 504 /* Initialize scan structures */ 505 VERIFY0(dsl_scan_init(dp, txg)); 506 507 /* create and open the root dir */ 508 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 509 VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 510 NULL, dp, &dp->dp_root_dir)); 511 512 /* create and open the meta-objset dir */ 513 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 514 VERIFY0(dsl_pool_open_special_dir(dp, 515 MOS_DIR_NAME, &dp->dp_mos_dir)); 516 517 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 518 /* create and open the free dir */ 519 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 520 FREE_DIR_NAME, tx); 521 VERIFY0(dsl_pool_open_special_dir(dp, 522 FREE_DIR_NAME, &dp->dp_free_dir)); 523 524 /* create and open the free_bplist */ 525 obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); 526 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 527 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); 528 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 529 dp->dp_meta_objset, obj)); 530 } 531 532 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 533 dsl_pool_create_origin(dp, tx); 534 535 /* create the root dataset */ 536 obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 537 538 /* create the root objset */ 539 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 540#ifdef _KERNEL 541 { 542 objset_t *os; 543 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 544 os = dmu_objset_create_impl(dp->dp_spa, ds, 545 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 546 rrw_exit(&ds->ds_bp_rwlock, FTAG); 547 zfs_create_fs(os, kcred, zplprops, tx); 548 } 549#endif 550 dsl_dataset_rele(ds, FTAG); 551 552 dmu_tx_commit(tx); 553 554 rrw_exit(&dp->dp_config_rwlock, FTAG); 555 556 return (dp); 557} 558 559/* 560 * Account for the meta-objset space in its placeholder dsl_dir. 561 */ 562void 563dsl_pool_mos_diduse_space(dsl_pool_t *dp, 564 int64_t used, int64_t comp, int64_t uncomp) 565{ 566 ASSERT3U(comp, ==, uncomp); /* it's all metadata */ 567 mutex_enter(&dp->dp_lock); 568 dp->dp_mos_used_delta += used; 569 dp->dp_mos_compressed_delta += comp; 570 dp->dp_mos_uncompressed_delta += uncomp; 571 mutex_exit(&dp->dp_lock); 572} 573 574static void 575dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) 576{ 577 zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 578 dmu_objset_sync(dp->dp_meta_objset, zio, tx); 579 VERIFY0(zio_wait(zio)); 580 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 581 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 582} 583 584static void 585dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) 586{ 587 ASSERT(MUTEX_HELD(&dp->dp_lock)); 588 589 if (delta < 0) 590 ASSERT3U(-delta, <=, dp->dp_dirty_total); 591 592 dp->dp_dirty_total += delta; 593 594 /* 595 * Note: we signal even when increasing dp_dirty_total. 596 * This ensures forward progress -- each thread wakes the next waiter. 597 */ 598 if (dp->dp_dirty_total < zfs_dirty_data_max) 599 cv_signal(&dp->dp_spaceavail_cv); 600} 601 602void 603dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 604{ 605 zio_t *zio; 606 dmu_tx_t *tx; 607 dsl_dir_t *dd; 608 dsl_dataset_t *ds; 609 objset_t *mos = dp->dp_meta_objset; 610 list_t synced_datasets; 611 612 list_create(&synced_datasets, sizeof (dsl_dataset_t), 613 offsetof(dsl_dataset_t, ds_synced_link)); 614 615 tx = dmu_tx_create_assigned(dp, txg); 616 617 /* 618 * Write out all dirty blocks of dirty datasets. 619 */ 620 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 621 while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { 622 /* 623 * We must not sync any non-MOS datasets twice, because 624 * we may have taken a snapshot of them. However, we 625 * may sync newly-created datasets on pass 2. 626 */ 627 ASSERT(!list_link_active(&ds->ds_synced_link)); 628 list_insert_tail(&synced_datasets, ds); 629 dsl_dataset_sync(ds, zio, tx); 630 } 631 VERIFY0(zio_wait(zio)); 632 633 /* 634 * We have written all of the accounted dirty data, so our 635 * dp_space_towrite should now be zero. However, some seldom-used 636 * code paths do not adhere to this (e.g. dbuf_undirty(), also 637 * rounding error in dbuf_write_physdone). 638 * Shore up the accounting of any dirtied space now. 639 */ 640 dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); 641 642 /* 643 * Update the long range free counter after 644 * we're done syncing user data 645 */ 646 mutex_enter(&dp->dp_lock); 647 ASSERT(spa_sync_pass(dp->dp_spa) == 1 || 648 dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); 649 dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; 650 mutex_exit(&dp->dp_lock); 651 652 /* 653 * After the data blocks have been written (ensured by the zio_wait() 654 * above), update the user/group space accounting. This happens 655 * in tasks dispatched to dp_sync_taskq, so wait for them before 656 * continuing. 657 */ 658 for (ds = list_head(&synced_datasets); ds != NULL; 659 ds = list_next(&synced_datasets, ds)) { 660 dmu_objset_do_userquota_updates(ds->ds_objset, tx); 661 } 662 taskq_wait(dp->dp_sync_taskq); 663 664 /* 665 * Sync the datasets again to push out the changes due to 666 * userspace updates. This must be done before we process the 667 * sync tasks, so that any snapshots will have the correct 668 * user accounting information (and we won't get confused 669 * about which blocks are part of the snapshot). 670 */ 671 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 672 while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { 673 ASSERT(list_link_active(&ds->ds_synced_link)); 674 dmu_buf_rele(ds->ds_dbuf, ds); 675 dsl_dataset_sync(ds, zio, tx); 676 } 677 VERIFY0(zio_wait(zio)); 678 679 /* 680 * Now that the datasets have been completely synced, we can 681 * clean up our in-memory structures accumulated while syncing: 682 * 683 * - move dead blocks from the pending deadlist to the on-disk deadlist 684 * - release hold from dsl_dataset_dirty() 685 */ 686 while ((ds = list_remove_head(&synced_datasets)) != NULL) { 687 dsl_dataset_sync_done(ds, tx); 688 } 689 while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { 690 dsl_dir_sync(dd, tx); 691 } 692 693 /* 694 * The MOS's space is accounted for in the pool/$MOS 695 * (dp_mos_dir). We can't modify the mos while we're syncing 696 * it, so we remember the deltas and apply them here. 697 */ 698 if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || 699 dp->dp_mos_uncompressed_delta != 0) { 700 dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, 701 dp->dp_mos_used_delta, 702 dp->dp_mos_compressed_delta, 703 dp->dp_mos_uncompressed_delta, tx); 704 dp->dp_mos_used_delta = 0; 705 dp->dp_mos_compressed_delta = 0; 706 dp->dp_mos_uncompressed_delta = 0; 707 } 708 709 if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) { 710 dsl_pool_sync_mos(dp, tx); 711 } 712 713 /* 714 * If we modify a dataset in the same txg that we want to destroy it, 715 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. 716 * dsl_dir_destroy_check() will fail if there are unexpected holds. 717 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf 718 * and clearing the hold on it) before we process the sync_tasks. 719 * The MOS data dirtied by the sync_tasks will be synced on the next 720 * pass. 721 */ 722 if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { 723 dsl_sync_task_t *dst; 724 /* 725 * No more sync tasks should have been added while we 726 * were syncing. 727 */ 728 ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); 729 while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) 730 dsl_sync_task_sync(dst, tx); 731 } 732 733 dmu_tx_commit(tx); 734 735 DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); 736} 737 738void 739dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 740{ 741 zilog_t *zilog; 742 743 while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) { 744 dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); 745 /* 746 * We don't remove the zilog from the dp_dirty_zilogs 747 * list until after we've cleaned it. This ensures that 748 * callers of zilog_is_dirty() receive an accurate 749 * answer when they are racing with the spa sync thread. 750 */ 751 zil_clean(zilog, txg); 752 (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg); 753 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); 754 dmu_buf_rele(ds->ds_dbuf, zilog); 755 } 756 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 757} 758 759/* 760 * TRUE if the current thread is the tx_sync_thread or if we 761 * are being called from SPA context during pool initialization. 762 */ 763int 764dsl_pool_sync_context(dsl_pool_t *dp) 765{ 766 return (curthread == dp->dp_tx.tx_sync_thread || 767 spa_is_initializing(dp->dp_spa) || 768 taskq_member(dp->dp_sync_taskq, curthread)); 769} 770 771uint64_t 772dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 773{ 774 uint64_t space, resv; 775 776 /* 777 * If we're trying to assess whether it's OK to do a free, 778 * cut the reservation in half to allow forward progress 779 * (e.g. make it possible to rm(1) files from a full pool). 780 */ 781 space = spa_get_dspace(dp->dp_spa); 782 resv = spa_get_slop_space(dp->dp_spa); 783 if (netfree) 784 resv >>= 1; 785 786 return (space - resv); 787} 788 789boolean_t 790dsl_pool_need_dirty_delay(dsl_pool_t *dp) 791{ 792 uint64_t delay_min_bytes = 793 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 794 boolean_t rv; 795 796 mutex_enter(&dp->dp_lock); 797 if (dp->dp_dirty_total > zfs_dirty_data_sync) 798 txg_kick(dp); 799 rv = (dp->dp_dirty_total > delay_min_bytes); 800 mutex_exit(&dp->dp_lock); 801 return (rv); 802} 803 804void 805dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 806{ 807 if (space > 0) { 808 mutex_enter(&dp->dp_lock); 809 dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; 810 dsl_pool_dirty_delta(dp, space); 811 mutex_exit(&dp->dp_lock); 812 } 813} 814 815void 816dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) 817{ 818 ASSERT3S(space, >=, 0); 819 if (space == 0) 820 return; 821 mutex_enter(&dp->dp_lock); 822 if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { 823 /* XXX writing something we didn't dirty? */ 824 space = dp->dp_dirty_pertxg[txg & TXG_MASK]; 825 } 826 ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); 827 dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; 828 ASSERT3U(dp->dp_dirty_total, >=, space); 829 dsl_pool_dirty_delta(dp, -space); 830 mutex_exit(&dp->dp_lock); 831} 832 833/* ARGSUSED */ 834static int 835upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 836{ 837 dmu_tx_t *tx = arg; 838 dsl_dataset_t *ds, *prev = NULL; 839 int err; 840 841 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 842 if (err) 843 return (err); 844 845 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { 846 err = dsl_dataset_hold_obj(dp, 847 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 848 if (err) { 849 dsl_dataset_rele(ds, FTAG); 850 return (err); 851 } 852 853 if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) 854 break; 855 dsl_dataset_rele(ds, FTAG); 856 ds = prev; 857 prev = NULL; 858 } 859 860 if (prev == NULL) { 861 prev = dp->dp_origin_snap; 862 863 /* 864 * The $ORIGIN can't have any data, or the accounting 865 * will be wrong. 866 */ 867 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 868 ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); 869 rrw_exit(&ds->ds_bp_rwlock, FTAG); 870 871 /* The origin doesn't get attached to itself */ 872 if (ds->ds_object == prev->ds_object) { 873 dsl_dataset_rele(ds, FTAG); 874 return (0); 875 } 876 877 dmu_buf_will_dirty(ds->ds_dbuf, tx); 878 dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; 879 dsl_dataset_phys(ds)->ds_prev_snap_txg = 880 dsl_dataset_phys(prev)->ds_creation_txg; 881 882 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 883 dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; 884 885 dmu_buf_will_dirty(prev->ds_dbuf, tx); 886 dsl_dataset_phys(prev)->ds_num_children++; 887 888 if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { 889 ASSERT(ds->ds_prev == NULL); 890 VERIFY0(dsl_dataset_hold_obj(dp, 891 dsl_dataset_phys(ds)->ds_prev_snap_obj, 892 ds, &ds->ds_prev)); 893 } 894 } 895 896 ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); 897 ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); 898 899 if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { 900 dmu_buf_will_dirty(prev->ds_dbuf, tx); 901 dsl_dataset_phys(prev)->ds_next_clones_obj = 902 zap_create(dp->dp_meta_objset, 903 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 904 } 905 VERIFY0(zap_add_int(dp->dp_meta_objset, 906 dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); 907 908 dsl_dataset_rele(ds, FTAG); 909 if (prev != dp->dp_origin_snap) 910 dsl_dataset_rele(prev, FTAG); 911 return (0); 912} 913 914void 915dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 916{ 917 ASSERT(dmu_tx_is_syncing(tx)); 918 ASSERT(dp->dp_origin_snap != NULL); 919 920 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, 921 tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); 922} 923 924/* ARGSUSED */ 925static int 926upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 927{ 928 dmu_tx_t *tx = arg; 929 objset_t *mos = dp->dp_meta_objset; 930 931 if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { 932 dsl_dataset_t *origin; 933 934 VERIFY0(dsl_dataset_hold_obj(dp, 935 dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); 936 937 if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { 938 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 939 dsl_dir_phys(origin->ds_dir)->dd_clones = 940 zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 941 0, tx); 942 } 943 944 VERIFY0(zap_add_int(dp->dp_meta_objset, 945 dsl_dir_phys(origin->ds_dir)->dd_clones, 946 ds->ds_object, tx)); 947 948 dsl_dataset_rele(origin, FTAG); 949 } 950 return (0); 951} 952 953void 954dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) 955{ 956 ASSERT(dmu_tx_is_syncing(tx)); 957 uint64_t obj; 958 959 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); 960 VERIFY0(dsl_pool_open_special_dir(dp, 961 FREE_DIR_NAME, &dp->dp_free_dir)); 962 963 /* 964 * We can't use bpobj_alloc(), because spa_version() still 965 * returns the old version, and we need a new-version bpobj with 966 * subobj support. So call dmu_object_alloc() directly. 967 */ 968 obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, 969 SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); 970 VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 971 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 972 VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); 973 974 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 975 upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); 976} 977 978void 979dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 980{ 981 uint64_t dsobj; 982 dsl_dataset_t *ds; 983 984 ASSERT(dmu_tx_is_syncing(tx)); 985 ASSERT(dp->dp_origin_snap == NULL); 986 ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); 987 988 /* create the origin dir, ds, & snap-ds */ 989 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 990 NULL, 0, kcred, tx); 991 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 992 dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); 993 VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, 994 dp, &dp->dp_origin_snap)); 995 dsl_dataset_rele(ds, FTAG); 996} 997 998taskq_t * 999dsl_pool_vnrele_taskq(dsl_pool_t *dp) 1000{ 1001 return (dp->dp_vnrele_taskq); 1002} 1003 1004/* 1005 * Walk through the pool-wide zap object of temporary snapshot user holds 1006 * and release them. 1007 */ 1008void 1009dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 1010{ 1011 zap_attribute_t za; 1012 zap_cursor_t zc; 1013 objset_t *mos = dp->dp_meta_objset; 1014 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 1015 nvlist_t *holds; 1016 1017 if (zapobj == 0) 1018 return; 1019 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1020 1021 holds = fnvlist_alloc(); 1022 1023 for (zap_cursor_init(&zc, mos, zapobj); 1024 zap_cursor_retrieve(&zc, &za) == 0; 1025 zap_cursor_advance(&zc)) { 1026 char *htag; 1027 nvlist_t *tags; 1028 1029 htag = strchr(za.za_name, '-'); 1030 *htag = '\0'; 1031 ++htag; 1032 if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { 1033 tags = fnvlist_alloc(); 1034 fnvlist_add_boolean(tags, htag); 1035 fnvlist_add_nvlist(holds, za.za_name, tags); 1036 fnvlist_free(tags); 1037 } else { 1038 fnvlist_add_boolean(tags, htag); 1039 } 1040 } 1041 dsl_dataset_user_release_tmp(dp, holds); 1042 fnvlist_free(holds); 1043 zap_cursor_fini(&zc); 1044} 1045 1046/* 1047 * Create the pool-wide zap object for storing temporary snapshot holds. 1048 */ 1049void 1050dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 1051{ 1052 objset_t *mos = dp->dp_meta_objset; 1053 1054 ASSERT(dp->dp_tmp_userrefs_obj == 0); 1055 ASSERT(dmu_tx_is_syncing(tx)); 1056 1057 dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, 1058 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); 1059} 1060 1061static int 1062dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 1063 const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) 1064{ 1065 objset_t *mos = dp->dp_meta_objset; 1066 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 1067 char *name; 1068 int error; 1069 1070 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1071 ASSERT(dmu_tx_is_syncing(tx)); 1072 1073 /* 1074 * If the pool was created prior to SPA_VERSION_USERREFS, the 1075 * zap object for temporary holds might not exist yet. 1076 */ 1077 if (zapobj == 0) { 1078 if (holding) { 1079 dsl_pool_user_hold_create_obj(dp, tx); 1080 zapobj = dp->dp_tmp_userrefs_obj; 1081 } else { 1082 return (SET_ERROR(ENOENT)); 1083 } 1084 } 1085 1086 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 1087 if (holding) 1088 error = zap_add(mos, zapobj, name, 8, 1, &now, tx); 1089 else 1090 error = zap_remove(mos, zapobj, name, tx); 1091 strfree(name); 1092 1093 return (error); 1094} 1095 1096/* 1097 * Add a temporary hold for the given dataset object and tag. 1098 */ 1099int 1100dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 1101 uint64_t now, dmu_tx_t *tx) 1102{ 1103 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 1104} 1105 1106/* 1107 * Release a temporary hold for the given dataset object and tag. 1108 */ 1109int 1110dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 1111 dmu_tx_t *tx) 1112{ 1113 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, 1114 tx, B_FALSE)); 1115} 1116 1117/* 1118 * DSL Pool Configuration Lock 1119 * 1120 * The dp_config_rwlock protects against changes to DSL state (e.g. dataset 1121 * creation / destruction / rename / property setting). It must be held for 1122 * read to hold a dataset or dsl_dir. I.e. you must call 1123 * dsl_pool_config_enter() or dsl_pool_hold() before calling 1124 * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock 1125 * must be held continuously until all datasets and dsl_dirs are released. 1126 * 1127 * The only exception to this rule is that if a "long hold" is placed on 1128 * a dataset, then the dp_config_rwlock may be dropped while the dataset 1129 * is still held. The long hold will prevent the dataset from being 1130 * destroyed -- the destroy will fail with EBUSY. A long hold can be 1131 * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset 1132 * (by calling dsl_{dataset,objset}_{try}own{_obj}). 1133 * 1134 * Legitimate long-holders (including owners) should be long-running, cancelable 1135 * tasks that should cause "zfs destroy" to fail. This includes DMU 1136 * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), 1137 * "zfs send", and "zfs diff". There are several other long-holders whose 1138 * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). 1139 * 1140 * The usual formula for long-holding would be: 1141 * dsl_pool_hold() 1142 * dsl_dataset_hold() 1143 * ... perform checks ... 1144 * dsl_dataset_long_hold() 1145 * dsl_pool_rele() 1146 * ... perform long-running task ... 1147 * dsl_dataset_long_rele() 1148 * dsl_dataset_rele() 1149 * 1150 * Note that when the long hold is released, the dataset is still held but 1151 * the pool is not held. The dataset may change arbitrarily during this time 1152 * (e.g. it could be destroyed). Therefore you shouldn't do anything to the 1153 * dataset except release it. 1154 * 1155 * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only 1156 * or modifying operations. 1157 * 1158 * Modifying operations should generally use dsl_sync_task(). The synctask 1159 * infrastructure enforces proper locking strategy with respect to the 1160 * dp_config_rwlock. See the comment above dsl_sync_task() for details. 1161 * 1162 * Read-only operations will manually hold the pool, then the dataset, obtain 1163 * information from the dataset, then release the pool and dataset. 1164 * dmu_objset_{hold,rele}() are convenience routines that also do the pool 1165 * hold/rele. 1166 */ 1167 1168int 1169dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) 1170{ 1171 spa_t *spa; 1172 int error; 1173 1174 error = spa_open(name, &spa, tag); 1175 if (error == 0) { 1176 *dp = spa_get_dsl(spa); 1177 dsl_pool_config_enter(*dp, tag); 1178 } 1179 return (error); 1180} 1181 1182void 1183dsl_pool_rele(dsl_pool_t *dp, void *tag) 1184{ 1185 dsl_pool_config_exit(dp, tag); 1186 spa_close(dp->dp_spa, tag); 1187} 1188 1189void 1190dsl_pool_config_enter(dsl_pool_t *dp, void *tag) 1191{ 1192 /* 1193 * We use a "reentrant" reader-writer lock, but not reentrantly. 1194 * 1195 * The rrwlock can (with the track_all flag) track all reading threads, 1196 * which is very useful for debugging which code path failed to release 1197 * the lock, and for verifying that the *current* thread does hold 1198 * the lock. 1199 * 1200 * (Unlike a rwlock, which knows that N threads hold it for 1201 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE 1202 * if any thread holds it for read, even if this thread doesn't). 1203 */ 1204 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); 1205 rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); 1206} 1207 1208void 1209dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag) 1210{ 1211 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); 1212 rrw_enter_read_prio(&dp->dp_config_rwlock, tag); 1213} 1214 1215void 1216dsl_pool_config_exit(dsl_pool_t *dp, void *tag) 1217{ 1218 rrw_exit(&dp->dp_config_rwlock, tag); 1219} 1220 1221boolean_t 1222dsl_pool_config_held(dsl_pool_t *dp) 1223{ 1224 return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); 1225} 1226 1227boolean_t 1228dsl_pool_config_held_writer(dsl_pool_t *dp) 1229{ 1230 return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); 1231} 1232