1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 24 * Copyright (c) 2013 Steven Hartland. All rights reserved. 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30#include <sys/dsl_pool.h> 31#include <sys/dsl_dataset.h> 32#include <sys/dsl_prop.h> 33#include <sys/dsl_dir.h> 34#include <sys/dsl_synctask.h> 35#include <sys/dsl_scan.h> 36#include <sys/dnode.h> 37#include <sys/dmu_tx.h> 38#include <sys/dmu_objset.h> 39#include <sys/arc.h> 40#include <sys/zap.h> 41#include <sys/zio.h> 42#include <sys/zfs_context.h> 43#include <sys/fs/zfs.h> 44#include <sys/zfs_znode.h> 45#include <sys/spa_impl.h> 46#include <sys/dsl_deadlist.h> 47#include <sys/vdev_impl.h> 48#include <sys/metaslab_impl.h> 49#include <sys/bptree.h> 50#include <sys/zfeature.h> 51#include <sys/zil_impl.h> 52#include <sys/dsl_userhold.h> 53 54#if defined(__FreeBSD__) && defined(_KERNEL) 55#include <sys/types.h> 56#include <sys/sysctl.h> 57#endif 58 59/* 60 * ZFS Write Throttle 61 * ------------------ 62 * 63 * ZFS must limit the rate of incoming writes to the rate at which it is able 64 * to sync data modifications to the backend storage. Throttling by too much 65 * creates an artificial limit; throttling by too little can only be sustained 66 * for short periods and would lead to highly lumpy performance. On a per-pool 67 * basis, ZFS tracks the amount of modified (dirty) data. As operations change 68 * data, the amount of dirty data increases; as ZFS syncs out data, the amount 69 * of dirty data decreases. When the amount of dirty data exceeds a 70 * predetermined threshold further modifications are blocked until the amount 71 * of dirty data decreases (as data is synced out). 72 * 73 * The limit on dirty data is tunable, and should be adjusted according to 74 * both the IO capacity and available memory of the system. The larger the 75 * window, the more ZFS is able to aggregate and amortize metadata (and data) 76 * changes. However, memory is a limited resource, and allowing for more dirty 77 * data comes at the cost of keeping other useful data in memory (for example 78 * ZFS data cached by the ARC). 79 * 80 * Implementation 81 * 82 * As buffers are modified dsl_pool_willuse_space() increments both the per- 83 * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of 84 * dirty space used; dsl_pool_dirty_space() decrements those values as data 85 * is synced out from dsl_pool_sync(). While only the poolwide value is 86 * relevant, the per-txg value is useful for debugging. The tunable 87 * zfs_dirty_data_max determines the dirty space limit. Once that value is 88 * exceeded, new writes are halted until space frees up. 89 * 90 * The zfs_dirty_data_sync tunable dictates the threshold at which we 91 * ensure that there is a txg syncing (see the comment in txg.c for a full 92 * description of transaction group stages). 93 * 94 * The IO scheduler uses both the dirty space limit and current amount of 95 * dirty data as inputs. Those values affect the number of concurrent IOs ZFS 96 * issues. See the comment in vdev_queue.c for details of the IO scheduler. 97 * 98 * The delay is also calculated based on the amount of dirty data. See the 99 * comment above dmu_tx_delay() for details. 100 */ 101 102/* 103 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, 104 * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. 105 */ 106uint64_t zfs_dirty_data_max; 107uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; 108int zfs_dirty_data_max_percent = 10; 109 110/* 111 * If there is at least this much dirty data, push out a txg. 112 */ 113uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024; 114 115/* 116 * Once there is this amount of dirty data, the dmu_tx_delay() will kick in 117 * and delay each transaction. 118 * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. 119 */ 120int zfs_delay_min_dirty_percent = 60; 121 122/* 123 * This controls how quickly the delay approaches infinity. 124 * Larger values cause it to delay more for a given amount of dirty data. 125 * Therefore larger values will cause there to be less dirty data for a 126 * given throughput. 127 * 128 * For the smoothest delay, this value should be about 1 billion divided 129 * by the maximum number of operations per second. This will smoothly 130 * handle between 10x and 1/10th this number. 131 * 132 * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the 133 * multiply in dmu_tx_delay(). 134 */ 135uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; 136 137/* 138 * This determines the number of threads used by the dp_sync_taskq. 139 */ 140int zfs_sync_taskq_batch_pct = 75; 141 142/* 143 * These tunables determine the behavior of how zil_itxg_clean() is 144 * called via zil_clean() in the context of spa_sync(). When an itxg 145 * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching. 146 * If the dispatch fails, the call to zil_itxg_clean() will occur 147 * synchronously in the context of spa_sync(), which can negatively 148 * impact the performance of spa_sync() (e.g. in the case of the itxg 149 * list having a large number of itxs that needs to be cleaned). 150 * 151 * Thus, these tunables can be used to manipulate the behavior of the 152 * taskq used by zil_clean(); they determine the number of taskq entries 153 * that are pre-populated when the taskq is first created (via the 154 * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of 155 * taskq entries that are cached after an on-demand allocation (via the 156 * "zfs_zil_clean_taskq_maxalloc"). 157 * 158 * The idea being, we want to try reasonably hard to ensure there will 159 * already be a taskq entry pre-allocated by the time that it is needed 160 * by zil_clean(). This way, we can avoid the possibility of an 161 * on-demand allocation of a new taskq entry from failing, which would 162 * result in zil_itxg_clean() being called synchronously from zil_clean() 163 * (which can adversely affect performance of spa_sync()). 164 * 165 * Additionally, the number of threads used by the taskq can be 166 * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable. 167 */ 168int zfs_zil_clean_taskq_nthr_pct = 100; 169int zfs_zil_clean_taskq_minalloc = 1024; 170int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; 171 172#if defined(__FreeBSD__) && defined(_KERNEL) 173 174extern int zfs_vdev_async_write_active_max_dirty_percent; 175 176SYSCTL_DECL(_vfs_zfs); 177 178SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN, 179 &zfs_dirty_data_max, 0, 180 "The maximum amount of dirty data in bytes after which new writes are " 181 "halted until space becomes available"); 182 183SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN, 184 &zfs_dirty_data_max_max, 0, 185 "The absolute cap on dirty_data_max when auto calculating"); 186 187static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS); 188SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent, 189 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), 190 sysctl_zfs_dirty_data_max_percent, "I", 191 "The percent of physical memory used to auto calculate dirty_data_max"); 192 193SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN, 194 &zfs_dirty_data_sync, 0, 195 "Force a txg if the number of dirty buffer bytes exceed this value"); 196 197static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS); 198/* No zfs_delay_min_dirty_percent tunable due to limit requirements */ 199SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent, 200 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int), 201 sysctl_zfs_delay_min_dirty_percent, "I", 202 "The limit of outstanding dirty data before transactions are delayed"); 203 204static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS); 205/* No zfs_delay_scale tunable due to limit requirements */ 206SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale, 207 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 208 sysctl_zfs_delay_scale, "QU", 209 "Controls how quickly the delay approaches infinity"); 210 211static int 212sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS) 213{ 214 int val, err; 215 216 val = zfs_dirty_data_max_percent; 217 err = sysctl_handle_int(oidp, &val, 0, req); 218 if (err != 0 || req->newptr == NULL) 219 return (err); 220 221 if (val < 0 || val > 100) 222 return (EINVAL); 223 224 zfs_dirty_data_max_percent = val; 225 226 return (0); 227} 228 229static int 230sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS) 231{ 232 int val, err; 233 234 val = zfs_delay_min_dirty_percent; 235 err = sysctl_handle_int(oidp, &val, 0, req); 236 if (err != 0 || req->newptr == NULL) 237 return (err); 238 239 if (val < zfs_vdev_async_write_active_max_dirty_percent) 240 return (EINVAL); 241 242 zfs_delay_min_dirty_percent = val; 243 244 return (0); 245} 246 247static int 248sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS) 249{ 250 uint64_t val; 251 int err; 252 253 val = zfs_delay_scale; 254 err = sysctl_handle_64(oidp, &val, 0, req); 255 if (err != 0 || req->newptr == NULL) 256 return (err); 257 258 if (val > UINT64_MAX / zfs_dirty_data_max) 259 return (EINVAL); 260 261 zfs_delay_scale = val; 262 263 return (0); 264} 265#endif 266 267int 268dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 269{ 270 uint64_t obj; 271 int err; 272 273 err = zap_lookup(dp->dp_meta_objset, 274 dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, 275 name, sizeof (obj), 1, &obj); 276 if (err) 277 return (err); 278 279 return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); 280} 281 282static dsl_pool_t * 283dsl_pool_open_impl(spa_t *spa, uint64_t txg) 284{ 285 dsl_pool_t *dp; 286 blkptr_t *bp = spa_get_rootblkptr(spa); 287 288 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 289 dp->dp_spa = spa; 290 dp->dp_meta_rootbp = *bp; 291 rrw_init(&dp->dp_config_rwlock, B_TRUE); 292 txg_init(dp, txg); 293 294 txg_list_create(&dp->dp_dirty_datasets, spa, 295 offsetof(dsl_dataset_t, ds_dirty_link)); 296 txg_list_create(&dp->dp_dirty_zilogs, spa, 297 offsetof(zilog_t, zl_dirty_link)); 298 txg_list_create(&dp->dp_dirty_dirs, spa, 299 offsetof(dsl_dir_t, dd_dirty_link)); 300 txg_list_create(&dp->dp_sync_tasks, spa, 301 offsetof(dsl_sync_task_t, dst_node)); 302 txg_list_create(&dp->dp_early_sync_tasks, spa, 303 offsetof(dsl_sync_task_t, dst_node)); 304 305 dp->dp_sync_taskq = taskq_create("dp_sync_taskq", 306 zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, 307 TASKQ_THREADS_CPU_PCT); 308 309 dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", 310 zfs_zil_clean_taskq_nthr_pct, minclsyspri, 311 zfs_zil_clean_taskq_minalloc, 312 zfs_zil_clean_taskq_maxalloc, 313 TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); 314 315 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 316 cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); 317 318 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 319 1, 4, 0); 320 321 return (dp); 322} 323 324int 325dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 326{ 327 int err; 328 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 329 330 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 331 &dp->dp_meta_objset); 332 if (err != 0) 333 dsl_pool_close(dp); 334 else 335 *dpp = dp; 336 337 return (err); 338} 339 340int 341dsl_pool_open(dsl_pool_t *dp) 342{ 343 int err; 344 dsl_dir_t *dd; 345 dsl_dataset_t *ds; 346 uint64_t obj; 347 348 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 349 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 350 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 351 &dp->dp_root_dir_obj); 352 if (err) 353 goto out; 354 355 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 356 NULL, dp, &dp->dp_root_dir); 357 if (err) 358 goto out; 359 360 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 361 if (err) 362 goto out; 363 364 if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { 365 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 366 if (err) 367 goto out; 368 err = dsl_dataset_hold_obj(dp, 369 dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); 370 if (err == 0) { 371 err = dsl_dataset_hold_obj(dp, 372 dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, 373 &dp->dp_origin_snap); 374 dsl_dataset_rele(ds, FTAG); 375 } 376 dsl_dir_rele(dd, dp); 377 if (err) 378 goto out; 379 } 380 381 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 382 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, 383 &dp->dp_free_dir); 384 if (err) 385 goto out; 386 387 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 388 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); 389 if (err) 390 goto out; 391 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 392 dp->dp_meta_objset, obj)); 393 } 394 395 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 396 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 397 DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj); 398 if (err == 0) { 399 VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, 400 dp->dp_meta_objset, obj)); 401 } else if (err == ENOENT) { 402 /* 403 * We might not have created the remap bpobj yet. 404 */ 405 err = 0; 406 } else { 407 goto out; 408 } 409 } 410 411 /* 412 * Note: errors ignored, because the these special dirs, used for 413 * space accounting, are only created on demand. 414 */ 415 (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, 416 &dp->dp_leak_dir); 417 418 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { 419 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 420 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 421 &dp->dp_bptree_obj); 422 if (err != 0) 423 goto out; 424 } 425 426 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { 427 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 428 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 429 &dp->dp_empty_bpobj); 430 if (err != 0) 431 goto out; 432 } 433 434 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 435 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 436 &dp->dp_tmp_userrefs_obj); 437 if (err == ENOENT) 438 err = 0; 439 if (err) 440 goto out; 441 442 err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); 443 444out: 445 rrw_exit(&dp->dp_config_rwlock, FTAG); 446 return (err); 447} 448 449void 450dsl_pool_close(dsl_pool_t *dp) 451{ 452 /* 453 * Drop our references from dsl_pool_open(). 454 * 455 * Since we held the origin_snap from "syncing" context (which 456 * includes pool-opening context), it actually only got a "ref" 457 * and not a hold, so just drop that here. 458 */ 459 if (dp->dp_origin_snap != NULL) 460 dsl_dataset_rele(dp->dp_origin_snap, dp); 461 if (dp->dp_mos_dir != NULL) 462 dsl_dir_rele(dp->dp_mos_dir, dp); 463 if (dp->dp_free_dir != NULL) 464 dsl_dir_rele(dp->dp_free_dir, dp); 465 if (dp->dp_leak_dir != NULL) 466 dsl_dir_rele(dp->dp_leak_dir, dp); 467 if (dp->dp_root_dir != NULL) 468 dsl_dir_rele(dp->dp_root_dir, dp); 469 470 bpobj_close(&dp->dp_free_bpobj); 471 bpobj_close(&dp->dp_obsolete_bpobj); 472 473 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 474 if (dp->dp_meta_objset != NULL) 475 dmu_objset_evict(dp->dp_meta_objset); 476 477 txg_list_destroy(&dp->dp_dirty_datasets); 478 txg_list_destroy(&dp->dp_dirty_zilogs); 479 txg_list_destroy(&dp->dp_sync_tasks); 480 txg_list_destroy(&dp->dp_early_sync_tasks); 481 txg_list_destroy(&dp->dp_dirty_dirs); 482 483 taskq_destroy(dp->dp_zil_clean_taskq); 484 taskq_destroy(dp->dp_sync_taskq); 485 486 /* 487 * We can't set retry to TRUE since we're explicitly specifying 488 * a spa to flush. This is good enough; any missed buffers for 489 * this spa won't cause trouble, and they'll eventually fall 490 * out of the ARC just like any other unused buffer. 491 */ 492 arc_flush(dp->dp_spa, FALSE); 493 494 txg_fini(dp); 495 dsl_scan_fini(dp); 496 dmu_buf_user_evict_wait(); 497 498 rrw_destroy(&dp->dp_config_rwlock); 499 mutex_destroy(&dp->dp_lock); 500 taskq_destroy(dp->dp_vnrele_taskq); 501 if (dp->dp_blkstats != NULL) { 502 mutex_destroy(&dp->dp_blkstats->zab_lock); 503 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 504 } 505 kmem_free(dp, sizeof (dsl_pool_t)); 506} 507 508void 509dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) 510{ 511 uint64_t obj; 512 /* 513 * Currently, we only create the obsolete_bpobj where there are 514 * indirect vdevs with referenced mappings. 515 */ 516 ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL)); 517 /* create and open the obsolete_bpobj */ 518 obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); 519 VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); 520 VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 521 DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 522 spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 523} 524 525void 526dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) 527{ 528 spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 529 VERIFY0(zap_remove(dp->dp_meta_objset, 530 DMU_POOL_DIRECTORY_OBJECT, 531 DMU_POOL_OBSOLETE_BPOBJ, tx)); 532 bpobj_free(dp->dp_meta_objset, 533 dp->dp_obsolete_bpobj.bpo_object, tx); 534 bpobj_close(&dp->dp_obsolete_bpobj); 535} 536 537dsl_pool_t * 538dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 539{ 540 int err; 541 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 542 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 543 dsl_dataset_t *ds; 544 uint64_t obj; 545 546 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 547 548 /* create and open the MOS (meta-objset) */ 549 dp->dp_meta_objset = dmu_objset_create_impl(spa, 550 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 551 552 /* create the pool directory */ 553 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 554 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 555 ASSERT0(err); 556 557 /* Initialize scan structures */ 558 VERIFY0(dsl_scan_init(dp, txg)); 559 560 /* create and open the root dir */ 561 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 562 VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 563 NULL, dp, &dp->dp_root_dir)); 564 565 /* create and open the meta-objset dir */ 566 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 567 VERIFY0(dsl_pool_open_special_dir(dp, 568 MOS_DIR_NAME, &dp->dp_mos_dir)); 569 570 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 571 /* create and open the free dir */ 572 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 573 FREE_DIR_NAME, tx); 574 VERIFY0(dsl_pool_open_special_dir(dp, 575 FREE_DIR_NAME, &dp->dp_free_dir)); 576 577 /* create and open the free_bplist */ 578 obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); 579 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 580 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); 581 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 582 dp->dp_meta_objset, obj)); 583 } 584 585 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 586 dsl_pool_create_origin(dp, tx); 587 588 /* create the root dataset */ 589 obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 590 591 /* create the root objset */ 592 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 593#ifdef _KERNEL 594 { 595 objset_t *os; 596 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 597 os = dmu_objset_create_impl(dp->dp_spa, ds, 598 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 599 rrw_exit(&ds->ds_bp_rwlock, FTAG); 600 zfs_create_fs(os, kcred, zplprops, tx); 601 } 602#endif 603 dsl_dataset_rele(ds, FTAG); 604 605 dmu_tx_commit(tx); 606 607 rrw_exit(&dp->dp_config_rwlock, FTAG); 608 609 return (dp); 610} 611 612/* 613 * Account for the meta-objset space in its placeholder dsl_dir. 614 */ 615void 616dsl_pool_mos_diduse_space(dsl_pool_t *dp, 617 int64_t used, int64_t comp, int64_t uncomp) 618{ 619 ASSERT3U(comp, ==, uncomp); /* it's all metadata */ 620 mutex_enter(&dp->dp_lock); 621 dp->dp_mos_used_delta += used; 622 dp->dp_mos_compressed_delta += comp; 623 dp->dp_mos_uncompressed_delta += uncomp; 624 mutex_exit(&dp->dp_lock); 625} 626 627static void 628dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) 629{ 630 zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 631 dmu_objset_sync(dp->dp_meta_objset, zio, tx); 632 VERIFY0(zio_wait(zio)); 633 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 634 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 635} 636 637static void 638dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) 639{ 640 ASSERT(MUTEX_HELD(&dp->dp_lock)); 641 642 if (delta < 0) 643 ASSERT3U(-delta, <=, dp->dp_dirty_total); 644 645 dp->dp_dirty_total += delta; 646 647 /* 648 * Note: we signal even when increasing dp_dirty_total. 649 * This ensures forward progress -- each thread wakes the next waiter. 650 */ 651 if (dp->dp_dirty_total < zfs_dirty_data_max) 652 cv_signal(&dp->dp_spaceavail_cv); 653} 654 655static boolean_t 656dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) 657{ 658 spa_t *spa = dp->dp_spa; 659 vdev_t *rvd = spa->spa_root_vdev; 660 661 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 662 vdev_t *vd = rvd->vdev_child[c]; 663 txg_list_t *tl = &vd->vdev_ms_list; 664 metaslab_t *ms; 665 666 for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms; 667 ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) { 668 VERIFY(range_tree_is_empty(ms->ms_freeing)); 669 VERIFY(range_tree_is_empty(ms->ms_checkpointing)); 670 } 671 } 672 673 return (B_TRUE); 674} 675 676void 677dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 678{ 679 zio_t *zio; 680 dmu_tx_t *tx; 681 dsl_dir_t *dd; 682 dsl_dataset_t *ds; 683 objset_t *mos = dp->dp_meta_objset; 684 list_t synced_datasets; 685 686 list_create(&synced_datasets, sizeof (dsl_dataset_t), 687 offsetof(dsl_dataset_t, ds_synced_link)); 688 689 tx = dmu_tx_create_assigned(dp, txg); 690 691 /* 692 * Run all early sync tasks before writing out any dirty blocks. 693 * For more info on early sync tasks see block comment in 694 * dsl_early_sync_task(). 695 */ 696 if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) { 697 dsl_sync_task_t *dst; 698 699 ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); 700 while ((dst = 701 txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) { 702 ASSERT(dsl_early_sync_task_verify(dp, txg)); 703 dsl_sync_task_sync(dst, tx); 704 } 705 ASSERT(dsl_early_sync_task_verify(dp, txg)); 706 } 707 708 /* 709 * Write out all dirty blocks of dirty datasets. 710 */ 711 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 712 while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { 713 /* 714 * We must not sync any non-MOS datasets twice, because 715 * we may have taken a snapshot of them. However, we 716 * may sync newly-created datasets on pass 2. 717 */ 718 ASSERT(!list_link_active(&ds->ds_synced_link)); 719 list_insert_tail(&synced_datasets, ds); 720 dsl_dataset_sync(ds, zio, tx); 721 } 722 VERIFY0(zio_wait(zio)); 723 724 /* 725 * We have written all of the accounted dirty data, so our 726 * dp_space_towrite should now be zero. However, some seldom-used 727 * code paths do not adhere to this (e.g. dbuf_undirty(), also 728 * rounding error in dbuf_write_physdone). 729 * Shore up the accounting of any dirtied space now. 730 */ 731 dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); 732 733 /* 734 * Update the long range free counter after 735 * we're done syncing user data 736 */ 737 mutex_enter(&dp->dp_lock); 738 ASSERT(spa_sync_pass(dp->dp_spa) == 1 || 739 dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); 740 dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; 741 mutex_exit(&dp->dp_lock); 742 743 /* 744 * After the data blocks have been written (ensured by the zio_wait() 745 * above), update the user/group space accounting. This happens 746 * in tasks dispatched to dp_sync_taskq, so wait for them before 747 * continuing. 748 */ 749 for (ds = list_head(&synced_datasets); ds != NULL; 750 ds = list_next(&synced_datasets, ds)) { 751 dmu_objset_do_userquota_updates(ds->ds_objset, tx); 752 } 753 taskq_wait(dp->dp_sync_taskq); 754 755 /* 756 * Sync the datasets again to push out the changes due to 757 * userspace updates. This must be done before we process the 758 * sync tasks, so that any snapshots will have the correct 759 * user accounting information (and we won't get confused 760 * about which blocks are part of the snapshot). 761 */ 762 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 763 while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { 764 ASSERT(list_link_active(&ds->ds_synced_link)); 765 dmu_buf_rele(ds->ds_dbuf, ds); 766 dsl_dataset_sync(ds, zio, tx); 767 } 768 VERIFY0(zio_wait(zio)); 769 770 /* 771 * Now that the datasets have been completely synced, we can 772 * clean up our in-memory structures accumulated while syncing: 773 * 774 * - move dead blocks from the pending deadlist to the on-disk deadlist 775 * - release hold from dsl_dataset_dirty() 776 */ 777 while ((ds = list_remove_head(&synced_datasets)) != NULL) { 778 dsl_dataset_sync_done(ds, tx); 779 } 780 while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { 781 dsl_dir_sync(dd, tx); 782 } 783 784 /* 785 * The MOS's space is accounted for in the pool/$MOS 786 * (dp_mos_dir). We can't modify the mos while we're syncing 787 * it, so we remember the deltas and apply them here. 788 */ 789 if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || 790 dp->dp_mos_uncompressed_delta != 0) { 791 dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, 792 dp->dp_mos_used_delta, 793 dp->dp_mos_compressed_delta, 794 dp->dp_mos_uncompressed_delta, tx); 795 dp->dp_mos_used_delta = 0; 796 dp->dp_mos_compressed_delta = 0; 797 dp->dp_mos_uncompressed_delta = 0; 798 } 799 800 if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) { 801 dsl_pool_sync_mos(dp, tx); 802 } 803 804 /* 805 * If we modify a dataset in the same txg that we want to destroy it, 806 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. 807 * dsl_dir_destroy_check() will fail if there are unexpected holds. 808 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf 809 * and clearing the hold on it) before we process the sync_tasks. 810 * The MOS data dirtied by the sync_tasks will be synced on the next 811 * pass. 812 */ 813 if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { 814 dsl_sync_task_t *dst; 815 /* 816 * No more sync tasks should have been added while we 817 * were syncing. 818 */ 819 ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); 820 while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) 821 dsl_sync_task_sync(dst, tx); 822 } 823 824 dmu_tx_commit(tx); 825 826 DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); 827} 828 829void 830dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 831{ 832 zilog_t *zilog; 833 834 while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) { 835 dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); 836 /* 837 * We don't remove the zilog from the dp_dirty_zilogs 838 * list until after we've cleaned it. This ensures that 839 * callers of zilog_is_dirty() receive an accurate 840 * answer when they are racing with the spa sync thread. 841 */ 842 zil_clean(zilog, txg); 843 (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg); 844 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); 845 dmu_buf_rele(ds->ds_dbuf, zilog); 846 } 847 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 848} 849 850/* 851 * TRUE if the current thread is the tx_sync_thread or if we 852 * are being called from SPA context during pool initialization. 853 */ 854int 855dsl_pool_sync_context(dsl_pool_t *dp) 856{ 857 return (curthread == dp->dp_tx.tx_sync_thread || 858 spa_is_initializing(dp->dp_spa) || 859 taskq_member(dp->dp_sync_taskq, curthread)); 860} 861 862/* 863 * This function returns the amount of allocatable space in the pool 864 * minus whatever space is currently reserved by ZFS for specific 865 * purposes. Specifically: 866 * 867 * 1] Any reserved SLOP space 868 * 2] Any space used by the checkpoint 869 * 3] Any space used for deferred frees 870 * 871 * The latter 2 are especially important because they are needed to 872 * rectify the SPA's and DMU's different understanding of how much space 873 * is used. Now the DMU is aware of that extra space tracked by the SPA 874 * without having to maintain a separate special dir (e.g similar to 875 * $MOS, $FREEING, and $LEAKED). 876 * 877 * Note: By deferred frees here, we mean the frees that were deferred 878 * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the 879 * segments placed in ms_defer trees during metaslab_sync_done(). 880 */ 881uint64_t 882dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy) 883{ 884 spa_t *spa = dp->dp_spa; 885 uint64_t space, resv, adjustedsize; 886 uint64_t spa_deferred_frees = 887 spa->spa_deferred_bpobj.bpo_phys->bpo_bytes; 888 889 space = spa_get_dspace(spa) 890 - spa_get_checkpoint_space(spa) - spa_deferred_frees; 891 resv = spa_get_slop_space(spa); 892 893 switch (slop_policy) { 894 case ZFS_SPACE_CHECK_NORMAL: 895 break; 896 case ZFS_SPACE_CHECK_RESERVED: 897 resv >>= 1; 898 break; 899 case ZFS_SPACE_CHECK_EXTRA_RESERVED: 900 resv >>= 2; 901 break; 902 case ZFS_SPACE_CHECK_NONE: 903 resv = 0; 904 break; 905 default: 906 panic("invalid slop policy value: %d", slop_policy); 907 break; 908 } 909 adjustedsize = (space >= resv) ? (space - resv) : 0; 910 911 return (adjustedsize); 912} 913 914uint64_t 915dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy) 916{ 917 uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy); 918 uint64_t deferred = 919 metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); 920 uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0; 921 return (quota); 922} 923 924boolean_t 925dsl_pool_need_dirty_delay(dsl_pool_t *dp) 926{ 927 uint64_t delay_min_bytes = 928 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 929 boolean_t rv; 930 931 mutex_enter(&dp->dp_lock); 932 if (dp->dp_dirty_total > zfs_dirty_data_sync) 933 txg_kick(dp); 934 rv = (dp->dp_dirty_total > delay_min_bytes); 935 mutex_exit(&dp->dp_lock); 936 return (rv); 937} 938 939void 940dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 941{ 942 if (space > 0) { 943 mutex_enter(&dp->dp_lock); 944 dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; 945 dsl_pool_dirty_delta(dp, space); 946 mutex_exit(&dp->dp_lock); 947 } 948} 949 950void 951dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) 952{ 953 ASSERT3S(space, >=, 0); 954 if (space == 0) 955 return; 956 mutex_enter(&dp->dp_lock); 957 if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { 958 /* XXX writing something we didn't dirty? */ 959 space = dp->dp_dirty_pertxg[txg & TXG_MASK]; 960 } 961 ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); 962 dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; 963 ASSERT3U(dp->dp_dirty_total, >=, space); 964 dsl_pool_dirty_delta(dp, -space); 965 mutex_exit(&dp->dp_lock); 966} 967 968/* ARGSUSED */ 969static int 970upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 971{ 972 dmu_tx_t *tx = arg; 973 dsl_dataset_t *ds, *prev = NULL; 974 int err; 975 976 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 977 if (err) 978 return (err); 979 980 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { 981 err = dsl_dataset_hold_obj(dp, 982 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 983 if (err) { 984 dsl_dataset_rele(ds, FTAG); 985 return (err); 986 } 987 988 if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) 989 break; 990 dsl_dataset_rele(ds, FTAG); 991 ds = prev; 992 prev = NULL; 993 } 994 995 if (prev == NULL) { 996 prev = dp->dp_origin_snap; 997 998 /* 999 * The $ORIGIN can't have any data, or the accounting 1000 * will be wrong. 1001 */ 1002 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 1003 ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); 1004 rrw_exit(&ds->ds_bp_rwlock, FTAG); 1005 1006 /* The origin doesn't get attached to itself */ 1007 if (ds->ds_object == prev->ds_object) { 1008 dsl_dataset_rele(ds, FTAG); 1009 return (0); 1010 } 1011 1012 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1013 dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; 1014 dsl_dataset_phys(ds)->ds_prev_snap_txg = 1015 dsl_dataset_phys(prev)->ds_creation_txg; 1016 1017 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1018 dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; 1019 1020 dmu_buf_will_dirty(prev->ds_dbuf, tx); 1021 dsl_dataset_phys(prev)->ds_num_children++; 1022 1023 if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { 1024 ASSERT(ds->ds_prev == NULL); 1025 VERIFY0(dsl_dataset_hold_obj(dp, 1026 dsl_dataset_phys(ds)->ds_prev_snap_obj, 1027 ds, &ds->ds_prev)); 1028 } 1029 } 1030 1031 ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); 1032 ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); 1033 1034 if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { 1035 dmu_buf_will_dirty(prev->ds_dbuf, tx); 1036 dsl_dataset_phys(prev)->ds_next_clones_obj = 1037 zap_create(dp->dp_meta_objset, 1038 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 1039 } 1040 VERIFY0(zap_add_int(dp->dp_meta_objset, 1041 dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); 1042 1043 dsl_dataset_rele(ds, FTAG); 1044 if (prev != dp->dp_origin_snap) 1045 dsl_dataset_rele(prev, FTAG); 1046 return (0); 1047} 1048 1049void 1050dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 1051{ 1052 ASSERT(dmu_tx_is_syncing(tx)); 1053 ASSERT(dp->dp_origin_snap != NULL); 1054 1055 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, 1056 tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); 1057} 1058 1059/* ARGSUSED */ 1060static int 1061upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 1062{ 1063 dmu_tx_t *tx = arg; 1064 objset_t *mos = dp->dp_meta_objset; 1065 1066 if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { 1067 dsl_dataset_t *origin; 1068 1069 VERIFY0(dsl_dataset_hold_obj(dp, 1070 dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); 1071 1072 if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { 1073 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 1074 dsl_dir_phys(origin->ds_dir)->dd_clones = 1075 zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 1076 0, tx); 1077 } 1078 1079 VERIFY0(zap_add_int(dp->dp_meta_objset, 1080 dsl_dir_phys(origin->ds_dir)->dd_clones, 1081 ds->ds_object, tx)); 1082 1083 dsl_dataset_rele(origin, FTAG); 1084 } 1085 return (0); 1086} 1087 1088void 1089dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) 1090{ 1091 ASSERT(dmu_tx_is_syncing(tx)); 1092 uint64_t obj; 1093 1094 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); 1095 VERIFY0(dsl_pool_open_special_dir(dp, 1096 FREE_DIR_NAME, &dp->dp_free_dir)); 1097 1098 /* 1099 * We can't use bpobj_alloc(), because spa_version() still 1100 * returns the old version, and we need a new-version bpobj with 1101 * subobj support. So call dmu_object_alloc() directly. 1102 */ 1103 obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, 1104 SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); 1105 VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1106 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 1107 VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); 1108 1109 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1110 upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); 1111} 1112 1113void 1114dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 1115{ 1116 uint64_t dsobj; 1117 dsl_dataset_t *ds; 1118 1119 ASSERT(dmu_tx_is_syncing(tx)); 1120 ASSERT(dp->dp_origin_snap == NULL); 1121 ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); 1122 1123 /* create the origin dir, ds, & snap-ds */ 1124 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 1125 NULL, 0, kcred, tx); 1126 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 1127 dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); 1128 VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, 1129 dp, &dp->dp_origin_snap)); 1130 dsl_dataset_rele(ds, FTAG); 1131} 1132 1133taskq_t * 1134dsl_pool_vnrele_taskq(dsl_pool_t *dp) 1135{ 1136 return (dp->dp_vnrele_taskq); 1137} 1138 1139/* 1140 * Walk through the pool-wide zap object of temporary snapshot user holds 1141 * and release them. 1142 */ 1143void 1144dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 1145{ 1146 zap_attribute_t za; 1147 zap_cursor_t zc; 1148 objset_t *mos = dp->dp_meta_objset; 1149 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 1150 nvlist_t *holds; 1151 1152 if (zapobj == 0) 1153 return; 1154 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1155 1156 holds = fnvlist_alloc(); 1157 1158 for (zap_cursor_init(&zc, mos, zapobj); 1159 zap_cursor_retrieve(&zc, &za) == 0; 1160 zap_cursor_advance(&zc)) { 1161 char *htag; 1162 nvlist_t *tags; 1163 1164 htag = strchr(za.za_name, '-'); 1165 *htag = '\0'; 1166 ++htag; 1167 if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { 1168 tags = fnvlist_alloc(); 1169 fnvlist_add_boolean(tags, htag); 1170 fnvlist_add_nvlist(holds, za.za_name, tags); 1171 fnvlist_free(tags); 1172 } else { 1173 fnvlist_add_boolean(tags, htag); 1174 } 1175 } 1176 dsl_dataset_user_release_tmp(dp, holds); 1177 fnvlist_free(holds); 1178 zap_cursor_fini(&zc); 1179} 1180 1181/* 1182 * Create the pool-wide zap object for storing temporary snapshot holds. 1183 */ 1184void 1185dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 1186{ 1187 objset_t *mos = dp->dp_meta_objset; 1188 1189 ASSERT(dp->dp_tmp_userrefs_obj == 0); 1190 ASSERT(dmu_tx_is_syncing(tx)); 1191 1192 dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, 1193 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); 1194} 1195 1196static int 1197dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 1198 const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) 1199{ 1200 objset_t *mos = dp->dp_meta_objset; 1201 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 1202 char *name; 1203 int error; 1204 1205 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1206 ASSERT(dmu_tx_is_syncing(tx)); 1207 1208 /* 1209 * If the pool was created prior to SPA_VERSION_USERREFS, the 1210 * zap object for temporary holds might not exist yet. 1211 */ 1212 if (zapobj == 0) { 1213 if (holding) { 1214 dsl_pool_user_hold_create_obj(dp, tx); 1215 zapobj = dp->dp_tmp_userrefs_obj; 1216 } else { 1217 return (SET_ERROR(ENOENT)); 1218 } 1219 } 1220 1221 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 1222 if (holding) 1223 error = zap_add(mos, zapobj, name, 8, 1, &now, tx); 1224 else 1225 error = zap_remove(mos, zapobj, name, tx); 1226 strfree(name); 1227 1228 return (error); 1229} 1230 1231/* 1232 * Add a temporary hold for the given dataset object and tag. 1233 */ 1234int 1235dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 1236 uint64_t now, dmu_tx_t *tx) 1237{ 1238 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 1239} 1240 1241/* 1242 * Release a temporary hold for the given dataset object and tag. 1243 */ 1244int 1245dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 1246 dmu_tx_t *tx) 1247{ 1248 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE)); 1249} 1250 1251/* 1252 * DSL Pool Configuration Lock 1253 * 1254 * The dp_config_rwlock protects against changes to DSL state (e.g. dataset 1255 * creation / destruction / rename / property setting). It must be held for 1256 * read to hold a dataset or dsl_dir. I.e. you must call 1257 * dsl_pool_config_enter() or dsl_pool_hold() before calling 1258 * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock 1259 * must be held continuously until all datasets and dsl_dirs are released. 1260 * 1261 * The only exception to this rule is that if a "long hold" is placed on 1262 * a dataset, then the dp_config_rwlock may be dropped while the dataset 1263 * is still held. The long hold will prevent the dataset from being 1264 * destroyed -- the destroy will fail with EBUSY. A long hold can be 1265 * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset 1266 * (by calling dsl_{dataset,objset}_{try}own{_obj}). 1267 * 1268 * Legitimate long-holders (including owners) should be long-running, cancelable 1269 * tasks that should cause "zfs destroy" to fail. This includes DMU 1270 * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), 1271 * "zfs send", and "zfs diff". There are several other long-holders whose 1272 * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). 1273 * 1274 * The usual formula for long-holding would be: 1275 * dsl_pool_hold() 1276 * dsl_dataset_hold() 1277 * ... perform checks ... 1278 * dsl_dataset_long_hold() 1279 * dsl_pool_rele() 1280 * ... perform long-running task ... 1281 * dsl_dataset_long_rele() 1282 * dsl_dataset_rele() 1283 * 1284 * Note that when the long hold is released, the dataset is still held but 1285 * the pool is not held. The dataset may change arbitrarily during this time 1286 * (e.g. it could be destroyed). Therefore you shouldn't do anything to the 1287 * dataset except release it. 1288 * 1289 * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only 1290 * or modifying operations. 1291 * 1292 * Modifying operations should generally use dsl_sync_task(). The synctask 1293 * infrastructure enforces proper locking strategy with respect to the 1294 * dp_config_rwlock. See the comment above dsl_sync_task() for details. 1295 * 1296 * Read-only operations will manually hold the pool, then the dataset, obtain 1297 * information from the dataset, then release the pool and dataset. 1298 * dmu_objset_{hold,rele}() are convenience routines that also do the pool 1299 * hold/rele. 1300 */ 1301 1302int 1303dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) 1304{ 1305 spa_t *spa; 1306 int error; 1307 1308 error = spa_open(name, &spa, tag); 1309 if (error == 0) { 1310 *dp = spa_get_dsl(spa); 1311 dsl_pool_config_enter(*dp, tag); 1312 } 1313 return (error); 1314} 1315 1316void 1317dsl_pool_rele(dsl_pool_t *dp, void *tag) 1318{ 1319 dsl_pool_config_exit(dp, tag); 1320 spa_close(dp->dp_spa, tag); 1321} 1322 1323void 1324dsl_pool_config_enter(dsl_pool_t *dp, void *tag) 1325{ 1326 /* 1327 * We use a "reentrant" reader-writer lock, but not reentrantly. 1328 * 1329 * The rrwlock can (with the track_all flag) track all reading threads, 1330 * which is very useful for debugging which code path failed to release 1331 * the lock, and for verifying that the *current* thread does hold 1332 * the lock. 1333 * 1334 * (Unlike a rwlock, which knows that N threads hold it for 1335 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE 1336 * if any thread holds it for read, even if this thread doesn't). 1337 */ 1338 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); 1339 rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); 1340} 1341 1342void 1343dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag) 1344{ 1345 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); 1346 rrw_enter_read_prio(&dp->dp_config_rwlock, tag); 1347} 1348 1349void 1350dsl_pool_config_exit(dsl_pool_t *dp, void *tag) 1351{ 1352 rrw_exit(&dp->dp_config_rwlock, tag); 1353} 1354 1355boolean_t 1356dsl_pool_config_held(dsl_pool_t *dp) 1357{ 1358 return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); 1359} 1360 1361boolean_t 1362dsl_pool_config_held_writer(dsl_pool_t *dp) 1363{ 1364 return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); 1365} 1366