dsl_pool.c revision 310511
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013 Steven Hartland. All rights reserved. 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29#include <sys/dsl_pool.h> 30#include <sys/dsl_dataset.h> 31#include <sys/dsl_prop.h> 32#include <sys/dsl_dir.h> 33#include <sys/dsl_synctask.h> 34#include <sys/dsl_scan.h> 35#include <sys/dnode.h> 36#include <sys/dmu_tx.h> 37#include <sys/dmu_objset.h> 38#include <sys/arc.h> 39#include <sys/zap.h> 40#include <sys/zio.h> 41#include <sys/zfs_context.h> 42#include <sys/fs/zfs.h> 43#include <sys/zfs_znode.h> 44#include <sys/spa_impl.h> 45#include <sys/dsl_deadlist.h> 46#include <sys/bptree.h> 47#include <sys/zfeature.h> 48#include <sys/zil_impl.h> 49#include <sys/dsl_userhold.h> 50 51#if defined(__FreeBSD__) && defined(_KERNEL) 52#include <sys/types.h> 53#include <sys/sysctl.h> 54#endif 55 56/* 57 * ZFS Write Throttle 58 * ------------------ 59 * 60 * ZFS must limit the rate of incoming writes to the rate at which it is able 61 * to sync data modifications to the backend storage. Throttling by too much 62 * creates an artificial limit; throttling by too little can only be sustained 63 * for short periods and would lead to highly lumpy performance. On a per-pool 64 * basis, ZFS tracks the amount of modified (dirty) data. As operations change 65 * data, the amount of dirty data increases; as ZFS syncs out data, the amount 66 * of dirty data decreases. When the amount of dirty data exceeds a 67 * predetermined threshold further modifications are blocked until the amount 68 * of dirty data decreases (as data is synced out). 69 * 70 * The limit on dirty data is tunable, and should be adjusted according to 71 * both the IO capacity and available memory of the system. The larger the 72 * window, the more ZFS is able to aggregate and amortize metadata (and data) 73 * changes. However, memory is a limited resource, and allowing for more dirty 74 * data comes at the cost of keeping other useful data in memory (for example 75 * ZFS data cached by the ARC). 76 * 77 * Implementation 78 * 79 * As buffers are modified dsl_pool_willuse_space() increments both the per- 80 * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of 81 * dirty space used; dsl_pool_dirty_space() decrements those values as data 82 * is synced out from dsl_pool_sync(). While only the poolwide value is 83 * relevant, the per-txg value is useful for debugging. The tunable 84 * zfs_dirty_data_max determines the dirty space limit. Once that value is 85 * exceeded, new writes are halted until space frees up. 86 * 87 * The zfs_dirty_data_sync tunable dictates the threshold at which we 88 * ensure that there is a txg syncing (see the comment in txg.c for a full 89 * description of transaction group stages). 90 * 91 * The IO scheduler uses both the dirty space limit and current amount of 92 * dirty data as inputs. Those values affect the number of concurrent IOs ZFS 93 * issues. See the comment in vdev_queue.c for details of the IO scheduler. 94 * 95 * The delay is also calculated based on the amount of dirty data. See the 96 * comment above dmu_tx_delay() for details. 97 */ 98 99/* 100 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, 101 * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. 102 */ 103uint64_t zfs_dirty_data_max; 104uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; 105int zfs_dirty_data_max_percent = 10; 106 107/* 108 * If there is at least this much dirty data, push out a txg. 109 */ 110uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024; 111 112/* 113 * Once there is this amount of dirty data, the dmu_tx_delay() will kick in 114 * and delay each transaction. 115 * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. 116 */ 117int zfs_delay_min_dirty_percent = 60; 118 119/* 120 * This controls how quickly the delay approaches infinity. 121 * Larger values cause it to delay more for a given amount of dirty data. 122 * Therefore larger values will cause there to be less dirty data for a 123 * given throughput. 124 * 125 * For the smoothest delay, this value should be about 1 billion divided 126 * by the maximum number of operations per second. This will smoothly 127 * handle between 10x and 1/10th this number. 128 * 129 * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the 130 * multiply in dmu_tx_delay(). 131 */ 132uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; 133 134 135#if defined(__FreeBSD__) && defined(_KERNEL) 136 137extern int zfs_vdev_async_write_active_max_dirty_percent; 138 139SYSCTL_DECL(_vfs_zfs); 140 141SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN, 142 &zfs_dirty_data_max, 0, 143 "The maximum amount of dirty data in bytes after which new writes are " 144 "halted until space becomes available"); 145 146SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN, 147 &zfs_dirty_data_max_max, 0, 148 "The absolute cap on dirty_data_max when auto calculating"); 149 150static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS); 151SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent, 152 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), 153 sysctl_zfs_dirty_data_max_percent, "I", 154 "The percent of physical memory used to auto calculate dirty_data_max"); 155 156SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN, 157 &zfs_dirty_data_sync, 0, 158 "Force a txg if the number of dirty buffer bytes exceed this value"); 159 160static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS); 161/* No zfs_delay_min_dirty_percent tunable due to limit requirements */ 162SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent, 163 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int), 164 sysctl_zfs_delay_min_dirty_percent, "I", 165 "The limit of outstanding dirty data before transations are delayed"); 166 167static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS); 168/* No zfs_delay_scale tunable due to limit requirements */ 169SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale, 170 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 171 sysctl_zfs_delay_scale, "QU", 172 "Controls how quickly the delay approaches infinity"); 173 174static int 175sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS) 176{ 177 int val, err; 178 179 val = zfs_dirty_data_max_percent; 180 err = sysctl_handle_int(oidp, &val, 0, req); 181 if (err != 0 || req->newptr == NULL) 182 return (err); 183 184 if (val < 0 || val > 100) 185 return (EINVAL); 186 187 zfs_dirty_data_max_percent = val; 188 189 return (0); 190} 191 192static int 193sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS) 194{ 195 int val, err; 196 197 val = zfs_delay_min_dirty_percent; 198 err = sysctl_handle_int(oidp, &val, 0, req); 199 if (err != 0 || req->newptr == NULL) 200 return (err); 201 202 if (val < zfs_vdev_async_write_active_max_dirty_percent) 203 return (EINVAL); 204 205 zfs_delay_min_dirty_percent = val; 206 207 return (0); 208} 209 210static int 211sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS) 212{ 213 uint64_t val; 214 int err; 215 216 val = zfs_delay_scale; 217 err = sysctl_handle_64(oidp, &val, 0, req); 218 if (err != 0 || req->newptr == NULL) 219 return (err); 220 221 if (val > UINT64_MAX / zfs_dirty_data_max) 222 return (EINVAL); 223 224 zfs_delay_scale = val; 225 226 return (0); 227} 228#endif 229 230hrtime_t zfs_throttle_delay = MSEC2NSEC(10); 231hrtime_t zfs_throttle_resolution = MSEC2NSEC(10); 232 233int 234dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 235{ 236 uint64_t obj; 237 int err; 238 239 err = zap_lookup(dp->dp_meta_objset, 240 dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, 241 name, sizeof (obj), 1, &obj); 242 if (err) 243 return (err); 244 245 return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); 246} 247 248static dsl_pool_t * 249dsl_pool_open_impl(spa_t *spa, uint64_t txg) 250{ 251 dsl_pool_t *dp; 252 blkptr_t *bp = spa_get_rootblkptr(spa); 253 254 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 255 dp->dp_spa = spa; 256 dp->dp_meta_rootbp = *bp; 257 rrw_init(&dp->dp_config_rwlock, B_TRUE); 258 txg_init(dp, txg); 259 260 txg_list_create(&dp->dp_dirty_datasets, 261 offsetof(dsl_dataset_t, ds_dirty_link)); 262 txg_list_create(&dp->dp_dirty_zilogs, 263 offsetof(zilog_t, zl_dirty_link)); 264 txg_list_create(&dp->dp_dirty_dirs, 265 offsetof(dsl_dir_t, dd_dirty_link)); 266 txg_list_create(&dp->dp_sync_tasks, 267 offsetof(dsl_sync_task_t, dst_node)); 268 269 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 270 cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); 271 272 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 273 1, 4, 0); 274 275 return (dp); 276} 277 278int 279dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 280{ 281 int err; 282 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 283 284 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 285 &dp->dp_meta_objset); 286 if (err != 0) 287 dsl_pool_close(dp); 288 else 289 *dpp = dp; 290 291 return (err); 292} 293 294int 295dsl_pool_open(dsl_pool_t *dp) 296{ 297 int err; 298 dsl_dir_t *dd; 299 dsl_dataset_t *ds; 300 uint64_t obj; 301 302 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 303 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 304 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 305 &dp->dp_root_dir_obj); 306 if (err) 307 goto out; 308 309 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 310 NULL, dp, &dp->dp_root_dir); 311 if (err) 312 goto out; 313 314 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 315 if (err) 316 goto out; 317 318 if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { 319 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 320 if (err) 321 goto out; 322 err = dsl_dataset_hold_obj(dp, 323 dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); 324 if (err == 0) { 325 err = dsl_dataset_hold_obj(dp, 326 dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, 327 &dp->dp_origin_snap); 328 dsl_dataset_rele(ds, FTAG); 329 } 330 dsl_dir_rele(dd, dp); 331 if (err) 332 goto out; 333 } 334 335 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 336 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, 337 &dp->dp_free_dir); 338 if (err) 339 goto out; 340 341 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 342 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); 343 if (err) 344 goto out; 345 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 346 dp->dp_meta_objset, obj)); 347 } 348 349 /* 350 * Note: errors ignored, because the leak dir will not exist if we 351 * have not encountered a leak yet. 352 */ 353 (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, 354 &dp->dp_leak_dir); 355 356 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { 357 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 358 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 359 &dp->dp_bptree_obj); 360 if (err != 0) 361 goto out; 362 } 363 364 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { 365 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 366 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 367 &dp->dp_empty_bpobj); 368 if (err != 0) 369 goto out; 370 } 371 372 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 373 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 374 &dp->dp_tmp_userrefs_obj); 375 if (err == ENOENT) 376 err = 0; 377 if (err) 378 goto out; 379 380 err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); 381 382out: 383 rrw_exit(&dp->dp_config_rwlock, FTAG); 384 return (err); 385} 386 387void 388dsl_pool_close(dsl_pool_t *dp) 389{ 390 /* 391 * Drop our references from dsl_pool_open(). 392 * 393 * Since we held the origin_snap from "syncing" context (which 394 * includes pool-opening context), it actually only got a "ref" 395 * and not a hold, so just drop that here. 396 */ 397 if (dp->dp_origin_snap) 398 dsl_dataset_rele(dp->dp_origin_snap, dp); 399 if (dp->dp_mos_dir) 400 dsl_dir_rele(dp->dp_mos_dir, dp); 401 if (dp->dp_free_dir) 402 dsl_dir_rele(dp->dp_free_dir, dp); 403 if (dp->dp_leak_dir) 404 dsl_dir_rele(dp->dp_leak_dir, dp); 405 if (dp->dp_root_dir) 406 dsl_dir_rele(dp->dp_root_dir, dp); 407 408 bpobj_close(&dp->dp_free_bpobj); 409 410 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 411 if (dp->dp_meta_objset) 412 dmu_objset_evict(dp->dp_meta_objset); 413 414 txg_list_destroy(&dp->dp_dirty_datasets); 415 txg_list_destroy(&dp->dp_dirty_zilogs); 416 txg_list_destroy(&dp->dp_sync_tasks); 417 txg_list_destroy(&dp->dp_dirty_dirs); 418 419 /* 420 * We can't set retry to TRUE since we're explicitly specifying 421 * a spa to flush. This is good enough; any missed buffers for 422 * this spa won't cause trouble, and they'll eventually fall 423 * out of the ARC just like any other unused buffer. 424 */ 425 arc_flush(dp->dp_spa, FALSE); 426 427 txg_fini(dp); 428 dsl_scan_fini(dp); 429 dmu_buf_user_evict_wait(); 430 431 rrw_destroy(&dp->dp_config_rwlock); 432 mutex_destroy(&dp->dp_lock); 433 taskq_destroy(dp->dp_vnrele_taskq); 434 if (dp->dp_blkstats) 435 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 436 kmem_free(dp, sizeof (dsl_pool_t)); 437} 438 439dsl_pool_t * 440dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 441{ 442 int err; 443 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 444 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 445 objset_t *os; 446 dsl_dataset_t *ds; 447 uint64_t obj; 448 449 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 450 451 /* create and open the MOS (meta-objset) */ 452 dp->dp_meta_objset = dmu_objset_create_impl(spa, 453 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 454 455 /* create the pool directory */ 456 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 457 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 458 ASSERT0(err); 459 460 /* Initialize scan structures */ 461 VERIFY0(dsl_scan_init(dp, txg)); 462 463 /* create and open the root dir */ 464 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 465 VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 466 NULL, dp, &dp->dp_root_dir)); 467 468 /* create and open the meta-objset dir */ 469 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 470 VERIFY0(dsl_pool_open_special_dir(dp, 471 MOS_DIR_NAME, &dp->dp_mos_dir)); 472 473 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 474 /* create and open the free dir */ 475 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 476 FREE_DIR_NAME, tx); 477 VERIFY0(dsl_pool_open_special_dir(dp, 478 FREE_DIR_NAME, &dp->dp_free_dir)); 479 480 /* create and open the free_bplist */ 481 obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); 482 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 483 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); 484 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 485 dp->dp_meta_objset, obj)); 486 } 487 488 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 489 dsl_pool_create_origin(dp, tx); 490 491 /* create the root dataset */ 492 obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 493 494 /* create the root objset */ 495 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 496 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 497 os = dmu_objset_create_impl(dp->dp_spa, ds, 498 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 499 rrw_exit(&ds->ds_bp_rwlock, FTAG); 500#ifdef _KERNEL 501 zfs_create_fs(os, kcred, zplprops, tx); 502#endif 503 dsl_dataset_rele(ds, FTAG); 504 505 dmu_tx_commit(tx); 506 507 rrw_exit(&dp->dp_config_rwlock, FTAG); 508 509 return (dp); 510} 511 512/* 513 * Account for the meta-objset space in its placeholder dsl_dir. 514 */ 515void 516dsl_pool_mos_diduse_space(dsl_pool_t *dp, 517 int64_t used, int64_t comp, int64_t uncomp) 518{ 519 ASSERT3U(comp, ==, uncomp); /* it's all metadata */ 520 mutex_enter(&dp->dp_lock); 521 dp->dp_mos_used_delta += used; 522 dp->dp_mos_compressed_delta += comp; 523 dp->dp_mos_uncompressed_delta += uncomp; 524 mutex_exit(&dp->dp_lock); 525} 526 527static void 528dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) 529{ 530 zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 531 dmu_objset_sync(dp->dp_meta_objset, zio, tx); 532 VERIFY0(zio_wait(zio)); 533 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 534 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 535} 536 537static void 538dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) 539{ 540 ASSERT(MUTEX_HELD(&dp->dp_lock)); 541 542 if (delta < 0) 543 ASSERT3U(-delta, <=, dp->dp_dirty_total); 544 545 dp->dp_dirty_total += delta; 546 547 /* 548 * Note: we signal even when increasing dp_dirty_total. 549 * This ensures forward progress -- each thread wakes the next waiter. 550 */ 551 if (dp->dp_dirty_total <= zfs_dirty_data_max) 552 cv_signal(&dp->dp_spaceavail_cv); 553} 554 555void 556dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 557{ 558 zio_t *zio; 559 dmu_tx_t *tx; 560 dsl_dir_t *dd; 561 dsl_dataset_t *ds; 562 objset_t *mos = dp->dp_meta_objset; 563 list_t synced_datasets; 564 565 list_create(&synced_datasets, sizeof (dsl_dataset_t), 566 offsetof(dsl_dataset_t, ds_synced_link)); 567 568 tx = dmu_tx_create_assigned(dp, txg); 569 570 /* 571 * Write out all dirty blocks of dirty datasets. 572 */ 573 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 574 while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { 575 /* 576 * We must not sync any non-MOS datasets twice, because 577 * we may have taken a snapshot of them. However, we 578 * may sync newly-created datasets on pass 2. 579 */ 580 ASSERT(!list_link_active(&ds->ds_synced_link)); 581 list_insert_tail(&synced_datasets, ds); 582 dsl_dataset_sync(ds, zio, tx); 583 } 584 VERIFY0(zio_wait(zio)); 585 586 /* 587 * We have written all of the accounted dirty data, so our 588 * dp_space_towrite should now be zero. However, some seldom-used 589 * code paths do not adhere to this (e.g. dbuf_undirty(), also 590 * rounding error in dbuf_write_physdone). 591 * Shore up the accounting of any dirtied space now. 592 */ 593 dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); 594 595 /* 596 * After the data blocks have been written (ensured by the zio_wait() 597 * above), update the user/group space accounting. 598 */ 599 for (ds = list_head(&synced_datasets); ds != NULL; 600 ds = list_next(&synced_datasets, ds)) { 601 dmu_objset_do_userquota_updates(ds->ds_objset, tx); 602 } 603 604 /* 605 * Sync the datasets again to push out the changes due to 606 * userspace updates. This must be done before we process the 607 * sync tasks, so that any snapshots will have the correct 608 * user accounting information (and we won't get confused 609 * about which blocks are part of the snapshot). 610 */ 611 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 612 while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { 613 ASSERT(list_link_active(&ds->ds_synced_link)); 614 dmu_buf_rele(ds->ds_dbuf, ds); 615 dsl_dataset_sync(ds, zio, tx); 616 } 617 VERIFY0(zio_wait(zio)); 618 619 /* 620 * Now that the datasets have been completely synced, we can 621 * clean up our in-memory structures accumulated while syncing: 622 * 623 * - move dead blocks from the pending deadlist to the on-disk deadlist 624 * - release hold from dsl_dataset_dirty() 625 */ 626 while ((ds = list_remove_head(&synced_datasets)) != NULL) { 627 dsl_dataset_sync_done(ds, tx); 628 } 629 while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { 630 dsl_dir_sync(dd, tx); 631 } 632 633 /* 634 * The MOS's space is accounted for in the pool/$MOS 635 * (dp_mos_dir). We can't modify the mos while we're syncing 636 * it, so we remember the deltas and apply them here. 637 */ 638 if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || 639 dp->dp_mos_uncompressed_delta != 0) { 640 dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, 641 dp->dp_mos_used_delta, 642 dp->dp_mos_compressed_delta, 643 dp->dp_mos_uncompressed_delta, tx); 644 dp->dp_mos_used_delta = 0; 645 dp->dp_mos_compressed_delta = 0; 646 dp->dp_mos_uncompressed_delta = 0; 647 } 648 649 if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 650 list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { 651 dsl_pool_sync_mos(dp, tx); 652 } 653 654 /* 655 * If we modify a dataset in the same txg that we want to destroy it, 656 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. 657 * dsl_dir_destroy_check() will fail if there are unexpected holds. 658 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf 659 * and clearing the hold on it) before we process the sync_tasks. 660 * The MOS data dirtied by the sync_tasks will be synced on the next 661 * pass. 662 */ 663 if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { 664 dsl_sync_task_t *dst; 665 /* 666 * No more sync tasks should have been added while we 667 * were syncing. 668 */ 669 ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); 670 while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) 671 dsl_sync_task_sync(dst, tx); 672 } 673 674 dmu_tx_commit(tx); 675 676 DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); 677} 678 679void 680dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 681{ 682 zilog_t *zilog; 683 684 while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) { 685 dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); 686 zil_clean(zilog, txg); 687 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); 688 dmu_buf_rele(ds->ds_dbuf, zilog); 689 } 690 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 691} 692 693/* 694 * TRUE if the current thread is the tx_sync_thread or if we 695 * are being called from SPA context during pool initialization. 696 */ 697int 698dsl_pool_sync_context(dsl_pool_t *dp) 699{ 700 return (curthread == dp->dp_tx.tx_sync_thread || 701 spa_is_initializing(dp->dp_spa)); 702} 703 704uint64_t 705dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 706{ 707 uint64_t space, resv; 708 709 /* 710 * If we're trying to assess whether it's OK to do a free, 711 * cut the reservation in half to allow forward progress 712 * (e.g. make it possible to rm(1) files from a full pool). 713 */ 714 space = spa_get_dspace(dp->dp_spa); 715 resv = spa_get_slop_space(dp->dp_spa); 716 if (netfree) 717 resv >>= 1; 718 719 return (space - resv); 720} 721 722boolean_t 723dsl_pool_need_dirty_delay(dsl_pool_t *dp) 724{ 725 uint64_t delay_min_bytes = 726 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 727 boolean_t rv; 728 729 mutex_enter(&dp->dp_lock); 730 if (dp->dp_dirty_total > zfs_dirty_data_sync) 731 txg_kick(dp); 732 rv = (dp->dp_dirty_total > delay_min_bytes); 733 mutex_exit(&dp->dp_lock); 734 return (rv); 735} 736 737void 738dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 739{ 740 if (space > 0) { 741 mutex_enter(&dp->dp_lock); 742 dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; 743 dsl_pool_dirty_delta(dp, space); 744 mutex_exit(&dp->dp_lock); 745 } 746} 747 748void 749dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) 750{ 751 ASSERT3S(space, >=, 0); 752 if (space == 0) 753 return; 754 mutex_enter(&dp->dp_lock); 755 if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { 756 /* XXX writing something we didn't dirty? */ 757 space = dp->dp_dirty_pertxg[txg & TXG_MASK]; 758 } 759 ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); 760 dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; 761 ASSERT3U(dp->dp_dirty_total, >=, space); 762 dsl_pool_dirty_delta(dp, -space); 763 mutex_exit(&dp->dp_lock); 764} 765 766/* ARGSUSED */ 767static int 768upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 769{ 770 dmu_tx_t *tx = arg; 771 dsl_dataset_t *ds, *prev = NULL; 772 int err; 773 774 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 775 if (err) 776 return (err); 777 778 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { 779 err = dsl_dataset_hold_obj(dp, 780 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 781 if (err) { 782 dsl_dataset_rele(ds, FTAG); 783 return (err); 784 } 785 786 if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) 787 break; 788 dsl_dataset_rele(ds, FTAG); 789 ds = prev; 790 prev = NULL; 791 } 792 793 if (prev == NULL) { 794 prev = dp->dp_origin_snap; 795 796 /* 797 * The $ORIGIN can't have any data, or the accounting 798 * will be wrong. 799 */ 800 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 801 ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); 802 rrw_exit(&ds->ds_bp_rwlock, FTAG); 803 804 /* The origin doesn't get attached to itself */ 805 if (ds->ds_object == prev->ds_object) { 806 dsl_dataset_rele(ds, FTAG); 807 return (0); 808 } 809 810 dmu_buf_will_dirty(ds->ds_dbuf, tx); 811 dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; 812 dsl_dataset_phys(ds)->ds_prev_snap_txg = 813 dsl_dataset_phys(prev)->ds_creation_txg; 814 815 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 816 dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; 817 818 dmu_buf_will_dirty(prev->ds_dbuf, tx); 819 dsl_dataset_phys(prev)->ds_num_children++; 820 821 if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { 822 ASSERT(ds->ds_prev == NULL); 823 VERIFY0(dsl_dataset_hold_obj(dp, 824 dsl_dataset_phys(ds)->ds_prev_snap_obj, 825 ds, &ds->ds_prev)); 826 } 827 } 828 829 ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); 830 ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); 831 832 if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { 833 dmu_buf_will_dirty(prev->ds_dbuf, tx); 834 dsl_dataset_phys(prev)->ds_next_clones_obj = 835 zap_create(dp->dp_meta_objset, 836 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 837 } 838 VERIFY0(zap_add_int(dp->dp_meta_objset, 839 dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); 840 841 dsl_dataset_rele(ds, FTAG); 842 if (prev != dp->dp_origin_snap) 843 dsl_dataset_rele(prev, FTAG); 844 return (0); 845} 846 847void 848dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 849{ 850 ASSERT(dmu_tx_is_syncing(tx)); 851 ASSERT(dp->dp_origin_snap != NULL); 852 853 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, 854 tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); 855} 856 857/* ARGSUSED */ 858static int 859upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 860{ 861 dmu_tx_t *tx = arg; 862 objset_t *mos = dp->dp_meta_objset; 863 864 if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { 865 dsl_dataset_t *origin; 866 867 VERIFY0(dsl_dataset_hold_obj(dp, 868 dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); 869 870 if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { 871 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 872 dsl_dir_phys(origin->ds_dir)->dd_clones = 873 zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 874 0, tx); 875 } 876 877 VERIFY0(zap_add_int(dp->dp_meta_objset, 878 dsl_dir_phys(origin->ds_dir)->dd_clones, 879 ds->ds_object, tx)); 880 881 dsl_dataset_rele(origin, FTAG); 882 } 883 return (0); 884} 885 886void 887dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) 888{ 889 ASSERT(dmu_tx_is_syncing(tx)); 890 uint64_t obj; 891 892 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); 893 VERIFY0(dsl_pool_open_special_dir(dp, 894 FREE_DIR_NAME, &dp->dp_free_dir)); 895 896 /* 897 * We can't use bpobj_alloc(), because spa_version() still 898 * returns the old version, and we need a new-version bpobj with 899 * subobj support. So call dmu_object_alloc() directly. 900 */ 901 obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, 902 SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); 903 VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 904 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 905 VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); 906 907 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 908 upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); 909} 910 911void 912dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 913{ 914 uint64_t dsobj; 915 dsl_dataset_t *ds; 916 917 ASSERT(dmu_tx_is_syncing(tx)); 918 ASSERT(dp->dp_origin_snap == NULL); 919 ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); 920 921 /* create the origin dir, ds, & snap-ds */ 922 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 923 NULL, 0, kcred, tx); 924 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 925 dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); 926 VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, 927 dp, &dp->dp_origin_snap)); 928 dsl_dataset_rele(ds, FTAG); 929} 930 931taskq_t * 932dsl_pool_vnrele_taskq(dsl_pool_t *dp) 933{ 934 return (dp->dp_vnrele_taskq); 935} 936 937/* 938 * Walk through the pool-wide zap object of temporary snapshot user holds 939 * and release them. 940 */ 941void 942dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 943{ 944 zap_attribute_t za; 945 zap_cursor_t zc; 946 objset_t *mos = dp->dp_meta_objset; 947 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 948 nvlist_t *holds; 949 950 if (zapobj == 0) 951 return; 952 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 953 954 holds = fnvlist_alloc(); 955 956 for (zap_cursor_init(&zc, mos, zapobj); 957 zap_cursor_retrieve(&zc, &za) == 0; 958 zap_cursor_advance(&zc)) { 959 char *htag; 960 nvlist_t *tags; 961 962 htag = strchr(za.za_name, '-'); 963 *htag = '\0'; 964 ++htag; 965 if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { 966 tags = fnvlist_alloc(); 967 fnvlist_add_boolean(tags, htag); 968 fnvlist_add_nvlist(holds, za.za_name, tags); 969 fnvlist_free(tags); 970 } else { 971 fnvlist_add_boolean(tags, htag); 972 } 973 } 974 dsl_dataset_user_release_tmp(dp, holds); 975 fnvlist_free(holds); 976 zap_cursor_fini(&zc); 977} 978 979/* 980 * Create the pool-wide zap object for storing temporary snapshot holds. 981 */ 982void 983dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 984{ 985 objset_t *mos = dp->dp_meta_objset; 986 987 ASSERT(dp->dp_tmp_userrefs_obj == 0); 988 ASSERT(dmu_tx_is_syncing(tx)); 989 990 dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, 991 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); 992} 993 994static int 995dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 996 const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) 997{ 998 objset_t *mos = dp->dp_meta_objset; 999 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 1000 char *name; 1001 int error; 1002 1003 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1004 ASSERT(dmu_tx_is_syncing(tx)); 1005 1006 /* 1007 * If the pool was created prior to SPA_VERSION_USERREFS, the 1008 * zap object for temporary holds might not exist yet. 1009 */ 1010 if (zapobj == 0) { 1011 if (holding) { 1012 dsl_pool_user_hold_create_obj(dp, tx); 1013 zapobj = dp->dp_tmp_userrefs_obj; 1014 } else { 1015 return (SET_ERROR(ENOENT)); 1016 } 1017 } 1018 1019 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 1020 if (holding) 1021 error = zap_add(mos, zapobj, name, 8, 1, &now, tx); 1022 else 1023 error = zap_remove(mos, zapobj, name, tx); 1024 strfree(name); 1025 1026 return (error); 1027} 1028 1029/* 1030 * Add a temporary hold for the given dataset object and tag. 1031 */ 1032int 1033dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 1034 uint64_t now, dmu_tx_t *tx) 1035{ 1036 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 1037} 1038 1039/* 1040 * Release a temporary hold for the given dataset object and tag. 1041 */ 1042int 1043dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 1044 dmu_tx_t *tx) 1045{ 1046 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, 1047 tx, B_FALSE)); 1048} 1049 1050/* 1051 * DSL Pool Configuration Lock 1052 * 1053 * The dp_config_rwlock protects against changes to DSL state (e.g. dataset 1054 * creation / destruction / rename / property setting). It must be held for 1055 * read to hold a dataset or dsl_dir. I.e. you must call 1056 * dsl_pool_config_enter() or dsl_pool_hold() before calling 1057 * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock 1058 * must be held continuously until all datasets and dsl_dirs are released. 1059 * 1060 * The only exception to this rule is that if a "long hold" is placed on 1061 * a dataset, then the dp_config_rwlock may be dropped while the dataset 1062 * is still held. The long hold will prevent the dataset from being 1063 * destroyed -- the destroy will fail with EBUSY. A long hold can be 1064 * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset 1065 * (by calling dsl_{dataset,objset}_{try}own{_obj}). 1066 * 1067 * Legitimate long-holders (including owners) should be long-running, cancelable 1068 * tasks that should cause "zfs destroy" to fail. This includes DMU 1069 * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), 1070 * "zfs send", and "zfs diff". There are several other long-holders whose 1071 * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). 1072 * 1073 * The usual formula for long-holding would be: 1074 * dsl_pool_hold() 1075 * dsl_dataset_hold() 1076 * ... perform checks ... 1077 * dsl_dataset_long_hold() 1078 * dsl_pool_rele() 1079 * ... perform long-running task ... 1080 * dsl_dataset_long_rele() 1081 * dsl_dataset_rele() 1082 * 1083 * Note that when the long hold is released, the dataset is still held but 1084 * the pool is not held. The dataset may change arbitrarily during this time 1085 * (e.g. it could be destroyed). Therefore you shouldn't do anything to the 1086 * dataset except release it. 1087 * 1088 * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only 1089 * or modifying operations. 1090 * 1091 * Modifying operations should generally use dsl_sync_task(). The synctask 1092 * infrastructure enforces proper locking strategy with respect to the 1093 * dp_config_rwlock. See the comment above dsl_sync_task() for details. 1094 * 1095 * Read-only operations will manually hold the pool, then the dataset, obtain 1096 * information from the dataset, then release the pool and dataset. 1097 * dmu_objset_{hold,rele}() are convenience routines that also do the pool 1098 * hold/rele. 1099 */ 1100 1101int 1102dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) 1103{ 1104 spa_t *spa; 1105 int error; 1106 1107 error = spa_open(name, &spa, tag); 1108 if (error == 0) { 1109 *dp = spa_get_dsl(spa); 1110 dsl_pool_config_enter(*dp, tag); 1111 } 1112 return (error); 1113} 1114 1115void 1116dsl_pool_rele(dsl_pool_t *dp, void *tag) 1117{ 1118 dsl_pool_config_exit(dp, tag); 1119 spa_close(dp->dp_spa, tag); 1120} 1121 1122void 1123dsl_pool_config_enter(dsl_pool_t *dp, void *tag) 1124{ 1125 /* 1126 * We use a "reentrant" reader-writer lock, but not reentrantly. 1127 * 1128 * The rrwlock can (with the track_all flag) track all reading threads, 1129 * which is very useful for debugging which code path failed to release 1130 * the lock, and for verifying that the *current* thread does hold 1131 * the lock. 1132 * 1133 * (Unlike a rwlock, which knows that N threads hold it for 1134 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE 1135 * if any thread holds it for read, even if this thread doesn't). 1136 */ 1137 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); 1138 rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); 1139} 1140 1141void 1142dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag) 1143{ 1144 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); 1145 rrw_enter_read_prio(&dp->dp_config_rwlock, tag); 1146} 1147 1148void 1149dsl_pool_config_exit(dsl_pool_t *dp, void *tag) 1150{ 1151 rrw_exit(&dp->dp_config_rwlock, tag); 1152} 1153 1154boolean_t 1155dsl_pool_config_held(dsl_pool_t *dp) 1156{ 1157 return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); 1158} 1159 1160boolean_t 1161dsl_pool_config_held_writer(dsl_pool_t *dp) 1162{ 1163 return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); 1164} 1165